743 lines
16 KiB
NASM
743 lines
16 KiB
NASM
|
; AesOpt.asm -- AES optimized code for x86 AES hardware instructions
|
||
|
; 2021-12-25 : Igor Pavlov : Public domain
|
||
|
|
||
|
include 7zAsm.asm
|
||
|
|
||
|
ifdef __ASMC__
|
||
|
use_vaes_256 equ 1
|
||
|
else
|
||
|
ifdef ymm0
|
||
|
use_vaes_256 equ 1
|
||
|
endif
|
||
|
endif
|
||
|
|
||
|
|
||
|
ifdef use_vaes_256
|
||
|
ECHO "++ VAES 256"
|
||
|
else
|
||
|
ECHO "-- NO VAES 256"
|
||
|
endif
|
||
|
|
||
|
ifdef x64
|
||
|
ECHO "x86-64"
|
||
|
else
|
||
|
ECHO "x86"
|
||
|
if (IS_CDECL gt 0)
|
||
|
ECHO "ABI : CDECL"
|
||
|
else
|
||
|
ECHO "ABI : no CDECL : FASTCALL"
|
||
|
endif
|
||
|
endif
|
||
|
|
||
|
if (IS_LINUX gt 0)
|
||
|
ECHO "ABI : LINUX"
|
||
|
else
|
||
|
ECHO "ABI : WINDOWS"
|
||
|
endif
|
||
|
|
||
|
MY_ASM_START
|
||
|
|
||
|
ifndef x64
|
||
|
.686
|
||
|
.xmm
|
||
|
endif
|
||
|
|
||
|
|
||
|
; MY_ALIGN EQU ALIGN(64)
|
||
|
MY_ALIGN EQU
|
||
|
|
||
|
SEG_ALIGN EQU MY_ALIGN
|
||
|
|
||
|
MY_SEG_PROC macro name:req, numParams:req
|
||
|
; seg_name equ @CatStr(_TEXT$, name)
|
||
|
; seg_name SEGMENT SEG_ALIGN 'CODE'
|
||
|
MY_PROC name, numParams
|
||
|
endm
|
||
|
|
||
|
MY_SEG_ENDP macro
|
||
|
; seg_name ENDS
|
||
|
endm
|
||
|
|
||
|
|
||
|
NUM_AES_KEYS_MAX equ 15
|
||
|
|
||
|
; the number of push operators in function PROLOG
|
||
|
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||
|
num_regs_push equ 2
|
||
|
stack_param_offset equ (REG_SIZE * (1 + num_regs_push))
|
||
|
endif
|
||
|
|
||
|
ifdef x64
|
||
|
num_param equ REG_ABI_PARAM_2
|
||
|
else
|
||
|
if (IS_CDECL gt 0)
|
||
|
; size_t size
|
||
|
; void * data
|
||
|
; UInt32 * aes
|
||
|
; ret-ip <- (r4)
|
||
|
aes_OFFS equ (stack_param_offset)
|
||
|
data_OFFS equ (REG_SIZE + aes_OFFS)
|
||
|
size_OFFS equ (REG_SIZE + data_OFFS)
|
||
|
num_param equ [r4 + size_OFFS]
|
||
|
else
|
||
|
num_param equ [r4 + stack_param_offset]
|
||
|
endif
|
||
|
endif
|
||
|
|
||
|
keys equ REG_PARAM_0 ; r1
|
||
|
rD equ REG_PARAM_1 ; r2
|
||
|
rN equ r0
|
||
|
|
||
|
koffs_x equ x7
|
||
|
koffs_r equ r7
|
||
|
|
||
|
ksize_x equ x6
|
||
|
ksize_r equ r6
|
||
|
|
||
|
keys2 equ r3
|
||
|
|
||
|
state equ xmm0
|
||
|
key equ xmm0
|
||
|
key_ymm equ ymm0
|
||
|
key_ymm_n equ 0
|
||
|
|
||
|
ifdef x64
|
||
|
ways = 11
|
||
|
else
|
||
|
ways = 4
|
||
|
endif
|
||
|
|
||
|
ways_start_reg equ 1
|
||
|
|
||
|
iv equ @CatStr(xmm, %(ways_start_reg + ways))
|
||
|
iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways))
|
||
|
|
||
|
|
||
|
WOP macro op, op2
|
||
|
i = 0
|
||
|
rept ways
|
||
|
op @CatStr(xmm, %(ways_start_reg + i)), op2
|
||
|
i = i + 1
|
||
|
endm
|
||
|
endm
|
||
|
|
||
|
|
||
|
ifndef ABI_LINUX
|
||
|
ifdef x64
|
||
|
|
||
|
; we use 32 bytes of home space in stack in WIN64-x64
|
||
|
NUM_HOME_MM_REGS equ (32 / 16)
|
||
|
; we preserve xmm registers starting from xmm6 in WIN64-x64
|
||
|
MM_START_SAVE_REG equ 6
|
||
|
|
||
|
SAVE_XMM macro num_used_mm_regs:req
|
||
|
num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG
|
||
|
if num_save_mm_regs GT 0
|
||
|
num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS
|
||
|
; RSP is (16*x + 8) after entering the function in WIN64-x64
|
||
|
stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16)
|
||
|
|
||
|
i = 0
|
||
|
rept num_save_mm_regs
|
||
|
|
||
|
if i eq NUM_HOME_MM_REGS
|
||
|
sub r4, stack_offset
|
||
|
endif
|
||
|
|
||
|
if i lt NUM_HOME_MM_REGS
|
||
|
movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||
|
else
|
||
|
movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i))
|
||
|
endif
|
||
|
|
||
|
i = i + 1
|
||
|
endm
|
||
|
endif
|
||
|
endm
|
||
|
|
||
|
RESTORE_XMM macro num_used_mm_regs:req
|
||
|
if num_save_mm_regs GT 0
|
||
|
i = 0
|
||
|
if num_save_mm_regs2 GT 0
|
||
|
rept num_save_mm_regs2
|
||
|
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16]
|
||
|
i = i + 1
|
||
|
endm
|
||
|
add r4, stack_offset
|
||
|
endif
|
||
|
|
||
|
num_low_regs = num_save_mm_regs - i
|
||
|
i = 0
|
||
|
rept num_low_regs
|
||
|
movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16]
|
||
|
i = i + 1
|
||
|
endm
|
||
|
endif
|
||
|
endm
|
||
|
|
||
|
endif ; x64
|
||
|
endif ; ABI_LINUX
|
||
|
|
||
|
|
||
|
MY_PROLOG macro num_used_mm_regs:req
|
||
|
; num_regs_push: must be equal to the number of push operators
|
||
|
; push r3
|
||
|
; push r5
|
||
|
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||
|
push r6
|
||
|
push r7
|
||
|
endif
|
||
|
|
||
|
mov rN, num_param ; don't move it; num_param can use stack pointer (r4)
|
||
|
|
||
|
if (IS_X64 eq 0)
|
||
|
if (IS_CDECL gt 0)
|
||
|
mov rD, [r4 + data_OFFS]
|
||
|
mov keys, [r4 + aes_OFFS]
|
||
|
endif
|
||
|
elseif (IS_LINUX gt 0)
|
||
|
MY_ABI_LINUX_TO_WIN_2
|
||
|
endif
|
||
|
|
||
|
|
||
|
ifndef ABI_LINUX
|
||
|
ifdef x64
|
||
|
SAVE_XMM num_used_mm_regs
|
||
|
endif
|
||
|
endif
|
||
|
|
||
|
mov ksize_x, [keys + 16]
|
||
|
shl ksize_x, 5
|
||
|
endm
|
||
|
|
||
|
|
||
|
MY_EPILOG macro
|
||
|
ifndef ABI_LINUX
|
||
|
ifdef x64
|
||
|
RESTORE_XMM num_save_mm_regs
|
||
|
endif
|
||
|
endif
|
||
|
|
||
|
if (IS_LINUX eq 0) or (IS_X64 eq 0)
|
||
|
pop r7
|
||
|
pop r6
|
||
|
endif
|
||
|
; pop r5
|
||
|
; pop r3
|
||
|
MY_ENDP
|
||
|
endm
|
||
|
|
||
|
|
||
|
OP_KEY macro op:req, offs:req
|
||
|
op state, [keys + offs]
|
||
|
endm
|
||
|
|
||
|
|
||
|
WOP_KEY macro op:req, offs:req
|
||
|
movdqa key, [keys + offs]
|
||
|
WOP op, key
|
||
|
endm
|
||
|
|
||
|
|
||
|
; ---------- AES-CBC Decode ----------
|
||
|
|
||
|
|
||
|
XOR_WITH_DATA macro reg, _ppp_
|
||
|
pxor reg, [rD + i * 16]
|
||
|
endm
|
||
|
|
||
|
WRITE_TO_DATA macro reg, _ppp_
|
||
|
movdqa [rD + i * 16], reg
|
||
|
endm
|
||
|
|
||
|
|
||
|
; state0 equ @CatStr(xmm, %(ways_start_reg))
|
||
|
|
||
|
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||
|
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||
|
|
||
|
key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||
|
key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||
|
key_last_ymm_n equ (ways_start_reg + ways + 2)
|
||
|
|
||
|
NUM_CBC_REGS equ (ways_start_reg + ways + 3)
|
||
|
|
||
|
|
||
|
MY_SEG_PROC AesCbc_Decode_HW, 3
|
||
|
|
||
|
AesCbc_Decode_HW_start::
|
||
|
MY_PROLOG NUM_CBC_REGS
|
||
|
|
||
|
AesCbc_Decode_HW_start_2::
|
||
|
movdqa iv, [keys]
|
||
|
add keys, 32
|
||
|
|
||
|
movdqa key0, [keys + 1 * ksize_r]
|
||
|
movdqa key_last, [keys]
|
||
|
sub ksize_x, 16
|
||
|
|
||
|
jmp check2
|
||
|
align 16
|
||
|
nextBlocks2:
|
||
|
WOP movdqa, [rD + i * 16]
|
||
|
mov koffs_x, ksize_x
|
||
|
; WOP_KEY pxor, ksize_r + 16
|
||
|
WOP pxor, key0
|
||
|
; align 16
|
||
|
@@:
|
||
|
WOP_KEY aesdec, 1 * koffs_r
|
||
|
sub koffs_r, 16
|
||
|
jnz @B
|
||
|
; WOP_KEY aesdeclast, 0
|
||
|
WOP aesdeclast, key_last
|
||
|
|
||
|
pxor @CatStr(xmm, %(ways_start_reg)), iv
|
||
|
i = 1
|
||
|
rept ways - 1
|
||
|
pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16]
|
||
|
i = i + 1
|
||
|
endm
|
||
|
movdqa iv, [rD + ways * 16 - 16]
|
||
|
WOP WRITE_TO_DATA
|
||
|
|
||
|
add rD, ways * 16
|
||
|
AesCbc_Decode_HW_start_3::
|
||
|
check2:
|
||
|
sub rN, ways
|
||
|
jnc nextBlocks2
|
||
|
add rN, ways
|
||
|
|
||
|
sub ksize_x, 16
|
||
|
|
||
|
jmp check
|
||
|
nextBlock:
|
||
|
movdqa state, [rD]
|
||
|
mov koffs_x, ksize_x
|
||
|
; OP_KEY pxor, 1 * ksize_r + 32
|
||
|
pxor state, key0
|
||
|
; movdqa state0, [rD]
|
||
|
; movdqa state, key0
|
||
|
; pxor state, state0
|
||
|
@@:
|
||
|
OP_KEY aesdec, 1 * koffs_r + 16
|
||
|
OP_KEY aesdec, 1 * koffs_r
|
||
|
sub koffs_r, 32
|
||
|
jnz @B
|
||
|
OP_KEY aesdec, 16
|
||
|
; OP_KEY aesdeclast, 0
|
||
|
aesdeclast state, key_last
|
||
|
|
||
|
pxor state, iv
|
||
|
movdqa iv, [rD]
|
||
|
; movdqa iv, state0
|
||
|
movdqa [rD], state
|
||
|
|
||
|
add rD, 16
|
||
|
check:
|
||
|
sub rN, 1
|
||
|
jnc nextBlock
|
||
|
|
||
|
movdqa [keys - 32], iv
|
||
|
MY_EPILOG
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
; ---------- AVX ----------
|
||
|
|
||
|
|
||
|
AVX__WOP_n macro op
|
||
|
i = 0
|
||
|
rept ways
|
||
|
op (ways_start_reg + i)
|
||
|
i = i + 1
|
||
|
endm
|
||
|
endm
|
||
|
|
||
|
AVX__WOP macro op
|
||
|
i = 0
|
||
|
rept ways
|
||
|
op @CatStr(ymm, %(ways_start_reg + i))
|
||
|
i = i + 1
|
||
|
endm
|
||
|
endm
|
||
|
|
||
|
|
||
|
AVX__WOP_KEY macro op:req, offs:req
|
||
|
vmovdqa key_ymm, ymmword ptr [keys2 + offs]
|
||
|
AVX__WOP_n op
|
||
|
endm
|
||
|
|
||
|
|
||
|
AVX__CBC_START macro reg
|
||
|
; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i]
|
||
|
vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i]
|
||
|
endm
|
||
|
|
||
|
AVX__CBC_END macro reg
|
||
|
if i eq 0
|
||
|
vpxor reg, reg, iv_ymm
|
||
|
else
|
||
|
vpxor reg, reg, ymmword ptr [rD + i * 32 - 16]
|
||
|
endif
|
||
|
endm
|
||
|
|
||
|
|
||
|
AVX__WRITE_TO_DATA macro reg
|
||
|
vmovdqu ymmword ptr [rD + 32 * i], reg
|
||
|
endm
|
||
|
|
||
|
AVX__XOR_WITH_DATA macro reg
|
||
|
vpxor reg, reg, ymmword ptr [rD + 32 * i]
|
||
|
endm
|
||
|
|
||
|
AVX__CTR_START macro reg
|
||
|
vpaddq iv_ymm, iv_ymm, one_ymm
|
||
|
; vpxor reg, iv_ymm, key_ymm
|
||
|
vpxor reg, iv_ymm, key0_ymm
|
||
|
endm
|
||
|
|
||
|
|
||
|
MY_VAES_INSTR_2 macro cmd, dest, a1, a2
|
||
|
db 0c4H
|
||
|
db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8)
|
||
|
db 5 + 8 * ((not (a1)) and 15)
|
||
|
db cmd
|
||
|
db 0c0H + 8 * ((dest) and 7) + ((a2) and 7)
|
||
|
endm
|
||
|
|
||
|
MY_VAES_INSTR macro cmd, dest, a
|
||
|
MY_VAES_INSTR_2 cmd, dest, dest, a
|
||
|
endm
|
||
|
|
||
|
MY_vaesenc macro dest, a
|
||
|
MY_VAES_INSTR 0dcH, dest, a
|
||
|
endm
|
||
|
MY_vaesenclast macro dest, a
|
||
|
MY_VAES_INSTR 0ddH, dest, a
|
||
|
endm
|
||
|
MY_vaesdec macro dest, a
|
||
|
MY_VAES_INSTR 0deH, dest, a
|
||
|
endm
|
||
|
MY_vaesdeclast macro dest, a
|
||
|
MY_VAES_INSTR 0dfH, dest, a
|
||
|
endm
|
||
|
|
||
|
|
||
|
AVX__VAES_DEC macro reg
|
||
|
MY_vaesdec reg, key_ymm_n
|
||
|
endm
|
||
|
|
||
|
AVX__VAES_DEC_LAST_key_last macro reg
|
||
|
; MY_vaesdeclast reg, key_ymm_n
|
||
|
MY_vaesdeclast reg, key_last_ymm_n
|
||
|
endm
|
||
|
|
||
|
AVX__VAES_ENC macro reg
|
||
|
MY_vaesenc reg, key_ymm_n
|
||
|
endm
|
||
|
|
||
|
AVX__VAES_ENC_LAST macro reg
|
||
|
MY_vaesenclast reg, key_ymm_n
|
||
|
endm
|
||
|
|
||
|
AVX__vinserti128_TO_HIGH macro dest, src
|
||
|
vinserti128 dest, dest, src, 1
|
||
|
endm
|
||
|
|
||
|
|
||
|
MY_PROC AesCbc_Decode_HW_256, 3
|
||
|
ifdef use_vaes_256
|
||
|
MY_PROLOG NUM_CBC_REGS
|
||
|
|
||
|
cmp rN, ways * 2
|
||
|
jb AesCbc_Decode_HW_start_2
|
||
|
|
||
|
vmovdqa iv, xmmword ptr [keys]
|
||
|
add keys, 32
|
||
|
|
||
|
vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r]
|
||
|
vbroadcasti128 key_last_ymm, xmmword ptr [keys]
|
||
|
sub ksize_x, 16
|
||
|
mov koffs_x, ksize_x
|
||
|
add ksize_x, ksize_x
|
||
|
|
||
|
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32)
|
||
|
push keys2
|
||
|
sub r4, AVX_STACK_SUB
|
||
|
; sub r4, 32
|
||
|
; sub r4, ksize_r
|
||
|
; lea keys2, [r4 + 32]
|
||
|
mov keys2, r4
|
||
|
and keys2, -32
|
||
|
broad:
|
||
|
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||
|
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||
|
sub koffs_r, 16
|
||
|
; jnc broad
|
||
|
jnz broad
|
||
|
|
||
|
sub rN, ways * 2
|
||
|
|
||
|
align 16
|
||
|
avx_cbcdec_nextBlock2:
|
||
|
mov koffs_x, ksize_x
|
||
|
; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32
|
||
|
AVX__WOP AVX__CBC_START
|
||
|
@@:
|
||
|
AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r
|
||
|
sub koffs_r, 32
|
||
|
jnz @B
|
||
|
; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0
|
||
|
AVX__WOP_n AVX__VAES_DEC_LAST_key_last
|
||
|
|
||
|
AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD]
|
||
|
AVX__WOP AVX__CBC_END
|
||
|
|
||
|
vmovdqa iv, xmmword ptr [rD + ways * 32 - 16]
|
||
|
AVX__WOP AVX__WRITE_TO_DATA
|
||
|
|
||
|
add rD, ways * 32
|
||
|
sub rN, ways * 2
|
||
|
jnc avx_cbcdec_nextBlock2
|
||
|
add rN, ways * 2
|
||
|
|
||
|
shr ksize_x, 1
|
||
|
|
||
|
; lea r4, [r4 + 1 * ksize_r + 32]
|
||
|
add r4, AVX_STACK_SUB
|
||
|
pop keys2
|
||
|
|
||
|
vzeroupper
|
||
|
jmp AesCbc_Decode_HW_start_3
|
||
|
else
|
||
|
jmp AesCbc_Decode_HW_start
|
||
|
endif
|
||
|
MY_ENDP
|
||
|
MY_SEG_ENDP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
; ---------- AES-CBC Encode ----------
|
||
|
|
||
|
e0 equ xmm1
|
||
|
|
||
|
CENC_START_KEY equ 2
|
||
|
CENC_NUM_REG_KEYS equ (3 * 2)
|
||
|
; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS))
|
||
|
|
||
|
MY_SEG_PROC AesCbc_Encode_HW, 3
|
||
|
MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0)
|
||
|
|
||
|
movdqa state, [keys]
|
||
|
add keys, 32
|
||
|
|
||
|
i = 0
|
||
|
rept CENC_NUM_REG_KEYS
|
||
|
movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16]
|
||
|
i = i + 1
|
||
|
endm
|
||
|
|
||
|
add keys, ksize_r
|
||
|
neg ksize_r
|
||
|
add ksize_r, (16 * CENC_NUM_REG_KEYS)
|
||
|
; movdqa last_key, [keys]
|
||
|
jmp check_e
|
||
|
|
||
|
align 16
|
||
|
nextBlock_e:
|
||
|
movdqa e0, [rD]
|
||
|
mov koffs_r, ksize_r
|
||
|
pxor e0, @CatStr(xmm, %(CENC_START_KEY))
|
||
|
pxor state, e0
|
||
|
|
||
|
i = 1
|
||
|
rept (CENC_NUM_REG_KEYS - 1)
|
||
|
aesenc state, @CatStr(xmm, %(CENC_START_KEY + i))
|
||
|
i = i + 1
|
||
|
endm
|
||
|
|
||
|
@@:
|
||
|
OP_KEY aesenc, 1 * koffs_r
|
||
|
OP_KEY aesenc, 1 * koffs_r + 16
|
||
|
add koffs_r, 32
|
||
|
jnz @B
|
||
|
OP_KEY aesenclast, 0
|
||
|
; aesenclast state, last_key
|
||
|
|
||
|
movdqa [rD], state
|
||
|
add rD, 16
|
||
|
check_e:
|
||
|
sub rN, 1
|
||
|
jnc nextBlock_e
|
||
|
|
||
|
; movdqa [keys - 32], state
|
||
|
movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state
|
||
|
MY_EPILOG
|
||
|
MY_SEG_ENDP
|
||
|
|
||
|
|
||
|
|
||
|
; ---------- AES-CTR ----------
|
||
|
|
||
|
ifdef x64
|
||
|
; ways = 11
|
||
|
endif
|
||
|
|
||
|
|
||
|
one equ @CatStr(xmm, %(ways_start_reg + ways + 1))
|
||
|
one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1))
|
||
|
key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2))
|
||
|
key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2))
|
||
|
NUM_CTR_REGS equ (ways_start_reg + ways + 3)
|
||
|
|
||
|
INIT_CTR macro reg, _ppp_
|
||
|
paddq iv, one
|
||
|
movdqa reg, iv
|
||
|
endm
|
||
|
|
||
|
|
||
|
MY_SEG_PROC AesCtr_Code_HW, 3
|
||
|
Ctr_start::
|
||
|
MY_PROLOG NUM_CTR_REGS
|
||
|
|
||
|
Ctr_start_2::
|
||
|
movdqa iv, [keys]
|
||
|
add keys, 32
|
||
|
movdqa key0, [keys]
|
||
|
|
||
|
add keys, ksize_r
|
||
|
neg ksize_r
|
||
|
add ksize_r, 16
|
||
|
|
||
|
Ctr_start_3::
|
||
|
mov koffs_x, 1
|
||
|
movd one, koffs_x
|
||
|
jmp check2_c
|
||
|
|
||
|
align 16
|
||
|
nextBlocks2_c:
|
||
|
WOP INIT_CTR, 0
|
||
|
mov koffs_r, ksize_r
|
||
|
; WOP_KEY pxor, 1 * koffs_r -16
|
||
|
WOP pxor, key0
|
||
|
@@:
|
||
|
WOP_KEY aesenc, 1 * koffs_r
|
||
|
add koffs_r, 16
|
||
|
jnz @B
|
||
|
WOP_KEY aesenclast, 0
|
||
|
|
||
|
WOP XOR_WITH_DATA
|
||
|
WOP WRITE_TO_DATA
|
||
|
add rD, ways * 16
|
||
|
check2_c:
|
||
|
sub rN, ways
|
||
|
jnc nextBlocks2_c
|
||
|
add rN, ways
|
||
|
|
||
|
sub keys, 16
|
||
|
add ksize_r, 16
|
||
|
|
||
|
jmp check_c
|
||
|
|
||
|
; align 16
|
||
|
nextBlock_c:
|
||
|
paddq iv, one
|
||
|
; movdqa state, [keys + 1 * koffs_r - 16]
|
||
|
movdqa state, key0
|
||
|
mov koffs_r, ksize_r
|
||
|
pxor state, iv
|
||
|
|
||
|
@@:
|
||
|
OP_KEY aesenc, 1 * koffs_r
|
||
|
OP_KEY aesenc, 1 * koffs_r + 16
|
||
|
add koffs_r, 32
|
||
|
jnz @B
|
||
|
OP_KEY aesenc, 0
|
||
|
OP_KEY aesenclast, 16
|
||
|
|
||
|
pxor state, [rD]
|
||
|
movdqa [rD], state
|
||
|
add rD, 16
|
||
|
check_c:
|
||
|
sub rN, 1
|
||
|
jnc nextBlock_c
|
||
|
|
||
|
; movdqa [keys - 32], iv
|
||
|
movdqa [keys + 1 * ksize_r - 16 - 32], iv
|
||
|
MY_EPILOG
|
||
|
|
||
|
|
||
|
MY_PROC AesCtr_Code_HW_256, 3
|
||
|
ifdef use_vaes_256
|
||
|
MY_PROLOG NUM_CTR_REGS
|
||
|
|
||
|
cmp rN, ways * 2
|
||
|
jb Ctr_start_2
|
||
|
|
||
|
vbroadcasti128 iv_ymm, xmmword ptr [keys]
|
||
|
add keys, 32
|
||
|
vbroadcasti128 key0_ymm, xmmword ptr [keys]
|
||
|
mov koffs_x, 1
|
||
|
vmovd one, koffs_x
|
||
|
vpsubq iv_ymm, iv_ymm, one_ymm
|
||
|
vpaddq one, one, one
|
||
|
AVX__vinserti128_TO_HIGH one_ymm, one
|
||
|
|
||
|
add keys, ksize_r
|
||
|
sub ksize_x, 16
|
||
|
neg ksize_r
|
||
|
mov koffs_r, ksize_r
|
||
|
add ksize_r, ksize_r
|
||
|
|
||
|
AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32)
|
||
|
push keys2
|
||
|
lea keys2, [r4 - 32]
|
||
|
sub r4, AVX_STACK_SUB
|
||
|
and keys2, -32
|
||
|
vbroadcasti128 key_ymm, xmmword ptr [keys]
|
||
|
vmovdqa ymmword ptr [keys2], key_ymm
|
||
|
@@:
|
||
|
vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r]
|
||
|
vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm
|
||
|
add koffs_r, 16
|
||
|
jnz @B
|
||
|
|
||
|
sub rN, ways * 2
|
||
|
|
||
|
align 16
|
||
|
avx_ctr_nextBlock2:
|
||
|
mov koffs_r, ksize_r
|
||
|
AVX__WOP AVX__CTR_START
|
||
|
; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32
|
||
|
@@:
|
||
|
AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r
|
||
|
add koffs_r, 32
|
||
|
jnz @B
|
||
|
AVX__WOP_KEY AVX__VAES_ENC_LAST, 0
|
||
|
|
||
|
AVX__WOP AVX__XOR_WITH_DATA
|
||
|
AVX__WOP AVX__WRITE_TO_DATA
|
||
|
|
||
|
add rD, ways * 32
|
||
|
sub rN, ways * 2
|
||
|
jnc avx_ctr_nextBlock2
|
||
|
add rN, ways * 2
|
||
|
|
||
|
vextracti128 iv, iv_ymm, 1
|
||
|
sar ksize_r, 1
|
||
|
|
||
|
add r4, AVX_STACK_SUB
|
||
|
pop keys2
|
||
|
|
||
|
vzeroupper
|
||
|
jmp Ctr_start_3
|
||
|
else
|
||
|
jmp Ctr_start
|
||
|
endif
|
||
|
MY_ENDP
|
||
|
MY_SEG_ENDP
|
||
|
|
||
|
end
|