Source release 14.0.0

This commit is contained in:
John W. Bruce
2018-05-16 17:35:40 -07:00
parent 31381a1311
commit 3ab70cec4e
2053 changed files with 1585838 additions and 4614 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,773 @@
#include <openssl/arm_arch.h>
#if __ARM_MAX_ARCH__>=7
.text
.code 32
#undef __thumb2__
.align 5
Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
.globl _aes_hw_set_encrypt_key
.private_extern _aes_hw_set_encrypt_key
#ifdef __thumb2__
.thumb_func _aes_hw_set_encrypt_key
#endif
.align 5
_aes_hw_set_encrypt_key:
Lenc_key:
mov r3,#-1
cmp r0,#0
beq Lenc_key_abort
cmp r2,#0
beq Lenc_key_abort
mov r3,#-2
cmp r1,#128
blt Lenc_key_abort
cmp r1,#256
bgt Lenc_key_abort
tst r1,#0x3f
bne Lenc_key_abort
adr r3,Lrcon
cmp r1,#192
veor q0,q0,q0
vld1.8 {q3},[r0]!
mov r1,#8 @ reuse r1
vld1.32 {q1,q2},[r3]!
blt Loop128
beq L192
b L256
.align 4
Loop128:
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
bne Loop128
vld1.32 {q1},[r3]
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vtbl.8 d20,{q3},d4
vtbl.8 d21,{q3},d5
vext.8 q9,q0,q3,#12
vst1.32 {q3},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
veor q3,q3,q10
vst1.32 {q3},[r2]
add r2,r2,#0x50
mov r12,#10
b Ldone
.align 4
L192:
vld1.8 {d16},[r0]!
vmov.i8 q10,#8 @ borrow q10
vst1.32 {q3},[r2]!
vsub.i8 q2,q2,q10 @ adjust the mask
Loop192:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {d16},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vdup.32 q9,d7[1]
veor q9,q9,q8
veor q10,q10,q1
vext.8 q8,q0,q8,#12
vshl.u8 q1,q1,#1
veor q8,q8,q9
veor q3,q3,q10
veor q8,q8,q10
vst1.32 {q3},[r2]!
bne Loop192
mov r12,#12
add r2,r2,#0x20
b Ldone
.align 4
L256:
vld1.8 {q8},[r0]
mov r1,#7
mov r12,#14
vst1.32 {q3},[r2]!
Loop256:
vtbl.8 d20,{q8},d4
vtbl.8 d21,{q8},d5
vext.8 q9,q0,q3,#12
vst1.32 {q8},[r2]!
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
subs r1,r1,#1
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q3,q3,q9
vext.8 q9,q0,q9,#12
veor q10,q10,q1
veor q3,q3,q9
vshl.u8 q1,q1,#1
veor q3,q3,q10
vst1.32 {q3},[r2]!
beq Ldone
vdup.32 q10,d7[1]
vext.8 q9,q0,q8,#12
.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
vext.8 q9,q0,q9,#12
veor q8,q8,q9
veor q8,q8,q10
b Loop256
Ldone:
str r12,[r2]
mov r3,#0
Lenc_key_abort:
mov r0,r3 @ return value
bx lr
.globl _aes_hw_set_decrypt_key
.private_extern _aes_hw_set_decrypt_key
#ifdef __thumb2__
.thumb_func _aes_hw_set_decrypt_key
#endif
.align 5
_aes_hw_set_decrypt_key:
stmdb sp!,{r4,lr}
bl Lenc_key
cmp r0,#0
bne Ldec_key_abort
sub r2,r2,#240 @ restore original r2
mov r4,#-16
add r0,r2,r12,lsl#4 @ end of key schedule
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
Loop_imc:
vld1.32 {q0},[r2]
vld1.32 {q1},[r0]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
vst1.32 {q0},[r0],r4
vst1.32 {q1},[r2]!
cmp r0,r2
bhi Loop_imc
vld1.32 {q0},[r2]
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
vst1.32 {q0},[r0]
eor r0,r0,r0 @ return value
Ldec_key_abort:
ldmia sp!,{r4,pc}
.globl _aes_hw_encrypt
.private_extern _aes_hw_encrypt
#ifdef __thumb2__
.thumb_func _aes_hw_encrypt
#endif
.align 5
_aes_hw_encrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
Loop_enc:
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q1},[r2]!
bgt Loop_enc
.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
vld1.32 {q0},[r2]
.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
bx lr
.globl _aes_hw_decrypt
.private_extern _aes_hw_decrypt
#ifdef __thumb2__
.thumb_func _aes_hw_decrypt
#endif
.align 5
_aes_hw_decrypt:
ldr r3,[r2,#240]
vld1.32 {q0},[r2]!
vld1.8 {q2},[r0]
sub r3,r3,#2
vld1.32 {q1},[r2]!
Loop_dec:
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]!
subs r3,r3,#2
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q1},[r2]!
bgt Loop_dec
.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
vld1.32 {q0},[r2]
.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
veor q2,q2,q0
vst1.8 {q2},[r1]
bx lr
.globl _aes_hw_cbc_encrypt
.private_extern _aes_hw_cbc_encrypt
#ifdef __thumb2__
.thumb_func _aes_hw_cbc_encrypt
#endif
.align 5
_aes_hw_cbc_encrypt:
mov ip,sp
stmdb sp!,{r4,r5,r6,r7,r8,lr}
vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
ldmia ip,{r4,r5} @ load remaining args
subs r2,r2,#16
mov r8,#16
blo Lcbc_abort
moveq r8,#0
cmp r5,#0 @ en- or decrypting?
ldr r5,[r3,#240]
and r2,r2,#-16
vld1.8 {q6},[r4]
vld1.8 {q0},[r0],r8
vld1.32 {q8,q9},[r3] @ load key schedule...
sub r5,r5,#6
add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
sub r5,r5,#2
vld1.32 {q10,q11},[r7]!
vld1.32 {q12,q13},[r7]!
vld1.32 {q14,q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
beq Lcbc_dec
cmp r5,#2
veor q0,q0,q6
veor q5,q8,q7
beq Lcbc_enc128
vld1.32 {q2,q3},[r7]
add r7,r3,#16
add r6,r3,#16*4
add r12,r3,#16*5
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
add r14,r3,#16*6
add r3,r3,#16*7
b Lenter_cbc_enc
.align 4
Loop_cbc_enc:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]!
Lenter_cbc_enc:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r6]
cmp r5,#4
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r12]
beq Lcbc_enc192
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q8},[r14]
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r3]
nop
Lcbc_enc192:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7
bhs Loop_cbc_enc
vst1.8 {q6},[r1]!
b Lcbc_done
.align 5
Lcbc_enc128:
vld1.32 {q2,q3},[r7]
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
b Lenter_cbc_enc128
Loop_cbc_enc128:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vst1.8 {q6},[r1]!
Lenter_cbc_enc128:
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
subs r2,r2,#16
.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
moveq r8,#0
.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
vld1.8 {q8},[r0],r8
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
veor q8,q8,q5
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
veor q6,q0,q7
bhs Loop_cbc_enc128
vst1.8 {q6},[r1]!
b Lcbc_done
.align 5
Lcbc_dec:
vld1.8 {q10},[r0]!
subs r2,r2,#32 @ bias
add r6,r5,#2
vorr q3,q0,q0
vorr q1,q0,q0
vorr q11,q10,q10
blo Lcbc_dec_tail
vorr q1,q10,q10
vld1.8 {q10},[r0]!
vorr q2,q0,q0
vorr q3,q1,q1
vorr q11,q10,q10
Loop3x_cbc_dec:
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt Loop3x_cbc_dec
.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q4,q6,q7
subs r2,r2,#0x30
veor q5,q2,q7
movlo r6,r2 @ r6, r6, is zero at this point
.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
add r0,r0,r6 @ r0 is adjusted in such way that
@ at exit from the loop q1-q10
@ are loaded with last "words"
vorr q6,q11,q11
mov r7,r3
.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q2},[r0]!
.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q3},[r0]!
.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.8 {q11},[r0]!
.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
add r6,r5,#2
veor q4,q4,q0
veor q5,q5,q1
veor q10,q10,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q4},[r1]!
vorr q0,q2,q2
vst1.8 {q5},[r1]!
vorr q1,q3,q3
vst1.8 {q10},[r1]!
vorr q10,q11,q11
bhs Loop3x_cbc_dec
cmn r2,#0x30
beq Lcbc_done
nop
Lcbc_dec_tail:
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
vld1.32 {q9},[r7]!
bgt Lcbc_dec_tail
.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
cmn r2,#0x20
.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q5,q6,q7
.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
veor q9,q3,q7
.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
beq Lcbc_dec_one
veor q5,q5,q1
veor q9,q9,q10
vorr q6,q11,q11
vst1.8 {q5},[r1]!
vst1.8 {q9},[r1]!
b Lcbc_done
Lcbc_dec_one:
veor q5,q5,q10
vorr q6,q11,q11
vst1.8 {q5},[r1]!
Lcbc_done:
vst1.8 {q6},[r4]
Lcbc_abort:
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4,r5,r6,r7,r8,pc}
.globl _aes_hw_ctr32_encrypt_blocks
.private_extern _aes_hw_ctr32_encrypt_blocks
#ifdef __thumb2__
.thumb_func _aes_hw_ctr32_encrypt_blocks
#endif
.align 5
_aes_hw_ctr32_encrypt_blocks:
mov ip,sp
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
ldr r4, [ip] @ load remaining arg
ldr r5,[r3,#240]
ldr r8, [r4, #12]
vld1.32 {q0},[r4]
vld1.32 {q8,q9},[r3] @ load key schedule...
sub r5,r5,#4
mov r12,#16
cmp r2,#2
add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
sub r5,r5,#2
vld1.32 {q12,q13},[r7]!
vld1.32 {q14,q15},[r7]!
vld1.32 {q7},[r7]
add r7,r3,#32
mov r6,r5
movlo r12,#0
#ifndef __ARMEB__
rev r8, r8
#endif
vorr q1,q0,q0
add r10, r8, #1
vorr q10,q0,q0
add r8, r8, #2
vorr q6,q0,q0
rev r10, r10
vmov.32 d3[1],r10
bls Lctr32_tail
rev r12, r8
sub r2,r2,#3 @ bias
vmov.32 d21[1],r12
b Loop3x_ctr32
.align 4
Loop3x_ctr32:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.32 {q9},[r7]!
bgt Loop3x_ctr32
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
vld1.8 {q2},[r0]!
vorr q0,q6,q6
.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
vld1.8 {q3},[r0]!
vorr q1,q6,q6
.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vld1.8 {q11},[r0]!
mov r7,r3
.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
vorr q10,q6,q6
add r9,r8,#1
.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q2,q2,q7
add r10,r8,#2
.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
veor q3,q3,q7
add r8,r8,#3
.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
veor q11,q11,q7
rev r9,r9
.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d1[1], r9
rev r10,r10
.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
vmov.32 d3[1], r10
rev r12,r8
.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
vmov.32 d21[1], r12
subs r2,r2,#3
.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
veor q2,q2,q4
vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
vst1.8 {q2},[r1]!
veor q3,q3,q5
mov r6,r5
vst1.8 {q3},[r1]!
veor q11,q11,q9
vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
vst1.8 {q11},[r1]!
bhs Loop3x_ctr32
adds r2,r2,#3
beq Lctr32_done
cmp r2,#1
mov r12,#16
moveq r12,#0
Lctr32_tail:
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q8},[r7]!
subs r6,r6,#2
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.32 {q9},[r7]!
bgt Lctr32_tail
.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q2},[r0],r12
.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
vld1.8 {q3},[r0]
.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q2,q2,q7
.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
veor q3,q3,q7
.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
cmp r2,#1
veor q2,q2,q0
veor q3,q3,q1
vst1.8 {q2},[r1]!
beq Lctr32_done
vst1.8 {q3},[r1]
Lctr32_done:
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
#endif

View File

@@ -0,0 +1,966 @@
#include <openssl/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
#if __ARM_MAX_ARCH__>=7
.align 5
LOPENSSL_armcap:
.word OPENSSL_armcap_P-Lbn_mul_mont
#endif
.globl _bn_mul_mont
.private_extern _bn_mul_mont
#ifdef __thumb2__
.thumb_func _bn_mul_mont
#endif
.align 5
_bn_mul_mont:
Lbn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne Lialu
adr r0,Lbn_mul_mont
ldr r2,LOPENSSL_armcap
ldr r0,[r0,r2]
#ifdef __APPLE__
ldr r0,[r0]
#endif
tst r0,#ARMV7_NEON @ NEON available?
ldmia sp, {r0,r2}
beq Lialu
add sp,sp,#8
b bn_mul8x_mont_neon
.align 4
Lialu:
#endif
cmp ip,#2
mov r0,ip @ load num
#ifdef __thumb2__
ittt lt
#endif
movlt r0,#0
addlt sp,sp,#2*4
blt Labrt
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
mov r0,r0,lsl#2 @ rescale r0 for byte count
sub sp,sp,r0 @ alloca(4*num)
sub sp,sp,#4 @ +extra dword
sub r0,r0,#4 @ "num=num-1"
add r4,r2,r0 @ &bp[num-1]
add r0,sp,r0 @ r0 to point at &tp[num-1]
ldr r8,[r0,#14*4] @ &n0
ldr r2,[r2] @ bp[0]
ldr r5,[r1],#4 @ ap[0],ap++
ldr r6,[r3],#4 @ np[0],np++
ldr r8,[r8] @ *n0
str r4,[r0,#15*4] @ save &bp[num]
umull r10,r11,r5,r2 @ ap[0]*bp[0]
str r8,[r0,#14*4] @ save n0 value
mul r8,r10,r8 @ "tp[0]"*n0
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
mov r4,sp
L1st:
ldr r5,[r1],#4 @ ap[j],ap++
mov r10,r11
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[0]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne L1st
adds r12,r12,r11
ldr r4,[r0,#13*4] @ restore bp
mov r14,#0
ldr r8,[r0,#14*4] @ restore n0
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
mov r7,sp
str r14,[r0,#4] @ tp[num]=
Louter:
sub r7,r0,r7 @ "original" r0-1 value
sub r1,r1,r7 @ "rewind" ap to &ap[1]
ldr r2,[r4,#4]! @ *(++bp)
sub r3,r3,r7 @ "rewind" np to &np[1]
ldr r5,[r1,#-4] @ ap[0]
ldr r10,[sp] @ tp[0]
ldr r6,[r3,#-4] @ np[0]
ldr r7,[sp,#4] @ tp[1]
mov r11,#0
umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
str r4,[r0,#13*4] @ save bp
mul r8,r10,r8
mov r12,#0
umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
mov r4,sp
Linner:
ldr r5,[r1],#4 @ ap[j],ap++
adds r10,r11,r7 @ +=tp[j]
ldr r6,[r3],#4 @ np[j],np++
mov r11,#0
umlal r10,r11,r5,r2 @ ap[j]*bp[i]
mov r14,#0
umlal r12,r14,r6,r8 @ np[j]*n0
adc r11,r11,#0
ldr r7,[r4,#8] @ tp[j+1]
adds r12,r12,r10
str r12,[r4],#4 @ tp[j-1]=,tp++
adc r12,r14,#0
cmp r4,r0
bne Linner
adds r12,r12,r11
mov r14,#0
ldr r4,[r0,#13*4] @ restore bp
adc r14,r14,#0
ldr r8,[r0,#14*4] @ restore n0
adds r12,r12,r7
ldr r7,[r0,#15*4] @ restore &bp[num]
adc r14,r14,#0
str r12,[r0] @ tp[num-1]=
str r14,[r0,#4] @ tp[num]=
cmp r4,r7
#ifdef __thumb2__
itt ne
#endif
movne r7,sp
bne Louter
ldr r2,[r0,#12*4] @ pull rp
mov r5,sp
add r0,r0,#4 @ r0 to point at &tp[num]
sub r5,r0,r5 @ "original" num value
mov r4,sp @ "rewind" r4
mov r1,r4 @ "borrow" r1
sub r3,r3,r5 @ "rewind" r3 to &np[0]
subs r7,r7,r7 @ "clear" carry flag
Lsub: ldr r7,[r4],#4
ldr r6,[r3],#4
sbcs r7,r7,r6 @ tp[j]-np[j]
str r7,[r2],#4 @ rp[j]=
teq r4,r0 @ preserve carry
bne Lsub
sbcs r14,r14,#0 @ upmost carry
mov r4,sp @ "rewind" r4
sub r2,r2,r5 @ "rewind" r2
and r1,r4,r14
bic r3,r2,r14
orr r1,r1,r3 @ ap=borrow?tp:rp
Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh
str sp,[r4],#4 @ zap tp
str r7,[r2],#4
cmp r4,r0
bne Lcopy
mov sp,r0
add sp,sp,#4 @ skip over tp[num+1]
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
add sp,sp,#2*4 @ skip over {r0,r2}
mov r0,#1
Labrt:
#if __ARM_ARCH__>=5
bx lr @ bx lr
#else
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
#if __ARM_MAX_ARCH__>=7
#ifdef __thumb2__
.thumb_func bn_mul8x_mont_neon
#endif
.align 5
bn_mul8x_mont_neon:
mov ip,sp
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
ldmia ip,{r4,r5} @ load rest of parameter block
mov ip,sp
cmp r5,#8
bhi LNEON_8n
@ special case for r5==8, everything is in register bank...
vld1.32 {d28[0]}, [r2,:32]!
veor d8,d8,d8
sub r7,sp,r5,lsl#4
vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
and r7,r7,#-64
vld1.32 {d30[0]}, [r4,:32]
mov sp,r7 @ alloca
vzip.16 d28,d8
vmull.u32 q6,d28,d0[0]
vmull.u32 q7,d28,d0[1]
vmull.u32 q8,d28,d1[0]
vshl.i64 d29,d13,#16
vmull.u32 q9,d28,d1[1]
vadd.u64 d29,d29,d12
veor d8,d8,d8
vmul.u32 d29,d29,d30
vmull.u32 q10,d28,d2[0]
vld1.32 {d4,d5,d6,d7}, [r3]!
vmull.u32 q11,d28,d2[1]
vmull.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmull.u32 q13,d28,d3[1]
vmlal.u32 q6,d29,d4[0]
sub r9,r5,#1
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
b LNEON_outer8
.align 4
LNEON_outer8:
vld1.32 {d28[0]}, [r2,:32]!
veor d8,d8,d8
vzip.16 d28,d8
vadd.u64 d12,d12,d10
vmlal.u32 q6,d28,d0[0]
vmlal.u32 q7,d28,d0[1]
vmlal.u32 q8,d28,d1[0]
vshl.i64 d29,d13,#16
vmlal.u32 q9,d28,d1[1]
vadd.u64 d29,d29,d12
veor d8,d8,d8
subs r9,r9,#1
vmul.u32 d29,d29,d30
vmlal.u32 q10,d28,d2[0]
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
vmlal.u32 q6,d29,d4[0]
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmov q5,q6
vmlal.u32 q11,d29,d6[1]
vmov q6,q7
vmlal.u32 q12,d29,d7[0]
vmov q7,q8
vmlal.u32 q13,d29,d7[1]
vmov q8,q9
vmov q9,q10
vshr.u64 d10,d10,#16
vmov q10,q11
vmov q11,q12
vadd.u64 d10,d10,d11
vmov q12,q13
veor q13,q13
vshr.u64 d10,d10,#16
bne LNEON_outer8
vadd.u64 d12,d12,d10
mov r7,sp
vshr.u64 d10,d12,#16
mov r8,r5
vadd.u64 d13,d13,d10
add r6,sp,#96
vshr.u64 d10,d13,#16
vzip.16 d12,d13
b LNEON_tail_entry
.align 4
LNEON_8n:
veor q6,q6,q6
sub r7,sp,#128
veor q7,q7,q7
sub r7,r7,r5,lsl#4
veor q8,q8,q8
and r7,r7,#-64
veor q9,q9,q9
mov sp,r7 @ alloca
veor q10,q10,q10
add r7,r7,#256
veor q11,q11,q11
sub r8,r5,#8
veor q12,q12,q12
veor q13,q13,q13
LNEON_8n_init:
vst1.64 {q6,q7},[r7,:256]!
subs r8,r8,#8
vst1.64 {q8,q9},[r7,:256]!
vst1.64 {q10,q11},[r7,:256]!
vst1.64 {q12,q13},[r7,:256]!
bne LNEON_8n_init
add r6,sp,#256
vld1.32 {d0,d1,d2,d3},[r1]!
add r10,sp,#8
vld1.32 {d30[0]},[r4,:32]
mov r9,r5
b LNEON_8n_outer
.align 4
LNEON_8n_outer:
vld1.32 {d28[0]},[r2,:32]! @ *b++
veor d8,d8,d8
vzip.16 d28,d8
add r7,sp,#128
vld1.32 {d4,d5,d6,d7},[r3]!
vmlal.u32 q6,d28,d0[0]
vmlal.u32 q7,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q8,d28,d1[0]
vshl.i64 d29,d13,#16
vmlal.u32 q9,d28,d1[1]
vadd.u64 d29,d29,d12
vmlal.u32 q10,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q11,d28,d2[1]
vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0]
vmlal.u32 q12,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q13,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q6,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q7,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q8,d29,d5[0]
vshr.u64 d12,d12,#16
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vadd.u64 d12,d12,d13
vmlal.u32 q11,d29,d6[1]
vshr.u64 d12,d12,#16
vmlal.u32 q12,d29,d7[0]
vmlal.u32 q13,d29,d7[1]
vadd.u64 d14,d14,d12
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0]
vmlal.u32 q7,d28,d0[0]
vld1.64 {q6},[r6,:128]!
vmlal.u32 q8,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q9,d28,d1[0]
vshl.i64 d29,d15,#16
vmlal.u32 q10,d28,d1[1]
vadd.u64 d29,d29,d14
vmlal.u32 q11,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q12,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1]
vmlal.u32 q13,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q6,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q7,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q8,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q9,d29,d5[0]
vshr.u64 d14,d14,#16
vmlal.u32 q10,d29,d5[1]
vmlal.u32 q11,d29,d6[0]
vadd.u64 d14,d14,d15
vmlal.u32 q12,d29,d6[1]
vshr.u64 d14,d14,#16
vmlal.u32 q13,d29,d7[0]
vmlal.u32 q6,d29,d7[1]
vadd.u64 d16,d16,d14
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1]
vmlal.u32 q8,d28,d0[0]
vld1.64 {q7},[r6,:128]!
vmlal.u32 q9,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q10,d28,d1[0]
vshl.i64 d29,d17,#16
vmlal.u32 q11,d28,d1[1]
vadd.u64 d29,d29,d16
vmlal.u32 q12,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q13,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2]
vmlal.u32 q6,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q7,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q8,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q9,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q10,d29,d5[0]
vshr.u64 d16,d16,#16
vmlal.u32 q11,d29,d5[1]
vmlal.u32 q12,d29,d6[0]
vadd.u64 d16,d16,d17
vmlal.u32 q13,d29,d6[1]
vshr.u64 d16,d16,#16
vmlal.u32 q6,d29,d7[0]
vmlal.u32 q7,d29,d7[1]
vadd.u64 d18,d18,d16
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2]
vmlal.u32 q9,d28,d0[0]
vld1.64 {q8},[r6,:128]!
vmlal.u32 q10,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q11,d28,d1[0]
vshl.i64 d29,d19,#16
vmlal.u32 q12,d28,d1[1]
vadd.u64 d29,d29,d18
vmlal.u32 q13,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q6,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3]
vmlal.u32 q7,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q8,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q9,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q10,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q11,d29,d5[0]
vshr.u64 d18,d18,#16
vmlal.u32 q12,d29,d5[1]
vmlal.u32 q13,d29,d6[0]
vadd.u64 d18,d18,d19
vmlal.u32 q6,d29,d6[1]
vshr.u64 d18,d18,#16
vmlal.u32 q7,d29,d7[0]
vmlal.u32 q8,d29,d7[1]
vadd.u64 d20,d20,d18
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3]
vmlal.u32 q10,d28,d0[0]
vld1.64 {q9},[r6,:128]!
vmlal.u32 q11,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q12,d28,d1[0]
vshl.i64 d29,d21,#16
vmlal.u32 q13,d28,d1[1]
vadd.u64 d29,d29,d20
vmlal.u32 q6,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q7,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4]
vmlal.u32 q8,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q9,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q10,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q11,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q12,d29,d5[0]
vshr.u64 d20,d20,#16
vmlal.u32 q13,d29,d5[1]
vmlal.u32 q6,d29,d6[0]
vadd.u64 d20,d20,d21
vmlal.u32 q7,d29,d6[1]
vshr.u64 d20,d20,#16
vmlal.u32 q8,d29,d7[0]
vmlal.u32 q9,d29,d7[1]
vadd.u64 d22,d22,d20
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4]
vmlal.u32 q11,d28,d0[0]
vld1.64 {q10},[r6,:128]!
vmlal.u32 q12,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q13,d28,d1[0]
vshl.i64 d29,d23,#16
vmlal.u32 q6,d28,d1[1]
vadd.u64 d29,d29,d22
vmlal.u32 q7,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q8,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5]
vmlal.u32 q9,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q10,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q11,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q12,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q13,d29,d5[0]
vshr.u64 d22,d22,#16
vmlal.u32 q6,d29,d5[1]
vmlal.u32 q7,d29,d6[0]
vadd.u64 d22,d22,d23
vmlal.u32 q8,d29,d6[1]
vshr.u64 d22,d22,#16
vmlal.u32 q9,d29,d7[0]
vmlal.u32 q10,d29,d7[1]
vadd.u64 d24,d24,d22
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5]
vmlal.u32 q12,d28,d0[0]
vld1.64 {q11},[r6,:128]!
vmlal.u32 q13,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q6,d28,d1[0]
vshl.i64 d29,d25,#16
vmlal.u32 q7,d28,d1[1]
vadd.u64 d29,d29,d24
vmlal.u32 q8,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q9,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6]
vmlal.u32 q10,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q11,d28,d3[1]
vld1.32 {d28[0]},[r2,:32]! @ *b++
vmlal.u32 q12,d29,d4[0]
veor d10,d10,d10
vmlal.u32 q13,d29,d4[1]
vzip.16 d28,d10
vmlal.u32 q6,d29,d5[0]
vshr.u64 d24,d24,#16
vmlal.u32 q7,d29,d5[1]
vmlal.u32 q8,d29,d6[0]
vadd.u64 d24,d24,d25
vmlal.u32 q9,d29,d6[1]
vshr.u64 d24,d24,#16
vmlal.u32 q10,d29,d7[0]
vmlal.u32 q11,d29,d7[1]
vadd.u64 d26,d26,d24
vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6]
vmlal.u32 q13,d28,d0[0]
vld1.64 {q12},[r6,:128]!
vmlal.u32 q6,d28,d0[1]
veor d8,d8,d8
vmlal.u32 q7,d28,d1[0]
vshl.i64 d29,d27,#16
vmlal.u32 q8,d28,d1[1]
vadd.u64 d29,d29,d26
vmlal.u32 q9,d28,d2[0]
vmul.u32 d29,d29,d30
vmlal.u32 q10,d28,d2[1]
vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7]
vmlal.u32 q11,d28,d3[0]
vzip.16 d29,d8
vmlal.u32 q12,d28,d3[1]
vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 q13,d29,d4[0]
vld1.32 {d0,d1,d2,d3},[r1]!
vmlal.u32 q6,d29,d4[1]
vmlal.u32 q7,d29,d5[0]
vshr.u64 d26,d26,#16
vmlal.u32 q8,d29,d5[1]
vmlal.u32 q9,d29,d6[0]
vadd.u64 d26,d26,d27
vmlal.u32 q10,d29,d6[1]
vshr.u64 d26,d26,#16
vmlal.u32 q11,d29,d7[0]
vmlal.u32 q12,d29,d7[1]
vadd.u64 d12,d12,d26
vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7]
add r10,sp,#8 @ rewind
sub r8,r5,#8
b LNEON_8n_inner
.align 4
LNEON_8n_inner:
subs r8,r8,#8
vmlal.u32 q6,d28,d0[0]
vld1.64 {q13},[r6,:128]
vmlal.u32 q7,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0]
vmlal.u32 q8,d28,d1[0]
vld1.32 {d4,d5,d6,d7},[r3]!
vmlal.u32 q9,d28,d1[1]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q10,d28,d2[0]
vmlal.u32 q11,d28,d2[1]
vmlal.u32 q12,d28,d3[0]
vmlal.u32 q13,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1]
vmlal.u32 q6,d29,d4[0]
vmlal.u32 q7,d29,d4[1]
vmlal.u32 q8,d29,d5[0]
vmlal.u32 q9,d29,d5[1]
vmlal.u32 q10,d29,d6[0]
vmlal.u32 q11,d29,d6[1]
vmlal.u32 q12,d29,d7[0]
vmlal.u32 q13,d29,d7[1]
vst1.64 {q6},[r7,:128]!
vmlal.u32 q7,d28,d0[0]
vld1.64 {q6},[r6,:128]
vmlal.u32 q8,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1]
vmlal.u32 q9,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q10,d28,d1[1]
vmlal.u32 q11,d28,d2[0]
vmlal.u32 q12,d28,d2[1]
vmlal.u32 q13,d28,d3[0]
vmlal.u32 q6,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2]
vmlal.u32 q7,d29,d4[0]
vmlal.u32 q8,d29,d4[1]
vmlal.u32 q9,d29,d5[0]
vmlal.u32 q10,d29,d5[1]
vmlal.u32 q11,d29,d6[0]
vmlal.u32 q12,d29,d6[1]
vmlal.u32 q13,d29,d7[0]
vmlal.u32 q6,d29,d7[1]
vst1.64 {q7},[r7,:128]!
vmlal.u32 q8,d28,d0[0]
vld1.64 {q7},[r6,:128]
vmlal.u32 q9,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2]
vmlal.u32 q10,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q11,d28,d1[1]
vmlal.u32 q12,d28,d2[0]
vmlal.u32 q13,d28,d2[1]
vmlal.u32 q6,d28,d3[0]
vmlal.u32 q7,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3]
vmlal.u32 q8,d29,d4[0]
vmlal.u32 q9,d29,d4[1]
vmlal.u32 q10,d29,d5[0]
vmlal.u32 q11,d29,d5[1]
vmlal.u32 q12,d29,d6[0]
vmlal.u32 q13,d29,d6[1]
vmlal.u32 q6,d29,d7[0]
vmlal.u32 q7,d29,d7[1]
vst1.64 {q8},[r7,:128]!
vmlal.u32 q9,d28,d0[0]
vld1.64 {q8},[r6,:128]
vmlal.u32 q10,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3]
vmlal.u32 q11,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q12,d28,d1[1]
vmlal.u32 q13,d28,d2[0]
vmlal.u32 q6,d28,d2[1]
vmlal.u32 q7,d28,d3[0]
vmlal.u32 q8,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4]
vmlal.u32 q9,d29,d4[0]
vmlal.u32 q10,d29,d4[1]
vmlal.u32 q11,d29,d5[0]
vmlal.u32 q12,d29,d5[1]
vmlal.u32 q13,d29,d6[0]
vmlal.u32 q6,d29,d6[1]
vmlal.u32 q7,d29,d7[0]
vmlal.u32 q8,d29,d7[1]
vst1.64 {q9},[r7,:128]!
vmlal.u32 q10,d28,d0[0]
vld1.64 {q9},[r6,:128]
vmlal.u32 q11,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4]
vmlal.u32 q12,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q13,d28,d1[1]
vmlal.u32 q6,d28,d2[0]
vmlal.u32 q7,d28,d2[1]
vmlal.u32 q8,d28,d3[0]
vmlal.u32 q9,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5]
vmlal.u32 q10,d29,d4[0]
vmlal.u32 q11,d29,d4[1]
vmlal.u32 q12,d29,d5[0]
vmlal.u32 q13,d29,d5[1]
vmlal.u32 q6,d29,d6[0]
vmlal.u32 q7,d29,d6[1]
vmlal.u32 q8,d29,d7[0]
vmlal.u32 q9,d29,d7[1]
vst1.64 {q10},[r7,:128]!
vmlal.u32 q11,d28,d0[0]
vld1.64 {q10},[r6,:128]
vmlal.u32 q12,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5]
vmlal.u32 q13,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q6,d28,d1[1]
vmlal.u32 q7,d28,d2[0]
vmlal.u32 q8,d28,d2[1]
vmlal.u32 q9,d28,d3[0]
vmlal.u32 q10,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6]
vmlal.u32 q11,d29,d4[0]
vmlal.u32 q12,d29,d4[1]
vmlal.u32 q13,d29,d5[0]
vmlal.u32 q6,d29,d5[1]
vmlal.u32 q7,d29,d6[0]
vmlal.u32 q8,d29,d6[1]
vmlal.u32 q9,d29,d7[0]
vmlal.u32 q10,d29,d7[1]
vst1.64 {q11},[r7,:128]!
vmlal.u32 q12,d28,d0[0]
vld1.64 {q11},[r6,:128]
vmlal.u32 q13,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6]
vmlal.u32 q6,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q7,d28,d1[1]
vmlal.u32 q8,d28,d2[0]
vmlal.u32 q9,d28,d2[1]
vmlal.u32 q10,d28,d3[0]
vmlal.u32 q11,d28,d3[1]
vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7]
vmlal.u32 q12,d29,d4[0]
vmlal.u32 q13,d29,d4[1]
vmlal.u32 q6,d29,d5[0]
vmlal.u32 q7,d29,d5[1]
vmlal.u32 q8,d29,d6[0]
vmlal.u32 q9,d29,d6[1]
vmlal.u32 q10,d29,d7[0]
vmlal.u32 q11,d29,d7[1]
vst1.64 {q12},[r7,:128]!
vmlal.u32 q13,d28,d0[0]
vld1.64 {q12},[r6,:128]
vmlal.u32 q6,d28,d0[1]
vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7]
vmlal.u32 q7,d28,d1[0]
it ne
addne r6,r6,#16 @ don't advance in last iteration
vmlal.u32 q8,d28,d1[1]
vmlal.u32 q9,d28,d2[0]
vmlal.u32 q10,d28,d2[1]
vmlal.u32 q11,d28,d3[0]
vmlal.u32 q12,d28,d3[1]
it eq
subeq r1,r1,r5,lsl#2 @ rewind
vmlal.u32 q13,d29,d4[0]
vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
vmlal.u32 q6,d29,d4[1]
vld1.32 {d0,d1,d2,d3},[r1]!
vmlal.u32 q7,d29,d5[0]
add r10,sp,#8 @ rewind
vmlal.u32 q8,d29,d5[1]
vmlal.u32 q9,d29,d6[0]
vmlal.u32 q10,d29,d6[1]
vmlal.u32 q11,d29,d7[0]
vst1.64 {q13},[r7,:128]!
vmlal.u32 q12,d29,d7[1]
bne LNEON_8n_inner
add r6,sp,#128
vst1.64 {q6,q7},[r7,:256]!
veor q2,q2,q2 @ d4-d5
vst1.64 {q8,q9},[r7,:256]!
veor q3,q3,q3 @ d6-d7
vst1.64 {q10,q11},[r7,:256]!
vst1.64 {q12},[r7,:128]
subs r9,r9,#8
vld1.64 {q6,q7},[r6,:256]!
vld1.64 {q8,q9},[r6,:256]!
vld1.64 {q10,q11},[r6,:256]!
vld1.64 {q12,q13},[r6,:256]!
itt ne
subne r3,r3,r5,lsl#2 @ rewind
bne LNEON_8n_outer
add r7,sp,#128
vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame
vshr.u64 d10,d12,#16
vst1.64 {q2,q3},[sp,:256]!
vadd.u64 d13,d13,d10
vst1.64 {q2,q3}, [sp,:256]!
vshr.u64 d10,d13,#16
vst1.64 {q2,q3}, [sp,:256]!
vzip.16 d12,d13
mov r8,r5
b LNEON_tail_entry
.align 4
LNEON_tail:
vadd.u64 d12,d12,d10
vshr.u64 d10,d12,#16
vld1.64 {q8,q9}, [r6, :256]!
vadd.u64 d13,d13,d10
vld1.64 {q10,q11}, [r6, :256]!
vshr.u64 d10,d13,#16
vld1.64 {q12,q13}, [r6, :256]!
vzip.16 d12,d13
LNEON_tail_entry:
vadd.u64 d14,d14,d10
vst1.32 {d12[0]}, [r7, :32]!
vshr.u64 d10,d14,#16
vadd.u64 d15,d15,d10
vshr.u64 d10,d15,#16
vzip.16 d14,d15
vadd.u64 d16,d16,d10
vst1.32 {d14[0]}, [r7, :32]!
vshr.u64 d10,d16,#16
vadd.u64 d17,d17,d10
vshr.u64 d10,d17,#16
vzip.16 d16,d17
vadd.u64 d18,d18,d10
vst1.32 {d16[0]}, [r7, :32]!
vshr.u64 d10,d18,#16
vadd.u64 d19,d19,d10
vshr.u64 d10,d19,#16
vzip.16 d18,d19
vadd.u64 d20,d20,d10
vst1.32 {d18[0]}, [r7, :32]!
vshr.u64 d10,d20,#16
vadd.u64 d21,d21,d10
vshr.u64 d10,d21,#16
vzip.16 d20,d21
vadd.u64 d22,d22,d10
vst1.32 {d20[0]}, [r7, :32]!
vshr.u64 d10,d22,#16
vadd.u64 d23,d23,d10
vshr.u64 d10,d23,#16
vzip.16 d22,d23
vadd.u64 d24,d24,d10
vst1.32 {d22[0]}, [r7, :32]!
vshr.u64 d10,d24,#16
vadd.u64 d25,d25,d10
vshr.u64 d10,d25,#16
vzip.16 d24,d25
vadd.u64 d26,d26,d10
vst1.32 {d24[0]}, [r7, :32]!
vshr.u64 d10,d26,#16
vadd.u64 d27,d27,d10
vshr.u64 d10,d27,#16
vzip.16 d26,d27
vld1.64 {q6,q7}, [r6, :256]!
subs r8,r8,#8
vst1.32 {d26[0]}, [r7, :32]!
bne LNEON_tail
vst1.32 {d10[0]}, [r7, :32] @ top-most bit
sub r3,r3,r5,lsl#2 @ rewind r3
subs r1,sp,#0 @ clear carry flag
add r2,sp,r5,lsl#2
LNEON_sub:
ldmia r1!, {r4,r5,r6,r7}
ldmia r3!, {r8,r9,r10,r11}
sbcs r8, r4,r8
sbcs r9, r5,r9
sbcs r10,r6,r10
sbcs r11,r7,r11
teq r1,r2 @ preserves carry
stmia r0!, {r8,r9,r10,r11}
bne LNEON_sub
ldr r10, [r1] @ load top-most bit
mov r11,sp
veor q0,q0,q0
sub r11,r2,r11 @ this is num*4
veor q1,q1,q1
mov r1,sp
sub r0,r0,r11 @ rewind r0
mov r3,r2 @ second 3/4th of frame
sbcs r10,r10,#0 @ result is carry flag
LNEON_copy_n_zap:
ldmia r1!, {r4,r5,r6,r7}
ldmia r0, {r8,r9,r10,r11}
it cc
movcc r8, r4
vst1.64 {q0,q1}, [r3,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0,q1}, [r3,:256]! @ wipe
it cc
movcc r11,r7
ldmia r1, {r4,r5,r6,r7}
stmia r0!, {r8,r9,r10,r11}
sub r1,r1,#16
ldmia r0, {r8,r9,r10,r11}
it cc
movcc r8, r4
vst1.64 {q0,q1}, [r1,:256]! @ wipe
itt cc
movcc r9, r5
movcc r10,r6
vst1.64 {q0,q1}, [r3,:256]! @ wipe
it cc
movcc r11,r7
teq r1,r2 @ preserves carry
stmia r0!, {r8,r9,r10,r11}
bne LNEON_copy_n_zap
mov sp,ip
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
bx lr @ bx lr
#endif
.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#if __ARM_MAX_ARCH__>=7
.comm _OPENSSL_armcap_P,4
.non_lazy_symbol_pointer
OPENSSL_armcap_P:
.indirect_symbol _OPENSSL_armcap_P
.long 0
.private_extern _OPENSSL_armcap_P
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,588 @@
#include <openssl/arm_arch.h>
@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
@ instructions are in aesv8-armx.pl.)
.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#endif
#if defined(__thumb2__)
.thumb
#else
.code 32
#endif
#ifdef __clang__
#define ldrplb ldrbpl
#define ldrneb ldrbne
#endif
.align 5
rem_4bit:
.short 0x0000,0x1C20,0x3840,0x2460
.short 0x7080,0x6CA0,0x48C0,0x54E0
.short 0xE100,0xFD20,0xD940,0xC560
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
#ifdef __thumb2__
.thumb_func rem_4bit_get
#endif
rem_4bit_get:
#if defined(__thumb2__)
adr r2,rem_4bit
#else
sub r2,pc,#8+32 @ &rem_4bit
#endif
b Lrem_4bit_got
nop
nop
.globl _gcm_ghash_4bit
.private_extern _gcm_ghash_4bit
#ifdef __thumb2__
.thumb_func _gcm_ghash_4bit
#endif
.align 4
_gcm_ghash_4bit:
#if defined(__thumb2__)
adr r12,rem_4bit
#else
sub r12,pc,#8+48 @ &rem_4bit
#endif
add r3,r2,r3 @ r3 to point at the end
stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too
ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ...
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack
ldrb r12,[r2,#15]
ldrb r14,[r0,#15]
Louter:
eor r12,r12,r14
and r14,r12,#0xf0
and r12,r12,#0x0f
mov r3,#14
add r7,r1,r12,lsl#4
ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
add r11,r1,r14
ldrb r12,[r2,#14]
and r14,r4,#0xf @ rem
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
add r14,r14,r14
eor r4,r8,r4,lsr#4
ldrh r8,[sp,r14] @ rem_4bit[rem]
eor r4,r4,r5,lsl#28
ldrb r14,[r0,#14]
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
eor r12,r12,r14
and r14,r12,#0xf0
and r12,r12,#0x0f
eor r7,r7,r8,lsl#16
Linner:
add r11,r1,r12,lsl#4
and r12,r4,#0xf @ rem
subs r3,r3,#1
add r12,r12,r12
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
ldrh r8,[sp,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r12,[r2,r3]
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
add r11,r1,r14
and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r8,[r0,r3]
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
ldrh r9,[sp,r14]
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
#ifdef __thumb2__
it pl
#endif
eorpl r12,r12,r8
eor r7,r11,r7,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl r14,r12,#0xf0
andpl r12,r12,#0x0f
eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
bpl Linner
ldr r3,[sp,#32] @ re-load r3/end
add r2,r2,#16
mov r14,r4
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r4,r4
str r4,[r0,#12]
#elif defined(__ARMEB__)
str r4,[r0,#12]
#else
mov r9,r4,lsr#8
strb r4,[r0,#12+3]
mov r10,r4,lsr#16
strb r9,[r0,#12+2]
mov r11,r4,lsr#24
strb r10,[r0,#12+1]
strb r11,[r0,#12]
#endif
cmp r2,r3
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r5,r5
str r5,[r0,#8]
#elif defined(__ARMEB__)
str r5,[r0,#8]
#else
mov r9,r5,lsr#8
strb r5,[r0,#8+3]
mov r10,r5,lsr#16
strb r9,[r0,#8+2]
mov r11,r5,lsr#24
strb r10,[r0,#8+1]
strb r11,[r0,#8]
#endif
#ifdef __thumb2__
it ne
#endif
ldrneb r12,[r2,#15]
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6
str r6,[r0,#4]
#elif defined(__ARMEB__)
str r6,[r0,#4]
#else
mov r9,r6,lsr#8
strb r6,[r0,#4+3]
mov r10,r6,lsr#16
strb r9,[r0,#4+2]
mov r11,r6,lsr#24
strb r10,[r0,#4+1]
strb r11,[r0,#4]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7
str r7,[r0,#0]
#elif defined(__ARMEB__)
str r7,[r0,#0]
#else
mov r9,r7,lsr#8
strb r7,[r0,#0+3]
mov r10,r7,lsr#16
strb r9,[r0,#0+2]
mov r11,r7,lsr#24
strb r10,[r0,#0+1]
strb r11,[r0,#0]
#endif
bne Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
.globl _gcm_gmult_4bit
.private_extern _gcm_gmult_4bit
#ifdef __thumb2__
.thumb_func _gcm_gmult_4bit
#endif
_gcm_gmult_4bit:
stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
ldrb r12,[r0,#15]
b rem_4bit_get
Lrem_4bit_got:
and r14,r12,#0xf0
and r12,r12,#0x0f
mov r3,#14
add r7,r1,r12,lsl#4
ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo]
ldrb r12,[r0,#14]
add r11,r1,r14
and r14,r4,#0xf @ rem
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
add r14,r14,r14
eor r4,r8,r4,lsr#4
ldrh r8,[r2,r14] @ rem_4bit[rem]
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
and r14,r12,#0xf0
eor r7,r7,r8,lsl#16
and r12,r12,#0x0f
Loop:
add r11,r1,r12,lsl#4
and r12,r4,#0xf @ rem
subs r3,r3,#1
add r12,r12,r12
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
eor r5,r5,r6,lsl#28
ldrh r8,[r2,r12] @ rem_4bit[rem]
eor r6,r10,r6,lsr#4
#ifdef __thumb2__
it pl
#endif
ldrplb r12,[r0,r3]
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
add r11,r1,r14
and r14,r4,#0xf @ rem
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
add r14,r14,r14
ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi]
eor r4,r8,r4,lsr#4
eor r4,r4,r5,lsl#28
eor r5,r9,r5,lsr#4
ldrh r8,[r2,r14] @ rem_4bit[rem]
eor r5,r5,r6,lsl#28
eor r6,r10,r6,lsr#4
eor r6,r6,r7,lsl#28
eor r7,r11,r7,lsr#4
#ifdef __thumb2__
itt pl
#endif
andpl r14,r12,#0xf0
andpl r12,r12,#0x0f
eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
bpl Loop
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r4,r4
str r4,[r0,#12]
#elif defined(__ARMEB__)
str r4,[r0,#12]
#else
mov r9,r4,lsr#8
strb r4,[r0,#12+3]
mov r10,r4,lsr#16
strb r9,[r0,#12+2]
mov r11,r4,lsr#24
strb r10,[r0,#12+1]
strb r11,[r0,#12]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r5,r5
str r5,[r0,#8]
#elif defined(__ARMEB__)
str r5,[r0,#8]
#else
mov r9,r5,lsr#8
strb r5,[r0,#8+3]
mov r10,r5,lsr#16
strb r9,[r0,#8+2]
mov r11,r5,lsr#24
strb r10,[r0,#8+1]
strb r11,[r0,#8]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r6,r6
str r6,[r0,#4]
#elif defined(__ARMEB__)
str r6,[r0,#4]
#else
mov r9,r6,lsr#8
strb r6,[r0,#4+3]
mov r10,r6,lsr#16
strb r9,[r0,#4+2]
mov r11,r6,lsr#24
strb r10,[r0,#4+1]
strb r11,[r0,#4]
#endif
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev r7,r7
str r7,[r0,#0]
#elif defined(__ARMEB__)
str r7,[r0,#0]
#else
mov r9,r7,lsr#8
strb r7,[r0,#0+3]
mov r10,r7,lsr#16
strb r9,[r0,#0+2]
mov r11,r7,lsr#24
strb r10,[r0,#0+1]
strb r11,[r0,#0]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
#else
ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
.word 0xe12fff1e @ interoperable with Thumb ISA:-)
#endif
#if __ARM_MAX_ARCH__>=7
.globl _gcm_init_neon
.private_extern _gcm_init_neon
#ifdef __thumb2__
.thumb_func _gcm_init_neon
#endif
.align 4
_gcm_init_neon:
vld1.64 d7,[r1]! @ load H
vmov.i8 q8,#0xe1
vld1.64 d6,[r1]
vshl.i64 d17,#57
vshr.u64 d16,#63 @ t0=0xc2....01
vdup.8 q9,d7[7]
vshr.u64 d26,d6,#63
vshr.s8 q9,#7 @ broadcast carry bit
vshl.i64 q3,q3,#1
vand q8,q8,q9
vorr d7,d26 @ H<<<=1
veor q3,q3,q8 @ twisted H
vstmia r0,{q3}
bx lr @ bx lr
.globl _gcm_gmult_neon
.private_extern _gcm_gmult_neon
#ifdef __thumb2__
.thumb_func _gcm_gmult_neon
#endif
.align 4
_gcm_gmult_neon:
vld1.64 d7,[r0]! @ load Xi
vld1.64 d6,[r0]!
vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26,d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 q3,q3
#endif
vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing
mov r3,#16
b Lgmult_neon
.globl _gcm_ghash_neon
.private_extern _gcm_ghash_neon
#ifdef __thumb2__
.thumb_func _gcm_ghash_neon
#endif
.align 4
_gcm_ghash_neon:
vld1.64 d1,[r0]! @ load Xi
vld1.64 d0,[r0]!
vmov.i64 d29,#0x0000ffffffffffff
vldmia r1,{d26,d27} @ load twisted H
vmov.i64 d30,#0x00000000ffffffff
#ifdef __ARMEL__
vrev64.8 q0,q0
#endif
vmov.i64 d31,#0x000000000000ffff
veor d28,d26,d27 @ Karatsuba pre-processing
Loop_neon:
vld1.64 d7,[r2]! @ load inp
vld1.64 d6,[r2]!
#ifdef __ARMEL__
vrev64.8 q3,q3
#endif
veor q3,q0 @ inp^=Xi
Lgmult_neon:
vext.8 d16, d26, d26, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d0, d6, d6, #1 @ B1
vmull.p8 q0, d26, d0 @ E = A*B1
vext.8 d18, d26, d26, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d26, d22 @ G = A*B2
vext.8 d20, d26, d26, #3 @ A3
veor q8, q8, q0 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d0, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q0, d26, d0 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d26, d22 @ K = A*B4
veor q10, q10, q0 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q0, d26, d6 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q0, q0, q8
veor q0, q0, q10
veor d6,d6,d7 @ Karatsuba pre-processing
vext.8 d16, d28, d28, #1 @ A1
vmull.p8 q8, d16, d6 @ F = A1*B
vext.8 d2, d6, d6, #1 @ B1
vmull.p8 q1, d28, d2 @ E = A*B1
vext.8 d18, d28, d28, #2 @ A2
vmull.p8 q9, d18, d6 @ H = A2*B
vext.8 d22, d6, d6, #2 @ B2
vmull.p8 q11, d28, d22 @ G = A*B2
vext.8 d20, d28, d28, #3 @ A3
veor q8, q8, q1 @ L = E + F
vmull.p8 q10, d20, d6 @ J = A3*B
vext.8 d2, d6, d6, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q1, d28, d2 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d6, d6, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d28, d22 @ K = A*B4
veor q10, q10, q1 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q1, d28, d6 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q1, q1, q8
veor q1, q1, q10
vext.8 d16, d27, d27, #1 @ A1
vmull.p8 q8, d16, d7 @ F = A1*B
vext.8 d4, d7, d7, #1 @ B1
vmull.p8 q2, d27, d4 @ E = A*B1
vext.8 d18, d27, d27, #2 @ A2
vmull.p8 q9, d18, d7 @ H = A2*B
vext.8 d22, d7, d7, #2 @ B2
vmull.p8 q11, d27, d22 @ G = A*B2
vext.8 d20, d27, d27, #3 @ A3
veor q8, q8, q2 @ L = E + F
vmull.p8 q10, d20, d7 @ J = A3*B
vext.8 d4, d7, d7, #3 @ B3
veor q9, q9, q11 @ M = G + H
vmull.p8 q2, d27, d4 @ I = A*B3
veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
vand d17, d17, d29
vext.8 d22, d7, d7, #4 @ B4
veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
vand d19, d19, d30
vmull.p8 q11, d27, d22 @ K = A*B4
veor q10, q10, q2 @ N = I + J
veor d16, d16, d17
veor d18, d18, d19
veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
vand d21, d21, d31
vext.8 q8, q8, q8, #15
veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
vmov.i64 d23, #0
vext.8 q9, q9, q9, #14
veor d20, d20, d21
vmull.p8 q2, d27, d7 @ D = A*B
vext.8 q11, q11, q11, #12
vext.8 q10, q10, q10, #13
veor q8, q8, q9
veor q10, q10, q11
veor q2, q2, q8
veor q2, q2, q10
veor q1,q1,q0 @ Karatsuba post-processing
veor q1,q1,q2
veor d1,d1,d2
veor d4,d4,d3 @ Xh|Xl - 256-bit result
@ equivalent of reduction_avx from ghash-x86_64.pl
vshl.i64 q9,q0,#57 @ 1st phase
vshl.i64 q10,q0,#62
veor q10,q10,q9 @
vshl.i64 q9,q0,#63
veor q10, q10, q9 @
veor d1,d1,d20 @
veor d4,d4,d21
vshr.u64 q10,q0,#1 @ 2nd phase
veor q2,q2,q0
veor q0,q0,q10 @
vshr.u64 q10,q10,#6
vshr.u64 q0,q0,#1 @
veor q0,q0,q2 @
veor q0,q0,q10 @
subs r3,#16
bne Loop_neon
#ifdef __ARMEL__
vrev64.8 q0,q0
#endif
sub r0,#16
vst1.64 d1,[r0]! @ write out Xi
vst1.64 d0,[r0]
bx lr @ bx lr
#endif
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2

View File

@@ -0,0 +1,241 @@
#include <openssl/arm_arch.h>
.text
.code 32
#undef __thumb2__
.globl _gcm_init_v8
.private_extern _gcm_init_v8
#ifdef __thumb2__
.thumb_func _gcm_init_v8
#endif
.align 4
_gcm_init_v8:
vld1.64 {q9},[r1] @ load input H
vmov.i8 q11,#0xe1
vshl.i64 q11,q11,#57 @ 0xc2.0
vext.8 q3,q9,q9,#8
vshr.u64 q10,q11,#63
vdup.32 q9,d18[1]
vext.8 q8,q10,q11,#8 @ t0=0xc2....01
vshr.u64 q10,q3,#63
vshr.s32 q9,q9,#31 @ broadcast carry bit
vand q10,q10,q8
vshl.i64 q3,q3,#1
vext.8 q10,q10,q10,#8
vand q8,q8,q9
vorr q3,q3,q10 @ H<<<=1
veor q12,q3,q8 @ twisted H
vst1.64 {q12},[r0]! @ store Htable[0]
@ calculate H^2
vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
veor q8,q8,q12
.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q14,q0,q10
vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
veor q9,q9,q14
vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
vst1.64 {q13,q14},[r0] @ store Htable[1..2]
bx lr
.globl _gcm_gmult_v8
.private_extern _gcm_gmult_v8
#ifdef __thumb2__
.thumb_func _gcm_gmult_v8
#endif
.align 4
_gcm_gmult_v8:
vld1.64 {q9},[r0] @ load Xi
vmov.i8 q11,#0xe1
vld1.64 {q12,q13},[r1] @ load twisted H, ...
vshl.u64 q11,q11,#57
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vext.8 q3,q9,q9,#8
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
bx lr
.globl _gcm_ghash_v8
.private_extern _gcm_ghash_v8
#ifdef __thumb2__
.thumb_func _gcm_ghash_v8
#endif
.align 4
_gcm_ghash_v8:
vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
vld1.64 {q0},[r0] @ load [rotated] Xi
@ "[rotated]" means that
@ loaded value would have
@ to be rotated in order to
@ make it appear as in
@ alorithm specification
subs r3,r3,#32 @ see if r3 is 32 or larger
mov r12,#16 @ r12 is used as post-
@ increment for input pointer;
@ as loop is modulo-scheduled
@ r12 is zeroed just in time
@ to preclude oversteping
@ inp[len], which means that
@ last block[s] are actually
@ loaded twice, but last
@ copy is not processed
vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
vmov.i8 q11,#0xe1
vld1.64 {q14},[r1]
moveq r12,#0 @ is it time to zero r12?
vext.8 q0,q0,q0,#8 @ rotate Xi
vld1.64 {q8},[r2]! @ load [rotated] I[0]
vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
#ifndef __ARMEB__
vrev64.8 q8,q8
vrev64.8 q0,q0
#endif
vext.8 q3,q8,q8,#8 @ rotate I[0]
blo Lodd_tail_v8 @ r3 was less than 32
vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vext.8 q7,q9,q9,#8
veor q3,q3,q0 @ I[i]^=Xi
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q9,q9,q7 @ Karatsuba pre-processing
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
b Loop_mod2x_v8
.align 4
Loop_mod2x_v8:
vext.8 q10,q3,q3,#8
subs r3,r3,#32 @ is there more data?
.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
movlo r12,#0 @ is it time to zero r12?
.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
veor q10,q10,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
veor q0,q0,q4 @ accumulate
.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
veor q2,q2,q6
moveq r12,#0 @ is it time to zero r12?
veor q1,q1,q5
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
#ifndef __ARMEB__
vrev64.8 q8,q8
#endif
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
#ifndef __ARMEB__
vrev64.8 q9,q9
#endif
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
vext.8 q7,q9,q9,#8
vext.8 q3,q8,q8,#8
veor q0,q1,q10
.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
veor q3,q3,q2 @ accumulate q3 early
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q3,q3,q10
veor q9,q9,q7 @ Karatsuba pre-processing
veor q3,q3,q0
.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
bhs Loop_mod2x_v8 @ there was at least 32 more bytes
veor q2,q2,q10
vext.8 q3,q8,q8,#8 @ re-construct q3
adds r3,r3,#32 @ re-construct r3
veor q0,q0,q2 @ re-construct q0
beq Ldone_v8 @ is r3 zero?
Lodd_tail_v8:
vext.8 q10,q0,q0,#8
veor q3,q3,q0 @ inp^=Xi
veor q9,q8,q10 @ q9 is rotated inp^Xi
.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
veor q9,q9,q3 @ Karatsuba pre-processing
.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
veor q10,q0,q2
veor q1,q1,q9
veor q1,q1,q10
.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
vmov d4,d3 @ Xh|Xm - 256-bit result
vmov d3,d0 @ Xm is rotated Xl
veor q0,q1,q10
vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
veor q10,q10,q2
veor q0,q0,q10
Ldone_v8:
#ifndef __ARMEB__
vrev64.8 q0,q0
#endif
vext.8 q0,q0,q0,#8
vst1.64 {q0},[r0] @ write out Xi
vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
bx lr
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff