Source release 17.1.0
This commit is contained in:
2000
third_party/boringssl/kit/win-aarch64/crypto/chacha/chacha-armv8.S
vendored
Normal file
2000
third_party/boringssl/kit/win-aarch64/crypto/chacha/chacha-armv8.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
813
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/aesv8-armx64.S
vendored
Normal file
813
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/aesv8-armx64.S
vendored
Normal file
@@ -0,0 +1,813 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include <openssl/arm_arch.h>
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
.section .rodata
|
||||
.align 5
|
||||
Lrcon:
|
||||
.long 0x01,0x01,0x01,0x01
|
||||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
|
||||
.long 0x1b,0x1b,0x1b,0x1b
|
||||
|
||||
.text
|
||||
|
||||
.globl aes_hw_set_encrypt_key
|
||||
|
||||
.def aes_hw_set_encrypt_key
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_set_encrypt_key:
|
||||
Lenc_key:
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
mov x3,#-1
|
||||
cmp x0,#0
|
||||
b.eq Lenc_key_abort
|
||||
cmp x2,#0
|
||||
b.eq Lenc_key_abort
|
||||
mov x3,#-2
|
||||
cmp w1,#128
|
||||
b.lt Lenc_key_abort
|
||||
cmp w1,#256
|
||||
b.gt Lenc_key_abort
|
||||
tst w1,#0x3f
|
||||
b.ne Lenc_key_abort
|
||||
|
||||
adrp x3,Lrcon
|
||||
add x3,x3,:lo12:Lrcon
|
||||
cmp w1,#192
|
||||
|
||||
eor v0.16b,v0.16b,v0.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
mov w1,#8 // reuse w1
|
||||
ld1 {v1.4s,v2.4s},[x3],#32
|
||||
|
||||
b.lt Loop128
|
||||
b.eq L192
|
||||
b L256
|
||||
|
||||
.align 4
|
||||
Loop128:
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
b.ne Loop128
|
||||
|
||||
ld1 {v1.4s},[x3]
|
||||
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
|
||||
tbl v6.16b,{v3.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v3.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
st1 {v3.4s},[x2]
|
||||
add x2,x2,#0x50
|
||||
|
||||
mov w12,#10
|
||||
b Ldone
|
||||
|
||||
.align 4
|
||||
L192:
|
||||
ld1 {v4.8b},[x0],#8
|
||||
movi v6.16b,#8 // borrow v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
sub v2.16b,v2.16b,v6.16b // adjust the mask
|
||||
|
||||
Loop192:
|
||||
tbl v6.16b,{v4.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v4.8b},[x2],#8
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
|
||||
dup v5.4s,v3.s[3]
|
||||
eor v5.16b,v5.16b,v4.16b
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
ext v4.16b,v0.16b,v4.16b,#12
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
eor v4.16b,v4.16b,v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
b.ne Loop192
|
||||
|
||||
mov w12,#12
|
||||
add x2,x2,#0x20
|
||||
b Ldone
|
||||
|
||||
.align 4
|
||||
L256:
|
||||
ld1 {v4.16b},[x0]
|
||||
mov w1,#7
|
||||
mov w12,#14
|
||||
st1 {v3.4s},[x2],#16
|
||||
|
||||
Loop256:
|
||||
tbl v6.16b,{v4.16b},v2.16b
|
||||
ext v5.16b,v0.16b,v3.16b,#12
|
||||
st1 {v4.4s},[x2],#16
|
||||
aese v6.16b,v0.16b
|
||||
subs w1,w1,#1
|
||||
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v6.16b,v6.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
shl v1.16b,v1.16b,#1
|
||||
eor v3.16b,v3.16b,v6.16b
|
||||
st1 {v3.4s},[x2],#16
|
||||
b.eq Ldone
|
||||
|
||||
dup v6.4s,v3.s[3] // just splat
|
||||
ext v5.16b,v0.16b,v4.16b,#12
|
||||
aese v6.16b,v0.16b
|
||||
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
ext v5.16b,v0.16b,v5.16b,#12
|
||||
eor v4.16b,v4.16b,v5.16b
|
||||
|
||||
eor v4.16b,v4.16b,v6.16b
|
||||
b Loop256
|
||||
|
||||
Ldone:
|
||||
str w12,[x2]
|
||||
mov x3,#0
|
||||
|
||||
Lenc_key_abort:
|
||||
mov x0,x3 // return value
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
|
||||
|
||||
.globl aes_hw_set_decrypt_key
|
||||
|
||||
.def aes_hw_set_decrypt_key
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_set_decrypt_key:
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
bl Lenc_key
|
||||
|
||||
cmp x0,#0
|
||||
b.ne Ldec_key_abort
|
||||
|
||||
sub x2,x2,#240 // restore original x2
|
||||
mov x4,#-16
|
||||
add x0,x2,x12,lsl#4 // end of key schedule
|
||||
|
||||
ld1 {v0.4s},[x2]
|
||||
ld1 {v1.4s},[x0]
|
||||
st1 {v0.4s},[x0],x4
|
||||
st1 {v1.4s},[x2],#16
|
||||
|
||||
Loop_imc:
|
||||
ld1 {v0.4s},[x2]
|
||||
ld1 {v1.4s},[x0]
|
||||
aesimc v0.16b,v0.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
st1 {v0.4s},[x0],x4
|
||||
st1 {v1.4s},[x2],#16
|
||||
cmp x0,x2
|
||||
b.hi Loop_imc
|
||||
|
||||
ld1 {v0.4s},[x2]
|
||||
aesimc v0.16b,v0.16b
|
||||
st1 {v0.4s},[x0]
|
||||
|
||||
eor x0,x0,x0 // return value
|
||||
Ldec_key_abort:
|
||||
ldp x29,x30,[sp],#16
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
.globl aes_hw_encrypt
|
||||
|
||||
.def aes_hw_encrypt
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_encrypt:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ldr w3,[x2,#240]
|
||||
ld1 {v0.4s},[x2],#16
|
||||
ld1 {v2.16b},[x0]
|
||||
sub w3,w3,#2
|
||||
ld1 {v1.4s},[x2],#16
|
||||
|
||||
Loop_enc:
|
||||
aese v2.16b,v0.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2],#16
|
||||
subs w3,w3,#2
|
||||
aese v2.16b,v1.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v1.4s},[x2],#16
|
||||
b.gt Loop_enc
|
||||
|
||||
aese v2.16b,v0.16b
|
||||
aesmc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2]
|
||||
aese v2.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
|
||||
st1 {v2.16b},[x1]
|
||||
ret
|
||||
|
||||
.globl aes_hw_decrypt
|
||||
|
||||
.def aes_hw_decrypt
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_decrypt:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ldr w3,[x2,#240]
|
||||
ld1 {v0.4s},[x2],#16
|
||||
ld1 {v2.16b},[x0]
|
||||
sub w3,w3,#2
|
||||
ld1 {v1.4s},[x2],#16
|
||||
|
||||
Loop_dec:
|
||||
aesd v2.16b,v0.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2],#16
|
||||
subs w3,w3,#2
|
||||
aesd v2.16b,v1.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v1.4s},[x2],#16
|
||||
b.gt Loop_dec
|
||||
|
||||
aesd v2.16b,v0.16b
|
||||
aesimc v2.16b,v2.16b
|
||||
ld1 {v0.4s},[x2]
|
||||
aesd v2.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
|
||||
st1 {v2.16b},[x1]
|
||||
ret
|
||||
|
||||
.globl aes_hw_cbc_encrypt
|
||||
|
||||
.def aes_hw_cbc_encrypt
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_cbc_encrypt:
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
subs x2,x2,#16
|
||||
mov x8,#16
|
||||
b.lo Lcbc_abort
|
||||
csel x8,xzr,x8,eq
|
||||
|
||||
cmp w5,#0 // en- or decrypting?
|
||||
ldr w5,[x3,#240]
|
||||
and x2,x2,#-16
|
||||
ld1 {v6.16b},[x4]
|
||||
ld1 {v0.16b},[x0],x8
|
||||
|
||||
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#6
|
||||
add x7,x3,x5,lsl#4 // pointer to last 7 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v18.4s,v19.4s},[x7],#32
|
||||
ld1 {v20.4s,v21.4s},[x7],#32
|
||||
ld1 {v22.4s,v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
b.eq Lcbc_dec
|
||||
|
||||
cmp w5,#2
|
||||
eor v0.16b,v0.16b,v6.16b
|
||||
eor v5.16b,v16.16b,v7.16b
|
||||
b.eq Lcbc_enc128
|
||||
|
||||
ld1 {v2.4s,v3.4s},[x7]
|
||||
add x7,x3,#16
|
||||
add x6,x3,#16*4
|
||||
add x12,x3,#16*5
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
add x14,x3,#16*6
|
||||
add x3,x3,#16*7
|
||||
b Lenter_cbc_enc
|
||||
|
||||
.align 4
|
||||
Loop_cbc_enc:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
Lenter_cbc_enc:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v2.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.4s},[x6]
|
||||
cmp w5,#4
|
||||
aese v0.16b,v3.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x12]
|
||||
b.eq Lcbc_enc192
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.4s},[x14]
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x3]
|
||||
nop
|
||||
|
||||
Lcbc_enc192:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v23.16b
|
||||
eor v6.16b,v0.16b,v7.16b
|
||||
b.hs Loop_cbc_enc
|
||||
|
||||
st1 {v6.16b},[x1],#16
|
||||
b Lcbc_done
|
||||
|
||||
.align 5
|
||||
Lcbc_enc128:
|
||||
ld1 {v2.4s,v3.4s},[x7]
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
b Lenter_cbc_enc128
|
||||
Loop_cbc_enc128:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
st1 {v6.16b},[x1],#16
|
||||
Lenter_cbc_enc128:
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
subs x2,x2,#16
|
||||
aese v0.16b,v2.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
csel x8,xzr,x8,eq
|
||||
aese v0.16b,v3.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v18.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v19.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
ld1 {v16.16b},[x0],x8
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
eor v16.16b,v16.16b,v5.16b
|
||||
aese v0.16b,v23.16b
|
||||
eor v6.16b,v0.16b,v7.16b
|
||||
b.hs Loop_cbc_enc128
|
||||
|
||||
st1 {v6.16b},[x1],#16
|
||||
b Lcbc_done
|
||||
.align 5
|
||||
Lcbc_dec:
|
||||
ld1 {v18.16b},[x0],#16
|
||||
subs x2,x2,#32 // bias
|
||||
add w6,w5,#2
|
||||
orr v3.16b,v0.16b,v0.16b
|
||||
orr v1.16b,v0.16b,v0.16b
|
||||
orr v19.16b,v18.16b,v18.16b
|
||||
b.lo Lcbc_dec_tail
|
||||
|
||||
orr v1.16b,v18.16b,v18.16b
|
||||
ld1 {v18.16b},[x0],#16
|
||||
orr v2.16b,v0.16b,v0.16b
|
||||
orr v3.16b,v1.16b,v1.16b
|
||||
orr v19.16b,v18.16b,v18.16b
|
||||
|
||||
Loop3x_cbc_dec:
|
||||
aesd v0.16b,v16.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aesd v0.16b,v17.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt Loop3x_cbc_dec
|
||||
|
||||
aesd v0.16b,v16.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v4.16b,v6.16b,v7.16b
|
||||
subs x2,x2,#0x30
|
||||
eor v5.16b,v2.16b,v7.16b
|
||||
csel x6,x2,x6,lo // x6, w6, is zero at this point
|
||||
aesd v0.16b,v17.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
add x0,x0,x6 // x0 is adjusted in such way that
|
||||
// at exit from the loop v1.16b-v18.16b
|
||||
// are loaded with last "words"
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
mov x7,x3
|
||||
aesd v0.16b,v20.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v20.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
aesd v0.16b,v21.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
aesd v0.16b,v22.16b
|
||||
aesimc v0.16b,v0.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
aesd v0.16b,v23.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
add w6,w5,#2
|
||||
eor v4.16b,v4.16b,v0.16b
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v18.16b,v18.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v4.16b},[x1],#16
|
||||
orr v0.16b,v2.16b,v2.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
orr v1.16b,v3.16b,v3.16b
|
||||
st1 {v18.16b},[x1],#16
|
||||
orr v18.16b,v19.16b,v19.16b
|
||||
b.hs Loop3x_cbc_dec
|
||||
|
||||
cmn x2,#0x30
|
||||
b.eq Lcbc_done
|
||||
nop
|
||||
|
||||
Lcbc_dec_tail:
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt Lcbc_dec_tail
|
||||
|
||||
aesd v1.16b,v16.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v16.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
aesd v1.16b,v17.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v17.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
aesd v1.16b,v20.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v20.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
cmn x2,#0x20
|
||||
aesd v1.16b,v21.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v21.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v5.16b,v6.16b,v7.16b
|
||||
aesd v1.16b,v22.16b
|
||||
aesimc v1.16b,v1.16b
|
||||
aesd v18.16b,v22.16b
|
||||
aesimc v18.16b,v18.16b
|
||||
eor v17.16b,v3.16b,v7.16b
|
||||
aesd v1.16b,v23.16b
|
||||
aesd v18.16b,v23.16b
|
||||
b.eq Lcbc_dec_one
|
||||
eor v5.16b,v5.16b,v1.16b
|
||||
eor v17.16b,v17.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
st1 {v17.16b},[x1],#16
|
||||
b Lcbc_done
|
||||
|
||||
Lcbc_dec_one:
|
||||
eor v5.16b,v5.16b,v18.16b
|
||||
orr v6.16b,v19.16b,v19.16b
|
||||
st1 {v5.16b},[x1],#16
|
||||
|
||||
Lcbc_done:
|
||||
st1 {v6.16b},[x4]
|
||||
Lcbc_abort:
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
|
||||
.globl aes_hw_ctr32_encrypt_blocks
|
||||
|
||||
.def aes_hw_ctr32_encrypt_blocks
|
||||
.type 32
|
||||
.endef
|
||||
.align 5
|
||||
aes_hw_ctr32_encrypt_blocks:
|
||||
// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
ldr w5,[x3,#240]
|
||||
|
||||
ldr w8, [x4, #12]
|
||||
ld1 {v0.4s},[x4]
|
||||
|
||||
ld1 {v16.4s,v17.4s},[x3] // load key schedule...
|
||||
sub w5,w5,#4
|
||||
mov x12,#16
|
||||
cmp x2,#2
|
||||
add x7,x3,x5,lsl#4 // pointer to last 5 round keys
|
||||
sub w5,w5,#2
|
||||
ld1 {v20.4s,v21.4s},[x7],#32
|
||||
ld1 {v22.4s,v23.4s},[x7],#32
|
||||
ld1 {v7.4s},[x7]
|
||||
add x7,x3,#32
|
||||
mov w6,w5
|
||||
csel x12,xzr,x12,lo
|
||||
|
||||
// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
|
||||
// affected by silicon errata #1742098 [0] and #1655431 [1],
|
||||
// respectively, where the second instruction of an aese/aesmc
|
||||
// instruction pair may execute twice if an interrupt is taken right
|
||||
// after the first instruction consumes an input register of which a
|
||||
// single 32-bit lane has been updated the last time it was modified.
|
||||
//
|
||||
// This function uses a counter in one 32-bit lane. The vmov lines
|
||||
// could write to v1.16b and v18.16b directly, but that trips this bugs.
|
||||
// We write to v6.16b and copy to the final register as a workaround.
|
||||
//
|
||||
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
|
||||
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
|
||||
#ifndef __ARMEB__
|
||||
rev w8, w8
|
||||
#endif
|
||||
add w10, w8, #1
|
||||
orr v6.16b,v0.16b,v0.16b
|
||||
rev w10, w10
|
||||
mov v6.s[3],w10
|
||||
add w8, w8, #2
|
||||
orr v1.16b,v6.16b,v6.16b
|
||||
b.ls Lctr32_tail
|
||||
rev w12, w8
|
||||
mov v6.s[3],w12
|
||||
sub x2,x2,#3 // bias
|
||||
orr v18.16b,v6.16b,v6.16b
|
||||
b Loop3x_ctr32
|
||||
|
||||
.align 4
|
||||
Loop3x_ctr32:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt Loop3x_ctr32
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v4.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v5.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],#16
|
||||
add w9,w8,#1
|
||||
aese v18.16b,v16.16b
|
||||
aesmc v18.16b,v18.16b
|
||||
ld1 {v3.16b},[x0],#16
|
||||
rev w9,w9
|
||||
aese v4.16b,v17.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v17.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
ld1 {v19.16b},[x0],#16
|
||||
mov x7,x3
|
||||
aese v18.16b,v17.16b
|
||||
aesmc v17.16b,v18.16b
|
||||
aese v4.16b,v20.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v20.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
add w10,w8,#2
|
||||
aese v17.16b,v20.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
add w8,w8,#3
|
||||
aese v4.16b,v21.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
aese v5.16b,v21.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
// Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
|
||||
// around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
|
||||
// 32-bit mode. See the comment above.
|
||||
eor v19.16b,v19.16b,v7.16b
|
||||
mov v6.s[3], w9
|
||||
aese v17.16b,v21.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
orr v0.16b,v6.16b,v6.16b
|
||||
rev w10,w10
|
||||
aese v4.16b,v22.16b
|
||||
aesmc v4.16b,v4.16b
|
||||
mov v6.s[3], w10
|
||||
rev w12,w8
|
||||
aese v5.16b,v22.16b
|
||||
aesmc v5.16b,v5.16b
|
||||
orr v1.16b,v6.16b,v6.16b
|
||||
mov v6.s[3], w12
|
||||
aese v17.16b,v22.16b
|
||||
aesmc v17.16b,v17.16b
|
||||
orr v18.16b,v6.16b,v6.16b
|
||||
subs x2,x2,#3
|
||||
aese v4.16b,v23.16b
|
||||
aese v5.16b,v23.16b
|
||||
aese v17.16b,v23.16b
|
||||
|
||||
eor v2.16b,v2.16b,v4.16b
|
||||
ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
|
||||
st1 {v2.16b},[x1],#16
|
||||
eor v3.16b,v3.16b,v5.16b
|
||||
mov w6,w5
|
||||
st1 {v3.16b},[x1],#16
|
||||
eor v19.16b,v19.16b,v17.16b
|
||||
ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
|
||||
st1 {v19.16b},[x1],#16
|
||||
b.hs Loop3x_ctr32
|
||||
|
||||
adds x2,x2,#3
|
||||
b.eq Lctr32_done
|
||||
cmp x2,#1
|
||||
mov x12,#16
|
||||
csel x12,xzr,x12,eq
|
||||
|
||||
Lctr32_tail:
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v16.4s},[x7],#16
|
||||
subs w6,w6,#2
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v17.4s},[x7],#16
|
||||
b.gt Lctr32_tail
|
||||
|
||||
aese v0.16b,v16.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v16.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
aese v0.16b,v17.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v17.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v2.16b},[x0],x12
|
||||
aese v0.16b,v20.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v20.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
ld1 {v3.16b},[x0]
|
||||
aese v0.16b,v21.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v21.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v2.16b,v2.16b,v7.16b
|
||||
aese v0.16b,v22.16b
|
||||
aesmc v0.16b,v0.16b
|
||||
aese v1.16b,v22.16b
|
||||
aesmc v1.16b,v1.16b
|
||||
eor v3.16b,v3.16b,v7.16b
|
||||
aese v0.16b,v23.16b
|
||||
aese v1.16b,v23.16b
|
||||
|
||||
cmp x2,#1
|
||||
eor v2.16b,v2.16b,v0.16b
|
||||
eor v3.16b,v3.16b,v1.16b
|
||||
st1 {v2.16b},[x1],#16
|
||||
b.eq Lctr32_done
|
||||
st1 {v3.16b},[x1]
|
||||
|
||||
Lctr32_done:
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
1441
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/armv8-mont.S
vendored
Normal file
1441
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/armv8-mont.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
351
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
vendored
Normal file
351
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
vendored
Normal file
@@ -0,0 +1,351 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include <openssl/arm_arch.h>
|
||||
|
||||
.text
|
||||
|
||||
.globl gcm_init_neon
|
||||
|
||||
.def gcm_init_neon
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_init_neon:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
// This function is adapted from gcm_init_v8. xC2 is t3.
|
||||
ld1 {v17.2d}, [x1] // load H
|
||||
movi v19.16b, #0xe1
|
||||
shl v19.2d, v19.2d, #57 // 0xc2.0
|
||||
ext v3.16b, v17.16b, v17.16b, #8
|
||||
ushr v18.2d, v19.2d, #63
|
||||
dup v17.4s, v17.s[1]
|
||||
ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
|
||||
ushr v18.2d, v3.2d, #63
|
||||
sshr v17.4s, v17.4s, #31 // broadcast carry bit
|
||||
and v18.16b, v18.16b, v16.16b
|
||||
shl v3.2d, v3.2d, #1
|
||||
ext v18.16b, v18.16b, v18.16b, #8
|
||||
and v16.16b, v16.16b, v17.16b
|
||||
orr v3.16b, v3.16b, v18.16b // H<<<=1
|
||||
eor v5.16b, v3.16b, v16.16b // twisted H
|
||||
st1 {v5.2d}, [x0] // store Htable[0]
|
||||
ret
|
||||
|
||||
|
||||
.globl gcm_gmult_neon
|
||||
|
||||
.def gcm_gmult_neon
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_gmult_neon:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ld1 {v3.16b}, [x0] // load Xi
|
||||
ld1 {v5.1d}, [x1], #8 // load twisted H
|
||||
ld1 {v6.1d}, [x1]
|
||||
adrp x9, Lmasks // load constants
|
||||
add x9, x9, :lo12:Lmasks
|
||||
ld1 {v24.2d, v25.2d}, [x9]
|
||||
rev64 v3.16b, v3.16b // byteswap Xi
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
|
||||
|
||||
mov x3, #16
|
||||
b Lgmult_neon
|
||||
|
||||
|
||||
.globl gcm_ghash_neon
|
||||
|
||||
.def gcm_ghash_neon
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_ghash_neon:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ld1 {v0.16b}, [x0] // load Xi
|
||||
ld1 {v5.1d}, [x1], #8 // load twisted H
|
||||
ld1 {v6.1d}, [x1]
|
||||
adrp x9, Lmasks // load constants
|
||||
add x9, x9, :lo12:Lmasks
|
||||
ld1 {v24.2d, v25.2d}, [x9]
|
||||
rev64 v0.16b, v0.16b // byteswap Xi
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
|
||||
|
||||
Loop_neon:
|
||||
ld1 {v3.16b}, [x2], #16 // load inp
|
||||
rev64 v3.16b, v3.16b // byteswap inp
|
||||
ext v3.16b, v3.16b, v3.16b, #8
|
||||
eor v3.16b, v3.16b, v0.16b // inp ^= Xi
|
||||
|
||||
Lgmult_neon:
|
||||
// Split the input into v3 and v4. (The upper halves are unused,
|
||||
// so it is okay to leave them alone.)
|
||||
ins v4.d[0], v3.d[1]
|
||||
ext v16.8b, v5.8b, v5.8b, #1 // A1
|
||||
pmull v16.8h, v16.8b, v3.8b // F = A1*B
|
||||
ext v0.8b, v3.8b, v3.8b, #1 // B1
|
||||
pmull v0.8h, v5.8b, v0.8b // E = A*B1
|
||||
ext v17.8b, v5.8b, v5.8b, #2 // A2
|
||||
pmull v17.8h, v17.8b, v3.8b // H = A2*B
|
||||
ext v19.8b, v3.8b, v3.8b, #2 // B2
|
||||
pmull v19.8h, v5.8b, v19.8b // G = A*B2
|
||||
ext v18.8b, v5.8b, v5.8b, #3 // A3
|
||||
eor v16.16b, v16.16b, v0.16b // L = E + F
|
||||
pmull v18.8h, v18.8b, v3.8b // J = A3*B
|
||||
ext v0.8b, v3.8b, v3.8b, #3 // B3
|
||||
eor v17.16b, v17.16b, v19.16b // M = G + H
|
||||
pmull v0.8h, v5.8b, v0.8b // I = A*B3
|
||||
|
||||
// Here we diverge from the 32-bit version. It computes the following
|
||||
// (instructions reordered for clarity):
|
||||
//
|
||||
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
|
||||
// vand $t0#hi, $t0#hi, $k48
|
||||
// veor $t0#lo, $t0#lo, $t0#hi
|
||||
//
|
||||
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
|
||||
// vand $t1#hi, $t1#hi, $k32
|
||||
// veor $t1#lo, $t1#lo, $t1#hi
|
||||
//
|
||||
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
|
||||
// vand $t2#hi, $t2#hi, $k16
|
||||
// veor $t2#lo, $t2#lo, $t2#hi
|
||||
//
|
||||
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
|
||||
// vmov.i64 $t3#hi, #0
|
||||
//
|
||||
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
|
||||
// upper halves of SIMD registers, so we must split each half into
|
||||
// separate registers. To compensate, we pair computations up and
|
||||
// parallelize.
|
||||
|
||||
ext v19.8b, v3.8b, v3.8b, #4 // B4
|
||||
eor v18.16b, v18.16b, v0.16b // N = I + J
|
||||
pmull v19.8h, v5.8b, v19.8b // K = A*B4
|
||||
|
||||
// This can probably be scheduled more efficiently. For now, we just
|
||||
// pair up independent instructions.
|
||||
zip1 v20.2d, v16.2d, v17.2d
|
||||
zip1 v22.2d, v18.2d, v19.2d
|
||||
zip2 v21.2d, v16.2d, v17.2d
|
||||
zip2 v23.2d, v18.2d, v19.2d
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
and v21.16b, v21.16b, v24.16b
|
||||
and v23.16b, v23.16b, v25.16b
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
zip1 v16.2d, v20.2d, v21.2d
|
||||
zip1 v18.2d, v22.2d, v23.2d
|
||||
zip2 v17.2d, v20.2d, v21.2d
|
||||
zip2 v19.2d, v22.2d, v23.2d
|
||||
|
||||
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
|
||||
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
|
||||
pmull v0.8h, v5.8b, v3.8b // D = A*B
|
||||
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
|
||||
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
|
||||
eor v16.16b, v16.16b, v17.16b
|
||||
eor v18.16b, v18.16b, v19.16b
|
||||
eor v0.16b, v0.16b, v16.16b
|
||||
eor v0.16b, v0.16b, v18.16b
|
||||
eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
|
||||
ext v16.8b, v7.8b, v7.8b, #1 // A1
|
||||
pmull v16.8h, v16.8b, v3.8b // F = A1*B
|
||||
ext v1.8b, v3.8b, v3.8b, #1 // B1
|
||||
pmull v1.8h, v7.8b, v1.8b // E = A*B1
|
||||
ext v17.8b, v7.8b, v7.8b, #2 // A2
|
||||
pmull v17.8h, v17.8b, v3.8b // H = A2*B
|
||||
ext v19.8b, v3.8b, v3.8b, #2 // B2
|
||||
pmull v19.8h, v7.8b, v19.8b // G = A*B2
|
||||
ext v18.8b, v7.8b, v7.8b, #3 // A3
|
||||
eor v16.16b, v16.16b, v1.16b // L = E + F
|
||||
pmull v18.8h, v18.8b, v3.8b // J = A3*B
|
||||
ext v1.8b, v3.8b, v3.8b, #3 // B3
|
||||
eor v17.16b, v17.16b, v19.16b // M = G + H
|
||||
pmull v1.8h, v7.8b, v1.8b // I = A*B3
|
||||
|
||||
// Here we diverge from the 32-bit version. It computes the following
|
||||
// (instructions reordered for clarity):
|
||||
//
|
||||
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
|
||||
// vand $t0#hi, $t0#hi, $k48
|
||||
// veor $t0#lo, $t0#lo, $t0#hi
|
||||
//
|
||||
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
|
||||
// vand $t1#hi, $t1#hi, $k32
|
||||
// veor $t1#lo, $t1#lo, $t1#hi
|
||||
//
|
||||
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
|
||||
// vand $t2#hi, $t2#hi, $k16
|
||||
// veor $t2#lo, $t2#lo, $t2#hi
|
||||
//
|
||||
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
|
||||
// vmov.i64 $t3#hi, #0
|
||||
//
|
||||
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
|
||||
// upper halves of SIMD registers, so we must split each half into
|
||||
// separate registers. To compensate, we pair computations up and
|
||||
// parallelize.
|
||||
|
||||
ext v19.8b, v3.8b, v3.8b, #4 // B4
|
||||
eor v18.16b, v18.16b, v1.16b // N = I + J
|
||||
pmull v19.8h, v7.8b, v19.8b // K = A*B4
|
||||
|
||||
// This can probably be scheduled more efficiently. For now, we just
|
||||
// pair up independent instructions.
|
||||
zip1 v20.2d, v16.2d, v17.2d
|
||||
zip1 v22.2d, v18.2d, v19.2d
|
||||
zip2 v21.2d, v16.2d, v17.2d
|
||||
zip2 v23.2d, v18.2d, v19.2d
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
and v21.16b, v21.16b, v24.16b
|
||||
and v23.16b, v23.16b, v25.16b
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
zip1 v16.2d, v20.2d, v21.2d
|
||||
zip1 v18.2d, v22.2d, v23.2d
|
||||
zip2 v17.2d, v20.2d, v21.2d
|
||||
zip2 v19.2d, v22.2d, v23.2d
|
||||
|
||||
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
|
||||
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
|
||||
pmull v1.8h, v7.8b, v3.8b // D = A*B
|
||||
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
|
||||
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
|
||||
eor v16.16b, v16.16b, v17.16b
|
||||
eor v18.16b, v18.16b, v19.16b
|
||||
eor v1.16b, v1.16b, v16.16b
|
||||
eor v1.16b, v1.16b, v18.16b
|
||||
ext v16.8b, v6.8b, v6.8b, #1 // A1
|
||||
pmull v16.8h, v16.8b, v4.8b // F = A1*B
|
||||
ext v2.8b, v4.8b, v4.8b, #1 // B1
|
||||
pmull v2.8h, v6.8b, v2.8b // E = A*B1
|
||||
ext v17.8b, v6.8b, v6.8b, #2 // A2
|
||||
pmull v17.8h, v17.8b, v4.8b // H = A2*B
|
||||
ext v19.8b, v4.8b, v4.8b, #2 // B2
|
||||
pmull v19.8h, v6.8b, v19.8b // G = A*B2
|
||||
ext v18.8b, v6.8b, v6.8b, #3 // A3
|
||||
eor v16.16b, v16.16b, v2.16b // L = E + F
|
||||
pmull v18.8h, v18.8b, v4.8b // J = A3*B
|
||||
ext v2.8b, v4.8b, v4.8b, #3 // B3
|
||||
eor v17.16b, v17.16b, v19.16b // M = G + H
|
||||
pmull v2.8h, v6.8b, v2.8b // I = A*B3
|
||||
|
||||
// Here we diverge from the 32-bit version. It computes the following
|
||||
// (instructions reordered for clarity):
|
||||
//
|
||||
// veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
|
||||
// vand $t0#hi, $t0#hi, $k48
|
||||
// veor $t0#lo, $t0#lo, $t0#hi
|
||||
//
|
||||
// veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
|
||||
// vand $t1#hi, $t1#hi, $k32
|
||||
// veor $t1#lo, $t1#lo, $t1#hi
|
||||
//
|
||||
// veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
|
||||
// vand $t2#hi, $t2#hi, $k16
|
||||
// veor $t2#lo, $t2#lo, $t2#hi
|
||||
//
|
||||
// veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
|
||||
// vmov.i64 $t3#hi, #0
|
||||
//
|
||||
// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
|
||||
// upper halves of SIMD registers, so we must split each half into
|
||||
// separate registers. To compensate, we pair computations up and
|
||||
// parallelize.
|
||||
|
||||
ext v19.8b, v4.8b, v4.8b, #4 // B4
|
||||
eor v18.16b, v18.16b, v2.16b // N = I + J
|
||||
pmull v19.8h, v6.8b, v19.8b // K = A*B4
|
||||
|
||||
// This can probably be scheduled more efficiently. For now, we just
|
||||
// pair up independent instructions.
|
||||
zip1 v20.2d, v16.2d, v17.2d
|
||||
zip1 v22.2d, v18.2d, v19.2d
|
||||
zip2 v21.2d, v16.2d, v17.2d
|
||||
zip2 v23.2d, v18.2d, v19.2d
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
and v21.16b, v21.16b, v24.16b
|
||||
and v23.16b, v23.16b, v25.16b
|
||||
eor v20.16b, v20.16b, v21.16b
|
||||
eor v22.16b, v22.16b, v23.16b
|
||||
zip1 v16.2d, v20.2d, v21.2d
|
||||
zip1 v18.2d, v22.2d, v23.2d
|
||||
zip2 v17.2d, v20.2d, v21.2d
|
||||
zip2 v19.2d, v22.2d, v23.2d
|
||||
|
||||
ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
|
||||
ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
|
||||
pmull v2.8h, v6.8b, v4.8b // D = A*B
|
||||
ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
|
||||
ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
|
||||
eor v16.16b, v16.16b, v17.16b
|
||||
eor v18.16b, v18.16b, v19.16b
|
||||
eor v2.16b, v2.16b, v16.16b
|
||||
eor v2.16b, v2.16b, v18.16b
|
||||
ext v16.16b, v0.16b, v2.16b, #8
|
||||
eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
|
||||
eor v1.16b, v1.16b, v2.16b
|
||||
eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
|
||||
ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
|
||||
// This is a no-op due to the ins instruction below.
|
||||
// ins v2.d[0], v1.d[1]
|
||||
|
||||
// equivalent of reduction_avx from ghash-x86_64.pl
|
||||
shl v17.2d, v0.2d, #57 // 1st phase
|
||||
shl v18.2d, v0.2d, #62
|
||||
eor v18.16b, v18.16b, v17.16b //
|
||||
shl v17.2d, v0.2d, #63
|
||||
eor v18.16b, v18.16b, v17.16b //
|
||||
// Note Xm contains {Xl.d[1], Xh.d[0]}.
|
||||
eor v18.16b, v18.16b, v1.16b
|
||||
ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
|
||||
ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
|
||||
|
||||
ushr v18.2d, v0.2d, #1 // 2nd phase
|
||||
eor v2.16b, v2.16b,v0.16b
|
||||
eor v0.16b, v0.16b,v18.16b //
|
||||
ushr v18.2d, v18.2d, #6
|
||||
ushr v0.2d, v0.2d, #1 //
|
||||
eor v0.16b, v0.16b, v2.16b //
|
||||
eor v0.16b, v0.16b, v18.16b //
|
||||
|
||||
subs x3, x3, #16
|
||||
bne Loop_neon
|
||||
|
||||
rev64 v0.16b, v0.16b // byteswap Xi and write
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
st1 {v0.16b}, [x0]
|
||||
|
||||
ret
|
||||
|
||||
|
||||
.section .rodata
|
||||
.align 4
|
||||
Lmasks:
|
||||
.quad 0x0000ffffffffffff // k48
|
||||
.quad 0x00000000ffffffff // k32
|
||||
.quad 0x000000000000ffff // k16
|
||||
.quad 0x0000000000000000 // k0
|
||||
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
583
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/ghashv8-armx64.S
vendored
Normal file
583
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/ghashv8-armx64.S
vendored
Normal file
@@ -0,0 +1,583 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include <openssl/arm_arch.h>
|
||||
|
||||
#if __ARM_MAX_ARCH__>=7
|
||||
.text
|
||||
.arch armv8-a+crypto
|
||||
.globl gcm_init_v8
|
||||
|
||||
.def gcm_init_v8
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_init_v8:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ld1 {v17.2d},[x1] //load input H
|
||||
movi v19.16b,#0xe1
|
||||
shl v19.2d,v19.2d,#57 //0xc2.0
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
ushr v18.2d,v19.2d,#63
|
||||
dup v17.4s,v17.s[1]
|
||||
ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
|
||||
ushr v18.2d,v3.2d,#63
|
||||
sshr v17.4s,v17.4s,#31 //broadcast carry bit
|
||||
and v18.16b,v18.16b,v16.16b
|
||||
shl v3.2d,v3.2d,#1
|
||||
ext v18.16b,v18.16b,v18.16b,#8
|
||||
and v16.16b,v16.16b,v17.16b
|
||||
orr v3.16b,v3.16b,v18.16b //H<<<=1
|
||||
eor v20.16b,v3.16b,v16.16b //twisted H
|
||||
st1 {v20.2d},[x0],#16 //store Htable[0]
|
||||
|
||||
//calculate H^2
|
||||
ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
|
||||
pmull v0.1q,v20.1d,v20.1d
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
pmull2 v2.1q,v20.2d,v20.2d
|
||||
pmull v1.1q,v16.1d,v16.1d
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v22.16b,v0.16b,v18.16b
|
||||
|
||||
ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
|
||||
//calculate H^3 and H^4
|
||||
pmull v0.1q,v20.1d, v22.1d
|
||||
pmull v5.1q,v22.1d,v22.1d
|
||||
pmull2 v2.1q,v20.2d, v22.2d
|
||||
pmull2 v7.1q,v22.2d,v22.2d
|
||||
pmull v1.1q,v16.1d,v17.1d
|
||||
pmull v6.1q,v17.1d,v17.1d
|
||||
|
||||
ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
ext v17.16b,v5.16b,v7.16b,#8
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v16.16b
|
||||
eor v4.16b,v5.16b,v7.16b
|
||||
eor v6.16b,v6.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase
|
||||
eor v6.16b,v6.16b,v4.16b
|
||||
pmull v4.1q,v5.1d,v19.1d
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v7.d[0],v6.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ins v6.d[1],v5.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
eor v5.16b,v6.16b,v4.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
|
||||
ext v4.16b,v5.16b,v5.16b,#8
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
pmull v5.1q,v5.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v4.16b,v4.16b,v7.16b
|
||||
eor v20.16b, v0.16b,v18.16b //H^3
|
||||
eor v22.16b,v5.16b,v4.16b //H^4
|
||||
|
||||
ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
|
||||
ext v17.16b,v22.16b,v22.16b,#8
|
||||
eor v16.16b,v16.16b,v20.16b
|
||||
eor v17.16b,v17.16b,v22.16b
|
||||
ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
|
||||
st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
|
||||
ret
|
||||
|
||||
.globl gcm_gmult_v8
|
||||
|
||||
.def gcm_gmult_v8
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_gmult_v8:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
ld1 {v17.2d},[x0] //load Xi
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
|
||||
shl v19.2d,v19.2d,#57
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
|
||||
.globl gcm_ghash_v8
|
||||
|
||||
.def gcm_ghash_v8
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_ghash_v8:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
cmp x3,#64
|
||||
b.hs Lgcm_ghash_v8_4x
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
//"[rotated]" means that
|
||||
//loaded value would have
|
||||
//to be rotated in order to
|
||||
//make it appear as in
|
||||
//algorithm specification
|
||||
subs x3,x3,#32 //see if x3 is 32 or larger
|
||||
mov x12,#16 //x12 is used as post-
|
||||
//increment for input pointer;
|
||||
//as loop is modulo-scheduled
|
||||
//x12 is zeroed just in time
|
||||
//to preclude overstepping
|
||||
//inp[len], which means that
|
||||
//last block[s] are actually
|
||||
//loaded twice, but last
|
||||
//copy is not processed
|
||||
ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v22.2d},[x1]
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
|
||||
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
|
||||
b.lo Lodd_tail_v8 //x3 was less than 32
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b Loop_mod2x_v8
|
||||
|
||||
.align 4
|
||||
Loop_mod2x_v8:
|
||||
ext v18.16b,v3.16b,v3.16b,#8
|
||||
subs x3,x3,#32 //is there more data?
|
||||
pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
|
||||
csel x12,xzr,x12,lo //is it time to zero x12?
|
||||
|
||||
pmull v5.1q,v21.1d,v17.1d
|
||||
eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
|
||||
eor v0.16b,v0.16b,v4.16b //accumulate
|
||||
pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
|
||||
ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
|
||||
|
||||
eor v2.16b,v2.16b,v6.16b
|
||||
csel x12,xzr,x12,eq //is it time to zero x12?
|
||||
eor v1.16b,v1.16b,v5.16b
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
rev64 v16.16b,v16.16b
|
||||
#endif
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull v4.1q,v20.1d,v7.1d //H·Ii+1
|
||||
eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v3.16b,v3.16b,v18.16b
|
||||
eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
|
||||
eor v3.16b,v3.16b,v0.16b
|
||||
pmull2 v6.1q,v20.2d,v7.2d
|
||||
b.hs Loop_mod2x_v8 //there was at least 32 more bytes
|
||||
|
||||
eor v2.16b,v2.16b,v18.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
|
||||
adds x3,x3,#32 //re-construct x3
|
||||
eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
|
||||
b.eq Ldone_v8 //is x3 zero?
|
||||
Lodd_tail_v8:
|
||||
ext v18.16b,v0.16b,v0.16b,#8
|
||||
eor v3.16b,v3.16b,v0.16b //inp^=Xi
|
||||
eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
|
||||
eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
|
||||
pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
|
||||
pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
|
||||
.def gcm_ghash_v8_4x
|
||||
.type 32
|
||||
.endef
|
||||
.align 4
|
||||
gcm_ghash_v8_4x:
|
||||
Lgcm_ghash_v8_4x:
|
||||
ld1 {v0.2d},[x0] //load [rotated] Xi
|
||||
ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v7.16b,v7.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
ext v25.16b,v7.16b,v7.16b,#8
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
|
||||
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
|
||||
eor v7.16b,v7.16b,v25.16b
|
||||
pmull2 v31.1q,v20.2d,v25.2d
|
||||
pmull v30.1q,v21.1d,v7.1d
|
||||
|
||||
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
pmull2 v24.1q,v22.2d,v24.2d
|
||||
pmull2 v6.1q,v21.2d,v6.2d
|
||||
|
||||
eor v29.16b,v29.16b,v16.16b
|
||||
eor v31.16b,v31.16b,v24.16b
|
||||
eor v30.16b,v30.16b,v6.16b
|
||||
|
||||
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
pmull2 v23.1q,v26.2d,v23.2d
|
||||
pmull v5.1q,v27.1d,v5.1d
|
||||
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
subs x3,x3,#128
|
||||
b.lo Ltail4x
|
||||
|
||||
b Loop4x
|
||||
|
||||
.align 4
|
||||
Loop4x:
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v7.16b,v7.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v28.2d,v3.2d
|
||||
ext v25.16b,v7.16b,v7.16b,#8
|
||||
pmull2 v1.1q,v27.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
pmull v29.1q,v20.1d,v25.1d //H·Ii+3
|
||||
eor v7.16b,v7.16b,v25.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
pmull2 v31.1q,v20.2d,v25.2d
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v30.1q,v21.1d,v7.1d
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
pmull2 v24.1q,v22.2d,v24.2d
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
pmull2 v6.1q,v21.2d,v6.2d
|
||||
|
||||
eor v29.16b,v29.16b,v16.16b
|
||||
eor v31.16b,v31.16b,v24.16b
|
||||
eor v30.16b,v30.16b,v6.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
pmull2 v23.1q,v26.2d,v23.2d
|
||||
pmull v5.1q,v27.1d,v5.1d
|
||||
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
subs x3,x3,#64
|
||||
b.hs Loop4x
|
||||
|
||||
Ltail4x:
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v28.2d,v3.2d
|
||||
pmull2 v1.1q,v27.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
|
||||
adds x3,x3,#64
|
||||
b.eq Ldone4x
|
||||
|
||||
cmp x3,#32
|
||||
b.lo Lone
|
||||
b.eq Ltwo
|
||||
Lthree:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v24.16b,v6.16b,v6.16b,#8
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
pmull v29.1q,v20.1d,v24.1d //H·Ii+2
|
||||
eor v6.16b,v6.16b,v24.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
pmull2 v31.1q,v20.2d,v24.2d
|
||||
pmull v30.1q,v21.1d,v6.1d
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
pmull2 v23.1q,v22.2d,v23.2d
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
pmull2 v5.1q,v21.2d,v5.2d
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
eor v29.16b,v29.16b,v7.16b
|
||||
eor v31.16b,v31.16b,v23.16b
|
||||
eor v30.16b,v30.16b,v5.16b
|
||||
|
||||
pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v26.2d,v3.2d
|
||||
pmull v1.1q,v27.1d,v16.1d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
b Ldone4x
|
||||
|
||||
.align 4
|
||||
Ltwo:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
ext v23.16b,v5.16b,v5.16b,#8
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
pmull v29.1q,v20.1d,v23.1d //H·Ii+1
|
||||
eor v5.16b,v5.16b,v23.16b
|
||||
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull2 v31.1q,v20.2d,v23.2d
|
||||
pmull v30.1q,v21.1d,v5.1d
|
||||
|
||||
pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v22.2d,v3.2d
|
||||
pmull2 v1.1q,v21.2d,v16.2d
|
||||
|
||||
eor v0.16b,v0.16b,v29.16b
|
||||
eor v2.16b,v2.16b,v31.16b
|
||||
eor v1.16b,v1.16b,v30.16b
|
||||
b Ldone4x
|
||||
|
||||
.align 4
|
||||
Lone:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
|
||||
pmull v0.1q,v20.1d,v3.1d
|
||||
eor v16.16b,v16.16b,v3.16b
|
||||
pmull2 v2.1q,v20.2d,v3.2d
|
||||
pmull v1.1q,v21.1d,v16.1d
|
||||
|
||||
Ldone4x:
|
||||
ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
ins v2.d[0],v1.d[1]
|
||||
ins v1.d[1],v0.d[0]
|
||||
eor v0.16b,v1.16b,v18.16b
|
||||
|
||||
ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
|
||||
pmull v0.1q,v0.1d,v19.1d
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
#ifndef __ARMEB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
|
||||
ret
|
||||
|
||||
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#endif
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
1241
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha1-armv8.S
vendored
Normal file
1241
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha1-armv8.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1217
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha256-armv8.S
vendored
Normal file
1217
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha256-armv8.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1085
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha512-armv8.S
vendored
Normal file
1085
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/sha512-armv8.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1272
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/vpaes-armv8.S
vendored
Normal file
1272
third_party/boringssl/kit/win-aarch64/crypto/fipsmodule/vpaes-armv8.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
760
third_party/boringssl/kit/win-aarch64/crypto/test/trampoline-armv8.S
vendored
Normal file
760
third_party/boringssl/kit/win-aarch64/crypto/test/trampoline-armv8.S
vendored
Normal file
@@ -0,0 +1,760 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include <openssl/arm_arch.h>
|
||||
|
||||
.text
|
||||
|
||||
// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
|
||||
// with |argv|, then saves the callee-saved registers into |state|. It returns
|
||||
// the result of |func|. The |unwind| argument is unused.
|
||||
// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
|
||||
// const uint64_t *argv, size_t argc,
|
||||
// uint64_t unwind);
|
||||
|
||||
.globl abi_test_trampoline
|
||||
|
||||
.align 4
|
||||
abi_test_trampoline:
|
||||
Labi_test_trampoline_begin:
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
// Stack layout (low to high addresses)
|
||||
// x29,x30 (16 bytes)
|
||||
// d8-d15 (64 bytes)
|
||||
// x19-x28 (80 bytes)
|
||||
// x1 (8 bytes)
|
||||
// padding (8 bytes)
|
||||
stp x29, x30, [sp, #-176]!
|
||||
mov x29, sp
|
||||
|
||||
// Saved callee-saved registers and |state|.
|
||||
stp d8, d9, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d12, d13, [sp, #48]
|
||||
stp d14, d15, [sp, #64]
|
||||
stp x19, x20, [sp, #80]
|
||||
stp x21, x22, [sp, #96]
|
||||
stp x23, x24, [sp, #112]
|
||||
stp x25, x26, [sp, #128]
|
||||
stp x27, x28, [sp, #144]
|
||||
str x1, [sp, #160]
|
||||
|
||||
// Load registers from |state|, with the exception of x29. x29 is the
|
||||
// frame pointer and also callee-saved, but AAPCS64 allows platforms to
|
||||
// mandate that x29 always point to a frame. iOS64 does so, which means
|
||||
// we cannot fill x29 with entropy without violating ABI rules
|
||||
// ourselves. x29 is tested separately below.
|
||||
ldp d8, d9, [x1], #16
|
||||
ldp d10, d11, [x1], #16
|
||||
ldp d12, d13, [x1], #16
|
||||
ldp d14, d15, [x1], #16
|
||||
ldp x19, x20, [x1], #16
|
||||
ldp x21, x22, [x1], #16
|
||||
ldp x23, x24, [x1], #16
|
||||
ldp x25, x26, [x1], #16
|
||||
ldp x27, x28, [x1], #16
|
||||
|
||||
// Move parameters into temporary registers.
|
||||
mov x9, x0
|
||||
mov x10, x2
|
||||
mov x11, x3
|
||||
|
||||
// Load parameters into registers.
|
||||
cbz x11, Largs_done
|
||||
ldr x0, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x1, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x2, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x3, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x4, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x5, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x6, [x10], #8
|
||||
subs x11, x11, #1
|
||||
b.eq Largs_done
|
||||
ldr x7, [x10], #8
|
||||
|
||||
Largs_done:
|
||||
blr x9
|
||||
|
||||
// Reload |state| and store registers.
|
||||
ldr x1, [sp, #160]
|
||||
stp d8, d9, [x1], #16
|
||||
stp d10, d11, [x1], #16
|
||||
stp d12, d13, [x1], #16
|
||||
stp d14, d15, [x1], #16
|
||||
stp x19, x20, [x1], #16
|
||||
stp x21, x22, [x1], #16
|
||||
stp x23, x24, [x1], #16
|
||||
stp x25, x26, [x1], #16
|
||||
stp x27, x28, [x1], #16
|
||||
|
||||
// |func| is required to preserve x29, the frame pointer. We cannot load
|
||||
// random values into x29 (see comment above), so compare it against the
|
||||
// expected value and zero the field of |state| if corrupted.
|
||||
mov x9, sp
|
||||
cmp x29, x9
|
||||
b.eq Lx29_ok
|
||||
str xzr, [x1]
|
||||
|
||||
Lx29_ok:
|
||||
// Restore callee-saved registers.
|
||||
ldp d8, d9, [sp, #16]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d12, d13, [sp, #48]
|
||||
ldp d14, d15, [sp, #64]
|
||||
ldp x19, x20, [sp, #80]
|
||||
ldp x21, x22, [sp, #96]
|
||||
ldp x23, x24, [sp, #112]
|
||||
ldp x25, x26, [sp, #128]
|
||||
ldp x27, x28, [sp, #144]
|
||||
|
||||
ldp x29, x30, [sp], #176
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x0
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x0:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x0, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x1
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x1:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x1, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x2
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x2:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x2, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x3
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x3:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x3, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x4
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x4:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x4, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x5
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x5:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x5, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x6
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x6:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x6, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x7
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x7:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x7, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x8
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x8:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x8, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x9
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x9:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x9, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x10
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x10:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x10, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x11
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x11:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x11, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x12
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x12:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x12, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x13
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x13:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x13, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x14
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x14:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x14, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x15
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x15:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x15, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x16
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x16:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x16, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x17
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x17:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x17, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x19
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x19:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x19, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x20
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x20:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x20, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x21
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x21:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x21, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x22
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x22:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x22, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x23
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x23:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x23, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x24
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x24:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x24, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x25
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x25:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x25, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x26
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x26:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x26, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x27
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x27:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x27, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x28
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x28:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x28, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_x29
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_x29:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
mov x29, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d0
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d0:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d0, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d1
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d1:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d1, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d2
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d2:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d2, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d3
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d3:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d3, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d4
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d4:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d4, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d5
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d5:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d5, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d6
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d6:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d6, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d7
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d7:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d7, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d8
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d8:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d8, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d9
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d9:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d9, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d10
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d10:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d10, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d11
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d11:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d11, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d12
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d12:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d12, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d13
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d13:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d13, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d14
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d14:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d14, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d15
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d15:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d15, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d16
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d16:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d16, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d17
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d17:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d17, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d18
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d18:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d18, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d19
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d19:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d19, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d20
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d20:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d20, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d21
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d21:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d21, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d22
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d22:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d22, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d23
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d23:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d23, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d24
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d24:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d24, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d25
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d25:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d25, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d26
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d26:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d26, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d27
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d27:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d27, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d28
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d28:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d28, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d29
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d29:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d29, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d30
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d30:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d30, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_d31
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_d31:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov d31, xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v8_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v8_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v8.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v9_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v9_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v9.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v10_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v10_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v10.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v11_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v11_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v11.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v12_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v12_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v12.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v13_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v13_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v13.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v14_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v14_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v14.d[1], xzr
|
||||
ret
|
||||
|
||||
|
||||
.globl abi_test_clobber_v15_upper
|
||||
|
||||
.align 4
|
||||
abi_test_clobber_v15_upper:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
fmov v15.d[1], xzr
|
||||
ret
|
||||
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
Reference in New Issue
Block a user