Source release 18.1.0
This commit is contained in:
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -635,7 +634,7 @@ aes_hw_ctr32_encrypt_blocks:
|
||||
//
|
||||
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
|
||||
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w8, w8
|
||||
#endif
|
||||
add w10, w8, #1
|
||||
@@ -797,6 +796,8 @@ aes_hw_ctr32_encrypt_blocks:
|
||||
ret
|
||||
.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
1567
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S
vendored
Normal file
1567
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -1431,6 +1430,8 @@ __bn_mul4x_mont:
|
||||
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 4
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
101
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S
vendored
Normal file
101
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S
vendored
Normal file
@@ -0,0 +1,101 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include <openssl/arm_arch.h>
|
||||
|
||||
.text
|
||||
|
||||
// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
// size_t num);
|
||||
.type bn_add_words, %function
|
||||
.globl bn_add_words
|
||||
.hidden bn_add_words
|
||||
.align 4
|
||||
bn_add_words:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
# Clear the carry flag.
|
||||
cmn xzr, xzr
|
||||
|
||||
# aarch64 can load two registers at a time, so we do two loop iterations at
|
||||
# at a time. Split x3 = 2 * x8 + x3. This allows loop
|
||||
# operations to use CBNZ without clobbering the carry flag.
|
||||
lsr x8, x3, #1
|
||||
and x3, x3, #1
|
||||
|
||||
cbz x8, .Ladd_tail
|
||||
.Ladd_loop:
|
||||
ldp x4, x5, [x1], #16
|
||||
ldp x6, x7, [x2], #16
|
||||
sub x8, x8, #1
|
||||
adcs x4, x4, x6
|
||||
adcs x5, x5, x7
|
||||
stp x4, x5, [x0], #16
|
||||
cbnz x8, .Ladd_loop
|
||||
|
||||
.Ladd_tail:
|
||||
cbz x3, .Ladd_exit
|
||||
ldr x4, [x1], #8
|
||||
ldr x6, [x2], #8
|
||||
adcs x4, x4, x6
|
||||
str x4, [x0], #8
|
||||
|
||||
.Ladd_exit:
|
||||
cset x0, cs
|
||||
ret
|
||||
.size bn_add_words,.-bn_add_words
|
||||
|
||||
// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
|
||||
// size_t num);
|
||||
.type bn_sub_words, %function
|
||||
.globl bn_sub_words
|
||||
.hidden bn_sub_words
|
||||
.align 4
|
||||
bn_sub_words:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
|
||||
# so we want C = 1 here.
|
||||
cmp xzr, xzr
|
||||
|
||||
# aarch64 can load two registers at a time, so we do two loop iterations at
|
||||
# at a time. Split x3 = 2 * x8 + x3. This allows loop
|
||||
# operations to use CBNZ without clobbering the carry flag.
|
||||
lsr x8, x3, #1
|
||||
and x3, x3, #1
|
||||
|
||||
cbz x8, .Lsub_tail
|
||||
.Lsub_loop:
|
||||
ldp x4, x5, [x1], #16
|
||||
ldp x6, x7, [x2], #16
|
||||
sub x8, x8, #1
|
||||
sbcs x4, x4, x6
|
||||
sbcs x5, x5, x7
|
||||
stp x4, x5, [x0], #16
|
||||
cbnz x8, .Lsub_loop
|
||||
|
||||
.Lsub_tail:
|
||||
cbz x3, .Lsub_exit
|
||||
ldr x4, [x1], #8
|
||||
ldr x6, [x2], #8
|
||||
sbcs x4, x4, x6
|
||||
str x4, [x0], #8
|
||||
|
||||
.Lsub_exit:
|
||||
cset x0, cc
|
||||
ret
|
||||
.size bn_sub_words,.-bn_sub_words
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -341,6 +340,8 @@ gcm_ghash_neon:
|
||||
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -120,7 +119,7 @@ gcm_gmult_v8:
|
||||
movi v19.16b,#0xe1
|
||||
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
|
||||
shl v19.2d,v19.2d,#57
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v3.16b,v17.16b,v17.16b,#8
|
||||
@@ -145,7 +144,7 @@ gcm_gmult_v8:
|
||||
eor v18.16b,v18.16b,v2.16b
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
@@ -184,14 +183,14 @@ gcm_ghash_v8:
|
||||
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
|
||||
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
|
||||
b.lo .Lodd_tail_v8 //x3 was less than 32
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ext v7.16b,v17.16b,v17.16b,#8
|
||||
@@ -223,13 +222,13 @@ gcm_ghash_v8:
|
||||
eor v18.16b,v0.16b,v2.16b
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v16.16b,v16.16b
|
||||
#endif
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
|
||||
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v17.16b,v17.16b
|
||||
#endif
|
||||
ins v2.d[0],v1.d[1]
|
||||
@@ -279,7 +278,7 @@ gcm_ghash_v8:
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
|
||||
.Ldone_v8:
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
@@ -298,7 +297,7 @@ gcm_ghash_v8_4x:
|
||||
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
|
||||
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v0.16b,v0.16b
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
@@ -342,7 +341,7 @@ gcm_ghash_v8_4x:
|
||||
eor v16.16b,v4.16b,v0.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
|
||||
ext v3.16b,v16.16b,v16.16b,#8
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v7.16b,v7.16b
|
||||
@@ -425,7 +424,7 @@ gcm_ghash_v8_4x:
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d,v6.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v6.16b,v6.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
@@ -477,7 +476,7 @@ gcm_ghash_v8_4x:
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d,v5.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v5.16b,v5.16b
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
@@ -520,7 +519,7 @@ gcm_ghash_v8_4x:
|
||||
eor v1.16b,v1.16b,v17.16b
|
||||
ld1 {v4.2d},[x2]
|
||||
eor v1.16b,v1.16b,v18.16b
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v4.16b,v4.16b
|
||||
#endif
|
||||
|
||||
@@ -560,7 +559,7 @@ gcm_ghash_v8_4x:
|
||||
eor v0.16b,v0.16b,v18.16b
|
||||
ext v0.16b,v0.16b,v0.16b,#8
|
||||
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev64 v0.16b,v0.16b
|
||||
#endif
|
||||
st1 {v0.2d},[x0] //write out Xi
|
||||
@@ -571,6 +570,8 @@ gcm_ghash_v8_4x:
|
||||
.align 2
|
||||
.align 2
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
1738
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S
vendored
Normal file
1738
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S
vendored
Normal file
File diff suppressed because it is too large
Load Diff
321
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S
vendored
Normal file
321
third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S
vendored
Normal file
@@ -0,0 +1,321 @@
|
||||
// This file is generated from a similarly-named Perl script in the BoringSSL
|
||||
// source tree. Do not edit by hand.
|
||||
|
||||
#if !defined(__has_feature)
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
#include "openssl/arm_arch.h"
|
||||
|
||||
.text
|
||||
.globl beeu_mod_inverse_vartime
|
||||
.hidden beeu_mod_inverse_vartime
|
||||
.type beeu_mod_inverse_vartime, %function
|
||||
.align 4
|
||||
beeu_mod_inverse_vartime:
|
||||
// Reserve enough space for 14 8-byte registers on the stack
|
||||
// in the first stp call for x29, x30.
|
||||
// Then store the remaining callee-saved registers.
|
||||
//
|
||||
// | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
|
||||
// ^ ^
|
||||
// sp <------------------- 112 bytes ----------------> old sp
|
||||
// x29 (FP)
|
||||
//
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x29,x30,[sp,#-112]!
|
||||
add x29,sp,#0
|
||||
stp x19,x20,[sp,#16]
|
||||
stp x21,x22,[sp,#32]
|
||||
stp x23,x24,[sp,#48]
|
||||
stp x25,x26,[sp,#64]
|
||||
stp x27,x28,[sp,#80]
|
||||
stp x0,x2,[sp,#96]
|
||||
|
||||
// B = b3..b0 := a
|
||||
ldp x25,x26,[x1]
|
||||
ldp x27,x28,[x1,#16]
|
||||
|
||||
// n3..n0 := n
|
||||
// Note: the value of input params are changed in the following.
|
||||
ldp x0,x1,[x2]
|
||||
ldp x2,x30,[x2,#16]
|
||||
|
||||
// A = a3..a0 := n
|
||||
mov x21, x0
|
||||
mov x22, x1
|
||||
mov x23, x2
|
||||
mov x24, x30
|
||||
|
||||
// X = x4..x0 := 1
|
||||
mov x3, #1
|
||||
eor x4, x4, x4
|
||||
eor x5, x5, x5
|
||||
eor x6, x6, x6
|
||||
eor x7, x7, x7
|
||||
|
||||
// Y = y4..y0 := 0
|
||||
eor x8, x8, x8
|
||||
eor x9, x9, x9
|
||||
eor x10, x10, x10
|
||||
eor x11, x11, x11
|
||||
eor x12, x12, x12
|
||||
|
||||
.Lbeeu_loop:
|
||||
// if B == 0, jump to .Lbeeu_loop_end
|
||||
orr x14, x25, x26
|
||||
orr x14, x14, x27
|
||||
|
||||
// reverse the bit order of x25. This is needed for clz after this macro
|
||||
rbit x15, x25
|
||||
|
||||
orr x14, x14, x28
|
||||
cbz x14,.Lbeeu_loop_end
|
||||
|
||||
|
||||
// 0 < B < |n|,
|
||||
// 0 < A <= |n|,
|
||||
// (1) X*a == B (mod |n|),
|
||||
// (2) (-1)*Y*a == A (mod |n|)
|
||||
|
||||
// Now divide B by the maximum possible power of two in the
|
||||
// integers, and divide X by the same value mod |n|.
|
||||
// When we're done, (1) still holds.
|
||||
|
||||
// shift := number of trailing 0s in x25
|
||||
// ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
|
||||
clz x13, x15
|
||||
|
||||
// If there is no shift, goto shift_A_Y
|
||||
cbz x13, .Lbeeu_shift_A_Y
|
||||
|
||||
// Shift B right by "x13" bits
|
||||
neg x14, x13
|
||||
lsr x25, x25, x13
|
||||
lsl x15, x26, x14
|
||||
|
||||
lsr x26, x26, x13
|
||||
lsl x19, x27, x14
|
||||
|
||||
orr x25, x25, x15
|
||||
|
||||
lsr x27, x27, x13
|
||||
lsl x20, x28, x14
|
||||
|
||||
orr x26, x26, x19
|
||||
|
||||
lsr x28, x28, x13
|
||||
|
||||
orr x27, x27, x20
|
||||
|
||||
|
||||
// Shift X right by "x13" bits, adding n whenever X becomes odd.
|
||||
// x13--;
|
||||
// x14 := 0; needed in the addition to the most significant word in SHIFT1
|
||||
eor x14, x14, x14
|
||||
.Lbeeu_shift_loop_X:
|
||||
tbz x3, #0, .Lshift1_0
|
||||
adds x3, x3, x0
|
||||
adcs x4, x4, x1
|
||||
adcs x5, x5, x2
|
||||
adcs x6, x6, x30
|
||||
adc x7, x7, x14
|
||||
.Lshift1_0:
|
||||
// var0 := [var1|var0]<64..1>;
|
||||
// i.e. concatenate var1 and var0,
|
||||
// extract bits <64..1> from the resulting 128-bit value
|
||||
// and put them in var0
|
||||
extr x3, x4, x3, #1
|
||||
extr x4, x5, x4, #1
|
||||
extr x5, x6, x5, #1
|
||||
extr x6, x7, x6, #1
|
||||
lsr x7, x7, #1
|
||||
|
||||
subs x13, x13, #1
|
||||
bne .Lbeeu_shift_loop_X
|
||||
|
||||
// Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
|
||||
// with the following differences:
|
||||
// - "x13" is set directly to the number of trailing 0s in B
|
||||
// (using rbit and clz instructions)
|
||||
// - The loop is only used to call SHIFT1(X)
|
||||
// and x13 is decreased while executing the X loop.
|
||||
// - SHIFT256(B, x13) is performed before right-shifting X; they are independent
|
||||
|
||||
.Lbeeu_shift_A_Y:
|
||||
// Same for A and Y.
|
||||
// Afterwards, (2) still holds.
|
||||
// Reverse the bit order of x21
|
||||
// x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
|
||||
rbit x15, x21
|
||||
clz x13, x15
|
||||
|
||||
// If there is no shift, goto |B-A|, X+Y update
|
||||
cbz x13, .Lbeeu_update_B_X_or_A_Y
|
||||
|
||||
// Shift A right by "x13" bits
|
||||
neg x14, x13
|
||||
lsr x21, x21, x13
|
||||
lsl x15, x22, x14
|
||||
|
||||
lsr x22, x22, x13
|
||||
lsl x19, x23, x14
|
||||
|
||||
orr x21, x21, x15
|
||||
|
||||
lsr x23, x23, x13
|
||||
lsl x20, x24, x14
|
||||
|
||||
orr x22, x22, x19
|
||||
|
||||
lsr x24, x24, x13
|
||||
|
||||
orr x23, x23, x20
|
||||
|
||||
|
||||
// Shift Y right by "x13" bits, adding n whenever Y becomes odd.
|
||||
// x13--;
|
||||
// x14 := 0; needed in the addition to the most significant word in SHIFT1
|
||||
eor x14, x14, x14
|
||||
.Lbeeu_shift_loop_Y:
|
||||
tbz x8, #0, .Lshift1_1
|
||||
adds x8, x8, x0
|
||||
adcs x9, x9, x1
|
||||
adcs x10, x10, x2
|
||||
adcs x11, x11, x30
|
||||
adc x12, x12, x14
|
||||
.Lshift1_1:
|
||||
// var0 := [var1|var0]<64..1>;
|
||||
// i.e. concatenate var1 and var0,
|
||||
// extract bits <64..1> from the resulting 128-bit value
|
||||
// and put them in var0
|
||||
extr x8, x9, x8, #1
|
||||
extr x9, x10, x9, #1
|
||||
extr x10, x11, x10, #1
|
||||
extr x11, x12, x11, #1
|
||||
lsr x12, x12, #1
|
||||
|
||||
subs x13, x13, #1
|
||||
bne .Lbeeu_shift_loop_Y
|
||||
|
||||
.Lbeeu_update_B_X_or_A_Y:
|
||||
// Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
|
||||
// Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
|
||||
// without taking a sign bit if generated. The lack of a carry would
|
||||
// indicate a negative result. See, for example,
|
||||
// https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
|
||||
subs x14, x25, x21
|
||||
sbcs x15, x26, x22
|
||||
sbcs x19, x27, x23
|
||||
sbcs x20, x28, x24
|
||||
bcs .Lbeeu_B_greater_than_A
|
||||
|
||||
// Else A > B =>
|
||||
// A := A - B; Y := Y + X; goto beginning of the loop
|
||||
subs x21, x21, x25
|
||||
sbcs x22, x22, x26
|
||||
sbcs x23, x23, x27
|
||||
sbcs x24, x24, x28
|
||||
|
||||
adds x8, x8, x3
|
||||
adcs x9, x9, x4
|
||||
adcs x10, x10, x5
|
||||
adcs x11, x11, x6
|
||||
adc x12, x12, x7
|
||||
b .Lbeeu_loop
|
||||
|
||||
.Lbeeu_B_greater_than_A:
|
||||
// Continue with B > A =>
|
||||
// B := B - A; X := X + Y; goto beginning of the loop
|
||||
mov x25, x14
|
||||
mov x26, x15
|
||||
mov x27, x19
|
||||
mov x28, x20
|
||||
|
||||
adds x3, x3, x8
|
||||
adcs x4, x4, x9
|
||||
adcs x5, x5, x10
|
||||
adcs x6, x6, x11
|
||||
adc x7, x7, x12
|
||||
b .Lbeeu_loop
|
||||
|
||||
.Lbeeu_loop_end:
|
||||
// The Euclid's algorithm loop ends when A == gcd(a,n);
|
||||
// this would be 1, when a and n are co-prime (i.e. do not have a common factor).
|
||||
// Since (-1)*Y*a == A (mod |n|), Y>0
|
||||
// then out = -Y mod n
|
||||
|
||||
// Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
|
||||
// Is A-1 == 0?
|
||||
// If not, fail.
|
||||
sub x14, x21, #1
|
||||
orr x14, x14, x22
|
||||
orr x14, x14, x23
|
||||
orr x14, x14, x24
|
||||
cbnz x14, .Lbeeu_err
|
||||
|
||||
// If Y>n ==> Y:=Y-n
|
||||
.Lbeeu_reduction_loop:
|
||||
// x_i := y_i - n_i (X is no longer needed, use it as temp)
|
||||
// (x14 = 0 from above)
|
||||
subs x3, x8, x0
|
||||
sbcs x4, x9, x1
|
||||
sbcs x5, x10, x2
|
||||
sbcs x6, x11, x30
|
||||
sbcs x7, x12, x14
|
||||
|
||||
// If result is non-negative (i.e., cs = carry set = no borrow),
|
||||
// y_i := x_i; goto reduce again
|
||||
// else
|
||||
// y_i := y_i; continue
|
||||
csel x8, x3, x8, cs
|
||||
csel x9, x4, x9, cs
|
||||
csel x10, x5, x10, cs
|
||||
csel x11, x6, x11, cs
|
||||
csel x12, x7, x12, cs
|
||||
bcs .Lbeeu_reduction_loop
|
||||
|
||||
// Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
|
||||
// out = -Y = n-Y
|
||||
subs x8, x0, x8
|
||||
sbcs x9, x1, x9
|
||||
sbcs x10, x2, x10
|
||||
sbcs x11, x30, x11
|
||||
|
||||
// Save Y in output (out (x0) was saved on the stack)
|
||||
ldr x3, [sp,#96]
|
||||
stp x8, x9, [x3]
|
||||
stp x10, x11, [x3,#16]
|
||||
// return 1 (success)
|
||||
mov x0, #1
|
||||
b .Lbeeu_finish
|
||||
|
||||
.Lbeeu_err:
|
||||
// return 0 (error)
|
||||
eor x0, x0, x0
|
||||
|
||||
.Lbeeu_finish:
|
||||
// Restore callee-saved registers, except x0, x2
|
||||
add sp,x29,#0
|
||||
ldp x19,x20,[sp,#16]
|
||||
ldp x21,x22,[sp,#32]
|
||||
ldp x23,x24,[sp,#48]
|
||||
ldp x25,x26,[sp,#64]
|
||||
ldp x27,x28,[sp,#80]
|
||||
ldp x29,x30,[sp],#112
|
||||
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -52,7 +51,7 @@ sha1_block_data_order:
|
||||
movz w28,#0x7999
|
||||
sub x2,x2,#1
|
||||
movk w28,#0x5a82,lsl#16
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x3,x3,#32
|
||||
#else
|
||||
rev32 x3,x3
|
||||
@@ -70,7 +69,7 @@ sha1_block_data_order:
|
||||
ror w21,w21,#2
|
||||
add w23,w23,w4 // future e+=X[i]
|
||||
add w24,w24,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x5,x5,#32
|
||||
#else
|
||||
rev32 x5,x5
|
||||
@@ -95,7 +94,7 @@ sha1_block_data_order:
|
||||
ror w24,w24,#2
|
||||
add w21,w21,w6 // future e+=X[i]
|
||||
add w22,w22,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x7,x7,#32
|
||||
#else
|
||||
rev32 x7,x7
|
||||
@@ -120,7 +119,7 @@ sha1_block_data_order:
|
||||
ror w22,w22,#2
|
||||
add w24,w24,w8 // future e+=X[i]
|
||||
add w20,w20,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x9,x9,#32
|
||||
#else
|
||||
rev32 x9,x9
|
||||
@@ -145,7 +144,7 @@ sha1_block_data_order:
|
||||
ror w20,w20,#2
|
||||
add w22,w22,w10 // future e+=X[i]
|
||||
add w23,w23,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x11,x11,#32
|
||||
#else
|
||||
rev32 x11,x11
|
||||
@@ -170,7 +169,7 @@ sha1_block_data_order:
|
||||
ror w23,w23,#2
|
||||
add w20,w20,w12 // future e+=X[i]
|
||||
add w21,w21,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x13,x13,#32
|
||||
#else
|
||||
rev32 x13,x13
|
||||
@@ -195,7 +194,7 @@ sha1_block_data_order:
|
||||
ror w21,w21,#2
|
||||
add w23,w23,w14 // future e+=X[i]
|
||||
add w24,w24,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x15,x15,#32
|
||||
#else
|
||||
rev32 x15,x15
|
||||
@@ -220,7 +219,7 @@ sha1_block_data_order:
|
||||
ror w24,w24,#2
|
||||
add w21,w21,w16 // future e+=X[i]
|
||||
add w22,w22,w25 // e+=F(b,c,d)
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror x17,x17,#32
|
||||
#else
|
||||
rev32 x17,x17
|
||||
@@ -1233,6 +1232,8 @@ sha1_block_armv8:
|
||||
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
@@ -8,12 +8,11 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the OpenSSL license (the "License"). You may not use
|
||||
// this file except in compliance with the License. You can obtain a copy
|
||||
@@ -41,6 +40,7 @@
|
||||
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
|
||||
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
|
||||
//
|
||||
// (*) Software SHA256 results are of lesser relevance, presented
|
||||
// mostly for informational purposes.
|
||||
@@ -49,7 +49,7 @@
|
||||
// on Cortex-A53 (or by 4 cycles per round).
|
||||
// (***) Super-impressive coefficients over gcc-generated code are
|
||||
// indication of some compiler "pathology", most notably code
|
||||
// generated with -mgeneral-regs-only is significanty faster
|
||||
// generated with -mgeneral-regs-only is significantly faster
|
||||
// and the gap is only 40-90%.
|
||||
|
||||
#ifndef __KERNEL__
|
||||
@@ -101,7 +101,7 @@ sha256_block_data_order:
|
||||
ldr w19,[x30],#4 // *K++
|
||||
eor w28,w21,w22 // magic seed
|
||||
str x1,[x29,#112]
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w3,w3 // 0
|
||||
#endif
|
||||
ror w16,w24,#6
|
||||
@@ -124,7 +124,7 @@ sha256_block_data_order:
|
||||
add w27,w27,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w27,w27,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w4,w4 // 1
|
||||
#endif
|
||||
ldp w5,w6,[x1],#2*4
|
||||
@@ -149,7 +149,7 @@ sha256_block_data_order:
|
||||
add w26,w26,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w26,w26,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w5,w5 // 2
|
||||
#endif
|
||||
add w26,w26,w17 // h+=Sigma0(a)
|
||||
@@ -173,7 +173,7 @@ sha256_block_data_order:
|
||||
add w25,w25,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w25,w25,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w6,w6 // 3
|
||||
#endif
|
||||
ldp w7,w8,[x1],#2*4
|
||||
@@ -198,7 +198,7 @@ sha256_block_data_order:
|
||||
add w24,w24,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w24,w24,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w7,w7 // 4
|
||||
#endif
|
||||
add w24,w24,w17 // h+=Sigma0(a)
|
||||
@@ -222,7 +222,7 @@ sha256_block_data_order:
|
||||
add w23,w23,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w23,w23,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w8,w8 // 5
|
||||
#endif
|
||||
ldp w9,w10,[x1],#2*4
|
||||
@@ -247,7 +247,7 @@ sha256_block_data_order:
|
||||
add w22,w22,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w22,w22,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w9,w9 // 6
|
||||
#endif
|
||||
add w22,w22,w17 // h+=Sigma0(a)
|
||||
@@ -271,7 +271,7 @@ sha256_block_data_order:
|
||||
add w21,w21,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w21,w21,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w10,w10 // 7
|
||||
#endif
|
||||
ldp w11,w12,[x1],#2*4
|
||||
@@ -296,7 +296,7 @@ sha256_block_data_order:
|
||||
add w20,w20,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w20,w20,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w11,w11 // 8
|
||||
#endif
|
||||
add w20,w20,w17 // h+=Sigma0(a)
|
||||
@@ -320,7 +320,7 @@ sha256_block_data_order:
|
||||
add w27,w27,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w27,w27,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w12,w12 // 9
|
||||
#endif
|
||||
ldp w13,w14,[x1],#2*4
|
||||
@@ -345,7 +345,7 @@ sha256_block_data_order:
|
||||
add w26,w26,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w26,w26,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w13,w13 // 10
|
||||
#endif
|
||||
add w26,w26,w17 // h+=Sigma0(a)
|
||||
@@ -369,7 +369,7 @@ sha256_block_data_order:
|
||||
add w25,w25,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w25,w25,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w14,w14 // 11
|
||||
#endif
|
||||
ldp w15,w0,[x1],#2*4
|
||||
@@ -395,7 +395,7 @@ sha256_block_data_order:
|
||||
add w24,w24,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w24,w24,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w15,w15 // 12
|
||||
#endif
|
||||
add w24,w24,w17 // h+=Sigma0(a)
|
||||
@@ -420,7 +420,7 @@ sha256_block_data_order:
|
||||
add w23,w23,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w23,w23,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w0,w0 // 13
|
||||
#endif
|
||||
ldp w1,w2,[x1]
|
||||
@@ -446,7 +446,7 @@ sha256_block_data_order:
|
||||
add w22,w22,w19 // h+=Maj(a,b,c)
|
||||
ldr w19,[x30],#4 // *K++, w28 in next round
|
||||
//add w22,w22,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w1,w1 // 14
|
||||
#endif
|
||||
ldr w6,[sp,#12]
|
||||
@@ -472,7 +472,7 @@ sha256_block_data_order:
|
||||
add w21,w21,w28 // h+=Maj(a,b,c)
|
||||
ldr w28,[x30],#4 // *K++, w19 in next round
|
||||
//add w21,w21,w17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev w2,w2 // 15
|
||||
#endif
|
||||
ldr w7,[sp,#0]
|
||||
@@ -1209,6 +1209,8 @@ sha256_block_armv8:
|
||||
ret
|
||||
.size sha256_block_armv8,.-sha256_block_armv8
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
@@ -8,12 +8,11 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
|
||||
// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the OpenSSL license (the "License"). You may not use
|
||||
// this file except in compliance with the License. You can obtain a copy
|
||||
@@ -41,6 +40,7 @@
|
||||
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
|
||||
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
|
||||
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
|
||||
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
|
||||
//
|
||||
// (*) Software SHA256 results are of lesser relevance, presented
|
||||
// mostly for informational purposes.
|
||||
@@ -49,7 +49,7 @@
|
||||
// on Cortex-A53 (or by 4 cycles per round).
|
||||
// (***) Super-impressive coefficients over gcc-generated code are
|
||||
// indication of some compiler "pathology", most notably code
|
||||
// generated with -mgeneral-regs-only is significanty faster
|
||||
// generated with -mgeneral-regs-only is significantly faster
|
||||
// and the gap is only 40-90%.
|
||||
|
||||
#ifndef __KERNEL__
|
||||
@@ -65,6 +65,17 @@
|
||||
.type sha512_block_data_order,%function
|
||||
.align 6
|
||||
sha512_block_data_order:
|
||||
AARCH64_VALID_CALL_TARGET
|
||||
#ifndef __KERNEL__
|
||||
#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
|
||||
adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
|
||||
#else
|
||||
adrp x16,OPENSSL_armcap_P
|
||||
#endif
|
||||
ldr w16,[x16,:lo12:OPENSSL_armcap_P]
|
||||
tst w16,#ARMV8_SHA512
|
||||
b.ne .Lv8_entry
|
||||
#endif
|
||||
AARCH64_SIGN_LINK_REGISTER
|
||||
stp x29,x30,[sp,#-128]!
|
||||
add x29,sp,#0
|
||||
@@ -90,7 +101,7 @@ sha512_block_data_order:
|
||||
ldr x19,[x30],#8 // *K++
|
||||
eor x28,x21,x22 // magic seed
|
||||
str x1,[x29,#112]
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x3,x3 // 0
|
||||
#endif
|
||||
ror x16,x24,#14
|
||||
@@ -113,7 +124,7 @@ sha512_block_data_order:
|
||||
add x27,x27,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x27,x27,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x4,x4 // 1
|
||||
#endif
|
||||
ldp x5,x6,[x1],#2*8
|
||||
@@ -138,7 +149,7 @@ sha512_block_data_order:
|
||||
add x26,x26,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x26,x26,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x5,x5 // 2
|
||||
#endif
|
||||
add x26,x26,x17 // h+=Sigma0(a)
|
||||
@@ -162,7 +173,7 @@ sha512_block_data_order:
|
||||
add x25,x25,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x25,x25,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x6,x6 // 3
|
||||
#endif
|
||||
ldp x7,x8,[x1],#2*8
|
||||
@@ -187,7 +198,7 @@ sha512_block_data_order:
|
||||
add x24,x24,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x24,x24,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x7,x7 // 4
|
||||
#endif
|
||||
add x24,x24,x17 // h+=Sigma0(a)
|
||||
@@ -211,7 +222,7 @@ sha512_block_data_order:
|
||||
add x23,x23,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x23,x23,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x8,x8 // 5
|
||||
#endif
|
||||
ldp x9,x10,[x1],#2*8
|
||||
@@ -236,7 +247,7 @@ sha512_block_data_order:
|
||||
add x22,x22,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x22,x22,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x9,x9 // 6
|
||||
#endif
|
||||
add x22,x22,x17 // h+=Sigma0(a)
|
||||
@@ -260,7 +271,7 @@ sha512_block_data_order:
|
||||
add x21,x21,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x21,x21,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x10,x10 // 7
|
||||
#endif
|
||||
ldp x11,x12,[x1],#2*8
|
||||
@@ -285,7 +296,7 @@ sha512_block_data_order:
|
||||
add x20,x20,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x20,x20,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x11,x11 // 8
|
||||
#endif
|
||||
add x20,x20,x17 // h+=Sigma0(a)
|
||||
@@ -309,7 +320,7 @@ sha512_block_data_order:
|
||||
add x27,x27,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x27,x27,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x12,x12 // 9
|
||||
#endif
|
||||
ldp x13,x14,[x1],#2*8
|
||||
@@ -334,7 +345,7 @@ sha512_block_data_order:
|
||||
add x26,x26,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x26,x26,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x13,x13 // 10
|
||||
#endif
|
||||
add x26,x26,x17 // h+=Sigma0(a)
|
||||
@@ -358,7 +369,7 @@ sha512_block_data_order:
|
||||
add x25,x25,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x25,x25,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x14,x14 // 11
|
||||
#endif
|
||||
ldp x15,x0,[x1],#2*8
|
||||
@@ -384,7 +395,7 @@ sha512_block_data_order:
|
||||
add x24,x24,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x24,x24,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x15,x15 // 12
|
||||
#endif
|
||||
add x24,x24,x17 // h+=Sigma0(a)
|
||||
@@ -409,7 +420,7 @@ sha512_block_data_order:
|
||||
add x23,x23,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x23,x23,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x0,x0 // 13
|
||||
#endif
|
||||
ldp x1,x2,[x1]
|
||||
@@ -435,7 +446,7 @@ sha512_block_data_order:
|
||||
add x22,x22,x19 // h+=Maj(a,b,c)
|
||||
ldr x19,[x30],#8 // *K++, x28 in next round
|
||||
//add x22,x22,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x1,x1 // 14
|
||||
#endif
|
||||
ldr x6,[sp,#24]
|
||||
@@ -461,7 +472,7 @@ sha512_block_data_order:
|
||||
add x21,x21,x28 // h+=Maj(a,b,c)
|
||||
ldr x28,[x30],#8 // *K++, x19 in next round
|
||||
//add x21,x21,x17 // h+=Sigma0(a)
|
||||
#ifndef __ARMEB__
|
||||
#ifndef __AARCH64EB__
|
||||
rev x2,x2 // 15
|
||||
#endif
|
||||
ldr x7,[sp,#0]
|
||||
@@ -1079,6 +1090,529 @@ sha512_block_data_order:
|
||||
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
|
||||
.align 2
|
||||
.align 2
|
||||
.text
|
||||
#ifndef __KERNEL__
|
||||
.type sha512_block_armv8,%function
|
||||
.align 6
|
||||
sha512_block_armv8:
|
||||
.Lv8_entry:
|
||||
stp x29,x30,[sp,#-16]!
|
||||
add x29,sp,#0
|
||||
|
||||
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
|
||||
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
|
||||
|
||||
ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
|
||||
adrp x3,.LK512
|
||||
add x3,x3,:lo12:.LK512
|
||||
|
||||
rev64 v16.16b,v16.16b
|
||||
rev64 v17.16b,v17.16b
|
||||
rev64 v18.16b,v18.16b
|
||||
rev64 v19.16b,v19.16b
|
||||
rev64 v20.16b,v20.16b
|
||||
rev64 v21.16b,v21.16b
|
||||
rev64 v22.16b,v22.16b
|
||||
rev64 v23.16b,v23.16b
|
||||
b .Loop_hw
|
||||
|
||||
.align 4
|
||||
.Loop_hw:
|
||||
ld1 {v24.2d},[x3],#16
|
||||
subs x2,x2,#1
|
||||
sub x4,x1,#128
|
||||
orr v26.16b,v0.16b,v0.16b // offload
|
||||
orr v27.16b,v1.16b,v1.16b
|
||||
orr v28.16b,v2.16b,v2.16b
|
||||
orr v29.16b,v3.16b,v3.16b
|
||||
csel x1,x1,x4,ne // conditional rewind
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
|
||||
ext v7.16b,v20.16b,v21.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
|
||||
ext v7.16b,v21.16b,v22.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
|
||||
ext v7.16b,v22.16b,v23.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
|
||||
ext v7.16b,v23.16b,v16.16b,#8
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
|
||||
ext v7.16b,v16.16b,v17.16b,#8
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
|
||||
ext v7.16b,v17.16b,v18.16b,#8
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v25.2d},[x3],#16
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
|
||||
ext v7.16b,v18.16b,v19.16b,#8
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v24.2d},[x3],#16
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
|
||||
ext v7.16b,v19.16b,v20.16b,#8
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v16.2d
|
||||
ld1 {v16.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
rev64 v16.16b,v16.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v17.2d
|
||||
ld1 {v17.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
rev64 v17.16b,v17.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v18.2d
|
||||
ld1 {v18.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
rev64 v18.16b,v18.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v19.2d
|
||||
ld1 {v19.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v2.16b,v3.16b,#8
|
||||
ext v6.16b,v1.16b,v2.16b,#8
|
||||
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
|
||||
rev64 v19.16b,v19.16b
|
||||
add v4.2d,v1.2d,v3.2d // "D + T1"
|
||||
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v20.2d
|
||||
ld1 {v20.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v4.16b,v2.16b,#8
|
||||
ext v6.16b,v0.16b,v4.16b,#8
|
||||
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
|
||||
rev64 v20.16b,v20.16b
|
||||
add v1.2d,v0.2d,v2.2d // "D + T1"
|
||||
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
|
||||
ld1 {v24.2d},[x3],#16
|
||||
add v25.2d,v25.2d,v21.2d
|
||||
ld1 {v21.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v1.16b,v4.16b,#8
|
||||
ext v6.16b,v3.16b,v1.16b,#8
|
||||
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
|
||||
rev64 v21.16b,v21.16b
|
||||
add v0.2d,v3.2d,v4.2d // "D + T1"
|
||||
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
|
||||
ld1 {v25.2d},[x3],#16
|
||||
add v24.2d,v24.2d,v22.2d
|
||||
ld1 {v22.16b},[x1],#16 // load next input
|
||||
ext v24.16b,v24.16b,v24.16b,#8
|
||||
ext v5.16b,v0.16b,v1.16b,#8
|
||||
ext v6.16b,v2.16b,v0.16b,#8
|
||||
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
|
||||
rev64 v22.16b,v22.16b
|
||||
add v3.2d,v2.2d,v1.2d // "D + T1"
|
||||
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
|
||||
sub x3,x3,#80*8 // rewind
|
||||
add v25.2d,v25.2d,v23.2d
|
||||
ld1 {v23.16b},[x1],#16 // load next input
|
||||
ext v25.16b,v25.16b,v25.16b,#8
|
||||
ext v5.16b,v3.16b,v0.16b,#8
|
||||
ext v6.16b,v4.16b,v3.16b,#8
|
||||
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
|
||||
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
|
||||
rev64 v23.16b,v23.16b
|
||||
add v2.2d,v4.2d,v0.2d // "D + T1"
|
||||
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
|
||||
add v0.2d,v0.2d,v26.2d // accumulate
|
||||
add v1.2d,v1.2d,v27.2d
|
||||
add v2.2d,v2.2d,v28.2d
|
||||
add v3.2d,v3.2d,v29.2d
|
||||
|
||||
cbnz x2,.Loop_hw
|
||||
|
||||
st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
|
||||
|
||||
ldr x29,[sp],#16
|
||||
ret
|
||||
.size sha512_block_armv8,.-sha512_block_armv8
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
@@ -8,8 +8,7 @@
|
||||
#define OPENSSL_NO_ASM
|
||||
#endif
|
||||
|
||||
#if !defined(OPENSSL_NO_ASM)
|
||||
#if defined(__aarch64__)
|
||||
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(BORINGSSL_PREFIX)
|
||||
#include <boringssl_prefix_symbols_asm.h>
|
||||
#endif
|
||||
@@ -1230,6 +1229,8 @@ vpaes_ctr32_encrypt_blocks:
|
||||
AARCH64_VALIDATE_LINK_REGISTER
|
||||
ret
|
||||
.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
|
||||
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
|
||||
#if defined(__ELF__)
|
||||
// See https://www.airs.com/blog/archives/518.
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
#endif
|
||||
#endif // !OPENSSL_NO_ASM
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
Reference in New Issue
Block a user