Source release 18.1.0

This commit is contained in:
John "Juce" Bruce
2023-06-23 15:45:08 -07:00
parent 2baa7c6e2b
commit b2c35151ad
2074 changed files with 196004 additions and 427059 deletions

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -635,7 +634,7 @@ aes_hw_ctr32_encrypt_blocks:
//
// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w8, w8
#endif
add w10, w8, #1
@@ -797,6 +796,8 @@ aes_hw_ctr32_encrypt_blocks:
ret
.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
#endif
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

File diff suppressed because it is too large Load Diff

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -1431,6 +1430,8 @@ __bn_mul4x_mont:
.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 4
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

View File

@@ -0,0 +1,101 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#if !defined(__has_feature)
#define __has_feature(x) 0
#endif
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
#include <openssl/arm_arch.h>
.text
// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_add_words, %function
.globl bn_add_words
.hidden bn_add_words
.align 4
bn_add_words:
AARCH64_VALID_CALL_TARGET
# Clear the carry flag.
cmn xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, .Ladd_tail
.Ladd_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
adcs x4, x4, x6
adcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, .Ladd_loop
.Ladd_tail:
cbz x3, .Ladd_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
adcs x4, x4, x6
str x4, [x0], #8
.Ladd_exit:
cset x0, cs
ret
.size bn_add_words,.-bn_add_words
// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
// size_t num);
.type bn_sub_words, %function
.globl bn_sub_words
.hidden bn_sub_words
.align 4
bn_sub_words:
AARCH64_VALID_CALL_TARGET
# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
# so we want C = 1 here.
cmp xzr, xzr
# aarch64 can load two registers at a time, so we do two loop iterations at
# at a time. Split x3 = 2 * x8 + x3. This allows loop
# operations to use CBNZ without clobbering the carry flag.
lsr x8, x3, #1
and x3, x3, #1
cbz x8, .Lsub_tail
.Lsub_loop:
ldp x4, x5, [x1], #16
ldp x6, x7, [x2], #16
sub x8, x8, #1
sbcs x4, x4, x6
sbcs x5, x5, x7
stp x4, x5, [x0], #16
cbnz x8, .Lsub_loop
.Lsub_tail:
cbz x3, .Lsub_exit
ldr x4, [x1], #8
ldr x6, [x2], #8
sbcs x4, x4, x6
str x4, [x0], #8
.Lsub_exit:
cset x0, cc
ret
.size bn_sub_words,.-bn_sub_words
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -341,6 +340,8 @@ gcm_ghash_neon:
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -120,7 +119,7 @@ gcm_gmult_v8:
movi v19.16b,#0xe1
ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
shl v19.2d,v19.2d,#57
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v3.16b,v17.16b,v17.16b,#8
@@ -145,7 +144,7 @@ gcm_gmult_v8:
eor v18.16b,v18.16b,v2.16b
eor v0.16b,v0.16b,v18.16b
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
@@ -184,14 +183,14 @@ gcm_ghash_v8:
ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
rev64 v0.16b,v0.16b
#endif
ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
b.lo .Lodd_tail_v8 //x3 was less than 32
ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ext v7.16b,v17.16b,v17.16b,#8
@@ -223,13 +222,13 @@ gcm_ghash_v8:
eor v18.16b,v0.16b,v2.16b
eor v1.16b,v1.16b,v17.16b
ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v16.16b,v16.16b
#endif
eor v1.16b,v1.16b,v18.16b
pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v17.16b,v17.16b
#endif
ins v2.d[0],v1.d[1]
@@ -279,7 +278,7 @@ gcm_ghash_v8:
eor v0.16b,v0.16b,v18.16b
.Ldone_v8:
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
ext v0.16b,v0.16b,v0.16b,#8
@@ -298,7 +297,7 @@ gcm_ghash_v8_4x:
shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
@@ -342,7 +341,7 @@ gcm_ghash_v8_4x:
eor v16.16b,v4.16b,v0.16b
ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
ext v3.16b,v16.16b,v16.16b,#8
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v7.16b,v7.16b
@@ -425,7 +424,7 @@ gcm_ghash_v8_4x:
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d,v6.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v6.16b,v6.16b
rev64 v4.16b,v4.16b
@@ -477,7 +476,7 @@ gcm_ghash_v8_4x:
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d,v5.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v5.16b,v5.16b
rev64 v4.16b,v4.16b
#endif
@@ -520,7 +519,7 @@ gcm_ghash_v8_4x:
eor v1.16b,v1.16b,v17.16b
ld1 {v4.2d},[x2]
eor v1.16b,v1.16b,v18.16b
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v4.16b,v4.16b
#endif
@@ -560,7 +559,7 @@ gcm_ghash_v8_4x:
eor v0.16b,v0.16b,v18.16b
ext v0.16b,v0.16b,v0.16b,#8
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev64 v0.16b,v0.16b
#endif
st1 {v0.2d},[x0] //write out Xi
@@ -571,6 +570,8 @@ gcm_ghash_v8_4x:
.align 2
.align 2
#endif
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,321 @@
// This file is generated from a similarly-named Perl script in the BoringSSL
// source tree. Do not edit by hand.
#if !defined(__has_feature)
#define __has_feature(x) 0
#endif
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
#include "openssl/arm_arch.h"
.text
.globl beeu_mod_inverse_vartime
.hidden beeu_mod_inverse_vartime
.type beeu_mod_inverse_vartime, %function
.align 4
beeu_mod_inverse_vartime:
// Reserve enough space for 14 8-byte registers on the stack
// in the first stp call for x29, x30.
// Then store the remaining callee-saved registers.
//
// | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
// ^ ^
// sp <------------------- 112 bytes ----------------> old sp
// x29 (FP)
//
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-112]!
add x29,sp,#0
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
stp x0,x2,[sp,#96]
// B = b3..b0 := a
ldp x25,x26,[x1]
ldp x27,x28,[x1,#16]
// n3..n0 := n
// Note: the value of input params are changed in the following.
ldp x0,x1,[x2]
ldp x2,x30,[x2,#16]
// A = a3..a0 := n
mov x21, x0
mov x22, x1
mov x23, x2
mov x24, x30
// X = x4..x0 := 1
mov x3, #1
eor x4, x4, x4
eor x5, x5, x5
eor x6, x6, x6
eor x7, x7, x7
// Y = y4..y0 := 0
eor x8, x8, x8
eor x9, x9, x9
eor x10, x10, x10
eor x11, x11, x11
eor x12, x12, x12
.Lbeeu_loop:
// if B == 0, jump to .Lbeeu_loop_end
orr x14, x25, x26
orr x14, x14, x27
// reverse the bit order of x25. This is needed for clz after this macro
rbit x15, x25
orr x14, x14, x28
cbz x14,.Lbeeu_loop_end
// 0 < B < |n|,
// 0 < A <= |n|,
// (1) X*a == B (mod |n|),
// (2) (-1)*Y*a == A (mod |n|)
// Now divide B by the maximum possible power of two in the
// integers, and divide X by the same value mod |n|.
// When we're done, (1) still holds.
// shift := number of trailing 0s in x25
// ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
clz x13, x15
// If there is no shift, goto shift_A_Y
cbz x13, .Lbeeu_shift_A_Y
// Shift B right by "x13" bits
neg x14, x13
lsr x25, x25, x13
lsl x15, x26, x14
lsr x26, x26, x13
lsl x19, x27, x14
orr x25, x25, x15
lsr x27, x27, x13
lsl x20, x28, x14
orr x26, x26, x19
lsr x28, x28, x13
orr x27, x27, x20
// Shift X right by "x13" bits, adding n whenever X becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
.Lbeeu_shift_loop_X:
tbz x3, #0, .Lshift1_0
adds x3, x3, x0
adcs x4, x4, x1
adcs x5, x5, x2
adcs x6, x6, x30
adc x7, x7, x14
.Lshift1_0:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x3, x4, x3, #1
extr x4, x5, x4, #1
extr x5, x6, x5, #1
extr x6, x7, x6, #1
lsr x7, x7, #1
subs x13, x13, #1
bne .Lbeeu_shift_loop_X
// Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
// with the following differences:
// - "x13" is set directly to the number of trailing 0s in B
// (using rbit and clz instructions)
// - The loop is only used to call SHIFT1(X)
// and x13 is decreased while executing the X loop.
// - SHIFT256(B, x13) is performed before right-shifting X; they are independent
.Lbeeu_shift_A_Y:
// Same for A and Y.
// Afterwards, (2) still holds.
// Reverse the bit order of x21
// x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
rbit x15, x21
clz x13, x15
// If there is no shift, goto |B-A|, X+Y update
cbz x13, .Lbeeu_update_B_X_or_A_Y
// Shift A right by "x13" bits
neg x14, x13
lsr x21, x21, x13
lsl x15, x22, x14
lsr x22, x22, x13
lsl x19, x23, x14
orr x21, x21, x15
lsr x23, x23, x13
lsl x20, x24, x14
orr x22, x22, x19
lsr x24, x24, x13
orr x23, x23, x20
// Shift Y right by "x13" bits, adding n whenever Y becomes odd.
// x13--;
// x14 := 0; needed in the addition to the most significant word in SHIFT1
eor x14, x14, x14
.Lbeeu_shift_loop_Y:
tbz x8, #0, .Lshift1_1
adds x8, x8, x0
adcs x9, x9, x1
adcs x10, x10, x2
adcs x11, x11, x30
adc x12, x12, x14
.Lshift1_1:
// var0 := [var1|var0]<64..1>;
// i.e. concatenate var1 and var0,
// extract bits <64..1> from the resulting 128-bit value
// and put them in var0
extr x8, x9, x8, #1
extr x9, x10, x9, #1
extr x10, x11, x10, #1
extr x11, x12, x11, #1
lsr x12, x12, #1
subs x13, x13, #1
bne .Lbeeu_shift_loop_Y
.Lbeeu_update_B_X_or_A_Y:
// Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
// Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
// without taking a sign bit if generated. The lack of a carry would
// indicate a negative result. See, for example,
// https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
subs x14, x25, x21
sbcs x15, x26, x22
sbcs x19, x27, x23
sbcs x20, x28, x24
bcs .Lbeeu_B_greater_than_A
// Else A > B =>
// A := A - B; Y := Y + X; goto beginning of the loop
subs x21, x21, x25
sbcs x22, x22, x26
sbcs x23, x23, x27
sbcs x24, x24, x28
adds x8, x8, x3
adcs x9, x9, x4
adcs x10, x10, x5
adcs x11, x11, x6
adc x12, x12, x7
b .Lbeeu_loop
.Lbeeu_B_greater_than_A:
// Continue with B > A =>
// B := B - A; X := X + Y; goto beginning of the loop
mov x25, x14
mov x26, x15
mov x27, x19
mov x28, x20
adds x3, x3, x8
adcs x4, x4, x9
adcs x5, x5, x10
adcs x6, x6, x11
adc x7, x7, x12
b .Lbeeu_loop
.Lbeeu_loop_end:
// The Euclid's algorithm loop ends when A == gcd(a,n);
// this would be 1, when a and n are co-prime (i.e. do not have a common factor).
// Since (-1)*Y*a == A (mod |n|), Y>0
// then out = -Y mod n
// Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
// Is A-1 == 0?
// If not, fail.
sub x14, x21, #1
orr x14, x14, x22
orr x14, x14, x23
orr x14, x14, x24
cbnz x14, .Lbeeu_err
// If Y>n ==> Y:=Y-n
.Lbeeu_reduction_loop:
// x_i := y_i - n_i (X is no longer needed, use it as temp)
// (x14 = 0 from above)
subs x3, x8, x0
sbcs x4, x9, x1
sbcs x5, x10, x2
sbcs x6, x11, x30
sbcs x7, x12, x14
// If result is non-negative (i.e., cs = carry set = no borrow),
// y_i := x_i; goto reduce again
// else
// y_i := y_i; continue
csel x8, x3, x8, cs
csel x9, x4, x9, cs
csel x10, x5, x10, cs
csel x11, x6, x11, cs
csel x12, x7, x12, cs
bcs .Lbeeu_reduction_loop
// Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
// out = -Y = n-Y
subs x8, x0, x8
sbcs x9, x1, x9
sbcs x10, x2, x10
sbcs x11, x30, x11
// Save Y in output (out (x0) was saved on the stack)
ldr x3, [sp,#96]
stp x8, x9, [x3]
stp x10, x11, [x3,#16]
// return 1 (success)
mov x0, #1
b .Lbeeu_finish
.Lbeeu_err:
// return 0 (error)
eor x0, x0, x0
.Lbeeu_finish:
// Restore callee-saved registers, except x0, x2
add sp,x29,#0
ldp x19,x20,[sp,#16]
ldp x21,x22,[sp,#32]
ldp x23,x24,[sp,#48]
ldp x25,x26,[sp,#64]
ldp x27,x28,[sp,#80]
ldp x29,x30,[sp],#112
AARCH64_VALIDATE_LINK_REGISTER
ret
.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -52,7 +51,7 @@ sha1_block_data_order:
movz w28,#0x7999
sub x2,x2,#1
movk w28,#0x5a82,lsl#16
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x3,x3,#32
#else
rev32 x3,x3
@@ -70,7 +69,7 @@ sha1_block_data_order:
ror w21,w21,#2
add w23,w23,w4 // future e+=X[i]
add w24,w24,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x5,x5,#32
#else
rev32 x5,x5
@@ -95,7 +94,7 @@ sha1_block_data_order:
ror w24,w24,#2
add w21,w21,w6 // future e+=X[i]
add w22,w22,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x7,x7,#32
#else
rev32 x7,x7
@@ -120,7 +119,7 @@ sha1_block_data_order:
ror w22,w22,#2
add w24,w24,w8 // future e+=X[i]
add w20,w20,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x9,x9,#32
#else
rev32 x9,x9
@@ -145,7 +144,7 @@ sha1_block_data_order:
ror w20,w20,#2
add w22,w22,w10 // future e+=X[i]
add w23,w23,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x11,x11,#32
#else
rev32 x11,x11
@@ -170,7 +169,7 @@ sha1_block_data_order:
ror w23,w23,#2
add w20,w20,w12 // future e+=X[i]
add w21,w21,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x13,x13,#32
#else
rev32 x13,x13
@@ -195,7 +194,7 @@ sha1_block_data_order:
ror w21,w21,#2
add w23,w23,w14 // future e+=X[i]
add w24,w24,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x15,x15,#32
#else
rev32 x15,x15
@@ -220,7 +219,7 @@ sha1_block_data_order:
ror w24,w24,#2
add w21,w21,w16 // future e+=X[i]
add w22,w22,w25 // e+=F(b,c,d)
#ifdef __ARMEB__
#ifdef __AARCH64EB__
ror x17,x17,#32
#else
rev32 x17,x17
@@ -1233,6 +1232,8 @@ sha1_block_armv8:
.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

View File

@@ -8,12 +8,11 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
@@ -41,6 +40,7 @@
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
//
// (*) Software SHA256 results are of lesser relevance, presented
// mostly for informational purposes.
@@ -49,7 +49,7 @@
// on Cortex-A53 (or by 4 cycles per round).
// (***) Super-impressive coefficients over gcc-generated code are
// indication of some compiler "pathology", most notably code
// generated with -mgeneral-regs-only is significanty faster
// generated with -mgeneral-regs-only is significantly faster
// and the gap is only 40-90%.
#ifndef __KERNEL__
@@ -101,7 +101,7 @@ sha256_block_data_order:
ldr w19,[x30],#4 // *K++
eor w28,w21,w22 // magic seed
str x1,[x29,#112]
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w3,w3 // 0
#endif
ror w16,w24,#6
@@ -124,7 +124,7 @@ sha256_block_data_order:
add w27,w27,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w27,w27,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w4,w4 // 1
#endif
ldp w5,w6,[x1],#2*4
@@ -149,7 +149,7 @@ sha256_block_data_order:
add w26,w26,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w26,w26,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w5,w5 // 2
#endif
add w26,w26,w17 // h+=Sigma0(a)
@@ -173,7 +173,7 @@ sha256_block_data_order:
add w25,w25,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w25,w25,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w6,w6 // 3
#endif
ldp w7,w8,[x1],#2*4
@@ -198,7 +198,7 @@ sha256_block_data_order:
add w24,w24,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w24,w24,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w7,w7 // 4
#endif
add w24,w24,w17 // h+=Sigma0(a)
@@ -222,7 +222,7 @@ sha256_block_data_order:
add w23,w23,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w23,w23,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w8,w8 // 5
#endif
ldp w9,w10,[x1],#2*4
@@ -247,7 +247,7 @@ sha256_block_data_order:
add w22,w22,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w22,w22,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w9,w9 // 6
#endif
add w22,w22,w17 // h+=Sigma0(a)
@@ -271,7 +271,7 @@ sha256_block_data_order:
add w21,w21,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w21,w21,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w10,w10 // 7
#endif
ldp w11,w12,[x1],#2*4
@@ -296,7 +296,7 @@ sha256_block_data_order:
add w20,w20,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w20,w20,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w11,w11 // 8
#endif
add w20,w20,w17 // h+=Sigma0(a)
@@ -320,7 +320,7 @@ sha256_block_data_order:
add w27,w27,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w27,w27,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w12,w12 // 9
#endif
ldp w13,w14,[x1],#2*4
@@ -345,7 +345,7 @@ sha256_block_data_order:
add w26,w26,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w26,w26,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w13,w13 // 10
#endif
add w26,w26,w17 // h+=Sigma0(a)
@@ -369,7 +369,7 @@ sha256_block_data_order:
add w25,w25,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w25,w25,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w14,w14 // 11
#endif
ldp w15,w0,[x1],#2*4
@@ -395,7 +395,7 @@ sha256_block_data_order:
add w24,w24,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w24,w24,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w15,w15 // 12
#endif
add w24,w24,w17 // h+=Sigma0(a)
@@ -420,7 +420,7 @@ sha256_block_data_order:
add w23,w23,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w23,w23,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w0,w0 // 13
#endif
ldp w1,w2,[x1]
@@ -446,7 +446,7 @@ sha256_block_data_order:
add w22,w22,w19 // h+=Maj(a,b,c)
ldr w19,[x30],#4 // *K++, w28 in next round
//add w22,w22,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w1,w1 // 14
#endif
ldr w6,[sp,#12]
@@ -472,7 +472,7 @@ sha256_block_data_order:
add w21,w21,w28 // h+=Maj(a,b,c)
ldr w28,[x30],#4 // *K++, w19 in next round
//add w21,w21,w17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev w2,w2 // 15
#endif
ldr w7,[sp,#0]
@@ -1209,6 +1209,8 @@ sha256_block_armv8:
ret
.size sha256_block_armv8,.-sha256_block_armv8
#endif
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

View File

@@ -8,12 +8,11 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
@@ -41,6 +40,7 @@
// Denver 2.01 10.5 (+26%) 6.70 (+8%)
// X-Gene 20.0 (+100%) 12.8 (+300%(***))
// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
//
// (*) Software SHA256 results are of lesser relevance, presented
// mostly for informational purposes.
@@ -49,7 +49,7 @@
// on Cortex-A53 (or by 4 cycles per round).
// (***) Super-impressive coefficients over gcc-generated code are
// indication of some compiler "pathology", most notably code
// generated with -mgeneral-regs-only is significanty faster
// generated with -mgeneral-regs-only is significantly faster
// and the gap is only 40-90%.
#ifndef __KERNEL__
@@ -65,6 +65,17 @@
.type sha512_block_data_order,%function
.align 6
sha512_block_data_order:
AARCH64_VALID_CALL_TARGET
#ifndef __KERNEL__
#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
#else
adrp x16,OPENSSL_armcap_P
#endif
ldr w16,[x16,:lo12:OPENSSL_armcap_P]
tst w16,#ARMV8_SHA512
b.ne .Lv8_entry
#endif
AARCH64_SIGN_LINK_REGISTER
stp x29,x30,[sp,#-128]!
add x29,sp,#0
@@ -90,7 +101,7 @@ sha512_block_data_order:
ldr x19,[x30],#8 // *K++
eor x28,x21,x22 // magic seed
str x1,[x29,#112]
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x3,x3 // 0
#endif
ror x16,x24,#14
@@ -113,7 +124,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x4,x4 // 1
#endif
ldp x5,x6,[x1],#2*8
@@ -138,7 +149,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x5,x5 // 2
#endif
add x26,x26,x17 // h+=Sigma0(a)
@@ -162,7 +173,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x6,x6 // 3
#endif
ldp x7,x8,[x1],#2*8
@@ -187,7 +198,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x7,x7 // 4
#endif
add x24,x24,x17 // h+=Sigma0(a)
@@ -211,7 +222,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x8,x8 // 5
#endif
ldp x9,x10,[x1],#2*8
@@ -236,7 +247,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x9,x9 // 6
#endif
add x22,x22,x17 // h+=Sigma0(a)
@@ -260,7 +271,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x10,x10 // 7
#endif
ldp x11,x12,[x1],#2*8
@@ -285,7 +296,7 @@ sha512_block_data_order:
add x20,x20,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x20,x20,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x11,x11 // 8
#endif
add x20,x20,x17 // h+=Sigma0(a)
@@ -309,7 +320,7 @@ sha512_block_data_order:
add x27,x27,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x27,x27,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x12,x12 // 9
#endif
ldp x13,x14,[x1],#2*8
@@ -334,7 +345,7 @@ sha512_block_data_order:
add x26,x26,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x26,x26,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x13,x13 // 10
#endif
add x26,x26,x17 // h+=Sigma0(a)
@@ -358,7 +369,7 @@ sha512_block_data_order:
add x25,x25,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x25,x25,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x14,x14 // 11
#endif
ldp x15,x0,[x1],#2*8
@@ -384,7 +395,7 @@ sha512_block_data_order:
add x24,x24,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x24,x24,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x15,x15 // 12
#endif
add x24,x24,x17 // h+=Sigma0(a)
@@ -409,7 +420,7 @@ sha512_block_data_order:
add x23,x23,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x23,x23,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x0,x0 // 13
#endif
ldp x1,x2,[x1]
@@ -435,7 +446,7 @@ sha512_block_data_order:
add x22,x22,x19 // h+=Maj(a,b,c)
ldr x19,[x30],#8 // *K++, x28 in next round
//add x22,x22,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x1,x1 // 14
#endif
ldr x6,[sp,#24]
@@ -461,7 +472,7 @@ sha512_block_data_order:
add x21,x21,x28 // h+=Maj(a,b,c)
ldr x28,[x30],#8 // *K++, x19 in next round
//add x21,x21,x17 // h+=Sigma0(a)
#ifndef __ARMEB__
#ifndef __AARCH64EB__
rev x2,x2 // 15
#endif
ldr x7,[sp,#0]
@@ -1079,6 +1090,529 @@ sha512_block_data_order:
.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
.text
#ifndef __KERNEL__
.type sha512_block_armv8,%function
.align 6
sha512_block_armv8:
.Lv8_entry:
stp x29,x30,[sp,#-16]!
add x29,sp,#0
ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
adrp x3,.LK512
add x3,x3,:lo12:.LK512
rev64 v16.16b,v16.16b
rev64 v17.16b,v17.16b
rev64 v18.16b,v18.16b
rev64 v19.16b,v19.16b
rev64 v20.16b,v20.16b
rev64 v21.16b,v21.16b
rev64 v22.16b,v22.16b
rev64 v23.16b,v23.16b
b .Loop_hw
.align 4
.Loop_hw:
ld1 {v24.2d},[x3],#16
subs x2,x2,#1
sub x4,x1,#128
orr v26.16b,v0.16b,v0.16b // offload
orr v27.16b,v1.16b,v1.16b
orr v28.16b,v2.16b,v2.16b
orr v29.16b,v3.16b,v3.16b
csel x1,x1,x4,ne // conditional rewind
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v24.2d,v24.2d,v16.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08230 //sha512su0 v16.16b,v17.16b
ext v7.16b,v20.16b,v21.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v25.2d,v25.2d,v17.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08251 //sha512su0 v17.16b,v18.16b
ext v7.16b,v21.16b,v22.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v24.2d,v24.2d,v18.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec08272 //sha512su0 v18.16b,v19.16b
ext v7.16b,v22.16b,v23.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
add v25.2d,v25.2d,v19.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08293 //sha512su0 v19.16b,v20.16b
ext v7.16b,v23.16b,v16.16b,#8
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
add v24.2d,v24.2d,v20.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b
ext v7.16b,v16.16b,v17.16b,#8
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
add v25.2d,v25.2d,v21.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b
ext v7.16b,v17.16b,v18.16b,#8
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v24.2d,v24.2d,v22.2d
ld1 {v25.2d},[x3],#16
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b
ext v7.16b,v18.16b,v19.16b,#8
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
add v25.2d,v25.2d,v23.2d
ld1 {v24.2d},[x3],#16
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xcec08217 //sha512su0 v23.16b,v16.16b
ext v7.16b,v19.16b,v20.16b,#8
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v16.2d
ld1 {v16.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
rev64 v16.16b,v16.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v17.2d
ld1 {v17.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
rev64 v17.16b,v17.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v18.2d
ld1 {v18.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
rev64 v18.16b,v18.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v19.2d
ld1 {v19.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v2.16b,v3.16b,#8
ext v6.16b,v1.16b,v2.16b,#8
add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
rev64 v19.16b,v19.16b
add v4.2d,v1.2d,v3.2d // "D + T1"
.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v20.2d
ld1 {v20.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v4.16b,v2.16b,#8
ext v6.16b,v0.16b,v4.16b,#8
add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
rev64 v20.16b,v20.16b
add v1.2d,v0.2d,v2.2d // "D + T1"
.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
ld1 {v24.2d},[x3],#16
add v25.2d,v25.2d,v21.2d
ld1 {v21.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v1.16b,v4.16b,#8
ext v6.16b,v3.16b,v1.16b,#8
add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
rev64 v21.16b,v21.16b
add v0.2d,v3.2d,v4.2d // "D + T1"
.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
ld1 {v25.2d},[x3],#16
add v24.2d,v24.2d,v22.2d
ld1 {v22.16b},[x1],#16 // load next input
ext v24.16b,v24.16b,v24.16b,#8
ext v5.16b,v0.16b,v1.16b,#8
ext v6.16b,v2.16b,v0.16b,#8
add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
rev64 v22.16b,v22.16b
add v3.2d,v2.2d,v1.2d // "D + T1"
.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
sub x3,x3,#80*8 // rewind
add v25.2d,v25.2d,v23.2d
ld1 {v23.16b},[x1],#16 // load next input
ext v25.16b,v25.16b,v25.16b,#8
ext v5.16b,v3.16b,v0.16b,#8
ext v6.16b,v4.16b,v3.16b,#8
add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
rev64 v23.16b,v23.16b
add v2.2d,v4.2d,v0.2d // "D + T1"
.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
add v0.2d,v0.2d,v26.2d // accumulate
add v1.2d,v1.2d,v27.2d
add v2.2d,v2.2d,v28.2d
add v3.2d,v3.2d,v29.2d
cbnz x2,.Loop_hw
st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
ldr x29,[sp],#16
ret
.size sha512_block_armv8,.-sha512_block_armv8
#endif
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits

View File

@@ -8,8 +8,7 @@
#define OPENSSL_NO_ASM
#endif
#if !defined(OPENSSL_NO_ASM)
#if defined(__aarch64__)
#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(BORINGSSL_PREFIX)
#include <boringssl_prefix_symbols_asm.h>
#endif
@@ -1230,6 +1229,8 @@ vpaes_ctr32_encrypt_blocks:
AARCH64_VALIDATE_LINK_REGISTER
ret
.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
#if defined(__ELF__)
// See https://www.airs.com/blog/archives/518.
.section .note.GNU-stack,"",%progbits
#endif
#endif // !OPENSSL_NO_ASM
.section .note.GNU-stack,"",%progbits