Source release 18.1.0

2023-06-23 15:45:08 -07:00
parent 2baa7c6e2b
commit b2c35151ad
2074 changed files with 196004 additions and 427059 deletions
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-armv8-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -635,7 +634,7 @@ aes_hw_ctr32_encrypt_blocks:
 	//
 	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
 	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev	w8, w8
 #endif
 	add	w10, w8, #1
@@ -797,6 +796,8 @@ aes_hw_ctr32_encrypt_blocks:
 	ret
 .size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
 #endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-gcm-armv8-linux.S
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/armv8-mont-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/armv8-mont-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -1431,6 +1430,8 @@ __bn_mul4x_mont:
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	4
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/bn-armv8-linux.S
@@ -0,0 +1,101 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_add_words, %function
+.globl	bn_add_words
+.hidden	bn_add_words
+.align	4
+bn_add_words:
+	AARCH64_VALID_CALL_TARGET
+	# Clear the carry flag.
+	cmn	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, .Ladd_tail
+.Ladd_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	adcs	x4, x4, x6
+	adcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, .Ladd_loop
+
+.Ladd_tail:
+	cbz	x3, .Ladd_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	adcs	x4, x4, x6
+	str	x4, [x0], #8
+
+.Ladd_exit:
+	cset	x0, cs
+	ret
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+//                       size_t num);
+.type	bn_sub_words, %function
+.globl	bn_sub_words
+.hidden	bn_sub_words
+.align	4
+bn_sub_words:
+	AARCH64_VALID_CALL_TARGET
+	# Set the carry flag. Arm's borrow bit is flipped from the carry flag,
+	# so we want C = 1 here.
+	cmp	xzr, xzr
+
+	# aarch64 can load two registers at a time, so we do two loop iterations at
+	# at a time. Split x3 = 2 * x8 + x3. This allows loop
+	# operations to use CBNZ without clobbering the carry flag.
+	lsr	x8, x3, #1
+	and	x3, x3, #1
+
+	cbz	x8, .Lsub_tail
+.Lsub_loop:
+	ldp	x4, x5, [x1], #16
+	ldp	x6, x7, [x2], #16
+	sub	x8, x8, #1
+	sbcs	x4, x4, x6
+	sbcs	x5, x5, x7
+	stp	x4, x5, [x0], #16
+	cbnz	x8, .Lsub_loop
+
+.Lsub_tail:
+	cbz	x3, .Lsub_exit
+	ldr	x4, [x1], #8
+	ldr	x6, [x2], #8
+	sbcs	x4, x4, x6
+	str	x4, [x0], #8
+
+.Lsub_exit:
+	cset	x0, cc
+	ret
+.size	bn_sub_words,.-bn_sub_words
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -341,6 +340,8 @@ gcm_ghash_neon:
 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghashv8-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghashv8-armv8-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -120,7 +119,7 @@ gcm_gmult_v8:
 	movi	v19.16b,#0xe1
 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
 	shl	v19.2d,v19.2d,#57
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v17.16b,v17.16b
 #endif
 	ext	v3.16b,v17.16b,v17.16b,#8
@@ -145,7 +144,7 @@ gcm_gmult_v8:
 	eor	v18.16b,v18.16b,v2.16b
 	eor	v0.16b,v0.16b,v18.16b

-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v0.16b,v0.16b
 #endif
 	ext	v0.16b,v0.16b,v0.16b,#8
@@ -184,14 +183,14 @@ gcm_ghash_v8:
 	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
 	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v16.16b,v16.16b
 	rev64	v0.16b,v0.16b
 #endif
 	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
 	b.lo	.Lodd_tail_v8		//x3 was less than 32
 	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v17.16b,v17.16b
 #endif
 	ext	v7.16b,v17.16b,v17.16b,#8
@@ -223,13 +222,13 @@ gcm_ghash_v8:
 	eor	v18.16b,v0.16b,v2.16b
 	eor	v1.16b,v1.16b,v17.16b
 	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v16.16b,v16.16b
 #endif
 	eor	v1.16b,v1.16b,v18.16b
 	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction

-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v17.16b,v17.16b
 #endif
 	ins	v2.d[0],v1.d[1]
@@ -279,7 +278,7 @@ gcm_ghash_v8:
 	eor	v0.16b,v0.16b,v18.16b

 .Ldone_v8:
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v0.16b,v0.16b
 #endif
 	ext	v0.16b,v0.16b,v0.16b,#8
@@ -298,7 +297,7 @@ gcm_ghash_v8_4x:
 	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant

 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v0.16b,v0.16b
 	rev64	v5.16b,v5.16b
 	rev64	v6.16b,v6.16b
@@ -342,7 +341,7 @@ gcm_ghash_v8_4x:
 	eor	v16.16b,v4.16b,v0.16b
 	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
 	ext	v3.16b,v16.16b,v16.16b,#8
-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v5.16b,v5.16b
 	rev64	v6.16b,v6.16b
 	rev64	v7.16b,v7.16b
@@ -425,7 +424,7 @@ gcm_ghash_v8_4x:
 	eor	v1.16b,v1.16b,v17.16b
 	ld1	{v4.2d,v5.2d,v6.2d},[x2]
 	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev64	v5.16b,v5.16b
 	rev64	v6.16b,v6.16b
 	rev64	v4.16b,v4.16b
@@ -477,7 +476,7 @@ gcm_ghash_v8_4x:
 	eor	v1.16b,v1.16b,v17.16b
 	ld1	{v4.2d,v5.2d},[x2]
 	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev64	v5.16b,v5.16b
 	rev64	v4.16b,v4.16b
 #endif
@@ -520,7 +519,7 @@ gcm_ghash_v8_4x:
 	eor	v1.16b,v1.16b,v17.16b
 	ld1	{v4.2d},[x2]
 	eor	v1.16b,v1.16b,v18.16b
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev64	v4.16b,v4.16b
 #endif

@@ -560,7 +559,7 @@ gcm_ghash_v8_4x:
 	eor	v0.16b,v0.16b,v18.16b
 	ext	v0.16b,v0.16b,v0.16b,#8

-#ifndef __ARMEB__
+#ifndef __AARCH64EB__
 	rev64	v0.16b,v0.16b
 #endif
 	st1	{v0.2d},[x0]		//write out Xi
@@ -571,6 +570,8 @@ gcm_ghash_v8_4x:
 .align	2
 .align	2
 #endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256-armv8-asm-linux.S
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm-linux.S
@@ -0,0 +1,321 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include "openssl/arm_arch.h"
+
+.text
+.globl	beeu_mod_inverse_vartime
+.hidden	beeu_mod_inverse_vartime
+.type	beeu_mod_inverse_vartime, %function
+.align	4
+beeu_mod_inverse_vartime:
+    // Reserve enough space for 14 8-byte registers on the stack
+    // in the first stp call for x29, x30.
+    // Then store the remaining callee-saved registers.
+    //
+    //    | x29 | x30 | x19 | x20 | ... | x27 | x28 |  x0 |  x2 |
+    //    ^                                                     ^
+    //    sp  <------------------- 112 bytes ----------------> old sp
+    //   x29 (FP)
+    //
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-112]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x2,[sp,#96]
+
+    // B = b3..b0 := a
+	ldp	x25,x26,[x1]
+	ldp	x27,x28,[x1,#16]
+
+    // n3..n0 := n
+    // Note: the value of input params are changed in the following.
+	ldp	x0,x1,[x2]
+	ldp	x2,x30,[x2,#16]
+
+    // A = a3..a0 := n
+	mov	x21, x0
+	mov	x22, x1
+	mov	x23, x2
+	mov	x24, x30
+
+    // X = x4..x0 := 1
+	mov	x3, #1
+	eor	x4, x4, x4
+	eor	x5, x5, x5
+	eor	x6, x6, x6
+	eor	x7, x7, x7
+
+    // Y = y4..y0 := 0
+	eor	x8, x8, x8
+	eor	x9, x9, x9
+	eor	x10, x10, x10
+	eor	x11, x11, x11
+	eor	x12, x12, x12
+
+.Lbeeu_loop:
+    // if B == 0, jump to .Lbeeu_loop_end
+	orr	x14, x25, x26
+	orr	x14, x14, x27
+
+    // reverse the bit order of x25. This is needed for clz after this macro
+	rbit	x15, x25
+
+	orr	x14, x14, x28
+	cbz	x14,.Lbeeu_loop_end
+
+
+    // 0 < B < |n|,
+    // 0 < A <= |n|,
+    // (1)      X*a  ==  B   (mod |n|),
+    // (2) (-1)*Y*a  ==  A   (mod |n|)
+
+    // Now divide B by the maximum possible power of two in the
+    // integers, and divide X by the same value mod |n|.
+    // When we're done, (1) still holds.
+
+    // shift := number of trailing 0s in x25
+    // (      = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
+	clz	x13, x15
+
+    // If there is no shift, goto shift_A_Y
+	cbz	x13, .Lbeeu_shift_A_Y
+
+    // Shift B right by "x13" bits
+	neg	x14, x13
+	lsr	x25, x25, x13
+	lsl	x15, x26, x14
+
+	lsr	x26, x26, x13
+	lsl	x19, x27, x14
+
+	orr	x25, x25, x15
+
+	lsr	x27, x27, x13
+	lsl	x20, x28, x14
+
+	orr	x26, x26, x19
+
+	lsr	x28, x28, x13
+
+	orr	x27, x27, x20
+
+
+    // Shift X right by "x13" bits, adding n whenever X becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+.Lbeeu_shift_loop_X:
+	tbz	x3, #0, .Lshift1_0
+	adds	x3, x3, x0
+	adcs	x4, x4, x1
+	adcs	x5, x5, x2
+	adcs	x6, x6, x30
+	adc	x7, x7, x14
+.Lshift1_0:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x3, x4, x3, #1
+	extr	x4, x5, x4, #1
+	extr	x5, x6, x5, #1
+	extr	x6, x7, x6, #1
+	lsr	x7, x7, #1
+
+	subs	x13, x13, #1
+	bne	.Lbeeu_shift_loop_X
+
+    // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
+    // with the following differences:
+    // - "x13" is set directly to the number of trailing 0s in B
+    //   (using rbit and clz instructions)
+    // - The loop is only used to call SHIFT1(X)
+    //   and x13 is decreased while executing the X loop.
+    // - SHIFT256(B, x13) is performed before right-shifting X; they are independent
+
+.Lbeeu_shift_A_Y:
+    // Same for A and Y.
+    // Afterwards, (2) still holds.
+    // Reverse the bit order of x21
+    // x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
+	rbit	x15, x21
+	clz	x13, x15
+
+    // If there is no shift, goto |B-A|, X+Y update
+	cbz	x13, .Lbeeu_update_B_X_or_A_Y
+
+    // Shift A right by "x13" bits
+	neg	x14, x13
+	lsr	x21, x21, x13
+	lsl	x15, x22, x14
+
+	lsr	x22, x22, x13
+	lsl	x19, x23, x14
+
+	orr	x21, x21, x15
+
+	lsr	x23, x23, x13
+	lsl	x20, x24, x14
+
+	orr	x22, x22, x19
+
+	lsr	x24, x24, x13
+
+	orr	x23, x23, x20
+
+
+    // Shift Y right by "x13" bits, adding n whenever Y becomes odd.
+    // x13--;
+    // x14 := 0; needed in the addition to the most significant word in SHIFT1
+	eor	x14, x14, x14
+.Lbeeu_shift_loop_Y:
+	tbz	x8, #0, .Lshift1_1
+	adds	x8, x8, x0
+	adcs	x9, x9, x1
+	adcs	x10, x10, x2
+	adcs	x11, x11, x30
+	adc	x12, x12, x14
+.Lshift1_1:
+    // var0 := [var1|var0]<64..1>;
+    // i.e. concatenate var1 and var0,
+    //      extract bits <64..1> from the resulting 128-bit value
+    //      and put them in var0
+	extr	x8, x9, x8, #1
+	extr	x9, x10, x9, #1
+	extr	x10, x11, x10, #1
+	extr	x11, x12, x11, #1
+	lsr	x12, x12, #1
+
+	subs	x13, x13, #1
+	bne	.Lbeeu_shift_loop_Y
+
+.Lbeeu_update_B_X_or_A_Y:
+    // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
+    // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
+    //       without taking a sign bit if generated. The lack of a carry would
+    //       indicate a negative result. See, for example,
+    //       https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
+	subs	x14, x25, x21
+	sbcs	x15, x26, x22
+	sbcs	x19, x27, x23
+	sbcs	x20, x28, x24
+	bcs	.Lbeeu_B_greater_than_A
+
+    // Else A > B =>
+    // A := A - B; Y := Y + X; goto beginning of the loop
+	subs	x21, x21, x25
+	sbcs	x22, x22, x26
+	sbcs	x23, x23, x27
+	sbcs	x24, x24, x28
+
+	adds	x8, x8, x3
+	adcs	x9, x9, x4
+	adcs	x10, x10, x5
+	adcs	x11, x11, x6
+	adc	x12, x12, x7
+	b	.Lbeeu_loop
+
+.Lbeeu_B_greater_than_A:
+    // Continue with B > A =>
+    // B := B - A; X := X + Y; goto beginning of the loop
+	mov	x25, x14
+	mov	x26, x15
+	mov	x27, x19
+	mov	x28, x20
+
+	adds	x3, x3, x8
+	adcs	x4, x4, x9
+	adcs	x5, x5, x10
+	adcs	x6, x6, x11
+	adc	x7, x7, x12
+	b	.Lbeeu_loop
+
+.Lbeeu_loop_end:
+    // The Euclid's algorithm loop ends when A == gcd(a,n);
+    // this would be 1, when a and n are co-prime (i.e. do not have a common factor).
+    // Since (-1)*Y*a == A (mod |n|), Y>0
+    // then out = -Y mod n
+
+    // Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
+    // Is A-1 == 0?
+    // If not, fail.
+	sub	x14, x21, #1
+	orr	x14, x14, x22
+	orr	x14, x14, x23
+	orr	x14, x14, x24
+	cbnz	x14, .Lbeeu_err
+
+    // If Y>n ==> Y:=Y-n
+.Lbeeu_reduction_loop:
+    // x_i := y_i - n_i (X is no longer needed, use it as temp)
+    // (x14 = 0 from above)
+	subs	x3, x8, x0
+	sbcs	x4, x9, x1
+	sbcs	x5, x10, x2
+	sbcs	x6, x11, x30
+	sbcs	x7, x12, x14
+
+    // If result is non-negative (i.e., cs = carry set = no borrow),
+    // y_i := x_i; goto reduce again
+    // else
+    // y_i := y_i; continue
+	csel	x8, x3, x8, cs
+	csel	x9, x4, x9, cs
+	csel	x10, x5, x10, cs
+	csel	x11, x6, x11, cs
+	csel	x12, x7, x12, cs
+	bcs	.Lbeeu_reduction_loop
+
+    // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
+    // out = -Y = n-Y
+	subs	x8, x0, x8
+	sbcs	x9, x1, x9
+	sbcs	x10, x2, x10
+	sbcs	x11, x30, x11
+
+    // Save Y in output (out (x0) was saved on the stack)
+	ldr	x3, [sp,#96]
+	stp	x8, x9, [x3]
+	stp	x10, x11, [x3,#16]
+    // return 1 (success)
+	mov	x0, #1
+	b	.Lbeeu_finish
+
+.Lbeeu_err:
+    // return 0 (error)
+	eor	x0, x0, x0
+
+.Lbeeu_finish:
+    // Restore callee-saved registers, except x0, x2
+	add	sp,x29,#0
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldp	x29,x30,[sp],#112
+
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+.size	beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
+#endif
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha1-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha1-armv8-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -52,7 +51,7 @@ sha1_block_data_order:
 	movz	w28,#0x7999
 	sub	x2,x2,#1
 	movk	w28,#0x5a82,lsl#16
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x3,x3,#32
 #else
 	rev32	x3,x3
@@ -70,7 +69,7 @@ sha1_block_data_order:
 	ror	w21,w21,#2
 	add	w23,w23,w4	// future e+=X[i]
 	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x5,x5,#32
 #else
 	rev32	x5,x5
@@ -95,7 +94,7 @@ sha1_block_data_order:
 	ror	w24,w24,#2
 	add	w21,w21,w6	// future e+=X[i]
 	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x7,x7,#32
 #else
 	rev32	x7,x7
@@ -120,7 +119,7 @@ sha1_block_data_order:
 	ror	w22,w22,#2
 	add	w24,w24,w8	// future e+=X[i]
 	add	w20,w20,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x9,x9,#32
 #else
 	rev32	x9,x9
@@ -145,7 +144,7 @@ sha1_block_data_order:
 	ror	w20,w20,#2
 	add	w22,w22,w10	// future e+=X[i]
 	add	w23,w23,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x11,x11,#32
 #else
 	rev32	x11,x11
@@ -170,7 +169,7 @@ sha1_block_data_order:
 	ror	w23,w23,#2
 	add	w20,w20,w12	// future e+=X[i]
 	add	w21,w21,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x13,x13,#32
 #else
 	rev32	x13,x13
@@ -195,7 +194,7 @@ sha1_block_data_order:
 	ror	w21,w21,#2
 	add	w23,w23,w14	// future e+=X[i]
 	add	w24,w24,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x15,x15,#32
 #else
 	rev32	x15,x15
@@ -220,7 +219,7 @@ sha1_block_data_order:
 	ror	w24,w24,#2
 	add	w21,w21,w16	// future e+=X[i]
 	add	w22,w22,w25		// e+=F(b,c,d)
-#ifdef	__ARMEB__
+#ifdef	__AARCH64EB__
 	ror	x17,x17,#32
 #else
 	rev32	x17,x17
@@ -1233,6 +1232,8 @@ sha1_block_armv8:
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha256-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha256-armv8-linux.S
@@ -8,12 +8,11 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@@ -41,6 +40,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@@ -49,7 +49,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@@ -101,7 +101,7 @@ sha256_block_data_order:
 	ldr	w19,[x30],#4			// *K++
 	eor	w28,w21,w22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w3,w3			// 0
 #endif
 	ror	w16,w24,#6
@@ -124,7 +124,7 @@ sha256_block_data_order:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w4,w4			// 1
 #endif
 	ldp	w5,w6,[x1],#2*4
@@ -149,7 +149,7 @@ sha256_block_data_order:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w5,w5			// 2
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@@ -173,7 +173,7 @@ sha256_block_data_order:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w6,w6			// 3
 #endif
 	ldp	w7,w8,[x1],#2*4
@@ -198,7 +198,7 @@ sha256_block_data_order:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w7,w7			// 4
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@@ -222,7 +222,7 @@ sha256_block_data_order:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w8,w8			// 5
 #endif
 	ldp	w9,w10,[x1],#2*4
@@ -247,7 +247,7 @@ sha256_block_data_order:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w9,w9			// 6
 #endif
 	add	w22,w22,w17			// h+=Sigma0(a)
@@ -271,7 +271,7 @@ sha256_block_data_order:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w10,w10			// 7
 #endif
 	ldp	w11,w12,[x1],#2*4
@@ -296,7 +296,7 @@ sha256_block_data_order:
 	add	w20,w20,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w20,w20,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w11,w11			// 8
 #endif
 	add	w20,w20,w17			// h+=Sigma0(a)
@@ -320,7 +320,7 @@ sha256_block_data_order:
 	add	w27,w27,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w27,w27,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w12,w12			// 9
 #endif
 	ldp	w13,w14,[x1],#2*4
@@ -345,7 +345,7 @@ sha256_block_data_order:
 	add	w26,w26,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w26,w26,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w13,w13			// 10
 #endif
 	add	w26,w26,w17			// h+=Sigma0(a)
@@ -369,7 +369,7 @@ sha256_block_data_order:
 	add	w25,w25,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w25,w25,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w14,w14			// 11
 #endif
 	ldp	w15,w0,[x1],#2*4
@@ -395,7 +395,7 @@ sha256_block_data_order:
 	add	w24,w24,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w24,w24,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w15,w15			// 12
 #endif
 	add	w24,w24,w17			// h+=Sigma0(a)
@@ -420,7 +420,7 @@ sha256_block_data_order:
 	add	w23,w23,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w23,w23,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w0,w0			// 13
 #endif
 	ldp	w1,w2,[x1]
@@ -446,7 +446,7 @@ sha256_block_data_order:
 	add	w22,w22,w19			// h+=Maj(a,b,c)
 	ldr	w19,[x30],#4		// *K++, w28 in next round
 	//add	w22,w22,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w1,w1			// 14
 #endif
 	ldr	w6,[sp,#12]
@@ -472,7 +472,7 @@ sha256_block_data_order:
 	add	w21,w21,w28			// h+=Maj(a,b,c)
 	ldr	w28,[x30],#4		// *K++, w19 in next round
 	//add	w21,w21,w17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	w2,w2			// 15
 #endif
 	ldr	w7,[sp,#0]
@@ -1209,6 +1209,8 @@ sha256_block_armv8:
 	ret
 .size	sha256_block_armv8,.-sha256_block_armv8
 #endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha512-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha512-armv8-linux.S
@@ -8,12 +8,11 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
-// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
 // this file except in compliance with the License.  You can obtain a copy
@@ -41,6 +40,7 @@
 // Denver	2.01		10.5 (+26%)	6.70 (+8%)
 // X-Gene			20.0 (+100%)	12.8 (+300%(***))
 // Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
 //
 // (*)	Software SHA256 results are of lesser relevance, presented
 //	mostly for informational purposes.
@@ -49,7 +49,7 @@
 //	on Cortex-A53 (or by 4 cycles per round).
 // (***)	Super-impressive coefficients over gcc-generated code are
 //	indication of some compiler "pathology", most notably code
-//	generated with -mgeneral-regs-only is significanty faster
+//	generated with -mgeneral-regs-only is significantly faster
 //	and the gap is only 40-90%.

 #ifndef	__KERNEL__
@@ -65,6 +65,17 @@
 .type	sha512_block_data_order,%function
 .align	6
 sha512_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+	adrp	x16,OPENSSL_armcap_P
+#endif
+	ldr	w16,[x16,:lo12:OPENSSL_armcap_P]
+	tst	w16,#ARMV8_SHA512
+	b.ne	.Lv8_entry
+#endif
 	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
@@ -90,7 +101,7 @@ sha512_block_data_order:
 	ldr	x19,[x30],#8			// *K++
 	eor	x28,x21,x22				// magic seed
 	str	x1,[x29,#112]
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x3,x3			// 0
 #endif
 	ror	x16,x24,#14
@@ -113,7 +124,7 @@ sha512_block_data_order:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x4,x4			// 1
 #endif
 	ldp	x5,x6,[x1],#2*8
@@ -138,7 +149,7 @@ sha512_block_data_order:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x5,x5			// 2
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@@ -162,7 +173,7 @@ sha512_block_data_order:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x6,x6			// 3
 #endif
 	ldp	x7,x8,[x1],#2*8
@@ -187,7 +198,7 @@ sha512_block_data_order:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x7,x7			// 4
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@@ -211,7 +222,7 @@ sha512_block_data_order:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x8,x8			// 5
 #endif
 	ldp	x9,x10,[x1],#2*8
@@ -236,7 +247,7 @@ sha512_block_data_order:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x9,x9			// 6
 #endif
 	add	x22,x22,x17			// h+=Sigma0(a)
@@ -260,7 +271,7 @@ sha512_block_data_order:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x10,x10			// 7
 #endif
 	ldp	x11,x12,[x1],#2*8
@@ -285,7 +296,7 @@ sha512_block_data_order:
 	add	x20,x20,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x20,x20,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x11,x11			// 8
 #endif
 	add	x20,x20,x17			// h+=Sigma0(a)
@@ -309,7 +320,7 @@ sha512_block_data_order:
 	add	x27,x27,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x27,x27,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x12,x12			// 9
 #endif
 	ldp	x13,x14,[x1],#2*8
@@ -334,7 +345,7 @@ sha512_block_data_order:
 	add	x26,x26,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x26,x26,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x13,x13			// 10
 #endif
 	add	x26,x26,x17			// h+=Sigma0(a)
@@ -358,7 +369,7 @@ sha512_block_data_order:
 	add	x25,x25,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x25,x25,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x14,x14			// 11
 #endif
 	ldp	x15,x0,[x1],#2*8
@@ -384,7 +395,7 @@ sha512_block_data_order:
 	add	x24,x24,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x24,x24,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x15,x15			// 12
 #endif
 	add	x24,x24,x17			// h+=Sigma0(a)
@@ -409,7 +420,7 @@ sha512_block_data_order:
 	add	x23,x23,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x23,x23,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x0,x0			// 13
 #endif
 	ldp	x1,x2,[x1]
@@ -435,7 +446,7 @@ sha512_block_data_order:
 	add	x22,x22,x19			// h+=Maj(a,b,c)
 	ldr	x19,[x30],#8		// *K++, x28 in next round
 	//add	x22,x22,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x1,x1			// 14
 #endif
 	ldr	x6,[sp,#24]
@@ -461,7 +472,7 @@ sha512_block_data_order:
 	add	x21,x21,x28			// h+=Maj(a,b,c)
 	ldr	x28,[x30],#8		// *K++, x19 in next round
 	//add	x21,x21,x17			// h+=Sigma0(a)
-#ifndef	__ARMEB__
+#ifndef	__AARCH64EB__
 	rev	x2,x2			// 15
 #endif
 	ldr	x7,[sp,#0]
@@ -1079,6 +1090,529 @@ sha512_block_data_order:
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+.text
+#ifndef	__KERNEL__
+.type	sha512_block_armv8,%function
+.align	6
+sha512_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,.LK512
+	add	x3,x3,:lo12:.LK512
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	.Loop_hw
+
+.align	4
+.Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,.Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha512_block_armv8,.-sha512_block_armv8
+#endif
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/vpaes-armv8-linux.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/vpaes-armv8-linux.S
@@ -8,8 +8,7 @@
 #define OPENSSL_NO_ASM
 #endif

-#if !defined(OPENSSL_NO_ASM)
-#if defined(__aarch64__)
+#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
@@ -1230,6 +1229,8 @@ vpaes_ctr32_encrypt_blocks:
 	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
+#if defined(__ELF__)
+// See https://www.airs.com/blog/archives/518.
+.section .note.GNU-stack,"",%progbits
 #endif
-#endif  // !OPENSSL_NO_ASM
-.section	.note.GNU-stack,"",%progbits