Source release 17.1.0

2022-07-07 17:14:31 -07:00
parent 8c17574083
commit 694cf6fb25
2233 changed files with 272026 additions and 223371 deletions
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -33,6 +33,8 @@
 .align	5
 aes_hw_set_encrypt_key:
 .Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	mov	x3,#-1
@@ -201,6 +203,7 @@ aes_hw_set_encrypt_key:
 .type	aes_hw_set_decrypt_key,%function
 .align	5
 aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	bl	.Lenc_key
@@ -234,6 +237,7 @@ aes_hw_set_decrypt_key:
 	eor	x0,x0,x0		// return value
 .Ldec_key_abort:
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
 .globl	aes_hw_encrypt
@@ -241,6 +245,7 @@ aes_hw_set_decrypt_key:
 .type	aes_hw_encrypt,%function
 .align	5
 aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@@ -271,6 +276,7 @@ aes_hw_encrypt:
 .type	aes_hw_decrypt,%function
 .align	5
 aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
 	ldr	w3,[x2,#240]
 	ld1	{v0.4s},[x2],#16
 	ld1	{v2.16b},[x0]
@@ -301,6 +307,8 @@ aes_hw_decrypt:
 .type	aes_hw_cbc_encrypt,%function
 .align	5
 aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	subs	x2,x2,#16
@@ -592,6 +600,8 @@ aes_hw_cbc_encrypt:
 .type	aes_hw_ctr32_encrypt_blocks,%function
 .align	5
 aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	ldr	w5,[x3,#240]
@@ -611,20 +621,34 @@ aes_hw_ctr32_encrypt_blocks:
 	add	x7,x3,#32
 	mov	w6,w5
 	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
 #ifndef __ARMEB__
 	rev	w8, w8
 #endif
-	orr	v1.16b,v0.16b,v0.16b
 	add	w10, w8, #1
-	orr	v18.16b,v0.16b,v0.16b
-	add	w8, w8, #2
 	orr	v6.16b,v0.16b,v0.16b
 	rev	w10, w10
-	mov	v1.s[3],w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
 	b.ls	.Lctr32_tail
 	rev	w12, w8
+	mov	v6.s[3],w12
 	sub	x2,x2,#3		// bias
-	mov	v18.s[3],w12
+	orr	v18.16b,v6.16b,v6.16b
 	b	.Loop3x_ctr32

 .align	4
@@ -651,11 +675,11 @@ aes_hw_ctr32_encrypt_blocks:
 	aese	v1.16b,v16.16b
 	aesmc	v5.16b,v1.16b
 	ld1	{v2.16b},[x0],#16
-	orr	v0.16b,v6.16b,v6.16b
+	add	w9,w8,#1
 	aese	v18.16b,v16.16b
 	aesmc	v18.16b,v18.16b
 	ld1	{v3.16b},[x0],#16
-	orr	v1.16b,v6.16b,v6.16b
+	rev	w9,w9
 	aese	v4.16b,v17.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v17.16b
@@ -664,8 +688,6 @@ aes_hw_ctr32_encrypt_blocks:
 	mov	x7,x3
 	aese	v18.16b,v17.16b
 	aesmc	v17.16b,v18.16b
-	orr	v18.16b,v6.16b,v6.16b
-	add	w9,w8,#1
 	aese	v4.16b,v20.16b
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v20.16b
@@ -680,21 +702,26 @@ aes_hw_ctr32_encrypt_blocks:
 	aesmc	v4.16b,v4.16b
 	aese	v5.16b,v21.16b
 	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
 	eor	v19.16b,v19.16b,v7.16b
-	rev	w9,w9
+	mov	v6.s[3], w9
 	aese	v17.16b,v21.16b
 	aesmc	v17.16b,v17.16b
-	mov	v0.s[3], w9
+	orr	v0.16b,v6.16b,v6.16b
 	rev	w10,w10
 	aese	v4.16b,v22.16b
 	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
 	aese	v5.16b,v22.16b
 	aesmc	v5.16b,v5.16b
-	mov	v1.s[3], w10
-	rev	w12,w8
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
 	aese	v17.16b,v22.16b
 	aesmc	v17.16b,v17.16b
-	mov	v18.s[3], w12
+	orr	v18.16b,v6.16b,v6.16b
 	subs	x2,x2,#3
 	aese	v4.16b,v23.16b
 	aese	v5.16b,v23.16b
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/armv8-mont.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -13,6 +13,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
+#include <openssl/arm_arch.h>
+
 .text

 .globl	bn_mul_mont
@@ -20,6 +22,7 @@
 .type	bn_mul_mont,%function
 .align	5
 bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
 	tst	x5,#7
 	b.eq	__bn_sqr8x_mont
 	tst	x5,#3
@@ -217,11 +220,14 @@ bn_mul_mont:
 	mov	x0,#1
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	bn_mul_mont,.-bn_mul_mont
 .type	__bn_sqr8x_mont,%function
 .align	5
 __bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
 	cmp	x1,x2
 	b.ne	__bn_mul4x_mont
 .Lsqr8x_mont:
@@ -975,11 +981,16 @@ __bn_sqr8x_mont:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
 .type	__bn_mul4x_mont,%function
 .align	5
 __bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0
 	stp	x19,x20,[sp,#16]
@@ -1413,6 +1424,8 @@ __bn_mul4x_mont:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	__bn_mul4x_mont,.-__bn_mul4x_mont
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -13,6 +13,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
+#include <openssl/arm_arch.h>
+
 .text

 .globl	gcm_init_neon
@@ -20,6 +22,7 @@
 .type	gcm_init_neon,%function
 .align	4
 gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
 	// This function is adapted from gcm_init_v8. xC2 is t3.
 	ld1	{v17.2d}, [x1]			// load H
 	movi	v19.16b, #0xe1
@@ -45,6 +48,7 @@ gcm_init_neon:
 .type	gcm_gmult_neon,%function
 .align	4
 gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v3.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
@@ -64,6 +68,7 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v0.16b}, [x0]		// load Xi
 	ld1	{v5.1d}, [x1], #8		// load twisted H
 	ld1	{v6.1d}, [x1]
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -15,6 +15,7 @@
 #endif
 #include <openssl/arm_arch.h>

+#if __ARM_MAX_ARCH__>=7
 .text
 .arch	armv8-a+crypto
 .globl	gcm_init_v8
@@ -22,6 +23,7 @@
 .type	gcm_init_v8,%function
 .align	4
 gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x1]		//load input H
 	movi	v19.16b,#0xe1
 	shl	v19.2d,v19.2d,#57		//0xc2.0
@@ -64,8 +66,48 @@ gcm_init_v8:
 	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
 	eor	v17.16b,v17.16b,v22.16b
 	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
-	st1	{v21.2d,v22.2d},[x0]		//store Htable[1..2]
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d

+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
 	ret
 .size	gcm_init_v8,.-gcm_init_v8
 .globl	gcm_gmult_v8
@@ -73,6 +115,7 @@ gcm_init_v8:
 .type	gcm_gmult_v8,%function
 .align	4
 gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
 	ld1	{v17.2d},[x0]		//load Xi
 	movi	v19.16b,#0xe1
 	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
@@ -115,6 +158,9 @@ gcm_gmult_v8:
 .type	gcm_ghash_v8,%function
 .align	4
 gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	.Lgcm_ghash_v8_4x
 	ld1	{v0.2d},[x0]		//load [rotated] Xi
 						//"[rotated]" means that
 						//loaded value would have
@@ -241,9 +287,290 @@ gcm_ghash_v8:

 	ret
 .size	gcm_ghash_v8,.-gcm_ghash_v8
+.type	gcm_ghash_v8_4x,%function
+.align	4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	.Ltail4x
+
+	b	.Loop4x
+
+.align	4
+.Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	.Loop4x
+
+.Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	.Ldone4x
+
+	cmp	x3,#32
+	b.lo	.Lone
+	b.eq	.Ltwo
+.Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	.Ldone4x
+
+.align	4
+.Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__ARMEB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __ARMEB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+.size	gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
 .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
 #endif
+#endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha1-armv8.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -18,11 +18,14 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha1_block_data_order
 .hidden	sha1_block_data_order
 .type	sha1_block_data_order,%function
 .align	6
 sha1_block_data_order:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
 #else
@@ -1090,6 +1093,8 @@ sha1_block_data_order:
 .type	sha1_block_armv8,%function
 .align	6
 sha1_block_armv8:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
 .Lv8_entry:
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
@@ -1228,8 +1233,6 @@ sha1_block_armv8:
 .byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
 #endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha256-armv8.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -59,11 +59,13 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha256_block_data_order
 .hidden	sha256_block_data_order
 .type	sha256_block_data_order,%function
 .align	6
 sha256_block_data_order:
+	AARCH64_VALID_CALL_TARGET
 #ifndef	__KERNEL__
 #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 	adrp	x16,:pg_hi21_nc:OPENSSL_armcap_P
@@ -74,6 +76,7 @@ sha256_block_data_order:
 	tst	w16,#ARMV8_SHA256
 	b.ne	.Lv8_entry
 #endif
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@@ -1034,6 +1037,7 @@ sha256_block_data_order:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	sha256_block_data_order,.-sha256_block_data_order

@@ -1068,6 +1072,7 @@ sha256_block_data_order:
 .align	6
 sha256_block_armv8:
 .Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@@ -1204,10 +1209,6 @@ sha256_block_armv8:
 	ret
 .size	sha256_block_armv8,.-sha256_block_armv8
 #endif
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 #endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -59,11 +59,13 @@
 .text


+.hidden	OPENSSL_armcap_P
 .globl	sha512_block_data_order
 .hidden	sha512_block_data_order
 .type	sha512_block_data_order,%function
 .align	6
 sha512_block_data_order:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-128]!
 	add	x29,sp,#0

@@ -1024,6 +1026,7 @@ sha512_block_data_order:
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
 	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	sha512_block_data_order,.-sha512_block_data_order

@@ -1076,10 +1079,6 @@ sha512_block_data_order:
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
 #endif
 #endif  // !OPENSSL_NO_ASM
 .section	.note.GNU-stack,"",%progbits
--- a/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S
+++ b/third_party/boringssl/kit/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S
@@ -13,6 +13,8 @@
 #if defined(BORINGSSL_PREFIX)
 #include <boringssl_prefix_symbols_asm.h>
 #endif
+#include <openssl/arm_arch.h>
+
 .section	.rodata

 .type	_vpaes_consts,%object
@@ -215,6 +217,7 @@ _vpaes_encrypt_core:
 .type	vpaes_encrypt,%function
 .align	4
 vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@@ -224,6 +227,7 @@ vpaes_encrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_encrypt,.-vpaes_encrypt

@@ -452,6 +456,7 @@ _vpaes_decrypt_core:
 .type	vpaes_decrypt,%function
 .align	4
 vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0

@@ -461,6 +466,7 @@ vpaes_decrypt:
 	st1	{v0.16b}, [x1]

 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_decrypt,.-vpaes_decrypt

@@ -630,6 +636,7 @@ _vpaes_key_preheat:
 .type	_vpaes_schedule_core,%function
 .align	4
 _vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29, x30, [sp,#-16]!
 	add	x29,sp,#0

@@ -799,6 +806,7 @@ _vpaes_schedule_core:
 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
 	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	_vpaes_schedule_core,.-_vpaes_schedule_core

@@ -1001,7 +1009,7 @@ _vpaes_schedule_mangle:

 .Lschedule_mangle_both:
 	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
-	add	x8, x8, #64-16			// add	$-16,	%r8
+	add	x8, x8, #48			// add	$-16,	%r8
 	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
 	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
 	ret
@@ -1012,6 +1020,7 @@ _vpaes_schedule_mangle:
 .type	vpaes_set_encrypt_key,%function
 .align	4
 vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -1027,6 +1036,7 @@ vpaes_set_encrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key

@@ -1035,6 +1045,7 @@ vpaes_set_encrypt_key:
 .type	vpaes_set_decrypt_key,%function
 .align	4
 vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -1054,6 +1065,7 @@ vpaes_set_decrypt_key:

 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
 .globl	vpaes_cbc_encrypt
@@ -1061,6 +1073,7 @@ vpaes_set_decrypt_key:
 .type	vpaes_cbc_encrypt,%function
 .align	4
 vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
 	cbz	x2, .Lcbc_abort
 	cmp	w5, #0			// check direction
 	b.eq	vpaes_cbc_decrypt
@@ -1088,12 +1101,15 @@ vpaes_cbc_encrypt:

 	ldp	x29,x30,[sp],#16
 .Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt

 .type	vpaes_cbc_decrypt,%function
 .align	4
 vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -1135,6 +1151,7 @@ vpaes_cbc_decrypt:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
 .globl	vpaes_ctr32_encrypt_blocks
@@ -1142,6 +1159,7 @@ vpaes_cbc_decrypt:
 .type	vpaes_ctr32_encrypt_blocks,%function
 .align	4
 vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
 	stp	x29,x30,[sp,#-16]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
@@ -1209,6 +1227,7 @@ vpaes_ctr32_encrypt_blocks:
 	ldp	d10,d11,[sp],#16
 	ldp	d8,d9,[sp],#16
 	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
 	ret
 .size	vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
 #endif