/* armv8-chacha-asm
 *
 * Copyright (C) 2006-2026 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

#include <wolfssl/wolfcrypt/libwolfssl_sources_asm.h>

/* Generated using (from wolfssl):
 *   cd ../scripts
 *   ruby ./chacha/chacha.rb arm64 \
 *       ../wolfssl/wolfcrypt/src/port/arm/armv8-chacha-asm.S
 */
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifndef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_CHACHA
#ifndef __APPLE__
	.text
	.section	.rodata
	.type	L_chacha20_arm64_ctr, %object
	.size	L_chacha20_arm64_ctr, 16
#else
	.section	__DATA,__data
#endif /* __APPLE__ */
	# 8-byte aligned, 64-bit aligned
#ifndef __APPLE__
	.align	3
#else
	.p2align	3
#endif /* __APPLE__ */
L_chacha20_arm64_ctr:
	.long	0x00000000,0x00000001,0x00000002,0x00000003
#ifndef __APPLE__
	.text
	.section	.rodata
	.type	L_chacha20_arm64_rol8, %object
	.size	L_chacha20_arm64_rol8, 16
#else
	.section	__DATA,__data
#endif /* __APPLE__ */
	# 8-byte aligned, 64-bit aligned
#ifndef __APPLE__
	.align	3
#else
	.p2align	3
#endif /* __APPLE__ */
L_chacha20_arm64_rol8:
	.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
#ifndef WOLFSSL_ARMASM_NO_NEON
#ifndef __APPLE__
.text
.globl	wc_chacha_crypt_bytes
.type	wc_chacha_crypt_bytes,@function
.align	2
wc_chacha_crypt_bytes:
#else
.section	__TEXT,__text
.globl	_wc_chacha_crypt_bytes
.p2align	2
_wc_chacha_crypt_bytes:
#endif /* __APPLE__ */
	stp	x29, x30, [sp, #-160]!
	add	x29, sp, #0
	stp	x17, x19, [x29, #24]
	stp	x20, x21, [x29, #40]
	stp	x22, x23, [x29, #56]
	stp	x24, x25, [x29, #72]
	str	x26, [x29, #88]
	stp	d8, d9, [x29, #96]
	stp	d10, d11, [x29, #112]
	stp	d12, d13, [x29, #128]
	stp	d14, d15, [x29, #144]
#ifndef __APPLE__
	adrp x5, L_chacha20_arm64_rol8
	add  x5, x5, :lo12:L_chacha20_arm64_rol8
#else
	adrp x5, L_chacha20_arm64_rol8@PAGE
	add  x5, x5, L_chacha20_arm64_rol8@PAGEOFF
#endif /* __APPLE__ */
#ifndef __APPLE__
	adrp x6, L_chacha20_arm64_ctr
	add  x6, x6, :lo12:L_chacha20_arm64_ctr
#else
	adrp x6, L_chacha20_arm64_ctr@PAGE
	add  x6, x6, L_chacha20_arm64_ctr@PAGEOFF
#endif /* __APPLE__ */
	eor	v29.16b, v29.16b, v29.16b
	mov	x26, #5
	eor	v31.16b, v31.16b, v31.16b
	mov	w7, #1
	ld1	{v30.16b}, [x5]
	ld1	{v28.4s}, [x6]
	add	x4, x0, #0x44
	mov	v29.s[0], w26
	mov	v31.s[0], w7
	# Load state to encrypt
	ld1	{v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
	cmp	x3, #0x140
	blt	L_chacha_crypt_bytes_arm64_lt_320
	mov	w25, #4
L_chacha_crypt_bytes_arm64_loop_320:
	# Move state into regular register
	mov	x8, v16.d[0]
	mov	x10, v16.d[1]
	mov	x12, v17.d[0]
	mov	x14, v17.d[1]
	mov	x16, v18.d[0]
	mov	x19, v18.d[1]
	mov	x21, v19.d[0]
	mov	x23, v19.d[1]
	sub	x3, x3, #0x140
	# Move state into vector registers
	dup	v0.4s, v16.s[0]
	dup	v1.4s, v16.s[1]
	lsr	x9, x8, #32
	dup	v2.4s, v16.s[2]
	dup	v3.4s, v16.s[3]
	lsr	x11, x10, #32
	dup	v4.4s, v17.s[0]
	dup	v5.4s, v17.s[1]
	lsr	x13, x12, #32
	dup	v6.4s, v17.s[2]
	dup	v7.4s, v17.s[3]
	lsr	x15, x14, #32
	dup	v8.4s, v18.s[0]
	dup	v9.4s, v18.s[1]
	lsr	x17, x16, #32
	dup	v10.4s, v18.s[2]
	dup	v11.4s, v18.s[3]
	lsr	x20, x19, #32
	dup	v12.4s, v19.s[0]
	dup	v13.4s, v19.s[1]
	lsr	x22, x21, #32
	dup	v14.4s, v19.s[2]
	dup	v15.4s, v19.s[3]
	lsr	x24, x23, #32
	# Add to counter word
	add	v12.4s, v12.4s, v28.4s
	add	w21, w21, w25
	# Set number of odd+even rounds to perform
	mov	x26, #10
L_chacha_crypt_bytes_arm64_round_start_320:
	subs	x26, x26, #1
	# Round odd
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v4.4s
	add	w8, w8, w12
	add	v1.4s, v1.4s, v5.4s
	add	w9, w9, w13
	add	v2.4s, v2.4s, v6.4s
	add	w10, w10, w14
	add	v3.4s, v3.4s, v7.4s
	add	w11, w11, w15
	eor	v12.16b, v12.16b, v0.16b
	eor	w21, w21, w8
	eor	v13.16b, v13.16b, v1.16b
	eor	w22, w22, w9
	eor	v14.16b, v14.16b, v2.16b
	eor	w23, w23, w10
	eor	v15.16b, v15.16b, v3.16b
	eor	w24, w24, w11
	rev32	v12.8h, v12.8h
	ror	w21, w21, #16
	rev32	v13.8h, v13.8h
	ror	w22, w22, #16
	rev32	v14.8h, v14.8h
	ror	w23, w23, #16
	rev32	v15.8h, v15.8h
	ror	w24, w24, #16
	# c += d; b ^= c; b <<<= 12;
	add	v8.4s, v8.4s, v12.4s
	add	w16, w16, w21
	add	v9.4s, v9.4s, v13.4s
	add	w17, w17, w22
	add	v10.4s, v10.4s, v14.4s
	add	w19, w19, w23
	add	v11.4s, v11.4s, v15.4s
	add	w20, w20, w24
	eor	v20.16b, v4.16b, v8.16b
	eor	w12, w12, w16
	eor	v21.16b, v5.16b, v9.16b
	eor	w13, w13, w17
	eor	v22.16b, v6.16b, v10.16b
	eor	w14, w14, w19
	eor	v23.16b, v7.16b, v11.16b
	eor	w15, w15, w20
	shl	v4.4s, v20.4s, #12
	ror	w12, w12, #20
	shl	v5.4s, v21.4s, #12
	ror	w13, w13, #20
	shl	v6.4s, v22.4s, #12
	ror	w14, w14, #20
	shl	v7.4s, v23.4s, #12
	ror	w15, w15, #20
	sri	v4.4s, v20.4s, #20
	sri	v5.4s, v21.4s, #20
	sri	v6.4s, v22.4s, #20
	sri	v7.4s, v23.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v4.4s
	add	w8, w8, w12
	add	v1.4s, v1.4s, v5.4s
	add	w9, w9, w13
	add	v2.4s, v2.4s, v6.4s
	add	w10, w10, w14
	add	v3.4s, v3.4s, v7.4s
	add	w11, w11, w15
	eor	v12.16b, v12.16b, v0.16b
	eor	w21, w21, w8
	eor	v13.16b, v13.16b, v1.16b
	eor	w22, w22, w9
	eor	v14.16b, v14.16b, v2.16b
	eor	w23, w23, w10
	eor	v15.16b, v15.16b, v3.16b
	eor	w24, w24, w11
	tbl	v12.16b, {v12.16b}, v30.16b
	ror	w21, w21, #24
	tbl	v13.16b, {v13.16b}, v30.16b
	ror	w22, w22, #24
	tbl	v14.16b, {v14.16b}, v30.16b
	ror	w23, w23, #24
	tbl	v15.16b, {v15.16b}, v30.16b
	ror	w24, w24, #24
	# c += d; b ^= c; b <<<= 7;
	add	v8.4s, v8.4s, v12.4s
	add	w16, w16, w21
	add	v9.4s, v9.4s, v13.4s
	add	w17, w17, w22
	add	v10.4s, v10.4s, v14.4s
	add	w19, w19, w23
	add	v11.4s, v11.4s, v15.4s
	add	w20, w20, w24
	eor	v20.16b, v4.16b, v8.16b
	eor	w12, w12, w16
	eor	v21.16b, v5.16b, v9.16b
	eor	w13, w13, w17
	eor	v22.16b, v6.16b, v10.16b
	eor	w14, w14, w19
	eor	v23.16b, v7.16b, v11.16b
	eor	w15, w15, w20
	shl	v4.4s, v20.4s, #7
	ror	w12, w12, #25
	shl	v5.4s, v21.4s, #7
	ror	w13, w13, #25
	shl	v6.4s, v22.4s, #7
	ror	w14, w14, #25
	shl	v7.4s, v23.4s, #7
	ror	w15, w15, #25
	sri	v4.4s, v20.4s, #25
	sri	v5.4s, v21.4s, #25
	sri	v6.4s, v22.4s, #25
	sri	v7.4s, v23.4s, #25
	# Round even
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v5.4s
	add	w8, w8, w13
	add	v1.4s, v1.4s, v6.4s
	add	w9, w9, w14
	add	v2.4s, v2.4s, v7.4s
	add	w10, w10, w15
	add	v3.4s, v3.4s, v4.4s
	add	w11, w11, w12
	eor	v15.16b, v15.16b, v0.16b
	eor	w24, w24, w8
	eor	v12.16b, v12.16b, v1.16b
	eor	w21, w21, w9
	eor	v13.16b, v13.16b, v2.16b
	eor	w22, w22, w10
	eor	v14.16b, v14.16b, v3.16b
	eor	w23, w23, w11
	rev32	v15.8h, v15.8h
	ror	w24, w24, #16
	rev32	v12.8h, v12.8h
	ror	w21, w21, #16
	rev32	v13.8h, v13.8h
	ror	w22, w22, #16
	rev32	v14.8h, v14.8h
	ror	w23, w23, #16
	# c += d; b ^= c; b <<<= 12;
	add	v10.4s, v10.4s, v15.4s
	add	w19, w19, w24
	add	v11.4s, v11.4s, v12.4s
	add	w20, w20, w21
	add	v8.4s, v8.4s, v13.4s
	add	w16, w16, w22
	add	v9.4s, v9.4s, v14.4s
	add	w17, w17, w23
	eor	v20.16b, v5.16b, v10.16b
	eor	w13, w13, w19
	eor	v21.16b, v6.16b, v11.16b
	eor	w14, w14, w20
	eor	v22.16b, v7.16b, v8.16b
	eor	w15, w15, w16
	eor	v23.16b, v4.16b, v9.16b
	eor	w12, w12, w17
	shl	v5.4s, v20.4s, #12
	ror	w13, w13, #20
	shl	v6.4s, v21.4s, #12
	ror	w14, w14, #20
	shl	v7.4s, v22.4s, #12
	ror	w15, w15, #20
	shl	v4.4s, v23.4s, #12
	ror	w12, w12, #20
	sri	v5.4s, v20.4s, #20
	sri	v6.4s, v21.4s, #20
	sri	v7.4s, v22.4s, #20
	sri	v4.4s, v23.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v5.4s
	add	w8, w8, w13
	add	v1.4s, v1.4s, v6.4s
	add	w9, w9, w14
	add	v2.4s, v2.4s, v7.4s
	add	w10, w10, w15
	add	v3.4s, v3.4s, v4.4s
	add	w11, w11, w12
	eor	v15.16b, v15.16b, v0.16b
	eor	w24, w24, w8
	eor	v12.16b, v12.16b, v1.16b
	eor	w21, w21, w9
	eor	v13.16b, v13.16b, v2.16b
	eor	w22, w22, w10
	eor	v14.16b, v14.16b, v3.16b
	eor	w23, w23, w11
	tbl	v15.16b, {v15.16b}, v30.16b
	ror	w24, w24, #24
	tbl	v12.16b, {v12.16b}, v30.16b
	ror	w21, w21, #24
	tbl	v13.16b, {v13.16b}, v30.16b
	ror	w22, w22, #24
	tbl	v14.16b, {v14.16b}, v30.16b
	ror	w23, w23, #24
	# c += d; b ^= c; b <<<= 7;
	add	v10.4s, v10.4s, v15.4s
	add	w19, w19, w24
	add	v11.4s, v11.4s, v12.4s
	add	w20, w20, w21
	add	v8.4s, v8.4s, v13.4s
	add	w16, w16, w22
	add	v9.4s, v9.4s, v14.4s
	add	w17, w17, w23
	eor	v20.16b, v5.16b, v10.16b
	eor	w13, w13, w19
	eor	v21.16b, v6.16b, v11.16b
	eor	w14, w14, w20
	eor	v22.16b, v7.16b, v8.16b
	eor	w15, w15, w16
	eor	v23.16b, v4.16b, v9.16b
	eor	w12, w12, w17
	shl	v5.4s, v20.4s, #7
	ror	w13, w13, #25
	shl	v6.4s, v21.4s, #7
	ror	w14, w14, #25
	shl	v7.4s, v22.4s, #7
	ror	w15, w15, #25
	shl	v4.4s, v23.4s, #7
	ror	w12, w12, #25
	sri	v5.4s, v20.4s, #25
	sri	v6.4s, v21.4s, #25
	sri	v7.4s, v22.4s, #25
	sri	v4.4s, v23.4s, #25
	bne	L_chacha_crypt_bytes_arm64_round_start_320
	# Add counter now rather than after transposed
	add	v12.4s, v12.4s, v28.4s
	add	w21, w21, w25
	# Load message
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	# Transpose vectors
	trn1	v20.4s, v0.4s, v1.4s
	trn1	v22.4s, v2.4s, v3.4s
	orr	x8, x8, x9, lsl 32
	trn2	v21.4s, v0.4s, v1.4s
	trn2	v23.4s, v2.4s, v3.4s
	trn1	v0.2d, v20.2d, v22.2d
	trn1	v1.2d, v21.2d, v23.2d
	orr	x10, x10, x11, lsl 32
	trn2	v2.2d, v20.2d, v22.2d
	trn2	v3.2d, v21.2d, v23.2d
	trn1	v20.4s, v4.4s, v5.4s
	trn1	v22.4s, v6.4s, v7.4s
	orr	x12, x12, x13, lsl 32
	trn2	v21.4s, v4.4s, v5.4s
	trn2	v23.4s, v6.4s, v7.4s
	trn1	v4.2d, v20.2d, v22.2d
	trn1	v5.2d, v21.2d, v23.2d
	orr	x14, x14, x15, lsl 32
	trn2	v6.2d, v20.2d, v22.2d
	trn2	v7.2d, v21.2d, v23.2d
	trn1	v20.4s, v8.4s, v9.4s
	trn1	v22.4s, v10.4s, v11.4s
	orr	x16, x16, x17, lsl 32
	trn2	v21.4s, v8.4s, v9.4s
	trn2	v23.4s, v10.4s, v11.4s
	trn1	v8.2d, v20.2d, v22.2d
	trn1	v9.2d, v21.2d, v23.2d
	orr	x19, x19, x20, lsl 32
	trn2	v10.2d, v20.2d, v22.2d
	trn2	v11.2d, v21.2d, v23.2d
	trn1	v20.4s, v12.4s, v13.4s
	trn1	v22.4s, v14.4s, v15.4s
	orr	x21, x21, x22, lsl 32
	trn2	v21.4s, v12.4s, v13.4s
	trn2	v23.4s, v14.4s, v15.4s
	trn1	v12.2d, v20.2d, v22.2d
	trn1	v13.2d, v21.2d, v23.2d
	orr	x23, x23, x24, lsl 32
	trn2	v14.2d, v20.2d, v22.2d
	trn2	v15.2d, v21.2d, v23.2d
	# Add back state, XOR in message and store (load next block)
	add	v20.4s, v0.4s, v16.4s
	add	v21.4s, v4.4s, v17.4s
	add	v22.4s, v8.4s, v18.4s
	add	v23.4s, v12.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v1.4s, v16.4s
	add	v21.4s, v5.4s, v17.4s
	add	v22.4s, v9.4s, v18.4s
	add	v23.4s, v13.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v2.4s, v16.4s
	add	v21.4s, v6.4s, v17.4s
	add	v22.4s, v10.4s, v18.4s
	add	v23.4s, v14.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v3.4s, v16.4s
	add	v21.4s, v7.4s, v17.4s
	add	v22.4s, v11.4s, v18.4s
	add	v23.4s, v15.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	# Move regular registers into vector registers for adding and xor
	mov	v0.d[0], x8
	mov	v0.d[1], x10
	mov	v1.d[0], x12
	mov	v1.d[1], x14
	mov	v2.d[0], x16
	mov	v2.d[1], x19
	mov	v3.d[0], x21
	mov	v3.d[1], x23
	# Add back state, XOR in message and store
	add	v0.4s, v0.4s, v16.4s
	add	v1.4s, v1.4s, v17.4s
	add	v2.4s, v2.4s, v18.4s
	add	v3.4s, v3.4s, v19.4s
	eor	v0.16b, v0.16b, v24.16b
	eor	v1.16b, v1.16b, v25.16b
	eor	v2.16b, v2.16b, v26.16b
	eor	v3.16b, v3.16b, v27.16b
	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40
	cmp	x3, #0x140
	add	v19.4s, v19.4s, v29.4s
	bge	L_chacha_crypt_bytes_arm64_loop_320
	# Done doing 320 bytes at a time
L_chacha_crypt_bytes_arm64_lt_320:
	cmp	x3, #0x100
	blt	L_chacha_crypt_bytes_arm64_lt_256
	# Move state into vector registers
	dup	v0.4s, v16.s[0]
	dup	v1.4s, v16.s[1]
	dup	v2.4s, v16.s[2]
	dup	v3.4s, v16.s[3]
	dup	v4.4s, v17.s[0]
	dup	v5.4s, v17.s[1]
	dup	v6.4s, v17.s[2]
	dup	v7.4s, v17.s[3]
	dup	v8.4s, v18.s[0]
	dup	v9.4s, v18.s[1]
	dup	v10.4s, v18.s[2]
	dup	v11.4s, v18.s[3]
	dup	v12.4s, v19.s[0]
	dup	v13.4s, v19.s[1]
	dup	v14.4s, v19.s[2]
	dup	v15.4s, v19.s[3]
	# Add to counter word
	add	v12.4s, v12.4s, v28.4s
	# Set number of odd+even rounds to perform
	mov	x26, #10
L_chacha_crypt_bytes_arm64_round_start_256:
	subs	x26, x26, #1
	# Round odd
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v4.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	add	v3.4s, v3.4s, v7.4s
	eor	v12.16b, v12.16b, v0.16b
	eor	v13.16b, v13.16b, v1.16b
	eor	v14.16b, v14.16b, v2.16b
	eor	v15.16b, v15.16b, v3.16b
	rev32	v12.8h, v12.8h
	rev32	v13.8h, v13.8h
	rev32	v14.8h, v14.8h
	rev32	v15.8h, v15.8h
	# c += d; b ^= c; b <<<= 12;
	add	v8.4s, v8.4s, v12.4s
	add	v9.4s, v9.4s, v13.4s
	add	v10.4s, v10.4s, v14.4s
	add	v11.4s, v11.4s, v15.4s
	eor	v20.16b, v4.16b, v8.16b
	eor	v21.16b, v5.16b, v9.16b
	eor	v22.16b, v6.16b, v10.16b
	eor	v23.16b, v7.16b, v11.16b
	shl	v4.4s, v20.4s, #12
	shl	v5.4s, v21.4s, #12
	shl	v6.4s, v22.4s, #12
	shl	v7.4s, v23.4s, #12
	sri	v4.4s, v20.4s, #20
	sri	v5.4s, v21.4s, #20
	sri	v6.4s, v22.4s, #20
	sri	v7.4s, v23.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v4.4s
	add	v1.4s, v1.4s, v5.4s
	add	v2.4s, v2.4s, v6.4s
	add	v3.4s, v3.4s, v7.4s
	eor	v12.16b, v12.16b, v0.16b
	eor	v13.16b, v13.16b, v1.16b
	eor	v14.16b, v14.16b, v2.16b
	eor	v15.16b, v15.16b, v3.16b
	tbl	v12.16b, {v12.16b}, v30.16b
	tbl	v13.16b, {v13.16b}, v30.16b
	tbl	v14.16b, {v14.16b}, v30.16b
	tbl	v15.16b, {v15.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v8.4s, v8.4s, v12.4s
	add	v9.4s, v9.4s, v13.4s
	add	v10.4s, v10.4s, v14.4s
	add	v11.4s, v11.4s, v15.4s
	eor	v20.16b, v4.16b, v8.16b
	eor	v21.16b, v5.16b, v9.16b
	eor	v22.16b, v6.16b, v10.16b
	eor	v23.16b, v7.16b, v11.16b
	shl	v4.4s, v20.4s, #7
	shl	v5.4s, v21.4s, #7
	shl	v6.4s, v22.4s, #7
	shl	v7.4s, v23.4s, #7
	sri	v4.4s, v20.4s, #25
	sri	v5.4s, v21.4s, #25
	sri	v6.4s, v22.4s, #25
	sri	v7.4s, v23.4s, #25
	# Round even
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v4.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v12.16b, v12.16b, v1.16b
	eor	v13.16b, v13.16b, v2.16b
	eor	v14.16b, v14.16b, v3.16b
	rev32	v15.8h, v15.8h
	rev32	v12.8h, v12.8h
	rev32	v13.8h, v13.8h
	rev32	v14.8h, v14.8h
	# c += d; b ^= c; b <<<= 12;
	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v12.4s
	add	v8.4s, v8.4s, v13.4s
	add	v9.4s, v9.4s, v14.4s
	eor	v20.16b, v5.16b, v10.16b
	eor	v21.16b, v6.16b, v11.16b
	eor	v22.16b, v7.16b, v8.16b
	eor	v23.16b, v4.16b, v9.16b
	shl	v5.4s, v20.4s, #12
	shl	v6.4s, v21.4s, #12
	shl	v7.4s, v22.4s, #12
	shl	v4.4s, v23.4s, #12
	sri	v5.4s, v20.4s, #20
	sri	v6.4s, v21.4s, #20
	sri	v7.4s, v22.4s, #20
	sri	v4.4s, v23.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v5.4s
	add	v1.4s, v1.4s, v6.4s
	add	v2.4s, v2.4s, v7.4s
	add	v3.4s, v3.4s, v4.4s
	eor	v15.16b, v15.16b, v0.16b
	eor	v12.16b, v12.16b, v1.16b
	eor	v13.16b, v13.16b, v2.16b
	eor	v14.16b, v14.16b, v3.16b
	tbl	v15.16b, {v15.16b}, v30.16b
	tbl	v12.16b, {v12.16b}, v30.16b
	tbl	v13.16b, {v13.16b}, v30.16b
	tbl	v14.16b, {v14.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v10.4s, v10.4s, v15.4s
	add	v11.4s, v11.4s, v12.4s
	add	v8.4s, v8.4s, v13.4s
	add	v9.4s, v9.4s, v14.4s
	eor	v20.16b, v5.16b, v10.16b
	eor	v21.16b, v6.16b, v11.16b
	eor	v22.16b, v7.16b, v8.16b
	eor	v23.16b, v4.16b, v9.16b
	shl	v5.4s, v20.4s, #7
	shl	v6.4s, v21.4s, #7
	shl	v7.4s, v22.4s, #7
	shl	v4.4s, v23.4s, #7
	sri	v5.4s, v20.4s, #25
	sri	v6.4s, v21.4s, #25
	sri	v7.4s, v22.4s, #25
	sri	v4.4s, v23.4s, #25
	bne	L_chacha_crypt_bytes_arm64_round_start_256
	mov	x26, #4
	# Add counter now rather than after transposed
	add	v12.4s, v12.4s, v28.4s
	# Load message
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	# Transpose vectors
	trn1	v20.4s, v0.4s, v1.4s
	trn1	v22.4s, v2.4s, v3.4s
	trn2	v21.4s, v0.4s, v1.4s
	trn2	v23.4s, v2.4s, v3.4s
	trn1	v0.2d, v20.2d, v22.2d
	trn1	v1.2d, v21.2d, v23.2d
	trn2	v2.2d, v20.2d, v22.2d
	trn2	v3.2d, v21.2d, v23.2d
	trn1	v20.4s, v4.4s, v5.4s
	trn1	v22.4s, v6.4s, v7.4s
	trn2	v21.4s, v4.4s, v5.4s
	trn2	v23.4s, v6.4s, v7.4s
	trn1	v4.2d, v20.2d, v22.2d
	trn1	v5.2d, v21.2d, v23.2d
	trn2	v6.2d, v20.2d, v22.2d
	trn2	v7.2d, v21.2d, v23.2d
	trn1	v20.4s, v8.4s, v9.4s
	trn1	v22.4s, v10.4s, v11.4s
	trn2	v21.4s, v8.4s, v9.4s
	trn2	v23.4s, v10.4s, v11.4s
	trn1	v8.2d, v20.2d, v22.2d
	trn1	v9.2d, v21.2d, v23.2d
	trn2	v10.2d, v20.2d, v22.2d
	trn2	v11.2d, v21.2d, v23.2d
	trn1	v20.4s, v12.4s, v13.4s
	trn1	v22.4s, v14.4s, v15.4s
	trn2	v21.4s, v12.4s, v13.4s
	trn2	v23.4s, v14.4s, v15.4s
	trn1	v12.2d, v20.2d, v22.2d
	trn1	v13.2d, v21.2d, v23.2d
	trn2	v14.2d, v20.2d, v22.2d
	trn2	v15.2d, v21.2d, v23.2d
	# Add back state, XOR in message and store (load next block)
	add	v20.4s, v0.4s, v16.4s
	add	v21.4s, v4.4s, v17.4s
	add	v22.4s, v8.4s, v18.4s
	add	v23.4s, v12.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v1.4s, v16.4s
	add	v21.4s, v5.4s, v17.4s
	add	v22.4s, v9.4s, v18.4s
	add	v23.4s, v13.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v2.4s, v16.4s
	add	v21.4s, v6.4s, v17.4s
	add	v22.4s, v10.4s, v18.4s
	add	v23.4s, v14.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v20.4s, v3.4s, v16.4s
	add	v21.4s, v7.4s, v17.4s
	add	v22.4s, v11.4s, v18.4s
	add	v23.4s, v15.4s, v19.4s
	eor	v20.16b, v20.16b, v24.16b
	eor	v21.16b, v21.16b, v25.16b
	eor	v22.16b, v22.16b, v26.16b
	eor	v23.16b, v23.16b, v27.16b
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	mov	v29.s[0], w26
	sub	x3, x3, #0x100
	add	v19.4s, v19.4s, v29.4s
	# Done 256-byte block
L_chacha_crypt_bytes_arm64_lt_256:
	cmp	x3, #0x80
	blt	L_chacha_crypt_bytes_arm64_lt_128
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	# Move state into vector registers
	mov	v4.16b, v16.16b
	mov	v5.16b, v17.16b
	mov	v6.16b, v18.16b
	mov	v7.16b, v19.16b
	mov	v0.16b, v16.16b
	mov	v1.16b, v17.16b
	mov	v2.16b, v18.16b
	mov	v3.16b, v19.16b
	# Add counter word
	add	v7.4s, v7.4s, v31.4s
	# Set number of odd+even rounds to perform
	mov	x26, #10
L_chacha_crypt_bytes_arm64_round_start_128:
	subs	x26, x26, #1
	# Round odd
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v1.4s
	add	v4.4s, v4.4s, v5.4s
	eor	v3.16b, v3.16b, v0.16b
	eor	v7.16b, v7.16b, v4.16b
	rev32	v3.8h, v3.8h
	rev32	v7.8h, v7.8h
	# c += d; b ^= c; b <<<= 12;
	add	v2.4s, v2.4s, v3.4s
	add	v6.4s, v6.4s, v7.4s
	eor	v20.16b, v1.16b, v2.16b
	eor	v21.16b, v5.16b, v6.16b
	shl	v1.4s, v20.4s, #12
	shl	v5.4s, v21.4s, #12
	sri	v1.4s, v20.4s, #20
	sri	v5.4s, v21.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v1.4s
	add	v4.4s, v4.4s, v5.4s
	eor	v3.16b, v3.16b, v0.16b
	eor	v7.16b, v7.16b, v4.16b
	tbl	v3.16b, {v3.16b}, v30.16b
	tbl	v7.16b, {v7.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v2.4s, v2.4s, v3.4s
	add	v6.4s, v6.4s, v7.4s
	eor	v20.16b, v1.16b, v2.16b
	eor	v21.16b, v5.16b, v6.16b
	shl	v1.4s, v20.4s, #7
	shl	v5.4s, v21.4s, #7
	sri	v1.4s, v20.4s, #25
	sri	v5.4s, v21.4s, #25
	ext	v3.16b, v3.16b, v3.16b, #12
	ext	v7.16b, v7.16b, v7.16b, #12
	ext	v1.16b, v1.16b, v1.16b, #4
	ext	v5.16b, v5.16b, v5.16b, #4
	ext	v2.16b, v2.16b, v2.16b, #8
	ext	v6.16b, v6.16b, v6.16b, #8
	# Round even
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v1.4s
	add	v4.4s, v4.4s, v5.4s
	eor	v3.16b, v3.16b, v0.16b
	eor	v7.16b, v7.16b, v4.16b
	rev32	v3.8h, v3.8h
	rev32	v7.8h, v7.8h
	# c += d; b ^= c; b <<<= 12;
	add	v2.4s, v2.4s, v3.4s
	add	v6.4s, v6.4s, v7.4s
	eor	v20.16b, v1.16b, v2.16b
	eor	v21.16b, v5.16b, v6.16b
	shl	v1.4s, v20.4s, #12
	shl	v5.4s, v21.4s, #12
	sri	v1.4s, v20.4s, #20
	sri	v5.4s, v21.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v1.4s
	add	v4.4s, v4.4s, v5.4s
	eor	v3.16b, v3.16b, v0.16b
	eor	v7.16b, v7.16b, v4.16b
	tbl	v3.16b, {v3.16b}, v30.16b
	tbl	v7.16b, {v7.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v2.4s, v2.4s, v3.4s
	add	v6.4s, v6.4s, v7.4s
	eor	v20.16b, v1.16b, v2.16b
	eor	v21.16b, v5.16b, v6.16b
	shl	v1.4s, v20.4s, #7
	shl	v5.4s, v21.4s, #7
	sri	v1.4s, v20.4s, #25
	sri	v5.4s, v21.4s, #25
	ext	v3.16b, v3.16b, v3.16b, #4
	ext	v7.16b, v7.16b, v7.16b, #4
	ext	v1.16b, v1.16b, v1.16b, #12
	ext	v5.16b, v5.16b, v5.16b, #12
	ext	v2.16b, v2.16b, v2.16b, #8
	ext	v6.16b, v6.16b, v6.16b, #8
	bne	L_chacha_crypt_bytes_arm64_round_start_128
	# Add back state, XOR in message and store (load next block)
	add	v0.4s, v0.4s, v16.4s
	add	v1.4s, v1.4s, v17.4s
	add	v2.4s, v2.4s, v18.4s
	add	v3.4s, v3.4s, v19.4s
	eor	v24.16b, v24.16b, v0.16b
	eor	v25.16b, v25.16b, v1.16b
	eor	v26.16b, v26.16b, v2.16b
	eor	v27.16b, v27.16b, v3.16b
	ld1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40
	st1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #0x40
	add	v19.4s, v19.4s, v31.4s
	add	v4.4s, v4.4s, v16.4s
	add	v5.4s, v5.4s, v17.4s
	add	v6.4s, v6.4s, v18.4s
	add	v7.4s, v7.4s, v19.4s
	eor	v20.16b, v20.16b, v4.16b
	eor	v21.16b, v21.16b, v5.16b
	eor	v22.16b, v22.16b, v6.16b
	eor	v23.16b, v23.16b, v7.16b
	st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40
	add	v19.4s, v19.4s, v31.4s
	sub	x3, x3, #0x80
	# Done 128-byte block
L_chacha_crypt_bytes_arm64_lt_128:
	cmp	x3, #0
	beq	L_chacha_crypt_bytes_arm64_done_all
	mov	w5, #0x40
L_chacha_crypt_bytes_arm64_loop_64:
	# Move state into vector registers
	mov	v0.16b, v16.16b
	mov	v1.16b, v17.16b
	mov	v2.16b, v18.16b
	mov	v3.16b, v19.16b
	# Set number of odd+even rounds to perform
	mov	x26, #10
L_chacha_crypt_bytes_arm64_round_64:
	subs	x26, x26, #1
	# Round odd
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v1.4s
	eor	v3.16b, v3.16b, v0.16b
	rev32	v3.8h, v3.8h
	# c += d; b ^= c; b <<<= 12;
	add	v2.4s, v2.4s, v3.4s
	eor	v20.16b, v1.16b, v2.16b
	shl	v1.4s, v20.4s, #12
	sri	v1.4s, v20.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v1.4s
	eor	v3.16b, v3.16b, v0.16b
	tbl	v3.16b, {v3.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v2.4s, v2.4s, v3.4s
	eor	v20.16b, v1.16b, v2.16b
	shl	v1.4s, v20.4s, #7
	sri	v1.4s, v20.4s, #25
	ext	v3.16b, v3.16b, v3.16b, #12
	ext	v1.16b, v1.16b, v1.16b, #4
	ext	v2.16b, v2.16b, v2.16b, #8
	# Round even
	# a += b; d ^= a; d <<<= 16;
	add	v0.4s, v0.4s, v1.4s
	eor	v3.16b, v3.16b, v0.16b
	rev32	v3.8h, v3.8h
	# c += d; b ^= c; b <<<= 12;
	add	v2.4s, v2.4s, v3.4s
	eor	v20.16b, v1.16b, v2.16b
	shl	v1.4s, v20.4s, #12
	sri	v1.4s, v20.4s, #20
	# a += b; d ^= a; d <<<= 8;
	add	v0.4s, v0.4s, v1.4s
	eor	v3.16b, v3.16b, v0.16b
	tbl	v3.16b, {v3.16b}, v30.16b
	# c += d; b ^= c; b <<<= 7;
	add	v2.4s, v2.4s, v3.4s
	eor	v20.16b, v1.16b, v2.16b
	shl	v1.4s, v20.4s, #7
	sri	v1.4s, v20.4s, #25
	ext	v3.16b, v3.16b, v3.16b, #4
	ext	v1.16b, v1.16b, v1.16b, #12
	ext	v2.16b, v2.16b, v2.16b, #8
	bne	L_chacha_crypt_bytes_arm64_round_64
	# Add back state
	add	v0.4s, v0.4s, v16.4s
	add	v1.4s, v1.4s, v17.4s
	add	v2.4s, v2.4s, v18.4s
	add	v3.4s, v3.4s, v19.4s
	# Check if data is less than 64 bytes - store in over
	cmp	x3, #0x40
	add	v19.4s, v19.4s, v31.4s
	blt	L_chacha_crypt_bytes_arm64_lt_64
	# Encipher 64 bytes
	ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40
	eor	v24.16b, v24.16b, v0.16b
	eor	v25.16b, v25.16b, v1.16b
	eor	v26.16b, v26.16b, v2.16b
	eor	v27.16b, v27.16b, v3.16b
	st1	{v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #0x40
	# Check for more bytes to be enciphered
	subs	x3, x3, #0x40
	bne	L_chacha_crypt_bytes_arm64_loop_64
	b	L_chacha_crypt_bytes_arm64_done
L_chacha_crypt_bytes_arm64_lt_64:
	# Calculate bytes left in block not used
	sub	w5, w5, w3
	# Store encipher block in over for further operations and left
	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [x4]
	str	w5, [x0, #64]
	# Encipher 32 bytes
	cmp	x3, #32
	blt	L_chacha_crypt_bytes_arm64_lt_32
	ld1	{v24.16b, v25.16b}, [x2], #32
	eor	v24.16b, v24.16b, v0.16b
	eor	v25.16b, v25.16b, v1.16b
	st1	{v24.16b, v25.16b}, [x1], #32
	subs	x3, x3, #32
	mov	v0.16b, v2.16b
	mov	v1.16b, v3.16b
	beq	L_chacha_crypt_bytes_arm64_done
L_chacha_crypt_bytes_arm64_lt_32:
	cmp	x3, #16
	blt	L_chacha_crypt_bytes_arm64_lt_16
	# Encipher 16 bytes
	ld1	{v24.16b}, [x2], #16
	eor	v24.16b, v24.16b, v0.16b
	st1	{v24.16b}, [x1], #16
	subs	x3, x3, #16
	mov	v0.16b, v1.16b
	beq	L_chacha_crypt_bytes_arm64_done
L_chacha_crypt_bytes_arm64_lt_16:
	cmp	x3, #8
	blt	L_chacha_crypt_bytes_arm64_lt_8
	# Encipher 8 bytes
	ld1	{v24.8b}, [x2], #8
	eor	v24.8b, v24.8b, v0.8b
	st1	{v24.8b}, [x1], #8
	subs	x3, x3, #8
	mov	v0.d[0], v0.d[1]
	beq	L_chacha_crypt_bytes_arm64_done
L_chacha_crypt_bytes_arm64_lt_8:
	mov	x5, v0.d[0]
L_chacha_crypt_bytes_arm64_loop_lt_8:
	# Encipher 1 byte at a time
	ldrb	w6, [x2], #1
	eor	w6, w6, w5
	strb	w6, [x1], #1
	subs	x3, x3, #1
	lsr	x5, x5, #8
	bgt	L_chacha_crypt_bytes_arm64_loop_lt_8
L_chacha_crypt_bytes_arm64_done:
L_chacha_crypt_bytes_arm64_done_all:
	st1	{v16.4s, v17.4s, v18.4s, v19.4s}, [x0]
	ldp	x17, x19, [x29, #24]
	ldp	x20, x21, [x29, #40]
	ldp	x22, x23, [x29, #56]
	ldp	x24, x25, [x29, #72]
	ldr	x26, [x29, #88]
	ldp	d8, d9, [x29, #96]
	ldp	d10, d11, [x29, #112]
	ldp	d12, d13, [x29, #128]
	ldp	d14, d15, [x29, #144]
	ldp	x29, x30, [sp], #0xa0
	ret
#ifndef __APPLE__
	.size	wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl	wc_chacha_setiv
.type	wc_chacha_setiv,@function
.align	2
wc_chacha_setiv:
#else
.section	__TEXT,__text
.globl	_wc_chacha_setiv
.p2align	2
_wc_chacha_setiv:
#endif /* __APPLE__ */
	ldr	x3, [x1]
	ldr	w4, [x1, #8]
	str	x2, [x0, #48]
	str	x3, [x0, #52]
	str	w4, [x0, #60]
	ret
#ifndef __APPLE__
	.size	wc_chacha_setiv,.-wc_chacha_setiv
#endif /* __APPLE__ */
#ifndef __APPLE__
	.text
	.section	.rodata
	.type	L_chacha_setkey_arm64_constant, %object
	.size	L_chacha_setkey_arm64_constant, 32
#else
	.section	__DATA,__data
#endif /* __APPLE__ */
	# 8-byte aligned, 64-bit aligned
#ifndef __APPLE__
	.align	3
#else
	.p2align	3
#endif /* __APPLE__ */
L_chacha_setkey_arm64_constant:
	.long	0x61707865,0x3120646e,0x79622d36,0x6b206574
	.long	0x61707865,0x3320646e,0x79622d32,0x6b206574
#ifndef __APPLE__
.text
.globl	wc_chacha_setkey
.type	wc_chacha_setkey,@function
.align	2
wc_chacha_setkey:
#else
.section	__TEXT,__text
.globl	_wc_chacha_setkey
.p2align	2
_wc_chacha_setkey:
#endif /* __APPLE__ */
#ifndef __APPLE__
	adrp x3, L_chacha_setkey_arm64_constant
	add  x3, x3, :lo12:L_chacha_setkey_arm64_constant
#else
	adrp x3, L_chacha_setkey_arm64_constant@PAGE
	add  x3, x3, L_chacha_setkey_arm64_constant@PAGEOFF
#endif /* __APPLE__ */
	subs	x2, x2, #16
	add	x3, x3, x2
	# Start with constants
	ld1	{v0.4s}, [x3]
	ld1	{v1.16b}, [x1], #16
#ifdef BIG_ENDIAN_ORDER
	rev32	v1.8h, v1.8h
#endif /* BIG_ENDIAN_ORDER */
	st1	{v0.4s}, [x0], #16
	st1	{v1.4s}, [x0], #16
	beq	L_chacha_setkey_arm64_done
	ld1	{v1.16b}, [x1]
#ifdef BIG_ENDIAN_ORDER
	rev32	v1.8h, v1.8h
#endif /* BIG_ENDIAN_ORDER */
L_chacha_setkey_arm64_done:
	st1	{v1.4s}, [x0]
	ret
#ifndef __APPLE__
	.size	wc_chacha_setkey,.-wc_chacha_setkey
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl	wc_chacha_use_over
.type	wc_chacha_use_over,@function
.align	2
wc_chacha_use_over:
#else
.section	__TEXT,__text
.globl	_wc_chacha_use_over
.p2align	2
_wc_chacha_use_over:
#endif /* __APPLE__ */
L_chacha_use_over_arm64_16byte_loop:
	cmp	x3, #16
	blt	L_chacha_use_over_arm64_word_loop
	# 16 bytes of state XORed into message.
	ld1	{v0.16b}, [x0], #16
	ld1	{v1.16b}, [x2], #16
	eor	v1.16b, v1.16b, v0.16b
	subs	x3, x3, #16
	st1	{v1.16b}, [x1], #16
	beq	L_chacha_use_over_arm64_done
	b	L_chacha_use_over_arm64_16byte_loop
L_chacha_use_over_arm64_word_loop:
	cmp	x3, #4
	blt	L_chacha_use_over_arm64_byte_loop
	# 4 bytes of state XORed into message.
	ldr	w4, [x0], #4
	ldr	w5, [x2], #4
	eor	w5, w5, w4
	subs	x3, x3, #4
	str	w5, [x1], #4
	beq	L_chacha_use_over_arm64_done
	b	L_chacha_use_over_arm64_word_loop
L_chacha_use_over_arm64_byte_loop:
	# 1 bytes of state XORed into message.
	ldrb	w4, [x0], #1
	ldrb	w5, [x2], #1
	eor	w5, w5, w4
	subs	x3, x3, #1
	strb	w5, [x1], #1
	beq	L_chacha_use_over_arm64_done
	b	L_chacha_use_over_arm64_byte_loop
L_chacha_use_over_arm64_done:
	ret
#ifndef __APPLE__
	.size	wc_chacha_use_over,.-wc_chacha_use_over
#endif /* __APPLE__ */
#endif /* !WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */

#if defined(__linux__) && defined(__ELF__)
.section	.note.GNU-stack,"",%progbits
#endif
#endif /* !WOLFSSL_ARMASM_INLINE */