/* armv8-chacha-asm
 *
 * Copyright (C) 2006-2026 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

#include <wolfssl/wolfcrypt/libwolfssl_sources_asm.h>
#include <wolfssl/wolfcrypt/error-crypt.h>

/* Generated using (from wolfssl):
 *   cd ../scripts
 *   ruby ./chacha/chacha.rb arm64 \
 *       ../wolfssl/wolfcrypt/src/port/arm/armv8-chacha-asm.c
 */
#ifdef WOLFSSL_ARMASM
#ifdef __aarch64__
#ifdef WOLFSSL_ARMASM_INLINE
#ifdef HAVE_CHACHA
#include <wolfssl/wolfcrypt/chacha.h>

XALIGNED(8) static const word32 L_chacha20_arm64_ctr[] = {
    0x00000000, 0x00000001, 0x00000002, 0x00000003,
};

XALIGNED(8) static const word32 L_chacha20_arm64_rol8[] = {
    0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f,
};

#ifndef WOLFSSL_ARMASM_NO_NEON
void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len)
{
    const word32* rol8 = L_chacha20_arm64_rol8;
    const word32* ctr = L_chacha20_arm64_ctr;
    __asm__ __volatile__ (
        "eor	v29.16b, v29.16b, v29.16b\n\t"
        "mov	x26, #5\n\t"
        "eor	v31.16b, v31.16b, v31.16b\n\t"
        "mov	w7, #1\n\t"
        "ld1	{v30.16b}, [%[rol8]]\n\t"
        "ld1	{v28.4s}, [%[ctr]]\n\t"
        "add	x4, %x[ctx], #0x44\n\t"
        "mov	v29.s[0], w26\n\t"
        "mov	v31.s[0], w7\n\t"
        /* Load state to encrypt */
        "ld1	{v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t"
        "cmp	%w[len], #0x140\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_320_%=\n\t"
        "mov	w25, #4\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_loop_320_%=:\n\t"
        /* Move state into regular register */
        "mov	x8, v16.d[0]\n\t"
        "mov	x10, v16.d[1]\n\t"
        "mov	x12, v17.d[0]\n\t"
        "mov	x14, v17.d[1]\n\t"
        "mov	x16, v18.d[0]\n\t"
        "mov	x19, v18.d[1]\n\t"
        "mov	x21, v19.d[0]\n\t"
        "mov	x23, v19.d[1]\n\t"
        "sub	%w[len], %w[len], #0x140\n\t"
        /* Move state into vector registers */
        "dup	v0.4s, v16.s[0]\n\t"
        "dup	v1.4s, v16.s[1]\n\t"
        "lsr	x9, x8, #32\n\t"
        "dup	v2.4s, v16.s[2]\n\t"
        "dup	v3.4s, v16.s[3]\n\t"
        "lsr	x11, x10, #32\n\t"
        "dup	v4.4s, v17.s[0]\n\t"
        "dup	v5.4s, v17.s[1]\n\t"
        "lsr	x13, x12, #32\n\t"
        "dup	v6.4s, v17.s[2]\n\t"
        "dup	v7.4s, v17.s[3]\n\t"
        "lsr	x15, x14, #32\n\t"
        "dup	v8.4s, v18.s[0]\n\t"
        "dup	v9.4s, v18.s[1]\n\t"
        "lsr	x17, x16, #32\n\t"
        "dup	v10.4s, v18.s[2]\n\t"
        "dup	v11.4s, v18.s[3]\n\t"
        "lsr	x20, x19, #32\n\t"
        "dup	v12.4s, v19.s[0]\n\t"
        "dup	v13.4s, v19.s[1]\n\t"
        "lsr	x22, x21, #32\n\t"
        "dup	v14.4s, v19.s[2]\n\t"
        "dup	v15.4s, v19.s[3]\n\t"
        "lsr	x24, x23, #32\n\t"
        /* Add to counter word */
        "add	v12.4s, v12.4s, v28.4s\n\t"
        "add	w21, w21, w25\n\t"
        /* Set number of odd+even rounds to perform */
        "mov	x26, #10\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_round_start_320_%=:\n\t"
        "subs	x26, x26, #1\n\t"
        /* Round odd */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v4.4s\n\t"
        "add	w8, w8, w12\n\t"
        "add	v1.4s, v1.4s, v5.4s\n\t"
        "add	w9, w9, w13\n\t"
        "add	v2.4s, v2.4s, v6.4s\n\t"
        "add	w10, w10, w14\n\t"
        "add	v3.4s, v3.4s, v7.4s\n\t"
        "add	w11, w11, w15\n\t"
        "eor	v12.16b, v12.16b, v0.16b\n\t"
        "eor	w21, w21, w8\n\t"
        "eor	v13.16b, v13.16b, v1.16b\n\t"
        "eor	w22, w22, w9\n\t"
        "eor	v14.16b, v14.16b, v2.16b\n\t"
        "eor	w23, w23, w10\n\t"
        "eor	v15.16b, v15.16b, v3.16b\n\t"
        "eor	w24, w24, w11\n\t"
        "rev32	v12.8h, v12.8h\n\t"
        "ror	w21, w21, #16\n\t"
        "rev32	v13.8h, v13.8h\n\t"
        "ror	w22, w22, #16\n\t"
        "rev32	v14.8h, v14.8h\n\t"
        "ror	w23, w23, #16\n\t"
        "rev32	v15.8h, v15.8h\n\t"
        "ror	w24, w24, #16\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v8.4s, v8.4s, v12.4s\n\t"
        "add	w16, w16, w21\n\t"
        "add	v9.4s, v9.4s, v13.4s\n\t"
        "add	w17, w17, w22\n\t"
        "add	v10.4s, v10.4s, v14.4s\n\t"
        "add	w19, w19, w23\n\t"
        "add	v11.4s, v11.4s, v15.4s\n\t"
        "add	w20, w20, w24\n\t"
        "eor	v20.16b, v4.16b, v8.16b\n\t"
        "eor	w12, w12, w16\n\t"
        "eor	v21.16b, v5.16b, v9.16b\n\t"
        "eor	w13, w13, w17\n\t"
        "eor	v22.16b, v6.16b, v10.16b\n\t"
        "eor	w14, w14, w19\n\t"
        "eor	v23.16b, v7.16b, v11.16b\n\t"
        "eor	w15, w15, w20\n\t"
        "shl	v4.4s, v20.4s, #12\n\t"
        "ror	w12, w12, #20\n\t"
        "shl	v5.4s, v21.4s, #12\n\t"
        "ror	w13, w13, #20\n\t"
        "shl	v6.4s, v22.4s, #12\n\t"
        "ror	w14, w14, #20\n\t"
        "shl	v7.4s, v23.4s, #12\n\t"
        "ror	w15, w15, #20\n\t"
        "sri	v4.4s, v20.4s, #20\n\t"
        "sri	v5.4s, v21.4s, #20\n\t"
        "sri	v6.4s, v22.4s, #20\n\t"
        "sri	v7.4s, v23.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v4.4s\n\t"
        "add	w8, w8, w12\n\t"
        "add	v1.4s, v1.4s, v5.4s\n\t"
        "add	w9, w9, w13\n\t"
        "add	v2.4s, v2.4s, v6.4s\n\t"
        "add	w10, w10, w14\n\t"
        "add	v3.4s, v3.4s, v7.4s\n\t"
        "add	w11, w11, w15\n\t"
        "eor	v12.16b, v12.16b, v0.16b\n\t"
        "eor	w21, w21, w8\n\t"
        "eor	v13.16b, v13.16b, v1.16b\n\t"
        "eor	w22, w22, w9\n\t"
        "eor	v14.16b, v14.16b, v2.16b\n\t"
        "eor	w23, w23, w10\n\t"
        "eor	v15.16b, v15.16b, v3.16b\n\t"
        "eor	w24, w24, w11\n\t"
        "tbl	v12.16b, {v12.16b}, v30.16b\n\t"
        "ror	w21, w21, #24\n\t"
        "tbl	v13.16b, {v13.16b}, v30.16b\n\t"
        "ror	w22, w22, #24\n\t"
        "tbl	v14.16b, {v14.16b}, v30.16b\n\t"
        "ror	w23, w23, #24\n\t"
        "tbl	v15.16b, {v15.16b}, v30.16b\n\t"
        "ror	w24, w24, #24\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v8.4s, v8.4s, v12.4s\n\t"
        "add	w16, w16, w21\n\t"
        "add	v9.4s, v9.4s, v13.4s\n\t"
        "add	w17, w17, w22\n\t"
        "add	v10.4s, v10.4s, v14.4s\n\t"
        "add	w19, w19, w23\n\t"
        "add	v11.4s, v11.4s, v15.4s\n\t"
        "add	w20, w20, w24\n\t"
        "eor	v20.16b, v4.16b, v8.16b\n\t"
        "eor	w12, w12, w16\n\t"
        "eor	v21.16b, v5.16b, v9.16b\n\t"
        "eor	w13, w13, w17\n\t"
        "eor	v22.16b, v6.16b, v10.16b\n\t"
        "eor	w14, w14, w19\n\t"
        "eor	v23.16b, v7.16b, v11.16b\n\t"
        "eor	w15, w15, w20\n\t"
        "shl	v4.4s, v20.4s, #7\n\t"
        "ror	w12, w12, #25\n\t"
        "shl	v5.4s, v21.4s, #7\n\t"
        "ror	w13, w13, #25\n\t"
        "shl	v6.4s, v22.4s, #7\n\t"
        "ror	w14, w14, #25\n\t"
        "shl	v7.4s, v23.4s, #7\n\t"
        "ror	w15, w15, #25\n\t"
        "sri	v4.4s, v20.4s, #25\n\t"
        "sri	v5.4s, v21.4s, #25\n\t"
        "sri	v6.4s, v22.4s, #25\n\t"
        "sri	v7.4s, v23.4s, #25\n\t"
        /* Round even */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v5.4s\n\t"
        "add	w8, w8, w13\n\t"
        "add	v1.4s, v1.4s, v6.4s\n\t"
        "add	w9, w9, w14\n\t"
        "add	v2.4s, v2.4s, v7.4s\n\t"
        "add	w10, w10, w15\n\t"
        "add	v3.4s, v3.4s, v4.4s\n\t"
        "add	w11, w11, w12\n\t"
        "eor	v15.16b, v15.16b, v0.16b\n\t"
        "eor	w24, w24, w8\n\t"
        "eor	v12.16b, v12.16b, v1.16b\n\t"
        "eor	w21, w21, w9\n\t"
        "eor	v13.16b, v13.16b, v2.16b\n\t"
        "eor	w22, w22, w10\n\t"
        "eor	v14.16b, v14.16b, v3.16b\n\t"
        "eor	w23, w23, w11\n\t"
        "rev32	v15.8h, v15.8h\n\t"
        "ror	w24, w24, #16\n\t"
        "rev32	v12.8h, v12.8h\n\t"
        "ror	w21, w21, #16\n\t"
        "rev32	v13.8h, v13.8h\n\t"
        "ror	w22, w22, #16\n\t"
        "rev32	v14.8h, v14.8h\n\t"
        "ror	w23, w23, #16\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v10.4s, v10.4s, v15.4s\n\t"
        "add	w19, w19, w24\n\t"
        "add	v11.4s, v11.4s, v12.4s\n\t"
        "add	w20, w20, w21\n\t"
        "add	v8.4s, v8.4s, v13.4s\n\t"
        "add	w16, w16, w22\n\t"
        "add	v9.4s, v9.4s, v14.4s\n\t"
        "add	w17, w17, w23\n\t"
        "eor	v20.16b, v5.16b, v10.16b\n\t"
        "eor	w13, w13, w19\n\t"
        "eor	v21.16b, v6.16b, v11.16b\n\t"
        "eor	w14, w14, w20\n\t"
        "eor	v22.16b, v7.16b, v8.16b\n\t"
        "eor	w15, w15, w16\n\t"
        "eor	v23.16b, v4.16b, v9.16b\n\t"
        "eor	w12, w12, w17\n\t"
        "shl	v5.4s, v20.4s, #12\n\t"
        "ror	w13, w13, #20\n\t"
        "shl	v6.4s, v21.4s, #12\n\t"
        "ror	w14, w14, #20\n\t"
        "shl	v7.4s, v22.4s, #12\n\t"
        "ror	w15, w15, #20\n\t"
        "shl	v4.4s, v23.4s, #12\n\t"
        "ror	w12, w12, #20\n\t"
        "sri	v5.4s, v20.4s, #20\n\t"
        "sri	v6.4s, v21.4s, #20\n\t"
        "sri	v7.4s, v22.4s, #20\n\t"
        "sri	v4.4s, v23.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v5.4s\n\t"
        "add	w8, w8, w13\n\t"
        "add	v1.4s, v1.4s, v6.4s\n\t"
        "add	w9, w9, w14\n\t"
        "add	v2.4s, v2.4s, v7.4s\n\t"
        "add	w10, w10, w15\n\t"
        "add	v3.4s, v3.4s, v4.4s\n\t"
        "add	w11, w11, w12\n\t"
        "eor	v15.16b, v15.16b, v0.16b\n\t"
        "eor	w24, w24, w8\n\t"
        "eor	v12.16b, v12.16b, v1.16b\n\t"
        "eor	w21, w21, w9\n\t"
        "eor	v13.16b, v13.16b, v2.16b\n\t"
        "eor	w22, w22, w10\n\t"
        "eor	v14.16b, v14.16b, v3.16b\n\t"
        "eor	w23, w23, w11\n\t"
        "tbl	v15.16b, {v15.16b}, v30.16b\n\t"
        "ror	w24, w24, #24\n\t"
        "tbl	v12.16b, {v12.16b}, v30.16b\n\t"
        "ror	w21, w21, #24\n\t"
        "tbl	v13.16b, {v13.16b}, v30.16b\n\t"
        "ror	w22, w22, #24\n\t"
        "tbl	v14.16b, {v14.16b}, v30.16b\n\t"
        "ror	w23, w23, #24\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v10.4s, v10.4s, v15.4s\n\t"
        "add	w19, w19, w24\n\t"
        "add	v11.4s, v11.4s, v12.4s\n\t"
        "add	w20, w20, w21\n\t"
        "add	v8.4s, v8.4s, v13.4s\n\t"
        "add	w16, w16, w22\n\t"
        "add	v9.4s, v9.4s, v14.4s\n\t"
        "add	w17, w17, w23\n\t"
        "eor	v20.16b, v5.16b, v10.16b\n\t"
        "eor	w13, w13, w19\n\t"
        "eor	v21.16b, v6.16b, v11.16b\n\t"
        "eor	w14, w14, w20\n\t"
        "eor	v22.16b, v7.16b, v8.16b\n\t"
        "eor	w15, w15, w16\n\t"
        "eor	v23.16b, v4.16b, v9.16b\n\t"
        "eor	w12, w12, w17\n\t"
        "shl	v5.4s, v20.4s, #7\n\t"
        "ror	w13, w13, #25\n\t"
        "shl	v6.4s, v21.4s, #7\n\t"
        "ror	w14, w14, #25\n\t"
        "shl	v7.4s, v22.4s, #7\n\t"
        "ror	w15, w15, #25\n\t"
        "shl	v4.4s, v23.4s, #7\n\t"
        "ror	w12, w12, #25\n\t"
        "sri	v5.4s, v20.4s, #25\n\t"
        "sri	v6.4s, v21.4s, #25\n\t"
        "sri	v7.4s, v22.4s, #25\n\t"
        "sri	v4.4s, v23.4s, #25\n\t"
        "b.ne	L_chacha_crypt_bytes_arm64_round_start_320_%=\n\t"
        /* Add counter now rather than after transposed */
        "add	v12.4s, v12.4s, v28.4s\n\t"
        "add	w21, w21, w25\n\t"
        /* Load message */
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        /* Transpose vectors */
        "trn1	v20.4s, v0.4s, v1.4s\n\t"
        "trn1	v22.4s, v2.4s, v3.4s\n\t"
        "orr	x8, x8, x9, lsl 32\n\t"
        "trn2	v21.4s, v0.4s, v1.4s\n\t"
        "trn2	v23.4s, v2.4s, v3.4s\n\t"
        "trn1	v0.2d, v20.2d, v22.2d\n\t"
        "trn1	v1.2d, v21.2d, v23.2d\n\t"
        "orr	x10, x10, x11, lsl 32\n\t"
        "trn2	v2.2d, v20.2d, v22.2d\n\t"
        "trn2	v3.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v4.4s, v5.4s\n\t"
        "trn1	v22.4s, v6.4s, v7.4s\n\t"
        "orr	x12, x12, x13, lsl 32\n\t"
        "trn2	v21.4s, v4.4s, v5.4s\n\t"
        "trn2	v23.4s, v6.4s, v7.4s\n\t"
        "trn1	v4.2d, v20.2d, v22.2d\n\t"
        "trn1	v5.2d, v21.2d, v23.2d\n\t"
        "orr	x14, x14, x15, lsl 32\n\t"
        "trn2	v6.2d, v20.2d, v22.2d\n\t"
        "trn2	v7.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v8.4s, v9.4s\n\t"
        "trn1	v22.4s, v10.4s, v11.4s\n\t"
        "orr	x16, x16, x17, lsl 32\n\t"
        "trn2	v21.4s, v8.4s, v9.4s\n\t"
        "trn2	v23.4s, v10.4s, v11.4s\n\t"
        "trn1	v8.2d, v20.2d, v22.2d\n\t"
        "trn1	v9.2d, v21.2d, v23.2d\n\t"
        "orr	x19, x19, x20, lsl 32\n\t"
        "trn2	v10.2d, v20.2d, v22.2d\n\t"
        "trn2	v11.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v12.4s, v13.4s\n\t"
        "trn1	v22.4s, v14.4s, v15.4s\n\t"
        "orr	x21, x21, x22, lsl 32\n\t"
        "trn2	v21.4s, v12.4s, v13.4s\n\t"
        "trn2	v23.4s, v14.4s, v15.4s\n\t"
        "trn1	v12.2d, v20.2d, v22.2d\n\t"
        "trn1	v13.2d, v21.2d, v23.2d\n\t"
        "orr	x23, x23, x24, lsl 32\n\t"
        "trn2	v14.2d, v20.2d, v22.2d\n\t"
        "trn2	v15.2d, v21.2d, v23.2d\n\t"
        /* Add back state, XOR in message and store (load next block) */
        "add	v20.4s, v0.4s, v16.4s\n\t"
        "add	v21.4s, v4.4s, v17.4s\n\t"
        "add	v22.4s, v8.4s, v18.4s\n\t"
        "add	v23.4s, v12.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v1.4s, v16.4s\n\t"
        "add	v21.4s, v5.4s, v17.4s\n\t"
        "add	v22.4s, v9.4s, v18.4s\n\t"
        "add	v23.4s, v13.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v2.4s, v16.4s\n\t"
        "add	v21.4s, v6.4s, v17.4s\n\t"
        "add	v22.4s, v10.4s, v18.4s\n\t"
        "add	v23.4s, v14.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v3.4s, v16.4s\n\t"
        "add	v21.4s, v7.4s, v17.4s\n\t"
        "add	v22.4s, v11.4s, v18.4s\n\t"
        "add	v23.4s, v15.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        /* Move regular registers into vector registers for adding and xor */
        "mov	v0.d[0], x8\n\t"
        "mov	v0.d[1], x10\n\t"
        "mov	v1.d[0], x12\n\t"
        "mov	v1.d[1], x14\n\t"
        "mov	v2.d[0], x16\n\t"
        "mov	v2.d[1], x19\n\t"
        "mov	v3.d[0], x21\n\t"
        "mov	v3.d[1], x23\n\t"
        /* Add back state, XOR in message and store */
        "add	v0.4s, v0.4s, v16.4s\n\t"
        "add	v1.4s, v1.4s, v17.4s\n\t"
        "add	v2.4s, v2.4s, v18.4s\n\t"
        "add	v3.4s, v3.4s, v19.4s\n\t"
        "eor	v0.16b, v0.16b, v24.16b\n\t"
        "eor	v1.16b, v1.16b, v25.16b\n\t"
        "eor	v2.16b, v2.16b, v26.16b\n\t"
        "eor	v3.16b, v3.16b, v27.16b\n\t"
        "st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t"
        "cmp	%w[len], #0x140\n\t"
        "add	v19.4s, v19.4s, v29.4s\n\t"
        "b.ge	L_chacha_crypt_bytes_arm64_loop_320_%=\n\t"
        /* Done doing 320 bytes at a time */
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_320_%=:\n\t"
        "cmp	%w[len], #0x100\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_256_%=\n\t"
        /* Move state into vector registers */
        "dup	v0.4s, v16.s[0]\n\t"
        "dup	v1.4s, v16.s[1]\n\t"
        "dup	v2.4s, v16.s[2]\n\t"
        "dup	v3.4s, v16.s[3]\n\t"
        "dup	v4.4s, v17.s[0]\n\t"
        "dup	v5.4s, v17.s[1]\n\t"
        "dup	v6.4s, v17.s[2]\n\t"
        "dup	v7.4s, v17.s[3]\n\t"
        "dup	v8.4s, v18.s[0]\n\t"
        "dup	v9.4s, v18.s[1]\n\t"
        "dup	v10.4s, v18.s[2]\n\t"
        "dup	v11.4s, v18.s[3]\n\t"
        "dup	v12.4s, v19.s[0]\n\t"
        "dup	v13.4s, v19.s[1]\n\t"
        "dup	v14.4s, v19.s[2]\n\t"
        "dup	v15.4s, v19.s[3]\n\t"
        /* Add to counter word */
        "add	v12.4s, v12.4s, v28.4s\n\t"
        /* Set number of odd+even rounds to perform */
        "mov	x26, #10\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_round_start_256_%=:\n\t"
        "subs	x26, x26, #1\n\t"
        /* Round odd */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v4.4s\n\t"
        "add	v1.4s, v1.4s, v5.4s\n\t"
        "add	v2.4s, v2.4s, v6.4s\n\t"
        "add	v3.4s, v3.4s, v7.4s\n\t"
        "eor	v12.16b, v12.16b, v0.16b\n\t"
        "eor	v13.16b, v13.16b, v1.16b\n\t"
        "eor	v14.16b, v14.16b, v2.16b\n\t"
        "eor	v15.16b, v15.16b, v3.16b\n\t"
        "rev32	v12.8h, v12.8h\n\t"
        "rev32	v13.8h, v13.8h\n\t"
        "rev32	v14.8h, v14.8h\n\t"
        "rev32	v15.8h, v15.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v8.4s, v8.4s, v12.4s\n\t"
        "add	v9.4s, v9.4s, v13.4s\n\t"
        "add	v10.4s, v10.4s, v14.4s\n\t"
        "add	v11.4s, v11.4s, v15.4s\n\t"
        "eor	v20.16b, v4.16b, v8.16b\n\t"
        "eor	v21.16b, v5.16b, v9.16b\n\t"
        "eor	v22.16b, v6.16b, v10.16b\n\t"
        "eor	v23.16b, v7.16b, v11.16b\n\t"
        "shl	v4.4s, v20.4s, #12\n\t"
        "shl	v5.4s, v21.4s, #12\n\t"
        "shl	v6.4s, v22.4s, #12\n\t"
        "shl	v7.4s, v23.4s, #12\n\t"
        "sri	v4.4s, v20.4s, #20\n\t"
        "sri	v5.4s, v21.4s, #20\n\t"
        "sri	v6.4s, v22.4s, #20\n\t"
        "sri	v7.4s, v23.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v4.4s\n\t"
        "add	v1.4s, v1.4s, v5.4s\n\t"
        "add	v2.4s, v2.4s, v6.4s\n\t"
        "add	v3.4s, v3.4s, v7.4s\n\t"
        "eor	v12.16b, v12.16b, v0.16b\n\t"
        "eor	v13.16b, v13.16b, v1.16b\n\t"
        "eor	v14.16b, v14.16b, v2.16b\n\t"
        "eor	v15.16b, v15.16b, v3.16b\n\t"
        "tbl	v12.16b, {v12.16b}, v30.16b\n\t"
        "tbl	v13.16b, {v13.16b}, v30.16b\n\t"
        "tbl	v14.16b, {v14.16b}, v30.16b\n\t"
        "tbl	v15.16b, {v15.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v8.4s, v8.4s, v12.4s\n\t"
        "add	v9.4s, v9.4s, v13.4s\n\t"
        "add	v10.4s, v10.4s, v14.4s\n\t"
        "add	v11.4s, v11.4s, v15.4s\n\t"
        "eor	v20.16b, v4.16b, v8.16b\n\t"
        "eor	v21.16b, v5.16b, v9.16b\n\t"
        "eor	v22.16b, v6.16b, v10.16b\n\t"
        "eor	v23.16b, v7.16b, v11.16b\n\t"
        "shl	v4.4s, v20.4s, #7\n\t"
        "shl	v5.4s, v21.4s, #7\n\t"
        "shl	v6.4s, v22.4s, #7\n\t"
        "shl	v7.4s, v23.4s, #7\n\t"
        "sri	v4.4s, v20.4s, #25\n\t"
        "sri	v5.4s, v21.4s, #25\n\t"
        "sri	v6.4s, v22.4s, #25\n\t"
        "sri	v7.4s, v23.4s, #25\n\t"
        /* Round even */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v5.4s\n\t"
        "add	v1.4s, v1.4s, v6.4s\n\t"
        "add	v2.4s, v2.4s, v7.4s\n\t"
        "add	v3.4s, v3.4s, v4.4s\n\t"
        "eor	v15.16b, v15.16b, v0.16b\n\t"
        "eor	v12.16b, v12.16b, v1.16b\n\t"
        "eor	v13.16b, v13.16b, v2.16b\n\t"
        "eor	v14.16b, v14.16b, v3.16b\n\t"
        "rev32	v15.8h, v15.8h\n\t"
        "rev32	v12.8h, v12.8h\n\t"
        "rev32	v13.8h, v13.8h\n\t"
        "rev32	v14.8h, v14.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v10.4s, v10.4s, v15.4s\n\t"
        "add	v11.4s, v11.4s, v12.4s\n\t"
        "add	v8.4s, v8.4s, v13.4s\n\t"
        "add	v9.4s, v9.4s, v14.4s\n\t"
        "eor	v20.16b, v5.16b, v10.16b\n\t"
        "eor	v21.16b, v6.16b, v11.16b\n\t"
        "eor	v22.16b, v7.16b, v8.16b\n\t"
        "eor	v23.16b, v4.16b, v9.16b\n\t"
        "shl	v5.4s, v20.4s, #12\n\t"
        "shl	v6.4s, v21.4s, #12\n\t"
        "shl	v7.4s, v22.4s, #12\n\t"
        "shl	v4.4s, v23.4s, #12\n\t"
        "sri	v5.4s, v20.4s, #20\n\t"
        "sri	v6.4s, v21.4s, #20\n\t"
        "sri	v7.4s, v22.4s, #20\n\t"
        "sri	v4.4s, v23.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v5.4s\n\t"
        "add	v1.4s, v1.4s, v6.4s\n\t"
        "add	v2.4s, v2.4s, v7.4s\n\t"
        "add	v3.4s, v3.4s, v4.4s\n\t"
        "eor	v15.16b, v15.16b, v0.16b\n\t"
        "eor	v12.16b, v12.16b, v1.16b\n\t"
        "eor	v13.16b, v13.16b, v2.16b\n\t"
        "eor	v14.16b, v14.16b, v3.16b\n\t"
        "tbl	v15.16b, {v15.16b}, v30.16b\n\t"
        "tbl	v12.16b, {v12.16b}, v30.16b\n\t"
        "tbl	v13.16b, {v13.16b}, v30.16b\n\t"
        "tbl	v14.16b, {v14.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v10.4s, v10.4s, v15.4s\n\t"
        "add	v11.4s, v11.4s, v12.4s\n\t"
        "add	v8.4s, v8.4s, v13.4s\n\t"
        "add	v9.4s, v9.4s, v14.4s\n\t"
        "eor	v20.16b, v5.16b, v10.16b\n\t"
        "eor	v21.16b, v6.16b, v11.16b\n\t"
        "eor	v22.16b, v7.16b, v8.16b\n\t"
        "eor	v23.16b, v4.16b, v9.16b\n\t"
        "shl	v5.4s, v20.4s, #7\n\t"
        "shl	v6.4s, v21.4s, #7\n\t"
        "shl	v7.4s, v22.4s, #7\n\t"
        "shl	v4.4s, v23.4s, #7\n\t"
        "sri	v5.4s, v20.4s, #25\n\t"
        "sri	v6.4s, v21.4s, #25\n\t"
        "sri	v7.4s, v22.4s, #25\n\t"
        "sri	v4.4s, v23.4s, #25\n\t"
        "b.ne	L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t"
        "mov	x26, #4\n\t"
        /* Add counter now rather than after transposed */
        "add	v12.4s, v12.4s, v28.4s\n\t"
        /* Load message */
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        /* Transpose vectors */
        "trn1	v20.4s, v0.4s, v1.4s\n\t"
        "trn1	v22.4s, v2.4s, v3.4s\n\t"
        "trn2	v21.4s, v0.4s, v1.4s\n\t"
        "trn2	v23.4s, v2.4s, v3.4s\n\t"
        "trn1	v0.2d, v20.2d, v22.2d\n\t"
        "trn1	v1.2d, v21.2d, v23.2d\n\t"
        "trn2	v2.2d, v20.2d, v22.2d\n\t"
        "trn2	v3.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v4.4s, v5.4s\n\t"
        "trn1	v22.4s, v6.4s, v7.4s\n\t"
        "trn2	v21.4s, v4.4s, v5.4s\n\t"
        "trn2	v23.4s, v6.4s, v7.4s\n\t"
        "trn1	v4.2d, v20.2d, v22.2d\n\t"
        "trn1	v5.2d, v21.2d, v23.2d\n\t"
        "trn2	v6.2d, v20.2d, v22.2d\n\t"
        "trn2	v7.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v8.4s, v9.4s\n\t"
        "trn1	v22.4s, v10.4s, v11.4s\n\t"
        "trn2	v21.4s, v8.4s, v9.4s\n\t"
        "trn2	v23.4s, v10.4s, v11.4s\n\t"
        "trn1	v8.2d, v20.2d, v22.2d\n\t"
        "trn1	v9.2d, v21.2d, v23.2d\n\t"
        "trn2	v10.2d, v20.2d, v22.2d\n\t"
        "trn2	v11.2d, v21.2d, v23.2d\n\t"
        "trn1	v20.4s, v12.4s, v13.4s\n\t"
        "trn1	v22.4s, v14.4s, v15.4s\n\t"
        "trn2	v21.4s, v12.4s, v13.4s\n\t"
        "trn2	v23.4s, v14.4s, v15.4s\n\t"
        "trn1	v12.2d, v20.2d, v22.2d\n\t"
        "trn1	v13.2d, v21.2d, v23.2d\n\t"
        "trn2	v14.2d, v20.2d, v22.2d\n\t"
        "trn2	v15.2d, v21.2d, v23.2d\n\t"
        /* Add back state, XOR in message and store (load next block) */
        "add	v20.4s, v0.4s, v16.4s\n\t"
        "add	v21.4s, v4.4s, v17.4s\n\t"
        "add	v22.4s, v8.4s, v18.4s\n\t"
        "add	v23.4s, v12.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v1.4s, v16.4s\n\t"
        "add	v21.4s, v5.4s, v17.4s\n\t"
        "add	v22.4s, v9.4s, v18.4s\n\t"
        "add	v23.4s, v13.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v2.4s, v16.4s\n\t"
        "add	v21.4s, v6.4s, v17.4s\n\t"
        "add	v22.4s, v10.4s, v18.4s\n\t"
        "add	v23.4s, v14.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v20.4s, v3.4s, v16.4s\n\t"
        "add	v21.4s, v7.4s, v17.4s\n\t"
        "add	v22.4s, v11.4s, v18.4s\n\t"
        "add	v23.4s, v15.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v24.16b\n\t"
        "eor	v21.16b, v21.16b, v25.16b\n\t"
        "eor	v22.16b, v22.16b, v26.16b\n\t"
        "eor	v23.16b, v23.16b, v27.16b\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "mov	v29.s[0], w26\n\t"
        "sub	%w[len], %w[len], #0x100\n\t"
        "add	v19.4s, v19.4s, v29.4s\n\t"
        /* Done 256-byte block */
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_256_%=:\n\t"
        "cmp	%w[len], #0x80\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_128_%=\n\t"
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        /* Move state into vector registers */
        "mov	v4.16b, v16.16b\n\t"
        "mov	v5.16b, v17.16b\n\t"
        "mov	v6.16b, v18.16b\n\t"
        "mov	v7.16b, v19.16b\n\t"
        "mov	v0.16b, v16.16b\n\t"
        "mov	v1.16b, v17.16b\n\t"
        "mov	v2.16b, v18.16b\n\t"
        "mov	v3.16b, v19.16b\n\t"
        /* Add counter word */
        "add	v7.4s, v7.4s, v31.4s\n\t"
        /* Set number of odd+even rounds to perform */
        "mov	x26, #10\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_round_start_128_%=:\n\t"
        "subs	x26, x26, #1\n\t"
        /* Round odd */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "add	v4.4s, v4.4s, v5.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "eor	v7.16b, v7.16b, v4.16b\n\t"
        "rev32	v3.8h, v3.8h\n\t"
        "rev32	v7.8h, v7.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "add	v6.4s, v6.4s, v7.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "eor	v21.16b, v5.16b, v6.16b\n\t"
        "shl	v1.4s, v20.4s, #12\n\t"
        "shl	v5.4s, v21.4s, #12\n\t"
        "sri	v1.4s, v20.4s, #20\n\t"
        "sri	v5.4s, v21.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "add	v4.4s, v4.4s, v5.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "eor	v7.16b, v7.16b, v4.16b\n\t"
        "tbl	v3.16b, {v3.16b}, v30.16b\n\t"
        "tbl	v7.16b, {v7.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "add	v6.4s, v6.4s, v7.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "eor	v21.16b, v5.16b, v6.16b\n\t"
        "shl	v1.4s, v20.4s, #7\n\t"
        "shl	v5.4s, v21.4s, #7\n\t"
        "sri	v1.4s, v20.4s, #25\n\t"
        "sri	v5.4s, v21.4s, #25\n\t"
        "ext	v3.16b, v3.16b, v3.16b, #12\n\t"
        "ext	v7.16b, v7.16b, v7.16b, #12\n\t"
        "ext	v1.16b, v1.16b, v1.16b, #4\n\t"
        "ext	v5.16b, v5.16b, v5.16b, #4\n\t"
        "ext	v2.16b, v2.16b, v2.16b, #8\n\t"
        "ext	v6.16b, v6.16b, v6.16b, #8\n\t"
        /* Round even */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "add	v4.4s, v4.4s, v5.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "eor	v7.16b, v7.16b, v4.16b\n\t"
        "rev32	v3.8h, v3.8h\n\t"
        "rev32	v7.8h, v7.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "add	v6.4s, v6.4s, v7.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "eor	v21.16b, v5.16b, v6.16b\n\t"
        "shl	v1.4s, v20.4s, #12\n\t"
        "shl	v5.4s, v21.4s, #12\n\t"
        "sri	v1.4s, v20.4s, #20\n\t"
        "sri	v5.4s, v21.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "add	v4.4s, v4.4s, v5.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "eor	v7.16b, v7.16b, v4.16b\n\t"
        "tbl	v3.16b, {v3.16b}, v30.16b\n\t"
        "tbl	v7.16b, {v7.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "add	v6.4s, v6.4s, v7.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "eor	v21.16b, v5.16b, v6.16b\n\t"
        "shl	v1.4s, v20.4s, #7\n\t"
        "shl	v5.4s, v21.4s, #7\n\t"
        "sri	v1.4s, v20.4s, #25\n\t"
        "sri	v5.4s, v21.4s, #25\n\t"
        "ext	v3.16b, v3.16b, v3.16b, #4\n\t"
        "ext	v7.16b, v7.16b, v7.16b, #4\n\t"
        "ext	v1.16b, v1.16b, v1.16b, #12\n\t"
        "ext	v5.16b, v5.16b, v5.16b, #12\n\t"
        "ext	v2.16b, v2.16b, v2.16b, #8\n\t"
        "ext	v6.16b, v6.16b, v6.16b, #8\n\t"
        "b.ne	L_chacha_crypt_bytes_arm64_round_start_128_%=\n\t"
        /* Add back state, XOR in message and store (load next block) */
        "add	v0.4s, v0.4s, v16.4s\n\t"
        "add	v1.4s, v1.4s, v17.4s\n\t"
        "add	v2.4s, v2.4s, v18.4s\n\t"
        "add	v3.4s, v3.4s, v19.4s\n\t"
        "eor	v24.16b, v24.16b, v0.16b\n\t"
        "eor	v25.16b, v25.16b, v1.16b\n\t"
        "eor	v26.16b, v26.16b, v2.16b\n\t"
        "eor	v27.16b, v27.16b, v3.16b\n\t"
        "ld1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t"
        "st1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t"
        "add	v19.4s, v19.4s, v31.4s\n\t"
        "add	v4.4s, v4.4s, v16.4s\n\t"
        "add	v5.4s, v5.4s, v17.4s\n\t"
        "add	v6.4s, v6.4s, v18.4s\n\t"
        "add	v7.4s, v7.4s, v19.4s\n\t"
        "eor	v20.16b, v20.16b, v4.16b\n\t"
        "eor	v21.16b, v21.16b, v5.16b\n\t"
        "eor	v22.16b, v22.16b, v6.16b\n\t"
        "eor	v23.16b, v23.16b, v7.16b\n\t"
        "st1	{v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t"
        "add	v19.4s, v19.4s, v31.4s\n\t"
        "sub	%w[len], %w[len], #0x80\n\t"
        /* Done 128-byte block */
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_128_%=:\n\t"
        "cmp	%w[len], #0\n\t"
        "b.eq	L_chacha_crypt_bytes_arm64_done_all_%=\n\t"
        "mov	%w[rol8], #0x40\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_loop_64_%=:\n\t"
        /* Move state into vector registers */
        "mov	v0.16b, v16.16b\n\t"
        "mov	v1.16b, v17.16b\n\t"
        "mov	v2.16b, v18.16b\n\t"
        "mov	v3.16b, v19.16b\n\t"
        /* Set number of odd+even rounds to perform */
        "mov	x26, #10\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_round_64_%=:\n\t"
        "subs	x26, x26, #1\n\t"
        /* Round odd */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "rev32	v3.8h, v3.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "shl	v1.4s, v20.4s, #12\n\t"
        "sri	v1.4s, v20.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "tbl	v3.16b, {v3.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "shl	v1.4s, v20.4s, #7\n\t"
        "sri	v1.4s, v20.4s, #25\n\t"
        "ext	v3.16b, v3.16b, v3.16b, #12\n\t"
        "ext	v1.16b, v1.16b, v1.16b, #4\n\t"
        "ext	v2.16b, v2.16b, v2.16b, #8\n\t"
        /* Round even */
        /* a += b; d ^= a; d <<<= 16; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "rev32	v3.8h, v3.8h\n\t"
        /* c += d; b ^= c; b <<<= 12; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "shl	v1.4s, v20.4s, #12\n\t"
        "sri	v1.4s, v20.4s, #20\n\t"
        /* a += b; d ^= a; d <<<= 8; */
        "add	v0.4s, v0.4s, v1.4s\n\t"
        "eor	v3.16b, v3.16b, v0.16b\n\t"
        "tbl	v3.16b, {v3.16b}, v30.16b\n\t"
        /* c += d; b ^= c; b <<<= 7; */
        "add	v2.4s, v2.4s, v3.4s\n\t"
        "eor	v20.16b, v1.16b, v2.16b\n\t"
        "shl	v1.4s, v20.4s, #7\n\t"
        "sri	v1.4s, v20.4s, #25\n\t"
        "ext	v3.16b, v3.16b, v3.16b, #4\n\t"
        "ext	v1.16b, v1.16b, v1.16b, #12\n\t"
        "ext	v2.16b, v2.16b, v2.16b, #8\n\t"
        "b.ne	L_chacha_crypt_bytes_arm64_round_64_%=\n\t"
        /* Add back state */
        "add	v0.4s, v0.4s, v16.4s\n\t"
        "add	v1.4s, v1.4s, v17.4s\n\t"
        "add	v2.4s, v2.4s, v18.4s\n\t"
        "add	v3.4s, v3.4s, v19.4s\n\t"
        /* Check if data is less than 64 bytes - store in over */
        "cmp	%w[len], #0x40\n\t"
        "add	v19.4s, v19.4s, v31.4s\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_64_%=\n\t"
        /* Encipher 64 bytes */
        "ld1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t"
        "eor	v24.16b, v24.16b, v0.16b\n\t"
        "eor	v25.16b, v25.16b, v1.16b\n\t"
        "eor	v26.16b, v26.16b, v2.16b\n\t"
        "eor	v27.16b, v27.16b, v3.16b\n\t"
        "st1	{v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t"
        /* Check for more bytes to be enciphered */
        "subs	%w[len], %w[len], #0x40\n\t"
        "b.ne	L_chacha_crypt_bytes_arm64_loop_64_%=\n\t"
        "b	L_chacha_crypt_bytes_arm64_done_%=\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_64_%=:\n\t"
        /* Calculate bytes left in block not used */
        "sub	%w[rol8], %w[rol8], %w[len]\n\t"
        /* Store encipher block in over for further operations and left */
        "st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [x4]\n\t"
        "str	%w[rol8], [%x[ctx], #64]\n\t"
        /* Encipher 32 bytes */
        "cmp	%w[len], #32\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_32_%=\n\t"
        "ld1	{v24.16b, v25.16b}, [%x[m]], #32\n\t"
        "eor	v24.16b, v24.16b, v0.16b\n\t"
        "eor	v25.16b, v25.16b, v1.16b\n\t"
        "st1	{v24.16b, v25.16b}, [%x[c]], #32\n\t"
        "subs	%w[len], %w[len], #32\n\t"
        "mov	v0.16b, v2.16b\n\t"
        "mov	v1.16b, v3.16b\n\t"
        "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_32_%=:\n\t"
        "cmp	%w[len], #16\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_16_%=\n\t"
        /* Encipher 16 bytes */
        "ld1	{v24.16b}, [%x[m]], #16\n\t"
        "eor	v24.16b, v24.16b, v0.16b\n\t"
        "st1	{v24.16b}, [%x[c]], #16\n\t"
        "subs	%w[len], %w[len], #16\n\t"
        "mov	v0.16b, v1.16b\n\t"
        "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_16_%=:\n\t"
        "cmp	%w[len], #8\n\t"
        "b.lt	L_chacha_crypt_bytes_arm64_lt_8_%=\n\t"
        /* Encipher 8 bytes */
        "ld1	{v24.8b}, [%x[m]], #8\n\t"
        "eor	v24.8b, v24.8b, v0.8b\n\t"
        "st1	{v24.8b}, [%x[c]], #8\n\t"
        "subs	%w[len], %w[len], #8\n\t"
        "mov	v0.d[0], v0.d[1]\n\t"
        "b.eq	L_chacha_crypt_bytes_arm64_done_%=\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_lt_8_%=:\n\t"
        "mov	%[rol8], v0.d[0]\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_loop_lt_8_%=:\n\t"
        /* Encipher 1 byte at a time */
        "ldrb	%w[ctr], [%x[m]], #1\n\t"
        "eor	%w[ctr], %w[ctr], %w[rol8]\n\t"
        "strb	%w[ctr], [%x[c]], #1\n\t"
        "subs	%w[len], %w[len], #1\n\t"
        "lsr	%[rol8], %[rol8], #8\n\t"
        "b.gt	L_chacha_crypt_bytes_arm64_loop_lt_8_%=\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_done_%=:\n\t"
        "\n"
    "L_chacha_crypt_bytes_arm64_done_all_%=:\n\t"
        "st1	{v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t"
        : [ctx] "+r" (ctx), [c] "+r" (c), [len] "+r" (len)
        : [m] "r" (m), [rol8] "r" (rol8), [ctr] "r" (ctr)
        : "memory", "cc", "x4", "x7", "x8", "x9", "x10", "x11", "x12", "x13",
            "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23",
            "x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
            "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
            "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
            "v27", "v28", "v29", "v30", "v31"
    );
}

void wc_chacha_setiv(word32* x, const byte* iv, word32 counter)
{
    __asm__ __volatile__ (
        "ldr	x3, [%x[iv]]\n\t"
        "ldr	w4, [%x[iv], #8]\n\t"
        "str	%x[counter], [%x[x], #48]\n\t"
        "str	x3, [%x[x], #52]\n\t"
        "str	w4, [%x[x], #60]\n\t"
        : [x] "+r" (x), [counter] "+r" (counter)
        : [iv] "r" (iv)
        : "memory", "cc", "x3", "x4"
    );
}

XALIGNED(8) static const word32 L_chacha_setkey_arm64_constant[] = {
    0x61707865, 0x3120646e, 0x79622d36, 0x6b206574,
    0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
};

void wc_chacha_setkey(word32* x, const byte* key, word32 keySz)
{
    const word32* constant = L_chacha_setkey_arm64_constant;
    __asm__ __volatile__ (
        "subs	%x[keySz], %x[keySz], #16\n\t"
        "add	%[constant], %[constant], %x[keySz]\n\t"
        /* Start with constants */
        "ld1	{v0.4s}, [%[constant]]\n\t"
        "ld1	{v1.16b}, [%x[key]], #16\n\t"
#ifdef BIG_ENDIAN_ORDER
        "rev32	v1.8h, v1.8h\n\t"
#endif /* BIG_ENDIAN_ORDER */
        "st1	{v0.4s}, [%x[x]], #16\n\t"
        "st1	{v1.4s}, [%x[x]], #16\n\t"
        "b.eq	L_chacha_setkey_arm64_done_%=\n\t"
        "ld1	{v1.16b}, [%x[key]]\n\t"
#ifdef BIG_ENDIAN_ORDER
        "rev32	v1.8h, v1.8h\n\t"
#endif /* BIG_ENDIAN_ORDER */
        "\n"
    "L_chacha_setkey_arm64_done_%=:\n\t"
        "st1	{v1.4s}, [%x[x]]\n\t"
        : [x] "+r" (x), [keySz] "+r" (keySz)
        : [key] "r" (key), [constant] "r" (constant)
        : "memory", "cc", "v0", "v1"
    );
}

void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len)
{
    __asm__ __volatile__ (
        "\n"
    "L_chacha_use_over_arm64_16byte_loop_%=:\n\t"
        "cmp	%w[len], #16\n\t"
        "b.lt	L_chacha_use_over_arm64_word_loop_%=\n\t"
        /* 16 bytes of state XORed into message. */
        "ld1	{v0.16b}, [%x[over]], #16\n\t"
        "ld1	{v1.16b}, [%x[input]], #16\n\t"
        "eor	v1.16b, v1.16b, v0.16b\n\t"
        "subs	%w[len], %w[len], #16\n\t"
        "st1	{v1.16b}, [%x[output]], #16\n\t"
        "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
        "b	L_chacha_use_over_arm64_16byte_loop_%=\n\t"
        "\n"
    "L_chacha_use_over_arm64_word_loop_%=:\n\t"
        "cmp	%w[len], #4\n\t"
        "b.lt	L_chacha_use_over_arm64_byte_loop_%=\n\t"
        /* 4 bytes of state XORed into message. */
        "ldr	w4, [%x[over]], #4\n\t"
        "ldr	w5, [%x[input]], #4\n\t"
        "eor	w5, w5, w4\n\t"
        "subs	%w[len], %w[len], #4\n\t"
        "str	w5, [%x[output]], #4\n\t"
        "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
        "b	L_chacha_use_over_arm64_word_loop_%=\n\t"
        "\n"
    "L_chacha_use_over_arm64_byte_loop_%=:\n\t"
        /* 1 bytes of state XORed into message. */
        "ldrb	w4, [%x[over]], #1\n\t"
        "ldrb	w5, [%x[input]], #1\n\t"
        "eor	w5, w5, w4\n\t"
        "subs	%w[len], %w[len], #1\n\t"
        "strb	w5, [%x[output]], #1\n\t"
        "b.eq	L_chacha_use_over_arm64_done_%=\n\t"
        "b	L_chacha_use_over_arm64_byte_loop_%=\n\t"
        "\n"
    "L_chacha_use_over_arm64_done_%=:\n\t"
        : [over] "+r" (over), [output] "+r" (output), [len] "+r" (len)
        : [input] "r" (input)
        : "memory", "cc", "x4", "x5", "v0", "v1"
    );
}

#endif /* !WOLFSSL_ARMASM_NO_NEON */
#endif /* HAVE_CHACHA */
#endif /* __aarch64__ */
#endif /* WOLFSSL_ARMASM */
#endif /* WOLFSSL_ARMASM_INLINE */