/* armv8-chacha-asm * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #include #include /* Generated using (from wolfssl): * cd ../scripts * ruby ./chacha/chacha.rb arm64 \ * ../wolfssl/wolfcrypt/src/port/arm/armv8-chacha-asm.c */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #ifdef WOLFSSL_ARMASM_INLINE #ifdef HAVE_CHACHA #include XALIGNED(8) static const word32 L_chacha20_arm64_ctr[] = { 0x00000000, 0x00000001, 0x00000002, 0x00000003, }; XALIGNED(8) static const word32 L_chacha20_arm64_rol8[] = { 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f, }; #ifndef WOLFSSL_ARMASM_NO_NEON void wc_chacha_crypt_bytes(ChaCha* ctx, byte* c, const byte* m, word32 len) { const word32* rol8 = L_chacha20_arm64_rol8; const word32* ctr = L_chacha20_arm64_ctr; __asm__ __volatile__ ( "eor v29.16b, v29.16b, v29.16b\n\t" "mov x26, #5\n\t" "eor v31.16b, v31.16b, v31.16b\n\t" "mov w7, #1\n\t" "ld1 {v30.16b}, [%[rol8]]\n\t" "ld1 {v28.4s}, [%[ctr]]\n\t" "add x4, %x[ctx], #0x44\n\t" "mov v29.s[0], w26\n\t" "mov v31.s[0], w7\n\t" /* Load state to encrypt */ "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t" "cmp %w[len], #0x140\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_320_%=\n\t" "mov w25, #4\n\t" "\n" "L_chacha_crypt_bytes_arm64_loop_320_%=:\n\t" /* Move state into regular register */ "mov x8, v16.d[0]\n\t" "mov x10, v16.d[1]\n\t" "mov x12, v17.d[0]\n\t" "mov x14, v17.d[1]\n\t" "mov x16, v18.d[0]\n\t" "mov x19, v18.d[1]\n\t" "mov x21, v19.d[0]\n\t" "mov x23, v19.d[1]\n\t" "sub %w[len], %w[len], #0x140\n\t" /* Move state into vector registers */ "dup v0.4s, v16.s[0]\n\t" "dup v1.4s, v16.s[1]\n\t" "lsr x9, x8, #32\n\t" "dup v2.4s, v16.s[2]\n\t" "dup v3.4s, v16.s[3]\n\t" "lsr x11, x10, #32\n\t" "dup v4.4s, v17.s[0]\n\t" "dup v5.4s, v17.s[1]\n\t" "lsr x13, x12, #32\n\t" "dup v6.4s, v17.s[2]\n\t" "dup v7.4s, v17.s[3]\n\t" "lsr x15, x14, #32\n\t" "dup v8.4s, v18.s[0]\n\t" "dup v9.4s, v18.s[1]\n\t" "lsr x17, x16, #32\n\t" "dup v10.4s, v18.s[2]\n\t" "dup v11.4s, v18.s[3]\n\t" "lsr x20, x19, #32\n\t" "dup v12.4s, v19.s[0]\n\t" "dup v13.4s, v19.s[1]\n\t" "lsr x22, x21, #32\n\t" "dup v14.4s, v19.s[2]\n\t" "dup v15.4s, v19.s[3]\n\t" "lsr x24, x23, #32\n\t" /* Add to counter word */ "add v12.4s, v12.4s, v28.4s\n\t" "add w21, w21, w25\n\t" /* Set number of odd+even rounds to perform */ "mov x26, #10\n\t" "\n" "L_chacha_crypt_bytes_arm64_round_start_320_%=:\n\t" "subs x26, x26, #1\n\t" /* Round odd */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v4.4s\n\t" "add w8, w8, w12\n\t" "add v1.4s, v1.4s, v5.4s\n\t" "add w9, w9, w13\n\t" "add v2.4s, v2.4s, v6.4s\n\t" "add w10, w10, w14\n\t" "add v3.4s, v3.4s, v7.4s\n\t" "add w11, w11, w15\n\t" "eor v12.16b, v12.16b, v0.16b\n\t" "eor w21, w21, w8\n\t" "eor v13.16b, v13.16b, v1.16b\n\t" "eor w22, w22, w9\n\t" "eor v14.16b, v14.16b, v2.16b\n\t" "eor w23, w23, w10\n\t" "eor v15.16b, v15.16b, v3.16b\n\t" "eor w24, w24, w11\n\t" "rev32 v12.8h, v12.8h\n\t" "ror w21, w21, #16\n\t" "rev32 v13.8h, v13.8h\n\t" "ror w22, w22, #16\n\t" "rev32 v14.8h, v14.8h\n\t" "ror w23, w23, #16\n\t" "rev32 v15.8h, v15.8h\n\t" "ror w24, w24, #16\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v8.4s, v8.4s, v12.4s\n\t" "add w16, w16, w21\n\t" "add v9.4s, v9.4s, v13.4s\n\t" "add w17, w17, w22\n\t" "add v10.4s, v10.4s, v14.4s\n\t" "add w19, w19, w23\n\t" "add v11.4s, v11.4s, v15.4s\n\t" "add w20, w20, w24\n\t" "eor v20.16b, v4.16b, v8.16b\n\t" "eor w12, w12, w16\n\t" "eor v21.16b, v5.16b, v9.16b\n\t" "eor w13, w13, w17\n\t" "eor v22.16b, v6.16b, v10.16b\n\t" "eor w14, w14, w19\n\t" "eor v23.16b, v7.16b, v11.16b\n\t" "eor w15, w15, w20\n\t" "shl v4.4s, v20.4s, #12\n\t" "ror w12, w12, #20\n\t" "shl v5.4s, v21.4s, #12\n\t" "ror w13, w13, #20\n\t" "shl v6.4s, v22.4s, #12\n\t" "ror w14, w14, #20\n\t" "shl v7.4s, v23.4s, #12\n\t" "ror w15, w15, #20\n\t" "sri v4.4s, v20.4s, #20\n\t" "sri v5.4s, v21.4s, #20\n\t" "sri v6.4s, v22.4s, #20\n\t" "sri v7.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v4.4s\n\t" "add w8, w8, w12\n\t" "add v1.4s, v1.4s, v5.4s\n\t" "add w9, w9, w13\n\t" "add v2.4s, v2.4s, v6.4s\n\t" "add w10, w10, w14\n\t" "add v3.4s, v3.4s, v7.4s\n\t" "add w11, w11, w15\n\t" "eor v12.16b, v12.16b, v0.16b\n\t" "eor w21, w21, w8\n\t" "eor v13.16b, v13.16b, v1.16b\n\t" "eor w22, w22, w9\n\t" "eor v14.16b, v14.16b, v2.16b\n\t" "eor w23, w23, w10\n\t" "eor v15.16b, v15.16b, v3.16b\n\t" "eor w24, w24, w11\n\t" "tbl v12.16b, {v12.16b}, v30.16b\n\t" "ror w21, w21, #24\n\t" "tbl v13.16b, {v13.16b}, v30.16b\n\t" "ror w22, w22, #24\n\t" "tbl v14.16b, {v14.16b}, v30.16b\n\t" "ror w23, w23, #24\n\t" "tbl v15.16b, {v15.16b}, v30.16b\n\t" "ror w24, w24, #24\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v8.4s, v8.4s, v12.4s\n\t" "add w16, w16, w21\n\t" "add v9.4s, v9.4s, v13.4s\n\t" "add w17, w17, w22\n\t" "add v10.4s, v10.4s, v14.4s\n\t" "add w19, w19, w23\n\t" "add v11.4s, v11.4s, v15.4s\n\t" "add w20, w20, w24\n\t" "eor v20.16b, v4.16b, v8.16b\n\t" "eor w12, w12, w16\n\t" "eor v21.16b, v5.16b, v9.16b\n\t" "eor w13, w13, w17\n\t" "eor v22.16b, v6.16b, v10.16b\n\t" "eor w14, w14, w19\n\t" "eor v23.16b, v7.16b, v11.16b\n\t" "eor w15, w15, w20\n\t" "shl v4.4s, v20.4s, #7\n\t" "ror w12, w12, #25\n\t" "shl v5.4s, v21.4s, #7\n\t" "ror w13, w13, #25\n\t" "shl v6.4s, v22.4s, #7\n\t" "ror w14, w14, #25\n\t" "shl v7.4s, v23.4s, #7\n\t" "ror w15, w15, #25\n\t" "sri v4.4s, v20.4s, #25\n\t" "sri v5.4s, v21.4s, #25\n\t" "sri v6.4s, v22.4s, #25\n\t" "sri v7.4s, v23.4s, #25\n\t" /* Round even */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v5.4s\n\t" "add w8, w8, w13\n\t" "add v1.4s, v1.4s, v6.4s\n\t" "add w9, w9, w14\n\t" "add v2.4s, v2.4s, v7.4s\n\t" "add w10, w10, w15\n\t" "add v3.4s, v3.4s, v4.4s\n\t" "add w11, w11, w12\n\t" "eor v15.16b, v15.16b, v0.16b\n\t" "eor w24, w24, w8\n\t" "eor v12.16b, v12.16b, v1.16b\n\t" "eor w21, w21, w9\n\t" "eor v13.16b, v13.16b, v2.16b\n\t" "eor w22, w22, w10\n\t" "eor v14.16b, v14.16b, v3.16b\n\t" "eor w23, w23, w11\n\t" "rev32 v15.8h, v15.8h\n\t" "ror w24, w24, #16\n\t" "rev32 v12.8h, v12.8h\n\t" "ror w21, w21, #16\n\t" "rev32 v13.8h, v13.8h\n\t" "ror w22, w22, #16\n\t" "rev32 v14.8h, v14.8h\n\t" "ror w23, w23, #16\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v10.4s, v10.4s, v15.4s\n\t" "add w19, w19, w24\n\t" "add v11.4s, v11.4s, v12.4s\n\t" "add w20, w20, w21\n\t" "add v8.4s, v8.4s, v13.4s\n\t" "add w16, w16, w22\n\t" "add v9.4s, v9.4s, v14.4s\n\t" "add w17, w17, w23\n\t" "eor v20.16b, v5.16b, v10.16b\n\t" "eor w13, w13, w19\n\t" "eor v21.16b, v6.16b, v11.16b\n\t" "eor w14, w14, w20\n\t" "eor v22.16b, v7.16b, v8.16b\n\t" "eor w15, w15, w16\n\t" "eor v23.16b, v4.16b, v9.16b\n\t" "eor w12, w12, w17\n\t" "shl v5.4s, v20.4s, #12\n\t" "ror w13, w13, #20\n\t" "shl v6.4s, v21.4s, #12\n\t" "ror w14, w14, #20\n\t" "shl v7.4s, v22.4s, #12\n\t" "ror w15, w15, #20\n\t" "shl v4.4s, v23.4s, #12\n\t" "ror w12, w12, #20\n\t" "sri v5.4s, v20.4s, #20\n\t" "sri v6.4s, v21.4s, #20\n\t" "sri v7.4s, v22.4s, #20\n\t" "sri v4.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v5.4s\n\t" "add w8, w8, w13\n\t" "add v1.4s, v1.4s, v6.4s\n\t" "add w9, w9, w14\n\t" "add v2.4s, v2.4s, v7.4s\n\t" "add w10, w10, w15\n\t" "add v3.4s, v3.4s, v4.4s\n\t" "add w11, w11, w12\n\t" "eor v15.16b, v15.16b, v0.16b\n\t" "eor w24, w24, w8\n\t" "eor v12.16b, v12.16b, v1.16b\n\t" "eor w21, w21, w9\n\t" "eor v13.16b, v13.16b, v2.16b\n\t" "eor w22, w22, w10\n\t" "eor v14.16b, v14.16b, v3.16b\n\t" "eor w23, w23, w11\n\t" "tbl v15.16b, {v15.16b}, v30.16b\n\t" "ror w24, w24, #24\n\t" "tbl v12.16b, {v12.16b}, v30.16b\n\t" "ror w21, w21, #24\n\t" "tbl v13.16b, {v13.16b}, v30.16b\n\t" "ror w22, w22, #24\n\t" "tbl v14.16b, {v14.16b}, v30.16b\n\t" "ror w23, w23, #24\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v10.4s, v10.4s, v15.4s\n\t" "add w19, w19, w24\n\t" "add v11.4s, v11.4s, v12.4s\n\t" "add w20, w20, w21\n\t" "add v8.4s, v8.4s, v13.4s\n\t" "add w16, w16, w22\n\t" "add v9.4s, v9.4s, v14.4s\n\t" "add w17, w17, w23\n\t" "eor v20.16b, v5.16b, v10.16b\n\t" "eor w13, w13, w19\n\t" "eor v21.16b, v6.16b, v11.16b\n\t" "eor w14, w14, w20\n\t" "eor v22.16b, v7.16b, v8.16b\n\t" "eor w15, w15, w16\n\t" "eor v23.16b, v4.16b, v9.16b\n\t" "eor w12, w12, w17\n\t" "shl v5.4s, v20.4s, #7\n\t" "ror w13, w13, #25\n\t" "shl v6.4s, v21.4s, #7\n\t" "ror w14, w14, #25\n\t" "shl v7.4s, v22.4s, #7\n\t" "ror w15, w15, #25\n\t" "shl v4.4s, v23.4s, #7\n\t" "ror w12, w12, #25\n\t" "sri v5.4s, v20.4s, #25\n\t" "sri v6.4s, v21.4s, #25\n\t" "sri v7.4s, v22.4s, #25\n\t" "sri v4.4s, v23.4s, #25\n\t" "b.ne L_chacha_crypt_bytes_arm64_round_start_320_%=\n\t" /* Add counter now rather than after transposed */ "add v12.4s, v12.4s, v28.4s\n\t" "add w21, w21, w25\n\t" /* Load message */ "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" /* Transpose vectors */ "trn1 v20.4s, v0.4s, v1.4s\n\t" "trn1 v22.4s, v2.4s, v3.4s\n\t" "orr x8, x8, x9, lsl 32\n\t" "trn2 v21.4s, v0.4s, v1.4s\n\t" "trn2 v23.4s, v2.4s, v3.4s\n\t" "trn1 v0.2d, v20.2d, v22.2d\n\t" "trn1 v1.2d, v21.2d, v23.2d\n\t" "orr x10, x10, x11, lsl 32\n\t" "trn2 v2.2d, v20.2d, v22.2d\n\t" "trn2 v3.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v4.4s, v5.4s\n\t" "trn1 v22.4s, v6.4s, v7.4s\n\t" "orr x12, x12, x13, lsl 32\n\t" "trn2 v21.4s, v4.4s, v5.4s\n\t" "trn2 v23.4s, v6.4s, v7.4s\n\t" "trn1 v4.2d, v20.2d, v22.2d\n\t" "trn1 v5.2d, v21.2d, v23.2d\n\t" "orr x14, x14, x15, lsl 32\n\t" "trn2 v6.2d, v20.2d, v22.2d\n\t" "trn2 v7.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v8.4s, v9.4s\n\t" "trn1 v22.4s, v10.4s, v11.4s\n\t" "orr x16, x16, x17, lsl 32\n\t" "trn2 v21.4s, v8.4s, v9.4s\n\t" "trn2 v23.4s, v10.4s, v11.4s\n\t" "trn1 v8.2d, v20.2d, v22.2d\n\t" "trn1 v9.2d, v21.2d, v23.2d\n\t" "orr x19, x19, x20, lsl 32\n\t" "trn2 v10.2d, v20.2d, v22.2d\n\t" "trn2 v11.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v12.4s, v13.4s\n\t" "trn1 v22.4s, v14.4s, v15.4s\n\t" "orr x21, x21, x22, lsl 32\n\t" "trn2 v21.4s, v12.4s, v13.4s\n\t" "trn2 v23.4s, v14.4s, v15.4s\n\t" "trn1 v12.2d, v20.2d, v22.2d\n\t" "trn1 v13.2d, v21.2d, v23.2d\n\t" "orr x23, x23, x24, lsl 32\n\t" "trn2 v14.2d, v20.2d, v22.2d\n\t" "trn2 v15.2d, v21.2d, v23.2d\n\t" /* Add back state, XOR in message and store (load next block) */ "add v20.4s, v0.4s, v16.4s\n\t" "add v21.4s, v4.4s, v17.4s\n\t" "add v22.4s, v8.4s, v18.4s\n\t" "add v23.4s, v12.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v1.4s, v16.4s\n\t" "add v21.4s, v5.4s, v17.4s\n\t" "add v22.4s, v9.4s, v18.4s\n\t" "add v23.4s, v13.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v2.4s, v16.4s\n\t" "add v21.4s, v6.4s, v17.4s\n\t" "add v22.4s, v10.4s, v18.4s\n\t" "add v23.4s, v14.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v3.4s, v16.4s\n\t" "add v21.4s, v7.4s, v17.4s\n\t" "add v22.4s, v11.4s, v18.4s\n\t" "add v23.4s, v15.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" /* Move regular registers into vector registers for adding and xor */ "mov v0.d[0], x8\n\t" "mov v0.d[1], x10\n\t" "mov v1.d[0], x12\n\t" "mov v1.d[1], x14\n\t" "mov v2.d[0], x16\n\t" "mov v2.d[1], x19\n\t" "mov v3.d[0], x21\n\t" "mov v3.d[1], x23\n\t" /* Add back state, XOR in message and store */ "add v0.4s, v0.4s, v16.4s\n\t" "add v1.4s, v1.4s, v17.4s\n\t" "add v2.4s, v2.4s, v18.4s\n\t" "add v3.4s, v3.4s, v19.4s\n\t" "eor v0.16b, v0.16b, v24.16b\n\t" "eor v1.16b, v1.16b, v25.16b\n\t" "eor v2.16b, v2.16b, v26.16b\n\t" "eor v3.16b, v3.16b, v27.16b\n\t" "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[c]], #0x40\n\t" "cmp %w[len], #0x140\n\t" "add v19.4s, v19.4s, v29.4s\n\t" "b.ge L_chacha_crypt_bytes_arm64_loop_320_%=\n\t" /* Done doing 320 bytes at a time */ "\n" "L_chacha_crypt_bytes_arm64_lt_320_%=:\n\t" "cmp %w[len], #0x100\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_256_%=\n\t" /* Move state into vector registers */ "dup v0.4s, v16.s[0]\n\t" "dup v1.4s, v16.s[1]\n\t" "dup v2.4s, v16.s[2]\n\t" "dup v3.4s, v16.s[3]\n\t" "dup v4.4s, v17.s[0]\n\t" "dup v5.4s, v17.s[1]\n\t" "dup v6.4s, v17.s[2]\n\t" "dup v7.4s, v17.s[3]\n\t" "dup v8.4s, v18.s[0]\n\t" "dup v9.4s, v18.s[1]\n\t" "dup v10.4s, v18.s[2]\n\t" "dup v11.4s, v18.s[3]\n\t" "dup v12.4s, v19.s[0]\n\t" "dup v13.4s, v19.s[1]\n\t" "dup v14.4s, v19.s[2]\n\t" "dup v15.4s, v19.s[3]\n\t" /* Add to counter word */ "add v12.4s, v12.4s, v28.4s\n\t" /* Set number of odd+even rounds to perform */ "mov x26, #10\n\t" "\n" "L_chacha_crypt_bytes_arm64_round_start_256_%=:\n\t" "subs x26, x26, #1\n\t" /* Round odd */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v4.4s\n\t" "add v1.4s, v1.4s, v5.4s\n\t" "add v2.4s, v2.4s, v6.4s\n\t" "add v3.4s, v3.4s, v7.4s\n\t" "eor v12.16b, v12.16b, v0.16b\n\t" "eor v13.16b, v13.16b, v1.16b\n\t" "eor v14.16b, v14.16b, v2.16b\n\t" "eor v15.16b, v15.16b, v3.16b\n\t" "rev32 v12.8h, v12.8h\n\t" "rev32 v13.8h, v13.8h\n\t" "rev32 v14.8h, v14.8h\n\t" "rev32 v15.8h, v15.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v8.4s, v8.4s, v12.4s\n\t" "add v9.4s, v9.4s, v13.4s\n\t" "add v10.4s, v10.4s, v14.4s\n\t" "add v11.4s, v11.4s, v15.4s\n\t" "eor v20.16b, v4.16b, v8.16b\n\t" "eor v21.16b, v5.16b, v9.16b\n\t" "eor v22.16b, v6.16b, v10.16b\n\t" "eor v23.16b, v7.16b, v11.16b\n\t" "shl v4.4s, v20.4s, #12\n\t" "shl v5.4s, v21.4s, #12\n\t" "shl v6.4s, v22.4s, #12\n\t" "shl v7.4s, v23.4s, #12\n\t" "sri v4.4s, v20.4s, #20\n\t" "sri v5.4s, v21.4s, #20\n\t" "sri v6.4s, v22.4s, #20\n\t" "sri v7.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v4.4s\n\t" "add v1.4s, v1.4s, v5.4s\n\t" "add v2.4s, v2.4s, v6.4s\n\t" "add v3.4s, v3.4s, v7.4s\n\t" "eor v12.16b, v12.16b, v0.16b\n\t" "eor v13.16b, v13.16b, v1.16b\n\t" "eor v14.16b, v14.16b, v2.16b\n\t" "eor v15.16b, v15.16b, v3.16b\n\t" "tbl v12.16b, {v12.16b}, v30.16b\n\t" "tbl v13.16b, {v13.16b}, v30.16b\n\t" "tbl v14.16b, {v14.16b}, v30.16b\n\t" "tbl v15.16b, {v15.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v8.4s, v8.4s, v12.4s\n\t" "add v9.4s, v9.4s, v13.4s\n\t" "add v10.4s, v10.4s, v14.4s\n\t" "add v11.4s, v11.4s, v15.4s\n\t" "eor v20.16b, v4.16b, v8.16b\n\t" "eor v21.16b, v5.16b, v9.16b\n\t" "eor v22.16b, v6.16b, v10.16b\n\t" "eor v23.16b, v7.16b, v11.16b\n\t" "shl v4.4s, v20.4s, #7\n\t" "shl v5.4s, v21.4s, #7\n\t" "shl v6.4s, v22.4s, #7\n\t" "shl v7.4s, v23.4s, #7\n\t" "sri v4.4s, v20.4s, #25\n\t" "sri v5.4s, v21.4s, #25\n\t" "sri v6.4s, v22.4s, #25\n\t" "sri v7.4s, v23.4s, #25\n\t" /* Round even */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v5.4s\n\t" "add v1.4s, v1.4s, v6.4s\n\t" "add v2.4s, v2.4s, v7.4s\n\t" "add v3.4s, v3.4s, v4.4s\n\t" "eor v15.16b, v15.16b, v0.16b\n\t" "eor v12.16b, v12.16b, v1.16b\n\t" "eor v13.16b, v13.16b, v2.16b\n\t" "eor v14.16b, v14.16b, v3.16b\n\t" "rev32 v15.8h, v15.8h\n\t" "rev32 v12.8h, v12.8h\n\t" "rev32 v13.8h, v13.8h\n\t" "rev32 v14.8h, v14.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v10.4s, v10.4s, v15.4s\n\t" "add v11.4s, v11.4s, v12.4s\n\t" "add v8.4s, v8.4s, v13.4s\n\t" "add v9.4s, v9.4s, v14.4s\n\t" "eor v20.16b, v5.16b, v10.16b\n\t" "eor v21.16b, v6.16b, v11.16b\n\t" "eor v22.16b, v7.16b, v8.16b\n\t" "eor v23.16b, v4.16b, v9.16b\n\t" "shl v5.4s, v20.4s, #12\n\t" "shl v6.4s, v21.4s, #12\n\t" "shl v7.4s, v22.4s, #12\n\t" "shl v4.4s, v23.4s, #12\n\t" "sri v5.4s, v20.4s, #20\n\t" "sri v6.4s, v21.4s, #20\n\t" "sri v7.4s, v22.4s, #20\n\t" "sri v4.4s, v23.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v5.4s\n\t" "add v1.4s, v1.4s, v6.4s\n\t" "add v2.4s, v2.4s, v7.4s\n\t" "add v3.4s, v3.4s, v4.4s\n\t" "eor v15.16b, v15.16b, v0.16b\n\t" "eor v12.16b, v12.16b, v1.16b\n\t" "eor v13.16b, v13.16b, v2.16b\n\t" "eor v14.16b, v14.16b, v3.16b\n\t" "tbl v15.16b, {v15.16b}, v30.16b\n\t" "tbl v12.16b, {v12.16b}, v30.16b\n\t" "tbl v13.16b, {v13.16b}, v30.16b\n\t" "tbl v14.16b, {v14.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v10.4s, v10.4s, v15.4s\n\t" "add v11.4s, v11.4s, v12.4s\n\t" "add v8.4s, v8.4s, v13.4s\n\t" "add v9.4s, v9.4s, v14.4s\n\t" "eor v20.16b, v5.16b, v10.16b\n\t" "eor v21.16b, v6.16b, v11.16b\n\t" "eor v22.16b, v7.16b, v8.16b\n\t" "eor v23.16b, v4.16b, v9.16b\n\t" "shl v5.4s, v20.4s, #7\n\t" "shl v6.4s, v21.4s, #7\n\t" "shl v7.4s, v22.4s, #7\n\t" "shl v4.4s, v23.4s, #7\n\t" "sri v5.4s, v20.4s, #25\n\t" "sri v6.4s, v21.4s, #25\n\t" "sri v7.4s, v22.4s, #25\n\t" "sri v4.4s, v23.4s, #25\n\t" "b.ne L_chacha_crypt_bytes_arm64_round_start_256_%=\n\t" "mov x26, #4\n\t" /* Add counter now rather than after transposed */ "add v12.4s, v12.4s, v28.4s\n\t" /* Load message */ "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" /* Transpose vectors */ "trn1 v20.4s, v0.4s, v1.4s\n\t" "trn1 v22.4s, v2.4s, v3.4s\n\t" "trn2 v21.4s, v0.4s, v1.4s\n\t" "trn2 v23.4s, v2.4s, v3.4s\n\t" "trn1 v0.2d, v20.2d, v22.2d\n\t" "trn1 v1.2d, v21.2d, v23.2d\n\t" "trn2 v2.2d, v20.2d, v22.2d\n\t" "trn2 v3.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v4.4s, v5.4s\n\t" "trn1 v22.4s, v6.4s, v7.4s\n\t" "trn2 v21.4s, v4.4s, v5.4s\n\t" "trn2 v23.4s, v6.4s, v7.4s\n\t" "trn1 v4.2d, v20.2d, v22.2d\n\t" "trn1 v5.2d, v21.2d, v23.2d\n\t" "trn2 v6.2d, v20.2d, v22.2d\n\t" "trn2 v7.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v8.4s, v9.4s\n\t" "trn1 v22.4s, v10.4s, v11.4s\n\t" "trn2 v21.4s, v8.4s, v9.4s\n\t" "trn2 v23.4s, v10.4s, v11.4s\n\t" "trn1 v8.2d, v20.2d, v22.2d\n\t" "trn1 v9.2d, v21.2d, v23.2d\n\t" "trn2 v10.2d, v20.2d, v22.2d\n\t" "trn2 v11.2d, v21.2d, v23.2d\n\t" "trn1 v20.4s, v12.4s, v13.4s\n\t" "trn1 v22.4s, v14.4s, v15.4s\n\t" "trn2 v21.4s, v12.4s, v13.4s\n\t" "trn2 v23.4s, v14.4s, v15.4s\n\t" "trn1 v12.2d, v20.2d, v22.2d\n\t" "trn1 v13.2d, v21.2d, v23.2d\n\t" "trn2 v14.2d, v20.2d, v22.2d\n\t" "trn2 v15.2d, v21.2d, v23.2d\n\t" /* Add back state, XOR in message and store (load next block) */ "add v20.4s, v0.4s, v16.4s\n\t" "add v21.4s, v4.4s, v17.4s\n\t" "add v22.4s, v8.4s, v18.4s\n\t" "add v23.4s, v12.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v1.4s, v16.4s\n\t" "add v21.4s, v5.4s, v17.4s\n\t" "add v22.4s, v9.4s, v18.4s\n\t" "add v23.4s, v13.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v2.4s, v16.4s\n\t" "add v21.4s, v6.4s, v17.4s\n\t" "add v22.4s, v10.4s, v18.4s\n\t" "add v23.4s, v14.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v20.4s, v3.4s, v16.4s\n\t" "add v21.4s, v7.4s, v17.4s\n\t" "add v22.4s, v11.4s, v18.4s\n\t" "add v23.4s, v15.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v24.16b\n\t" "eor v21.16b, v21.16b, v25.16b\n\t" "eor v22.16b, v22.16b, v26.16b\n\t" "eor v23.16b, v23.16b, v27.16b\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "mov v29.s[0], w26\n\t" "sub %w[len], %w[len], #0x100\n\t" "add v19.4s, v19.4s, v29.4s\n\t" /* Done 256-byte block */ "\n" "L_chacha_crypt_bytes_arm64_lt_256_%=:\n\t" "cmp %w[len], #0x80\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_128_%=\n\t" "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" /* Move state into vector registers */ "mov v4.16b, v16.16b\n\t" "mov v5.16b, v17.16b\n\t" "mov v6.16b, v18.16b\n\t" "mov v7.16b, v19.16b\n\t" "mov v0.16b, v16.16b\n\t" "mov v1.16b, v17.16b\n\t" "mov v2.16b, v18.16b\n\t" "mov v3.16b, v19.16b\n\t" /* Add counter word */ "add v7.4s, v7.4s, v31.4s\n\t" /* Set number of odd+even rounds to perform */ "mov x26, #10\n\t" "\n" "L_chacha_crypt_bytes_arm64_round_start_128_%=:\n\t" "subs x26, x26, #1\n\t" /* Round odd */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v1.4s\n\t" "add v4.4s, v4.4s, v5.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "eor v7.16b, v7.16b, v4.16b\n\t" "rev32 v3.8h, v3.8h\n\t" "rev32 v7.8h, v7.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v2.4s, v2.4s, v3.4s\n\t" "add v6.4s, v6.4s, v7.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "eor v21.16b, v5.16b, v6.16b\n\t" "shl v1.4s, v20.4s, #12\n\t" "shl v5.4s, v21.4s, #12\n\t" "sri v1.4s, v20.4s, #20\n\t" "sri v5.4s, v21.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v1.4s\n\t" "add v4.4s, v4.4s, v5.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "eor v7.16b, v7.16b, v4.16b\n\t" "tbl v3.16b, {v3.16b}, v30.16b\n\t" "tbl v7.16b, {v7.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v2.4s, v2.4s, v3.4s\n\t" "add v6.4s, v6.4s, v7.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "eor v21.16b, v5.16b, v6.16b\n\t" "shl v1.4s, v20.4s, #7\n\t" "shl v5.4s, v21.4s, #7\n\t" "sri v1.4s, v20.4s, #25\n\t" "sri v5.4s, v21.4s, #25\n\t" "ext v3.16b, v3.16b, v3.16b, #12\n\t" "ext v7.16b, v7.16b, v7.16b, #12\n\t" "ext v1.16b, v1.16b, v1.16b, #4\n\t" "ext v5.16b, v5.16b, v5.16b, #4\n\t" "ext v2.16b, v2.16b, v2.16b, #8\n\t" "ext v6.16b, v6.16b, v6.16b, #8\n\t" /* Round even */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v1.4s\n\t" "add v4.4s, v4.4s, v5.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "eor v7.16b, v7.16b, v4.16b\n\t" "rev32 v3.8h, v3.8h\n\t" "rev32 v7.8h, v7.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v2.4s, v2.4s, v3.4s\n\t" "add v6.4s, v6.4s, v7.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "eor v21.16b, v5.16b, v6.16b\n\t" "shl v1.4s, v20.4s, #12\n\t" "shl v5.4s, v21.4s, #12\n\t" "sri v1.4s, v20.4s, #20\n\t" "sri v5.4s, v21.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v1.4s\n\t" "add v4.4s, v4.4s, v5.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "eor v7.16b, v7.16b, v4.16b\n\t" "tbl v3.16b, {v3.16b}, v30.16b\n\t" "tbl v7.16b, {v7.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v2.4s, v2.4s, v3.4s\n\t" "add v6.4s, v6.4s, v7.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "eor v21.16b, v5.16b, v6.16b\n\t" "shl v1.4s, v20.4s, #7\n\t" "shl v5.4s, v21.4s, #7\n\t" "sri v1.4s, v20.4s, #25\n\t" "sri v5.4s, v21.4s, #25\n\t" "ext v3.16b, v3.16b, v3.16b, #4\n\t" "ext v7.16b, v7.16b, v7.16b, #4\n\t" "ext v1.16b, v1.16b, v1.16b, #12\n\t" "ext v5.16b, v5.16b, v5.16b, #12\n\t" "ext v2.16b, v2.16b, v2.16b, #8\n\t" "ext v6.16b, v6.16b, v6.16b, #8\n\t" "b.ne L_chacha_crypt_bytes_arm64_round_start_128_%=\n\t" /* Add back state, XOR in message and store (load next block) */ "add v0.4s, v0.4s, v16.4s\n\t" "add v1.4s, v1.4s, v17.4s\n\t" "add v2.4s, v2.4s, v18.4s\n\t" "add v3.4s, v3.4s, v19.4s\n\t" "eor v24.16b, v24.16b, v0.16b\n\t" "eor v25.16b, v25.16b, v1.16b\n\t" "eor v26.16b, v26.16b, v2.16b\n\t" "eor v27.16b, v27.16b, v3.16b\n\t" "ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[m]], #0x40\n\t" "st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t" "add v19.4s, v19.4s, v31.4s\n\t" "add v4.4s, v4.4s, v16.4s\n\t" "add v5.4s, v5.4s, v17.4s\n\t" "add v6.4s, v6.4s, v18.4s\n\t" "add v7.4s, v7.4s, v19.4s\n\t" "eor v20.16b, v20.16b, v4.16b\n\t" "eor v21.16b, v21.16b, v5.16b\n\t" "eor v22.16b, v22.16b, v6.16b\n\t" "eor v23.16b, v23.16b, v7.16b\n\t" "st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [%x[c]], #0x40\n\t" "add v19.4s, v19.4s, v31.4s\n\t" "sub %w[len], %w[len], #0x80\n\t" /* Done 128-byte block */ "\n" "L_chacha_crypt_bytes_arm64_lt_128_%=:\n\t" "cmp %w[len], #0\n\t" "b.eq L_chacha_crypt_bytes_arm64_done_all_%=\n\t" "mov %w[rol8], #0x40\n\t" "\n" "L_chacha_crypt_bytes_arm64_loop_64_%=:\n\t" /* Move state into vector registers */ "mov v0.16b, v16.16b\n\t" "mov v1.16b, v17.16b\n\t" "mov v2.16b, v18.16b\n\t" "mov v3.16b, v19.16b\n\t" /* Set number of odd+even rounds to perform */ "mov x26, #10\n\t" "\n" "L_chacha_crypt_bytes_arm64_round_64_%=:\n\t" "subs x26, x26, #1\n\t" /* Round odd */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v1.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "rev32 v3.8h, v3.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v2.4s, v2.4s, v3.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "shl v1.4s, v20.4s, #12\n\t" "sri v1.4s, v20.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v1.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "tbl v3.16b, {v3.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v2.4s, v2.4s, v3.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "shl v1.4s, v20.4s, #7\n\t" "sri v1.4s, v20.4s, #25\n\t" "ext v3.16b, v3.16b, v3.16b, #12\n\t" "ext v1.16b, v1.16b, v1.16b, #4\n\t" "ext v2.16b, v2.16b, v2.16b, #8\n\t" /* Round even */ /* a += b; d ^= a; d <<<= 16; */ "add v0.4s, v0.4s, v1.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "rev32 v3.8h, v3.8h\n\t" /* c += d; b ^= c; b <<<= 12; */ "add v2.4s, v2.4s, v3.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "shl v1.4s, v20.4s, #12\n\t" "sri v1.4s, v20.4s, #20\n\t" /* a += b; d ^= a; d <<<= 8; */ "add v0.4s, v0.4s, v1.4s\n\t" "eor v3.16b, v3.16b, v0.16b\n\t" "tbl v3.16b, {v3.16b}, v30.16b\n\t" /* c += d; b ^= c; b <<<= 7; */ "add v2.4s, v2.4s, v3.4s\n\t" "eor v20.16b, v1.16b, v2.16b\n\t" "shl v1.4s, v20.4s, #7\n\t" "sri v1.4s, v20.4s, #25\n\t" "ext v3.16b, v3.16b, v3.16b, #4\n\t" "ext v1.16b, v1.16b, v1.16b, #12\n\t" "ext v2.16b, v2.16b, v2.16b, #8\n\t" "b.ne L_chacha_crypt_bytes_arm64_round_64_%=\n\t" /* Add back state */ "add v0.4s, v0.4s, v16.4s\n\t" "add v1.4s, v1.4s, v17.4s\n\t" "add v2.4s, v2.4s, v18.4s\n\t" "add v3.4s, v3.4s, v19.4s\n\t" /* Check if data is less than 64 bytes - store in over */ "cmp %w[len], #0x40\n\t" "add v19.4s, v19.4s, v31.4s\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_64_%=\n\t" /* Encipher 64 bytes */ "ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[m]], #0x40\n\t" "eor v24.16b, v24.16b, v0.16b\n\t" "eor v25.16b, v25.16b, v1.16b\n\t" "eor v26.16b, v26.16b, v2.16b\n\t" "eor v27.16b, v27.16b, v3.16b\n\t" "st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [%x[c]], #0x40\n\t" /* Check for more bytes to be enciphered */ "subs %w[len], %w[len], #0x40\n\t" "b.ne L_chacha_crypt_bytes_arm64_loop_64_%=\n\t" "b L_chacha_crypt_bytes_arm64_done_%=\n\t" "\n" "L_chacha_crypt_bytes_arm64_lt_64_%=:\n\t" /* Calculate bytes left in block not used */ "sub %w[rol8], %w[rol8], %w[len]\n\t" /* Store encipher block in over for further operations and left */ "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x4]\n\t" "str %w[rol8], [%x[ctx], #64]\n\t" /* Encipher 32 bytes */ "cmp %w[len], #32\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_32_%=\n\t" "ld1 {v24.16b, v25.16b}, [%x[m]], #32\n\t" "eor v24.16b, v24.16b, v0.16b\n\t" "eor v25.16b, v25.16b, v1.16b\n\t" "st1 {v24.16b, v25.16b}, [%x[c]], #32\n\t" "subs %w[len], %w[len], #32\n\t" "mov v0.16b, v2.16b\n\t" "mov v1.16b, v3.16b\n\t" "b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t" "\n" "L_chacha_crypt_bytes_arm64_lt_32_%=:\n\t" "cmp %w[len], #16\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_16_%=\n\t" /* Encipher 16 bytes */ "ld1 {v24.16b}, [%x[m]], #16\n\t" "eor v24.16b, v24.16b, v0.16b\n\t" "st1 {v24.16b}, [%x[c]], #16\n\t" "subs %w[len], %w[len], #16\n\t" "mov v0.16b, v1.16b\n\t" "b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t" "\n" "L_chacha_crypt_bytes_arm64_lt_16_%=:\n\t" "cmp %w[len], #8\n\t" "b.lt L_chacha_crypt_bytes_arm64_lt_8_%=\n\t" /* Encipher 8 bytes */ "ld1 {v24.8b}, [%x[m]], #8\n\t" "eor v24.8b, v24.8b, v0.8b\n\t" "st1 {v24.8b}, [%x[c]], #8\n\t" "subs %w[len], %w[len], #8\n\t" "mov v0.d[0], v0.d[1]\n\t" "b.eq L_chacha_crypt_bytes_arm64_done_%=\n\t" "\n" "L_chacha_crypt_bytes_arm64_lt_8_%=:\n\t" "mov %[rol8], v0.d[0]\n\t" "\n" "L_chacha_crypt_bytes_arm64_loop_lt_8_%=:\n\t" /* Encipher 1 byte at a time */ "ldrb %w[ctr], [%x[m]], #1\n\t" "eor %w[ctr], %w[ctr], %w[rol8]\n\t" "strb %w[ctr], [%x[c]], #1\n\t" "subs %w[len], %w[len], #1\n\t" "lsr %[rol8], %[rol8], #8\n\t" "b.gt L_chacha_crypt_bytes_arm64_loop_lt_8_%=\n\t" "\n" "L_chacha_crypt_bytes_arm64_done_%=:\n\t" "\n" "L_chacha_crypt_bytes_arm64_done_all_%=:\n\t" "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%x[ctx]]\n\t" : [ctx] "+r" (ctx), [c] "+r" (c), [len] "+r" (len) : [m] "r" (m), [rol8] "r" (rol8), [ctr] "r" (ctr) : "memory", "cc", "x4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); } void wc_chacha_setiv(word32* x, const byte* iv, word32 counter) { __asm__ __volatile__ ( "ldr x3, [%x[iv]]\n\t" "ldr w4, [%x[iv], #8]\n\t" "str %x[counter], [%x[x], #48]\n\t" "str x3, [%x[x], #52]\n\t" "str w4, [%x[x], #60]\n\t" : [x] "+r" (x), [counter] "+r" (counter) : [iv] "r" (iv) : "memory", "cc", "x3", "x4" ); } XALIGNED(8) static const word32 L_chacha_setkey_arm64_constant[] = { 0x61707865, 0x3120646e, 0x79622d36, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, }; void wc_chacha_setkey(word32* x, const byte* key, word32 keySz) { const word32* constant = L_chacha_setkey_arm64_constant; __asm__ __volatile__ ( "subs %x[keySz], %x[keySz], #16\n\t" "add %[constant], %[constant], %x[keySz]\n\t" /* Start with constants */ "ld1 {v0.4s}, [%[constant]]\n\t" "ld1 {v1.16b}, [%x[key]], #16\n\t" #ifdef BIG_ENDIAN_ORDER "rev32 v1.8h, v1.8h\n\t" #endif /* BIG_ENDIAN_ORDER */ "st1 {v0.4s}, [%x[x]], #16\n\t" "st1 {v1.4s}, [%x[x]], #16\n\t" "b.eq L_chacha_setkey_arm64_done_%=\n\t" "ld1 {v1.16b}, [%x[key]]\n\t" #ifdef BIG_ENDIAN_ORDER "rev32 v1.8h, v1.8h\n\t" #endif /* BIG_ENDIAN_ORDER */ "\n" "L_chacha_setkey_arm64_done_%=:\n\t" "st1 {v1.4s}, [%x[x]]\n\t" : [x] "+r" (x), [keySz] "+r" (keySz) : [key] "r" (key), [constant] "r" (constant) : "memory", "cc", "v0", "v1" ); } void wc_chacha_use_over(byte* over, byte* output, const byte* input, word32 len) { __asm__ __volatile__ ( "\n" "L_chacha_use_over_arm64_16byte_loop_%=:\n\t" "cmp %w[len], #16\n\t" "b.lt L_chacha_use_over_arm64_word_loop_%=\n\t" /* 16 bytes of state XORed into message. */ "ld1 {v0.16b}, [%x[over]], #16\n\t" "ld1 {v1.16b}, [%x[input]], #16\n\t" "eor v1.16b, v1.16b, v0.16b\n\t" "subs %w[len], %w[len], #16\n\t" "st1 {v1.16b}, [%x[output]], #16\n\t" "b.eq L_chacha_use_over_arm64_done_%=\n\t" "b L_chacha_use_over_arm64_16byte_loop_%=\n\t" "\n" "L_chacha_use_over_arm64_word_loop_%=:\n\t" "cmp %w[len], #4\n\t" "b.lt L_chacha_use_over_arm64_byte_loop_%=\n\t" /* 4 bytes of state XORed into message. */ "ldr w4, [%x[over]], #4\n\t" "ldr w5, [%x[input]], #4\n\t" "eor w5, w5, w4\n\t" "subs %w[len], %w[len], #4\n\t" "str w5, [%x[output]], #4\n\t" "b.eq L_chacha_use_over_arm64_done_%=\n\t" "b L_chacha_use_over_arm64_word_loop_%=\n\t" "\n" "L_chacha_use_over_arm64_byte_loop_%=:\n\t" /* 1 bytes of state XORed into message. */ "ldrb w4, [%x[over]], #1\n\t" "ldrb w5, [%x[input]], #1\n\t" "eor w5, w5, w4\n\t" "subs %w[len], %w[len], #1\n\t" "strb w5, [%x[output]], #1\n\t" "b.eq L_chacha_use_over_arm64_done_%=\n\t" "b L_chacha_use_over_arm64_byte_loop_%=\n\t" "\n" "L_chacha_use_over_arm64_done_%=:\n\t" : [over] "+r" (over), [output] "+r" (output), [len] "+r" (len) : [input] "r" (input) : "memory", "cc", "x4", "x5", "v0", "v1" ); } #endif /* !WOLFSSL_ARMASM_NO_NEON */ #endif /* HAVE_CHACHA */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #endif /* WOLFSSL_ARMASM_INLINE */