/* armv8-chacha-asm * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #include /* Generated using (from wolfssl): * cd ../scripts * ruby ./chacha/chacha.rb arm64 \ * ../wolfssl/wolfcrypt/src/port/arm/armv8-chacha-asm.S */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #ifndef WOLFSSL_ARMASM_INLINE #ifdef HAVE_CHACHA #ifndef __APPLE__ .text .section .rodata .type L_chacha20_arm64_ctr, %object .size L_chacha20_arm64_ctr, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_chacha20_arm64_ctr: .long 0x00000000,0x00000001,0x00000002,0x00000003 #ifndef __APPLE__ .text .section .rodata .type L_chacha20_arm64_rol8, %object .size L_chacha20_arm64_rol8, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_chacha20_arm64_rol8: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f #ifndef WOLFSSL_ARMASM_NO_NEON #ifndef __APPLE__ .text .globl wc_chacha_crypt_bytes .type wc_chacha_crypt_bytes,@function .align 2 wc_chacha_crypt_bytes: #else .section __TEXT,__text .globl _wc_chacha_crypt_bytes .p2align 2 _wc_chacha_crypt_bytes: #endif /* __APPLE__ */ stp x29, x30, [sp, #-160]! add x29, sp, #0 stp x17, x19, [x29, #24] stp x20, x21, [x29, #40] stp x22, x23, [x29, #56] stp x24, x25, [x29, #72] str x26, [x29, #88] stp d8, d9, [x29, #96] stp d10, d11, [x29, #112] stp d12, d13, [x29, #128] stp d14, d15, [x29, #144] #ifndef __APPLE__ adrp x5, L_chacha20_arm64_rol8 add x5, x5, :lo12:L_chacha20_arm64_rol8 #else adrp x5, L_chacha20_arm64_rol8@PAGE add x5, x5, L_chacha20_arm64_rol8@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x6, L_chacha20_arm64_ctr add x6, x6, :lo12:L_chacha20_arm64_ctr #else adrp x6, L_chacha20_arm64_ctr@PAGE add x6, x6, L_chacha20_arm64_ctr@PAGEOFF #endif /* __APPLE__ */ eor v29.16b, v29.16b, v29.16b mov x26, #5 eor v31.16b, v31.16b, v31.16b mov w7, #1 ld1 {v30.16b}, [x5] ld1 {v28.4s}, [x6] add x4, x0, #0x44 mov v29.s[0], w26 mov v31.s[0], w7 # Load state to encrypt ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] cmp x3, #0x140 blt L_chacha_crypt_bytes_arm64_lt_320 mov w25, #4 L_chacha_crypt_bytes_arm64_loop_320: # Move state into regular register mov x8, v16.d[0] mov x10, v16.d[1] mov x12, v17.d[0] mov x14, v17.d[1] mov x16, v18.d[0] mov x19, v18.d[1] mov x21, v19.d[0] mov x23, v19.d[1] sub x3, x3, #0x140 # Move state into vector registers dup v0.4s, v16.s[0] dup v1.4s, v16.s[1] lsr x9, x8, #32 dup v2.4s, v16.s[2] dup v3.4s, v16.s[3] lsr x11, x10, #32 dup v4.4s, v17.s[0] dup v5.4s, v17.s[1] lsr x13, x12, #32 dup v6.4s, v17.s[2] dup v7.4s, v17.s[3] lsr x15, x14, #32 dup v8.4s, v18.s[0] dup v9.4s, v18.s[1] lsr x17, x16, #32 dup v10.4s, v18.s[2] dup v11.4s, v18.s[3] lsr x20, x19, #32 dup v12.4s, v19.s[0] dup v13.4s, v19.s[1] lsr x22, x21, #32 dup v14.4s, v19.s[2] dup v15.4s, v19.s[3] lsr x24, x23, #32 # Add to counter word add v12.4s, v12.4s, v28.4s add w21, w21, w25 # Set number of odd+even rounds to perform mov x26, #10 L_chacha_crypt_bytes_arm64_round_start_320: subs x26, x26, #1 # Round odd # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v4.4s add w8, w8, w12 add v1.4s, v1.4s, v5.4s add w9, w9, w13 add v2.4s, v2.4s, v6.4s add w10, w10, w14 add v3.4s, v3.4s, v7.4s add w11, w11, w15 eor v12.16b, v12.16b, v0.16b eor w21, w21, w8 eor v13.16b, v13.16b, v1.16b eor w22, w22, w9 eor v14.16b, v14.16b, v2.16b eor w23, w23, w10 eor v15.16b, v15.16b, v3.16b eor w24, w24, w11 rev32 v12.8h, v12.8h ror w21, w21, #16 rev32 v13.8h, v13.8h ror w22, w22, #16 rev32 v14.8h, v14.8h ror w23, w23, #16 rev32 v15.8h, v15.8h ror w24, w24, #16 # c += d; b ^= c; b <<<= 12; add v8.4s, v8.4s, v12.4s add w16, w16, w21 add v9.4s, v9.4s, v13.4s add w17, w17, w22 add v10.4s, v10.4s, v14.4s add w19, w19, w23 add v11.4s, v11.4s, v15.4s add w20, w20, w24 eor v20.16b, v4.16b, v8.16b eor w12, w12, w16 eor v21.16b, v5.16b, v9.16b eor w13, w13, w17 eor v22.16b, v6.16b, v10.16b eor w14, w14, w19 eor v23.16b, v7.16b, v11.16b eor w15, w15, w20 shl v4.4s, v20.4s, #12 ror w12, w12, #20 shl v5.4s, v21.4s, #12 ror w13, w13, #20 shl v6.4s, v22.4s, #12 ror w14, w14, #20 shl v7.4s, v23.4s, #12 ror w15, w15, #20 sri v4.4s, v20.4s, #20 sri v5.4s, v21.4s, #20 sri v6.4s, v22.4s, #20 sri v7.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v4.4s add w8, w8, w12 add v1.4s, v1.4s, v5.4s add w9, w9, w13 add v2.4s, v2.4s, v6.4s add w10, w10, w14 add v3.4s, v3.4s, v7.4s add w11, w11, w15 eor v12.16b, v12.16b, v0.16b eor w21, w21, w8 eor v13.16b, v13.16b, v1.16b eor w22, w22, w9 eor v14.16b, v14.16b, v2.16b eor w23, w23, w10 eor v15.16b, v15.16b, v3.16b eor w24, w24, w11 tbl v12.16b, {v12.16b}, v30.16b ror w21, w21, #24 tbl v13.16b, {v13.16b}, v30.16b ror w22, w22, #24 tbl v14.16b, {v14.16b}, v30.16b ror w23, w23, #24 tbl v15.16b, {v15.16b}, v30.16b ror w24, w24, #24 # c += d; b ^= c; b <<<= 7; add v8.4s, v8.4s, v12.4s add w16, w16, w21 add v9.4s, v9.4s, v13.4s add w17, w17, w22 add v10.4s, v10.4s, v14.4s add w19, w19, w23 add v11.4s, v11.4s, v15.4s add w20, w20, w24 eor v20.16b, v4.16b, v8.16b eor w12, w12, w16 eor v21.16b, v5.16b, v9.16b eor w13, w13, w17 eor v22.16b, v6.16b, v10.16b eor w14, w14, w19 eor v23.16b, v7.16b, v11.16b eor w15, w15, w20 shl v4.4s, v20.4s, #7 ror w12, w12, #25 shl v5.4s, v21.4s, #7 ror w13, w13, #25 shl v6.4s, v22.4s, #7 ror w14, w14, #25 shl v7.4s, v23.4s, #7 ror w15, w15, #25 sri v4.4s, v20.4s, #25 sri v5.4s, v21.4s, #25 sri v6.4s, v22.4s, #25 sri v7.4s, v23.4s, #25 # Round even # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v5.4s add w8, w8, w13 add v1.4s, v1.4s, v6.4s add w9, w9, w14 add v2.4s, v2.4s, v7.4s add w10, w10, w15 add v3.4s, v3.4s, v4.4s add w11, w11, w12 eor v15.16b, v15.16b, v0.16b eor w24, w24, w8 eor v12.16b, v12.16b, v1.16b eor w21, w21, w9 eor v13.16b, v13.16b, v2.16b eor w22, w22, w10 eor v14.16b, v14.16b, v3.16b eor w23, w23, w11 rev32 v15.8h, v15.8h ror w24, w24, #16 rev32 v12.8h, v12.8h ror w21, w21, #16 rev32 v13.8h, v13.8h ror w22, w22, #16 rev32 v14.8h, v14.8h ror w23, w23, #16 # c += d; b ^= c; b <<<= 12; add v10.4s, v10.4s, v15.4s add w19, w19, w24 add v11.4s, v11.4s, v12.4s add w20, w20, w21 add v8.4s, v8.4s, v13.4s add w16, w16, w22 add v9.4s, v9.4s, v14.4s add w17, w17, w23 eor v20.16b, v5.16b, v10.16b eor w13, w13, w19 eor v21.16b, v6.16b, v11.16b eor w14, w14, w20 eor v22.16b, v7.16b, v8.16b eor w15, w15, w16 eor v23.16b, v4.16b, v9.16b eor w12, w12, w17 shl v5.4s, v20.4s, #12 ror w13, w13, #20 shl v6.4s, v21.4s, #12 ror w14, w14, #20 shl v7.4s, v22.4s, #12 ror w15, w15, #20 shl v4.4s, v23.4s, #12 ror w12, w12, #20 sri v5.4s, v20.4s, #20 sri v6.4s, v21.4s, #20 sri v7.4s, v22.4s, #20 sri v4.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v5.4s add w8, w8, w13 add v1.4s, v1.4s, v6.4s add w9, w9, w14 add v2.4s, v2.4s, v7.4s add w10, w10, w15 add v3.4s, v3.4s, v4.4s add w11, w11, w12 eor v15.16b, v15.16b, v0.16b eor w24, w24, w8 eor v12.16b, v12.16b, v1.16b eor w21, w21, w9 eor v13.16b, v13.16b, v2.16b eor w22, w22, w10 eor v14.16b, v14.16b, v3.16b eor w23, w23, w11 tbl v15.16b, {v15.16b}, v30.16b ror w24, w24, #24 tbl v12.16b, {v12.16b}, v30.16b ror w21, w21, #24 tbl v13.16b, {v13.16b}, v30.16b ror w22, w22, #24 tbl v14.16b, {v14.16b}, v30.16b ror w23, w23, #24 # c += d; b ^= c; b <<<= 7; add v10.4s, v10.4s, v15.4s add w19, w19, w24 add v11.4s, v11.4s, v12.4s add w20, w20, w21 add v8.4s, v8.4s, v13.4s add w16, w16, w22 add v9.4s, v9.4s, v14.4s add w17, w17, w23 eor v20.16b, v5.16b, v10.16b eor w13, w13, w19 eor v21.16b, v6.16b, v11.16b eor w14, w14, w20 eor v22.16b, v7.16b, v8.16b eor w15, w15, w16 eor v23.16b, v4.16b, v9.16b eor w12, w12, w17 shl v5.4s, v20.4s, #7 ror w13, w13, #25 shl v6.4s, v21.4s, #7 ror w14, w14, #25 shl v7.4s, v22.4s, #7 ror w15, w15, #25 shl v4.4s, v23.4s, #7 ror w12, w12, #25 sri v5.4s, v20.4s, #25 sri v6.4s, v21.4s, #25 sri v7.4s, v22.4s, #25 sri v4.4s, v23.4s, #25 bne L_chacha_crypt_bytes_arm64_round_start_320 # Add counter now rather than after transposed add v12.4s, v12.4s, v28.4s add w21, w21, w25 # Load message ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 # Transpose vectors trn1 v20.4s, v0.4s, v1.4s trn1 v22.4s, v2.4s, v3.4s orr x8, x8, x9, lsl 32 trn2 v21.4s, v0.4s, v1.4s trn2 v23.4s, v2.4s, v3.4s trn1 v0.2d, v20.2d, v22.2d trn1 v1.2d, v21.2d, v23.2d orr x10, x10, x11, lsl 32 trn2 v2.2d, v20.2d, v22.2d trn2 v3.2d, v21.2d, v23.2d trn1 v20.4s, v4.4s, v5.4s trn1 v22.4s, v6.4s, v7.4s orr x12, x12, x13, lsl 32 trn2 v21.4s, v4.4s, v5.4s trn2 v23.4s, v6.4s, v7.4s trn1 v4.2d, v20.2d, v22.2d trn1 v5.2d, v21.2d, v23.2d orr x14, x14, x15, lsl 32 trn2 v6.2d, v20.2d, v22.2d trn2 v7.2d, v21.2d, v23.2d trn1 v20.4s, v8.4s, v9.4s trn1 v22.4s, v10.4s, v11.4s orr x16, x16, x17, lsl 32 trn2 v21.4s, v8.4s, v9.4s trn2 v23.4s, v10.4s, v11.4s trn1 v8.2d, v20.2d, v22.2d trn1 v9.2d, v21.2d, v23.2d orr x19, x19, x20, lsl 32 trn2 v10.2d, v20.2d, v22.2d trn2 v11.2d, v21.2d, v23.2d trn1 v20.4s, v12.4s, v13.4s trn1 v22.4s, v14.4s, v15.4s orr x21, x21, x22, lsl 32 trn2 v21.4s, v12.4s, v13.4s trn2 v23.4s, v14.4s, v15.4s trn1 v12.2d, v20.2d, v22.2d trn1 v13.2d, v21.2d, v23.2d orr x23, x23, x24, lsl 32 trn2 v14.2d, v20.2d, v22.2d trn2 v15.2d, v21.2d, v23.2d # Add back state, XOR in message and store (load next block) add v20.4s, v0.4s, v16.4s add v21.4s, v4.4s, v17.4s add v22.4s, v8.4s, v18.4s add v23.4s, v12.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v1.4s, v16.4s add v21.4s, v5.4s, v17.4s add v22.4s, v9.4s, v18.4s add v23.4s, v13.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v2.4s, v16.4s add v21.4s, v6.4s, v17.4s add v22.4s, v10.4s, v18.4s add v23.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v3.4s, v16.4s add v21.4s, v7.4s, v17.4s add v22.4s, v11.4s, v18.4s add v23.4s, v15.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 # Move regular registers into vector registers for adding and xor mov v0.d[0], x8 mov v0.d[1], x10 mov v1.d[0], x12 mov v1.d[1], x14 mov v2.d[0], x16 mov v2.d[1], x19 mov v3.d[0], x21 mov v3.d[1], x23 # Add back state, XOR in message and store add v0.4s, v0.4s, v16.4s add v1.4s, v1.4s, v17.4s add v2.4s, v2.4s, v18.4s add v3.4s, v3.4s, v19.4s eor v0.16b, v0.16b, v24.16b eor v1.16b, v1.16b, v25.16b eor v2.16b, v2.16b, v26.16b eor v3.16b, v3.16b, v27.16b st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #0x40 cmp x3, #0x140 add v19.4s, v19.4s, v29.4s bge L_chacha_crypt_bytes_arm64_loop_320 # Done doing 320 bytes at a time L_chacha_crypt_bytes_arm64_lt_320: cmp x3, #0x100 blt L_chacha_crypt_bytes_arm64_lt_256 # Move state into vector registers dup v0.4s, v16.s[0] dup v1.4s, v16.s[1] dup v2.4s, v16.s[2] dup v3.4s, v16.s[3] dup v4.4s, v17.s[0] dup v5.4s, v17.s[1] dup v6.4s, v17.s[2] dup v7.4s, v17.s[3] dup v8.4s, v18.s[0] dup v9.4s, v18.s[1] dup v10.4s, v18.s[2] dup v11.4s, v18.s[3] dup v12.4s, v19.s[0] dup v13.4s, v19.s[1] dup v14.4s, v19.s[2] dup v15.4s, v19.s[3] # Add to counter word add v12.4s, v12.4s, v28.4s # Set number of odd+even rounds to perform mov x26, #10 L_chacha_crypt_bytes_arm64_round_start_256: subs x26, x26, #1 # Round odd # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v4.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s eor v12.16b, v12.16b, v0.16b eor v13.16b, v13.16b, v1.16b eor v14.16b, v14.16b, v2.16b eor v15.16b, v15.16b, v3.16b rev32 v12.8h, v12.8h rev32 v13.8h, v13.8h rev32 v14.8h, v14.8h rev32 v15.8h, v15.8h # c += d; b ^= c; b <<<= 12; add v8.4s, v8.4s, v12.4s add v9.4s, v9.4s, v13.4s add v10.4s, v10.4s, v14.4s add v11.4s, v11.4s, v15.4s eor v20.16b, v4.16b, v8.16b eor v21.16b, v5.16b, v9.16b eor v22.16b, v6.16b, v10.16b eor v23.16b, v7.16b, v11.16b shl v4.4s, v20.4s, #12 shl v5.4s, v21.4s, #12 shl v6.4s, v22.4s, #12 shl v7.4s, v23.4s, #12 sri v4.4s, v20.4s, #20 sri v5.4s, v21.4s, #20 sri v6.4s, v22.4s, #20 sri v7.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v4.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s eor v12.16b, v12.16b, v0.16b eor v13.16b, v13.16b, v1.16b eor v14.16b, v14.16b, v2.16b eor v15.16b, v15.16b, v3.16b tbl v12.16b, {v12.16b}, v30.16b tbl v13.16b, {v13.16b}, v30.16b tbl v14.16b, {v14.16b}, v30.16b tbl v15.16b, {v15.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v8.4s, v8.4s, v12.4s add v9.4s, v9.4s, v13.4s add v10.4s, v10.4s, v14.4s add v11.4s, v11.4s, v15.4s eor v20.16b, v4.16b, v8.16b eor v21.16b, v5.16b, v9.16b eor v22.16b, v6.16b, v10.16b eor v23.16b, v7.16b, v11.16b shl v4.4s, v20.4s, #7 shl v5.4s, v21.4s, #7 shl v6.4s, v22.4s, #7 shl v7.4s, v23.4s, #7 sri v4.4s, v20.4s, #25 sri v5.4s, v21.4s, #25 sri v6.4s, v22.4s, #25 sri v7.4s, v23.4s, #25 # Round even # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v4.4s eor v15.16b, v15.16b, v0.16b eor v12.16b, v12.16b, v1.16b eor v13.16b, v13.16b, v2.16b eor v14.16b, v14.16b, v3.16b rev32 v15.8h, v15.8h rev32 v12.8h, v12.8h rev32 v13.8h, v13.8h rev32 v14.8h, v14.8h # c += d; b ^= c; b <<<= 12; add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v12.4s add v8.4s, v8.4s, v13.4s add v9.4s, v9.4s, v14.4s eor v20.16b, v5.16b, v10.16b eor v21.16b, v6.16b, v11.16b eor v22.16b, v7.16b, v8.16b eor v23.16b, v4.16b, v9.16b shl v5.4s, v20.4s, #12 shl v6.4s, v21.4s, #12 shl v7.4s, v22.4s, #12 shl v4.4s, v23.4s, #12 sri v5.4s, v20.4s, #20 sri v6.4s, v21.4s, #20 sri v7.4s, v22.4s, #20 sri v4.4s, v23.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v5.4s add v1.4s, v1.4s, v6.4s add v2.4s, v2.4s, v7.4s add v3.4s, v3.4s, v4.4s eor v15.16b, v15.16b, v0.16b eor v12.16b, v12.16b, v1.16b eor v13.16b, v13.16b, v2.16b eor v14.16b, v14.16b, v3.16b tbl v15.16b, {v15.16b}, v30.16b tbl v12.16b, {v12.16b}, v30.16b tbl v13.16b, {v13.16b}, v30.16b tbl v14.16b, {v14.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v10.4s, v10.4s, v15.4s add v11.4s, v11.4s, v12.4s add v8.4s, v8.4s, v13.4s add v9.4s, v9.4s, v14.4s eor v20.16b, v5.16b, v10.16b eor v21.16b, v6.16b, v11.16b eor v22.16b, v7.16b, v8.16b eor v23.16b, v4.16b, v9.16b shl v5.4s, v20.4s, #7 shl v6.4s, v21.4s, #7 shl v7.4s, v22.4s, #7 shl v4.4s, v23.4s, #7 sri v5.4s, v20.4s, #25 sri v6.4s, v21.4s, #25 sri v7.4s, v22.4s, #25 sri v4.4s, v23.4s, #25 bne L_chacha_crypt_bytes_arm64_round_start_256 mov x26, #4 # Add counter now rather than after transposed add v12.4s, v12.4s, v28.4s # Load message ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 # Transpose vectors trn1 v20.4s, v0.4s, v1.4s trn1 v22.4s, v2.4s, v3.4s trn2 v21.4s, v0.4s, v1.4s trn2 v23.4s, v2.4s, v3.4s trn1 v0.2d, v20.2d, v22.2d trn1 v1.2d, v21.2d, v23.2d trn2 v2.2d, v20.2d, v22.2d trn2 v3.2d, v21.2d, v23.2d trn1 v20.4s, v4.4s, v5.4s trn1 v22.4s, v6.4s, v7.4s trn2 v21.4s, v4.4s, v5.4s trn2 v23.4s, v6.4s, v7.4s trn1 v4.2d, v20.2d, v22.2d trn1 v5.2d, v21.2d, v23.2d trn2 v6.2d, v20.2d, v22.2d trn2 v7.2d, v21.2d, v23.2d trn1 v20.4s, v8.4s, v9.4s trn1 v22.4s, v10.4s, v11.4s trn2 v21.4s, v8.4s, v9.4s trn2 v23.4s, v10.4s, v11.4s trn1 v8.2d, v20.2d, v22.2d trn1 v9.2d, v21.2d, v23.2d trn2 v10.2d, v20.2d, v22.2d trn2 v11.2d, v21.2d, v23.2d trn1 v20.4s, v12.4s, v13.4s trn1 v22.4s, v14.4s, v15.4s trn2 v21.4s, v12.4s, v13.4s trn2 v23.4s, v14.4s, v15.4s trn1 v12.2d, v20.2d, v22.2d trn1 v13.2d, v21.2d, v23.2d trn2 v14.2d, v20.2d, v22.2d trn2 v15.2d, v21.2d, v23.2d # Add back state, XOR in message and store (load next block) add v20.4s, v0.4s, v16.4s add v21.4s, v4.4s, v17.4s add v22.4s, v8.4s, v18.4s add v23.4s, v12.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v1.4s, v16.4s add v21.4s, v5.4s, v17.4s add v22.4s, v9.4s, v18.4s add v23.4s, v13.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v2.4s, v16.4s add v21.4s, v6.4s, v17.4s add v22.4s, v10.4s, v18.4s add v23.4s, v14.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v20.4s, v3.4s, v16.4s add v21.4s, v7.4s, v17.4s add v22.4s, v11.4s, v18.4s add v23.4s, v15.4s, v19.4s eor v20.16b, v20.16b, v24.16b eor v21.16b, v21.16b, v25.16b eor v22.16b, v22.16b, v26.16b eor v23.16b, v23.16b, v27.16b st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 mov v29.s[0], w26 sub x3, x3, #0x100 add v19.4s, v19.4s, v29.4s # Done 256-byte block L_chacha_crypt_bytes_arm64_lt_256: cmp x3, #0x80 blt L_chacha_crypt_bytes_arm64_lt_128 ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 # Move state into vector registers mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b mov v0.16b, v16.16b mov v1.16b, v17.16b mov v2.16b, v18.16b mov v3.16b, v19.16b # Add counter word add v7.4s, v7.4s, v31.4s # Set number of odd+even rounds to perform mov x26, #10 L_chacha_crypt_bytes_arm64_round_start_128: subs x26, x26, #1 # Round odd # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v1.4s add v4.4s, v4.4s, v5.4s eor v3.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v4.16b rev32 v3.8h, v3.8h rev32 v7.8h, v7.8h # c += d; b ^= c; b <<<= 12; add v2.4s, v2.4s, v3.4s add v6.4s, v6.4s, v7.4s eor v20.16b, v1.16b, v2.16b eor v21.16b, v5.16b, v6.16b shl v1.4s, v20.4s, #12 shl v5.4s, v21.4s, #12 sri v1.4s, v20.4s, #20 sri v5.4s, v21.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v1.4s add v4.4s, v4.4s, v5.4s eor v3.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v4.16b tbl v3.16b, {v3.16b}, v30.16b tbl v7.16b, {v7.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v2.4s, v2.4s, v3.4s add v6.4s, v6.4s, v7.4s eor v20.16b, v1.16b, v2.16b eor v21.16b, v5.16b, v6.16b shl v1.4s, v20.4s, #7 shl v5.4s, v21.4s, #7 sri v1.4s, v20.4s, #25 sri v5.4s, v21.4s, #25 ext v3.16b, v3.16b, v3.16b, #12 ext v7.16b, v7.16b, v7.16b, #12 ext v1.16b, v1.16b, v1.16b, #4 ext v5.16b, v5.16b, v5.16b, #4 ext v2.16b, v2.16b, v2.16b, #8 ext v6.16b, v6.16b, v6.16b, #8 # Round even # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v1.4s add v4.4s, v4.4s, v5.4s eor v3.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v4.16b rev32 v3.8h, v3.8h rev32 v7.8h, v7.8h # c += d; b ^= c; b <<<= 12; add v2.4s, v2.4s, v3.4s add v6.4s, v6.4s, v7.4s eor v20.16b, v1.16b, v2.16b eor v21.16b, v5.16b, v6.16b shl v1.4s, v20.4s, #12 shl v5.4s, v21.4s, #12 sri v1.4s, v20.4s, #20 sri v5.4s, v21.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v1.4s add v4.4s, v4.4s, v5.4s eor v3.16b, v3.16b, v0.16b eor v7.16b, v7.16b, v4.16b tbl v3.16b, {v3.16b}, v30.16b tbl v7.16b, {v7.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v2.4s, v2.4s, v3.4s add v6.4s, v6.4s, v7.4s eor v20.16b, v1.16b, v2.16b eor v21.16b, v5.16b, v6.16b shl v1.4s, v20.4s, #7 shl v5.4s, v21.4s, #7 sri v1.4s, v20.4s, #25 sri v5.4s, v21.4s, #25 ext v3.16b, v3.16b, v3.16b, #4 ext v7.16b, v7.16b, v7.16b, #4 ext v1.16b, v1.16b, v1.16b, #12 ext v5.16b, v5.16b, v5.16b, #12 ext v2.16b, v2.16b, v2.16b, #8 ext v6.16b, v6.16b, v6.16b, #8 bne L_chacha_crypt_bytes_arm64_round_start_128 # Add back state, XOR in message and store (load next block) add v0.4s, v0.4s, v16.4s add v1.4s, v1.4s, v17.4s add v2.4s, v2.4s, v18.4s add v3.4s, v3.4s, v19.4s eor v24.16b, v24.16b, v0.16b eor v25.16b, v25.16b, v1.16b eor v26.16b, v26.16b, v2.16b eor v27.16b, v27.16b, v3.16b ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], #0x40 st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #0x40 add v19.4s, v19.4s, v31.4s add v4.4s, v4.4s, v16.4s add v5.4s, v5.4s, v17.4s add v6.4s, v6.4s, v18.4s add v7.4s, v7.4s, v19.4s eor v20.16b, v20.16b, v4.16b eor v21.16b, v21.16b, v5.16b eor v22.16b, v22.16b, v6.16b eor v23.16b, v23.16b, v7.16b st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x1], #0x40 add v19.4s, v19.4s, v31.4s sub x3, x3, #0x80 # Done 128-byte block L_chacha_crypt_bytes_arm64_lt_128: cmp x3, #0 beq L_chacha_crypt_bytes_arm64_done_all mov w5, #0x40 L_chacha_crypt_bytes_arm64_loop_64: # Move state into vector registers mov v0.16b, v16.16b mov v1.16b, v17.16b mov v2.16b, v18.16b mov v3.16b, v19.16b # Set number of odd+even rounds to perform mov x26, #10 L_chacha_crypt_bytes_arm64_round_64: subs x26, x26, #1 # Round odd # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v1.4s eor v3.16b, v3.16b, v0.16b rev32 v3.8h, v3.8h # c += d; b ^= c; b <<<= 12; add v2.4s, v2.4s, v3.4s eor v20.16b, v1.16b, v2.16b shl v1.4s, v20.4s, #12 sri v1.4s, v20.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v1.4s eor v3.16b, v3.16b, v0.16b tbl v3.16b, {v3.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v2.4s, v2.4s, v3.4s eor v20.16b, v1.16b, v2.16b shl v1.4s, v20.4s, #7 sri v1.4s, v20.4s, #25 ext v3.16b, v3.16b, v3.16b, #12 ext v1.16b, v1.16b, v1.16b, #4 ext v2.16b, v2.16b, v2.16b, #8 # Round even # a += b; d ^= a; d <<<= 16; add v0.4s, v0.4s, v1.4s eor v3.16b, v3.16b, v0.16b rev32 v3.8h, v3.8h # c += d; b ^= c; b <<<= 12; add v2.4s, v2.4s, v3.4s eor v20.16b, v1.16b, v2.16b shl v1.4s, v20.4s, #12 sri v1.4s, v20.4s, #20 # a += b; d ^= a; d <<<= 8; add v0.4s, v0.4s, v1.4s eor v3.16b, v3.16b, v0.16b tbl v3.16b, {v3.16b}, v30.16b # c += d; b ^= c; b <<<= 7; add v2.4s, v2.4s, v3.4s eor v20.16b, v1.16b, v2.16b shl v1.4s, v20.4s, #7 sri v1.4s, v20.4s, #25 ext v3.16b, v3.16b, v3.16b, #4 ext v1.16b, v1.16b, v1.16b, #12 ext v2.16b, v2.16b, v2.16b, #8 bne L_chacha_crypt_bytes_arm64_round_64 # Add back state add v0.4s, v0.4s, v16.4s add v1.4s, v1.4s, v17.4s add v2.4s, v2.4s, v18.4s add v3.4s, v3.4s, v19.4s # Check if data is less than 64 bytes - store in over cmp x3, #0x40 add v19.4s, v19.4s, v31.4s blt L_chacha_crypt_bytes_arm64_lt_64 # Encipher 64 bytes ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #0x40 eor v24.16b, v24.16b, v0.16b eor v25.16b, v25.16b, v1.16b eor v26.16b, v26.16b, v2.16b eor v27.16b, v27.16b, v3.16b st1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x1], #0x40 # Check for more bytes to be enciphered subs x3, x3, #0x40 bne L_chacha_crypt_bytes_arm64_loop_64 b L_chacha_crypt_bytes_arm64_done L_chacha_crypt_bytes_arm64_lt_64: # Calculate bytes left in block not used sub w5, w5, w3 # Store encipher block in over for further operations and left st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x4] str w5, [x0, #64] # Encipher 32 bytes cmp x3, #32 blt L_chacha_crypt_bytes_arm64_lt_32 ld1 {v24.16b, v25.16b}, [x2], #32 eor v24.16b, v24.16b, v0.16b eor v25.16b, v25.16b, v1.16b st1 {v24.16b, v25.16b}, [x1], #32 subs x3, x3, #32 mov v0.16b, v2.16b mov v1.16b, v3.16b beq L_chacha_crypt_bytes_arm64_done L_chacha_crypt_bytes_arm64_lt_32: cmp x3, #16 blt L_chacha_crypt_bytes_arm64_lt_16 # Encipher 16 bytes ld1 {v24.16b}, [x2], #16 eor v24.16b, v24.16b, v0.16b st1 {v24.16b}, [x1], #16 subs x3, x3, #16 mov v0.16b, v1.16b beq L_chacha_crypt_bytes_arm64_done L_chacha_crypt_bytes_arm64_lt_16: cmp x3, #8 blt L_chacha_crypt_bytes_arm64_lt_8 # Encipher 8 bytes ld1 {v24.8b}, [x2], #8 eor v24.8b, v24.8b, v0.8b st1 {v24.8b}, [x1], #8 subs x3, x3, #8 mov v0.d[0], v0.d[1] beq L_chacha_crypt_bytes_arm64_done L_chacha_crypt_bytes_arm64_lt_8: mov x5, v0.d[0] L_chacha_crypt_bytes_arm64_loop_lt_8: # Encipher 1 byte at a time ldrb w6, [x2], #1 eor w6, w6, w5 strb w6, [x1], #1 subs x3, x3, #1 lsr x5, x5, #8 bgt L_chacha_crypt_bytes_arm64_loop_lt_8 L_chacha_crypt_bytes_arm64_done: L_chacha_crypt_bytes_arm64_done_all: st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] ldp x17, x19, [x29, #24] ldp x20, x21, [x29, #40] ldp x22, x23, [x29, #56] ldp x24, x25, [x29, #72] ldr x26, [x29, #88] ldp d8, d9, [x29, #96] ldp d10, d11, [x29, #112] ldp d12, d13, [x29, #128] ldp d14, d15, [x29, #144] ldp x29, x30, [sp], #0xa0 ret #ifndef __APPLE__ .size wc_chacha_crypt_bytes,.-wc_chacha_crypt_bytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl wc_chacha_setiv .type wc_chacha_setiv,@function .align 2 wc_chacha_setiv: #else .section __TEXT,__text .globl _wc_chacha_setiv .p2align 2 _wc_chacha_setiv: #endif /* __APPLE__ */ ldr x3, [x1] ldr w4, [x1, #8] str x2, [x0, #48] str x3, [x0, #52] str w4, [x0, #60] ret #ifndef __APPLE__ .size wc_chacha_setiv,.-wc_chacha_setiv #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_chacha_setkey_arm64_constant, %object .size L_chacha_setkey_arm64_constant, 32 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_chacha_setkey_arm64_constant: .long 0x61707865,0x3120646e,0x79622d36,0x6b206574 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 #ifndef __APPLE__ .text .globl wc_chacha_setkey .type wc_chacha_setkey,@function .align 2 wc_chacha_setkey: #else .section __TEXT,__text .globl _wc_chacha_setkey .p2align 2 _wc_chacha_setkey: #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_chacha_setkey_arm64_constant add x3, x3, :lo12:L_chacha_setkey_arm64_constant #else adrp x3, L_chacha_setkey_arm64_constant@PAGE add x3, x3, L_chacha_setkey_arm64_constant@PAGEOFF #endif /* __APPLE__ */ subs x2, x2, #16 add x3, x3, x2 # Start with constants ld1 {v0.4s}, [x3] ld1 {v1.16b}, [x1], #16 #ifdef BIG_ENDIAN_ORDER rev32 v1.8h, v1.8h #endif /* BIG_ENDIAN_ORDER */ st1 {v0.4s}, [x0], #16 st1 {v1.4s}, [x0], #16 beq L_chacha_setkey_arm64_done ld1 {v1.16b}, [x1] #ifdef BIG_ENDIAN_ORDER rev32 v1.8h, v1.8h #endif /* BIG_ENDIAN_ORDER */ L_chacha_setkey_arm64_done: st1 {v1.4s}, [x0] ret #ifndef __APPLE__ .size wc_chacha_setkey,.-wc_chacha_setkey #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl wc_chacha_use_over .type wc_chacha_use_over,@function .align 2 wc_chacha_use_over: #else .section __TEXT,__text .globl _wc_chacha_use_over .p2align 2 _wc_chacha_use_over: #endif /* __APPLE__ */ L_chacha_use_over_arm64_16byte_loop: cmp x3, #16 blt L_chacha_use_over_arm64_word_loop # 16 bytes of state XORed into message. ld1 {v0.16b}, [x0], #16 ld1 {v1.16b}, [x2], #16 eor v1.16b, v1.16b, v0.16b subs x3, x3, #16 st1 {v1.16b}, [x1], #16 beq L_chacha_use_over_arm64_done b L_chacha_use_over_arm64_16byte_loop L_chacha_use_over_arm64_word_loop: cmp x3, #4 blt L_chacha_use_over_arm64_byte_loop # 4 bytes of state XORed into message. ldr w4, [x0], #4 ldr w5, [x2], #4 eor w5, w5, w4 subs x3, x3, #4 str w5, [x1], #4 beq L_chacha_use_over_arm64_done b L_chacha_use_over_arm64_word_loop L_chacha_use_over_arm64_byte_loop: # 1 bytes of state XORed into message. ldrb w4, [x0], #1 ldrb w5, [x2], #1 eor w5, w5, w4 subs x3, x3, #1 strb w5, [x1], #1 beq L_chacha_use_over_arm64_done b L_chacha_use_over_arm64_byte_loop L_chacha_use_over_arm64_done: ret #ifndef __APPLE__ .size wc_chacha_use_over,.-wc_chacha_use_over #endif /* __APPLE__ */ #endif /* !WOLFSSL_ARMASM_NO_NEON */ #endif /* HAVE_CHACHA */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* !WOLFSSL_ARMASM_INLINE */