/* riscv-64-chacha.c * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #include /* The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to * optimize for ARM: * https://cryptojedi.org/papers/veccrypto-20120320.pdf */ #include #ifdef WOLFSSL_RISCV_ASM #ifdef HAVE_CHACHA #include #include #ifdef NO_INLINE #include #else #define WOLFSSL_MISC_INCLUDED #include #endif #ifdef CHACHA_AEAD_TEST #include #endif #ifdef CHACHA_TEST #include #endif /* Number of rounds */ #define ROUNDS 20 #define U32C(v) (v##U) #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) #define U8TO32_LITTLE(p) (((word32*)(p))[0]) #define PLUS(v,w) (U32V((v) + (w))) #define PLUSONE(v) (PLUS((v),1)) #define ARM_SIMD_LEN_BYTES 16 /** * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. */ int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) { word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ if (ctx == NULL) return BAD_FUNC_ARG; XMEMCPY(temp, inIv, CHACHA_IV_BYTES); ctx->left = 0; ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ ctx->X[CHACHA_IV_BYTES+1] = temp[0]; /* fixed variable from nonce */ ctx->X[CHACHA_IV_BYTES+2] = temp[1]; /* counter from nonce */ ctx->X[CHACHA_IV_BYTES+3] = temp[2]; /* counter from nonce */ return 0; } /* "expand 32-byte k" as unsigned 32 byte */ static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; /* "expand 16-byte k" as unsigned 16 byte */ static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; /** * Key setup. 8 word iv (nonce) */ int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) { const word32* constants; const byte* k; #ifdef XSTREAM_ALIGN word32 alignKey[8]; #endif if (ctx == NULL) return BAD_FUNC_ARG; if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) return BAD_FUNC_ARG; #ifdef XSTREAM_ALIGN if ((wc_ptr_t)key % 4) { WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); XMEMCPY(alignKey, key, keySz); k = (byte*)alignKey; } else { k = key; } #else k = key; #endif /* XSTREAM_ALIGN */ ctx->X[4] = U8TO32_LITTLE(k + 0); ctx->X[5] = U8TO32_LITTLE(k + 4); ctx->X[6] = U8TO32_LITTLE(k + 8); ctx->X[7] = U8TO32_LITTLE(k + 12); if (keySz == CHACHA_MAX_KEY_SZ) { k += 16; constants = sigma; } else { constants = tau; } ctx->X[ 8] = U8TO32_LITTLE(k + 0); ctx->X[ 9] = U8TO32_LITTLE(k + 4); ctx->X[10] = U8TO32_LITTLE(k + 8); ctx->X[11] = U8TO32_LITTLE(k + 12); ctx->X[ 0] = constants[0]; ctx->X[ 1] = constants[1]; ctx->X[ 2] = constants[2]; ctx->X[ 3] = constants[3]; ctx->left = 0; return 0; } #define CC_A0 "a4" #define CC_A1 "a5" #define CC_A2 "a6" #define CC_A3 "a7" #define CC_B0 "t3" #define CC_B1 "t4" #define CC_B2 "t5" #define CC_B3 "t6" #define CC_C0 "s2" #define CC_C1 "s3" #define CC_C2 "s4" #define CC_C3 "s5" #define CC_D0 "s6" #define CC_D1 "s7" #define CC_D2 "s8" #define CC_D3 "s9" #define CC_T0 "t0" #define CC_T1 "t1" #define CC_T2 "t2" #define CC_T3 "s1" #if defined(WOLFSSL_RISCV_VECTOR) static const word32 L_chacha20_vec_inc_first_word[] = { 0x1, 0x0, 0x0, 0x0, }; #ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION #define PART_ROUND_ODD_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ VSLL_VI(REG_V20, REG_V3, s) \ "slli " CC_T0 ", " CC_D0 ", " #s "\n\t" \ VSLL_VI(REG_V21, REG_V7, s) \ "slli " CC_T1 ", " CC_D1 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V11, s) \ "slli " CC_T2 ", " CC_D2 ", " #s "\n\t" \ VSLL_VI(REG_V23, REG_V15, s) \ "slli " CC_T3 ", " CC_D3 ", " #s "\n\t" \ VSLL_VI(REG_V24, REG_V19, s) \ VSRL_VI(REG_V3, REG_V3, sr) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VSRL_VI(REG_V7, REG_V7, sr) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ VSRL_VI(REG_V11, REG_V11, sr) \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ VSRL_VI(REG_V15, REG_V15, sr) \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ VSRL_VI(REG_V19, REG_V19, sr) \ VOR_VV(REG_V3, REG_V3, REG_V20) \ "or " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t" \ VOR_VV(REG_V7, REG_V7, REG_V21) \ "or " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t" \ VOR_VV(REG_V11, REG_V11, REG_V22) \ "or " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t" \ VOR_VV(REG_V15, REG_V15, REG_V23) \ "or " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t" \ VOR_VV(REG_V19, REG_V19, REG_V24) #define PART_ROUND_ODD_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ VSLL_VI(REG_V20, REG_V1, s) \ "slli " CC_T0 ", " CC_B0 ", " #s "\n\t" \ VSLL_VI(REG_V21, REG_V5, s) \ "slli " CC_T1 ", " CC_B1 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V9, s) \ "slli " CC_T2 ", " CC_B2 ", " #s "\n\t" \ VSLL_VI(REG_V23, REG_V13, s) \ "slli " CC_T3 ", " CC_B3 ", " #s "\n\t" \ VSLL_VI(REG_V24, REG_V17, s) \ VSRL_VI(REG_V1, REG_V1, sr) \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ VSRL_VI(REG_V5, REG_V5, sr) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VSRL_VI(REG_V9, REG_V9, sr) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ VSRL_VI(REG_V13, REG_V13, sr) \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ VSRL_VI(REG_V17, REG_V17, sr) \ VOR_VV(REG_V1, REG_V1, REG_V20) \ "or " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t" \ VOR_VV(REG_V5, REG_V5, REG_V21) \ "or " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t" \ VOR_VV(REG_V9, REG_V9, REG_V22) \ "or " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t" \ VOR_VV(REG_V13, REG_V13, REG_V23) \ "or " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t" \ VOR_VV(REG_V17, REG_V17, REG_V24) #define PART_ROUND_EVEN_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ VSLL_VI(REG_V20, REG_V3, s) \ "slli " CC_T0 ", " CC_D3 ", " #s "\n\t" \ VSLL_VI(REG_V21, REG_V7, s) \ "slli " CC_T1 ", " CC_D0 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V11, s) \ "slli " CC_T2 ", " CC_D1 ", " #s "\n\t" \ VSLL_VI(REG_V23, REG_V15, s) \ "slli " CC_T3 ", " CC_D2 ", " #s "\n\t" \ VSLL_VI(REG_V24, REG_V19, s) \ VSRL_VI(REG_V3, REG_V3, sr) \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ VSRL_VI(REG_V7, REG_V7, sr) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VSRL_VI(REG_V11, REG_V11, sr) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ VSRL_VI(REG_V15, REG_V15, sr) \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ VSRL_VI(REG_V19, REG_V19, sr) \ VOR_VV(REG_V3, REG_V3, REG_V20) \ "or " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t" \ VOR_VV(REG_V7, REG_V7, REG_V21) \ "or " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t" \ VOR_VV(REG_V11, REG_V11, REG_V22) \ "or " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t" \ VOR_VV(REG_V15, REG_V15, REG_V23) \ "or " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t" \ VOR_VV(REG_V19, REG_V19, REG_V24) #define PART_ROUND_EVEN_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ VSLL_VI(REG_V20, REG_V1, s) \ "slli " CC_T0 ", " CC_B1 ", " #s "\n\t" \ VSLL_VI(REG_V21, REG_V5, s) \ "slli " CC_T1 ", " CC_B2 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V9, s) \ "slli " CC_T2 ", " CC_B3 ", " #s "\n\t" \ VSLL_VI(REG_V23, REG_V13, s) \ "slli " CC_T3 ", " CC_B0 ", " #s "\n\t" \ VSLL_VI(REG_V24, REG_V17, s) \ VSRL_VI(REG_V1, REG_V1, sr) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VSRL_VI(REG_V5, REG_V5, sr) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ VSRL_VI(REG_V9, REG_V9, sr) \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ VSRL_VI(REG_V13, REG_V13, sr) \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ VSRL_VI(REG_V17, REG_V17, sr) \ VOR_VV(REG_V1, REG_V1, REG_V20) \ "or " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t" \ VOR_VV(REG_V5, REG_V5, REG_V21) \ "or " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t" \ VOR_VV(REG_V9, REG_V9, REG_V22) \ "or " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t" \ VOR_VV(REG_V13, REG_V13, REG_V23) \ "or " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t" \ VOR_VV(REG_V17, REG_V17, REG_V24) #elif !defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION ) #define PART_ROUND_ODD_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "slli " CC_T0 ", " CC_D0 ", " #s "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ "slli " CC_T1 ", " CC_D1 ", " #s "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "slli " CC_T2 ", " CC_D2 ", " #s "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ "slli " CC_T3 ", " CC_D3 ", " #s "\n\t" \ VROR_VI(REG_V11, sr, REG_V11) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VROR_VI(REG_V15, sr, REG_V15) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ VROR_VI(REG_V19, sr, REG_V19) \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t" #define PART_ROUND_ODD_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "slli " CC_T0 ", " CC_B0 ", " #s "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ "slli " CC_T1 ", " CC_B1 ", " #s "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "slli " CC_T2 ", " CC_B2 ", " #s "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ "slli " CC_T3 ", " CC_B3 ", " #s "\n\t" \ VROR_VI(REG_V9, sr, REG_V9) \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ VROR_VI(REG_V13, sr, REG_V13) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VROR_VI(REG_V17, sr, REG_V17) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "slli " CC_T0 ", " CC_D3 ", " #s "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ "slli " CC_T1 ", " CC_D0 ", " #s "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "slli " CC_T2 ", " CC_D1 ", " #s "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ "slli " CC_T3 ", " CC_D2 ", " #s "\n\t" \ VROR_VI(REG_V11, sr, REG_V11) \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ VROR_VI(REG_V15, sr, REG_V15) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VROR_VI(REG_V19, sr, REG_V19) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "slli " CC_T0 ", " CC_B1 ", " #s "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ "slli " CC_T1 ", " CC_B2 ", " #s "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "slli " CC_T2 ", " CC_B3 ", " #s "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ "slli " CC_T3 ", " CC_B0 ", " #s "\n\t" \ VROR_VI(REG_V9, sr, REG_V9) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VROR_VI(REG_V13, sr, REG_V13) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ VROR_VI(REG_V17, sr, REG_V17) \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t" #else #define PART_ROUND_ODD_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ VROR_VI(REG_V3, sr, REG_V3) \ RORIW(REG_S6, REG_S6, sr) \ VROR_VI(REG_V7, sr, REG_V7) \ RORIW(REG_S7, REG_S7, sr) \ VROR_VI(REG_V11, sr, REG_V11) \ RORIW(REG_S8, REG_S8, sr) \ VROR_VI(REG_V15, sr, REG_V15) \ RORIW(REG_S9, REG_S9, sr) \ VROR_VI(REG_V19, sr, REG_V19) #define PART_ROUND_ODD_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ VROR_VI(REG_V1, sr, REG_V1) \ RORIW(REG_T3, REG_T3, sr) \ VROR_VI(REG_V5, sr, REG_V5) \ RORIW(REG_T4, REG_T4, sr) \ VROR_VI(REG_V9, sr, REG_V9) \ RORIW(REG_T5, REG_T5, sr) \ VROR_VI(REG_V13, sr, REG_V13) \ RORIW(REG_T6, REG_T6, sr) \ VROR_VI(REG_V17, sr, REG_V17) #define PART_ROUND_EVEN_ABD_5(s, sr) \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V12, REG_V12, REG_V13) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VADD_VV(REG_V16, REG_V16, REG_V17) \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VXOR_VV(REG_V15, REG_V15, REG_V12) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VXOR_VV(REG_V19, REG_V19, REG_V16) \ VROR_VI(REG_V3, sr, REG_V3) \ RORIW(REG_S9, REG_S9, sr) \ VROR_VI(REG_V7, sr, REG_V7) \ RORIW(REG_S6, REG_S6, sr) \ VROR_VI(REG_V11, sr, REG_V11) \ RORIW(REG_S7, REG_S7, sr) \ VROR_VI(REG_V15, sr, REG_V15) \ RORIW(REG_S8, REG_S8, sr) \ VROR_VI(REG_V19, sr, REG_V19) #define PART_ROUND_EVEN_CDB_5(s, sr) \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V14, REG_V14, REG_V15) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VADD_VV(REG_V18, REG_V18, REG_V19) \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V13, REG_V13, REG_V14) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V17, REG_V17, REG_V18) \ VROR_VI(REG_V1, sr, REG_V1) \ RORIW(REG_T4, REG_T4, sr) \ VROR_VI(REG_V5, sr, REG_V5) \ RORIW(REG_T5, REG_T5, sr) \ VROR_VI(REG_V9, sr, REG_V9) \ RORIW(REG_T6, REG_T6, sr) \ VROR_VI(REG_V13, sr, REG_V13) \ RORIW(REG_T3, REG_T3, sr) \ VROR_VI(REG_V17, sr, REG_V17) #endif #define QUARTER_ROUND_ODD_5() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_ODD_ABD_5(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_ODD_CDB_5(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_ODD_ABD_5( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_ODD_CDB_5( 7, 25) #define QUARTER_ROUND_EVEN_5() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_EVEN_ABD_5(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_EVEN_CDB_5(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_EVEN_ABD_5( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_EVEN_CDB_5( 7, 25) #define SHUFFLE_5(r, t, i) \ VRGATHER_VV(t + 0, i, r + 0) \ VRGATHER_VV(t + 1, i, r + 4) \ VRGATHER_VV(t + 2, i, r + 8) \ VRGATHER_VV(t + 3, i, r + 12) \ VRGATHER_VV(t + 4, i, r + 16) \ VMV_V_V(r + 0, t + 0) \ VMV_V_V(r + 4, t + 1) \ VMV_V_V(r + 8, t + 2) \ VMV_V_V(r + 12, t + 3) \ VMV_V_V(r + 16, t + 4) #define ODD_SHUFFLE_5() \ /* a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 \ * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */ \ SHUFFLE_5(REG_V3, REG_V20, REG_V27) \ SHUFFLE_5(REG_V1, REG_V20, REG_V25) \ SHUFFLE_5(REG_V2, REG_V20, REG_V26) #define EVEN_SHUFFLE_5() \ /* a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 \ * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */ \ SHUFFLE_5(REG_V3, REG_V20, REG_V25) \ SHUFFLE_5(REG_V1, REG_V20, REG_V27) \ SHUFFLE_5(REG_V2, REG_V20, REG_V26) static WC_INLINE void wc_chacha_encrypt_384(const word32* input, const byte* m, byte* c, word32 bytes) { word64 bytes64 = (word64)bytes; __asm__ __volatile__ ( VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) /* The layout of used vector registers is: * v0-v3 - first block * v4-v7 - second block * v8-v11 - third block * v12-v15 - fourth block * v16-v19 - fifth block * v20-v24 - temp/message * v25-v27 - indices for rotating words in vector * v28-v31 - input * * v0 0 1 2 3 * v1 4 5 6 7 * v2 8 9 10 11 * v3 12 13 14 15 * load CHACHA state with indices placed as shown above */ /* Load state to encrypt */ "mv t2, %[input]\n\t" VL4RE32_V(REG_V28, REG_T2) VID_V(REG_V20) VSLIDEDOWN_VI(REG_V25, REG_V20, 1) VSLIDEUP_VI(REG_V25, REG_V20, 3) VSLIDEDOWN_VI(REG_V26, REG_V20, 2) VSLIDEUP_VI(REG_V26, REG_V20, 2) VSLIDEDOWN_VI(REG_V27, REG_V20, 3) VSLIDEUP_VI(REG_V27, REG_V20, 1) "\n" "L_chacha20_riscv_384_outer:\n\t" /* Move state into regular registers */ "ld a4, 0(%[input])\n\t" "ld a6, 8(%[input])\n\t" "ld t3, 16(%[input])\n\t" "ld t5, 24(%[input])\n\t" "ld s2, 32(%[input])\n\t" "ld s4, 40(%[input])\n\t" "lw s7, 52(%[input])\n\t" "ld s8, 56(%[input])\n\t" "srli a5, a4, 32\n\t" "srli a7, a6, 32\n\t" "srli t4, t3, 32\n\t" "srli t6, t5, 32\n\t" "srli s3, s2, 32\n\t" "srli s5, s4, 32\n\t" "srli s9, s8, 32\n\t" VMV_X_S(REG_S6, REG_V31) /* Move state into vector registers */ VMVR_V(REG_V0, REG_V28, 4) VMVR_V(REG_V4, REG_V28, 4) VMVR_V(REG_V8, REG_V28, 4) VMVR_V(REG_V12, REG_V28, 4) VMVR_V(REG_V16, REG_V28, 4) /* Set counter word */ "addi t1, s6, 1\n\t" VMV_S_X(REG_V7, REG_T1) "addi t1, s6, 2\n\t" VMV_S_X(REG_V11, REG_T1) "addi t1, s6, 3\n\t" VMV_S_X(REG_V15, REG_T1) "addi t1, s6, 4\n\t" VMV_S_X(REG_V19, REG_T1) "addi s6, s6, 5\n\t" /* Set number of odd+even rounds to perform */ "li a3, 10\n\t" "\n" "L_chacha20_riscv_384_loop:\n\t" /* Odd Round */ QUARTER_ROUND_ODD_5() ODD_SHUFFLE_5() /* Even Round */ QUARTER_ROUND_EVEN_5() EVEN_SHUFFLE_5() "addi a3, a3, -1\n\t" "bnez a3, L_chacha20_riscv_384_loop\n\t" /* Load message */ "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" /* Add back state, XOR in message and store (load next block) */ /* BLOCK 1 */ VADD_VV(REG_V0, REG_V0, REG_V28) VADD_VV(REG_V1, REG_V1, REG_V29) VADD_VV(REG_V2, REG_V2, REG_V30) VADD_VV(REG_V3, REG_V3, REG_V31) VXOR_VV(REG_V0, REG_V0, REG_V20) VXOR_VV(REG_V1, REG_V1, REG_V21) VXOR_VV(REG_V2, REG_V2, REG_V22) VXOR_VV(REG_V3, REG_V3, REG_V23) "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" VMV_X_S(REG_T0, REG_V31) "mv t2, %[c]\n\t" VS4R_V(REG_V0, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 2 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V31, REG_T0) VADD_VV(REG_V4, REG_V4, REG_V28) VADD_VV(REG_V5, REG_V5, REG_V29) VADD_VV(REG_V6, REG_V6, REG_V30) VADD_VV(REG_V7, REG_V7, REG_V31) VXOR_VV(REG_V4, REG_V4, REG_V20) VXOR_VV(REG_V5, REG_V5, REG_V21) VXOR_VV(REG_V6, REG_V6, REG_V22) VXOR_VV(REG_V7, REG_V7, REG_V23) "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" "mv t2, %[c]\n\t" VS4R_V(REG_V4, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 3 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V31, REG_T0) VADD_VV(REG_V8, REG_V8, REG_V28) VADD_VV(REG_V9, REG_V9, REG_V29) VADD_VV(REG_V10, REG_V10, REG_V30) VADD_VV(REG_V11, REG_V11, REG_V31) VXOR_VV(REG_V8, REG_V8, REG_V20) VXOR_VV(REG_V9, REG_V9, REG_V21) VXOR_VV(REG_V10, REG_V10, REG_V22) VXOR_VV(REG_V11, REG_V11, REG_V23) "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" "mv t2, %[c]\n\t" VS4R_V(REG_V8, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 4 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V31, REG_T0) VADD_VV(REG_V12, REG_V12, REG_V28) VADD_VV(REG_V13, REG_V13, REG_V29) VADD_VV(REG_V14, REG_V14, REG_V30) VADD_VV(REG_V15, REG_V15, REG_V31) VXOR_VV(REG_V12, REG_V12, REG_V20) VXOR_VV(REG_V13, REG_V13, REG_V21) VXOR_VV(REG_V14, REG_V14, REG_V22) VXOR_VV(REG_V15, REG_V15, REG_V23) "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" "mv t2, %[c]\n\t" VS4R_V(REG_V12, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 5 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V31, REG_T0) VADD_VV(REG_V16, REG_V16, REG_V28) VADD_VV(REG_V17, REG_V17, REG_V29) VADD_VV(REG_V18, REG_V18, REG_V30) VADD_VV(REG_V19, REG_V19, REG_V31) VXOR_VV(REG_V16, REG_V16, REG_V20) VXOR_VV(REG_V17, REG_V17, REG_V21) VXOR_VV(REG_V18, REG_V18, REG_V22) VXOR_VV(REG_V19, REG_V19, REG_V23) "mv t2, %[m]\n\t" VL4RE32_V(REG_V20, REG_T2) "addi %[m], %[m], 64\n\t" "mv t2, %[c]\n\t" VS4R_V(REG_V16, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 6 */ /* Move regular registers into vector registers for adding and xor */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V0, REG_A4) VMV_S_X(REG_V1, REG_T3) VMV_S_X(REG_V2, REG_S2) VMV_S_X(REG_V3, REG_S6) VMV_S_X(REG_V4, REG_A5) VMV_S_X(REG_V5, REG_T4) VMV_S_X(REG_V6, REG_S3) VMV_S_X(REG_V7, REG_S7) VSLIDEUP_VI(REG_V0, REG_V4, 1) VSLIDEUP_VI(REG_V1, REG_V5, 1) VSLIDEUP_VI(REG_V2, REG_V6, 1) VSLIDEUP_VI(REG_V3, REG_V7, 1) VMV_S_X(REG_V4, REG_A6) VMV_S_X(REG_V5, REG_T5) VMV_S_X(REG_V6, REG_S4) VMV_S_X(REG_V7, REG_S8) VSLIDEUP_VI(REG_V0, REG_V4, 2) VSLIDEUP_VI(REG_V1, REG_V5, 2) VSLIDEUP_VI(REG_V2, REG_V6, 2) VSLIDEUP_VI(REG_V3, REG_V7, 2) VMV_S_X(REG_V4, REG_A7) VMV_S_X(REG_V5, REG_T6) VMV_S_X(REG_V6, REG_S5) VMV_S_X(REG_V7, REG_S9) VSLIDEUP_VI(REG_V0, REG_V4, 3) VSLIDEUP_VI(REG_V1, REG_V5, 3) VSLIDEUP_VI(REG_V2, REG_V6, 3) VSLIDEUP_VI(REG_V3, REG_V7, 3) VMV_S_X(REG_V31, REG_T0) /* Add back state, XOR in message and store */ VADD_VV(REG_V0, REG_V0, REG_V28) VADD_VV(REG_V1, REG_V1, REG_V29) VADD_VV(REG_V2, REG_V2, REG_V30) VADD_VV(REG_V3, REG_V3, REG_V31) VXOR_VV(REG_V0, REG_V0, REG_V20) VXOR_VV(REG_V1, REG_V1, REG_V21) VXOR_VV(REG_V2, REG_V2, REG_V22) VXOR_VV(REG_V3, REG_V3, REG_V23) "mv t2, %[c]\n\t" VS4R_V(REG_V0, REG_T2) "addi %[c], %[c], 64\n\t" "addi %[bytes], %[bytes], -384\n\t" "addi t0, t0, 1\n\t" VMV_S_X(REG_V31, REG_T0) "bnez %[bytes], L_chacha20_riscv_384_outer\n\t" : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64) : [input] "r" (input) : "memory", "t0", "t1", "t2", "s1", "a3", "t3", "t4", "t5", "t6", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9" ); } #ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION #define PART_ROUND_ODD_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VSLL_VI(REG_V20, REG_V3, s) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VSLL_VI(REG_V21, REG_V7, s) \ "slli " CC_T0 ", " CC_D0 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V11, s) \ "slli " CC_T1 ", " CC_D1 ", " #s "\n\t" \ VSRL_VI(REG_V3, REG_V3, sr) \ "slli " CC_T2 ", " CC_D2 ", " #s "\n\t" \ VSRL_VI(REG_V7, REG_V7, sr) \ "slli " CC_T3 ", " CC_D3 ", " #s "\n\t" \ VSRL_VI(REG_V11, REG_V11, sr) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VOR_VV(REG_V3, REG_V3, REG_V20) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ VOR_VV(REG_V7, REG_V7, REG_V21) \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ VOR_VV(REG_V11, REG_V11, REG_V22) \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t" #define PART_ROUND_ODD_CDB(s, sr) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VSLL_VI(REG_V20, REG_V1, s) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VSLL_VI(REG_V21, REG_V5, s) \ "slli " CC_T0 ", " CC_B0 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V9, s) \ "slli " CC_T1 ", " CC_B1 ", " #s "\n\t" \ VSRL_VI(REG_V1, REG_V1, sr) \ "slli " CC_T2 ", " CC_B2 ", " #s "\n\t" \ VSRL_VI(REG_V5, REG_V5, sr) \ "slli " CC_T3 ", " CC_B3 ", " #s "\n\t" \ VSRL_VI(REG_V9, REG_V9, sr) \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ VOR_VV(REG_V1, REG_V1, REG_V20) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VOR_VV(REG_V5, REG_V5, REG_V21) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ VOR_VV(REG_V9, REG_V9, REG_V22) \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VSLL_VI(REG_V20, REG_V3, s) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VSLL_VI(REG_V21, REG_V7, s) \ "slli " CC_T0 ", " CC_D3 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V11, s) \ "slli " CC_T1 ", " CC_D0 ", " #s "\n\t" \ VSRL_VI(REG_V3, REG_V3, sr) \ "slli " CC_T2 ", " CC_D1 ", " #s "\n\t" \ VSRL_VI(REG_V7, REG_V7, sr) \ "slli " CC_T3 ", " CC_D2 ", " #s "\n\t" \ VSRL_VI(REG_V11, REG_V11, sr) \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ VOR_VV(REG_V3, REG_V3, REG_V20) \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ VOR_VV(REG_V7, REG_V7, REG_V21) \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ VOR_VV(REG_V11, REG_V11, REG_V22) \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_CDB(s, sr) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VSLL_VI(REG_V20, REG_V1, s) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VSLL_VI(REG_V21, REG_V5, s) \ "slli " CC_T0 ", " CC_B1 ", " #s "\n\t" \ VSLL_VI(REG_V22, REG_V9, s) \ "slli " CC_T1 ", " CC_B2 ", " #s "\n\t" \ VSRL_VI(REG_V1, REG_V1, sr) \ "slli " CC_T2 ", " CC_B3 ", " #s "\n\t" \ VSRL_VI(REG_V5, REG_V5, sr) \ "slli " CC_T3 ", " CC_B0 ", " #s "\n\t" \ VSRL_VI(REG_V9, REG_V9, sr) \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ VOR_VV(REG_V1, REG_V1, REG_V20) \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ VOR_VV(REG_V5, REG_V5, REG_V21) \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ VOR_VV(REG_V9, REG_V9, REG_V22) \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t" #elif !defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION ) #define PART_ROUND_ODD_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ "slli " CC_T0 ", " CC_D0 ", " #s "\n\t" \ VROR_VI(REG_V11, sr, REG_V11) \ "slli " CC_T1 ", " CC_D1 ", " #s "\n\t" \ "slli " CC_T2 ", " CC_D2 ", " #s "\n\t" \ "slli " CC_T3 ", " CC_D3 ", " #s "\n\t" \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t" #define PART_ROUND_ODD_CDB(s, sr) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ "slli " CC_T0 ", " CC_B0 ", " #s "\n\t" \ VROR_VI(REG_V9, sr, REG_V9) \ "slli " CC_T1 ", " CC_B1 ", " #s "\n\t" \ "slli " CC_T2 ", " CC_B2 ", " #s "\n\t" \ "slli " CC_T3 ", " CC_B3 ", " #s "\n\t" \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ "slli " CC_T0 ", " CC_D3 ", " #s "\n\t" \ VROR_VI(REG_V11, sr, REG_V11) \ "slli " CC_T1 ", " CC_D0 ", " #s "\n\t" \ "slli " CC_T2 ", " CC_D1 ", " #s "\n\t" \ "slli " CC_T3 ", " CC_D2 ", " #s "\n\t" \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_CDB(s, sr) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ "slli " CC_T0 ", " CC_B1 ", " #s "\n\t" \ VROR_VI(REG_V9, sr, REG_V9) \ "slli " CC_T1 ", " CC_B2 ", " #s "\n\t" \ "slli " CC_T2 ", " CC_B3 ", " #s "\n\t" \ "slli " CC_T3 ", " CC_B0 ", " #s "\n\t" \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t" #else #define PART_ROUND_ODD_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ RORIW(REG_S6, REG_S6, sr) \ VROR_VI(REG_V11, sr, REG_V11) \ RORIW(REG_S7, REG_S7, sr) \ RORIW(REG_S8, REG_S8, sr) \ RORIW(REG_S9, REG_S9, sr) #define PART_ROUND_ODD_CDB(s, sr) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ RORIW(REG_T3, REG_T3, sr) \ VROR_VI(REG_V9, sr, REG_V9) \ RORIW(REG_T4, REG_T4, sr) \ RORIW(REG_T5, REG_T5, sr) \ RORIW(REG_T6, REG_T6, sr) #define PART_ROUND_EVEN_ABD(s, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ VADD_VV(REG_V0, REG_V0, REG_V1) \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ VADD_VV(REG_V4, REG_V4, REG_V5) \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ VADD_VV(REG_V8, REG_V8, REG_V9) \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ VXOR_VV(REG_V3, REG_V3, REG_V0) \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ VXOR_VV(REG_V7, REG_V7, REG_V4) \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ VXOR_VV(REG_V11, REG_V11, REG_V8) \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ VROR_VI(REG_V3, sr, REG_V3) \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ VROR_VI(REG_V7, sr, REG_V7) \ RORIW(REG_S9, REG_S9, sr) \ VROR_VI(REG_V11, sr, REG_V11) \ RORIW(REG_S6, REG_S6, sr) \ RORIW(REG_S7, REG_S7, sr) \ RORIW(REG_S8, REG_S8, sr) #define PART_ROUND_EVEN_CDB(s, sr) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ VADD_VV(REG_V2, REG_V2, REG_V3) \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ VADD_VV(REG_V6, REG_V6, REG_V7) \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ VADD_VV(REG_V10, REG_V10, REG_V11) \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ VXOR_VV(REG_V1, REG_V1, REG_V2) \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ VXOR_VV(REG_V5, REG_V5, REG_V6) \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ VXOR_VV(REG_V9, REG_V9, REG_V10) \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ VROR_VI(REG_V1, sr, REG_V1) \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ VROR_VI(REG_V5, sr, REG_V5) \ "slli " CC_T0 ", " CC_B1 ", " #s "\n\t" \ RORIW(REG_T4, REG_T4, sr) \ VROR_VI(REG_V9, sr, REG_V9) \ RORIW(REG_T5, REG_T5, sr) \ RORIW(REG_T6, REG_T6, sr) \ RORIW(REG_T3, REG_T3, sr) #endif #define QUARTER_ROUND_ODD_4() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_ODD_ABD(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_ODD_CDB(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_ODD_ABD( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_ODD_CDB( 7, 25) #define QUARTER_ROUND_EVEN_4() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_EVEN_ABD(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_EVEN_CDB(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_EVEN_ABD( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_EVEN_CDB( 7, 25) #define SHUFFLE_4(r, t, i) \ VRGATHER_VV(t + 0, i, r + 0) \ VRGATHER_VV(t + 1, i, r + 4) \ VRGATHER_VV(t + 2, i, r + 8) \ VMV_V_V(r + 0, t + 0) \ VMV_V_V(r + 4, t + 1) \ VMV_V_V(r + 8, t + 2) #define ODD_SHUFFLE_4() \ /* a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 \ * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */ \ SHUFFLE_4(REG_V3, REG_V20, REG_V25) \ SHUFFLE_4(REG_V1, REG_V20, REG_V23) \ SHUFFLE_4(REG_V2, REG_V20, REG_V24) #define EVEN_SHUFFLE_4() \ /* a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 \ * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */ \ SHUFFLE_4(REG_V3, REG_V20, REG_V23) \ SHUFFLE_4(REG_V1, REG_V20, REG_V25) \ SHUFFLE_4(REG_V2, REG_V20, REG_V24) /** * Converts word into bytes with rotations having been done. */ static WC_INLINE int wc_chacha_encrypt_256(const word32* input, const byte* m, byte* c) { __asm__ __volatile__ ( VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) /* The layout of used vector registers is: * v0-v3 - first block * v4-v7 - second block * v8-v11 - third block * v12-v15 - message * v16-v19 - input * v20-v22 - temp * v23-v25 - indices for rotating words in vector * * v0 0 1 2 3 * v1 4 5 6 7 * v2 8 9 10 11 * v3 12 13 14 15 * load CHACHA state with indices placed as shown above */ /* Load state to encrypt */ "mv t2, %[input]\n\t" VL4RE32_V(REG_V16, REG_T2) VID_V(REG_V20) VSLIDEDOWN_VI(REG_V23, REG_V20, 1) VSLIDEUP_VI(REG_V23, REG_V20, 3) VSLIDEDOWN_VI(REG_V24, REG_V20, 2) VSLIDEUP_VI(REG_V24, REG_V20, 2) VSLIDEDOWN_VI(REG_V25, REG_V20, 3) VSLIDEUP_VI(REG_V25, REG_V20, 1) /* Move state into regular registers */ "ld a4, 0(%[input])\n\t" "ld a6, 8(%[input])\n\t" "ld t3, 16(%[input])\n\t" "ld t5, 24(%[input])\n\t" "ld s2, 32(%[input])\n\t" "ld s4, 40(%[input])\n\t" "ld s6, 48(%[input])\n\t" "ld s8, 56(%[input])\n\t" "srli a5, a4, 32\n\t" "srli a7, a6, 32\n\t" "srli t4, t3, 32\n\t" "srli t6, t5, 32\n\t" "srli s3, s2, 32\n\t" "srli s5, s4, 32\n\t" "srli s7, s6, 32\n\t" "srli s9, s8, 32\n\t" /* Move state into vector registers */ VMVR_V(REG_V0, REG_V16, 4) "addi t0, s6, 1\n\t" VMVR_V(REG_V4, REG_V16, 4) "addi t1, s6, 2\n\t" VMVR_V(REG_V8, REG_V16, 4) "addi s6, s6, 3\n\t" /* Set counter word */ VMV_S_X(REG_V7, REG_T0) VMV_S_X(REG_V11, REG_T1) /* Set number of odd+even rounds to perform */ "li a3, 10\n\t" "\n" "L_chacha20_riscv_256_loop:\n\t" /* Odd Round */ QUARTER_ROUND_ODD_4() ODD_SHUFFLE_4() "addi a3, a3, -1\n\t" /* Even Round */ QUARTER_ROUND_EVEN_4() EVEN_SHUFFLE_4() "bnez a3, L_chacha20_riscv_256_loop\n\t" /* Load message */ "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "addi %[m], %[m], 64\n\t" /* Add back state, XOR in message and store (load next block) */ /* BLOCK 1 */ VADD_VV(REG_V0, REG_V0, REG_V16) VADD_VV(REG_V1, REG_V1, REG_V17) VADD_VV(REG_V2, REG_V2, REG_V18) VADD_VV(REG_V3, REG_V3, REG_V19) VXOR_VV(REG_V0, REG_V0, REG_V12) VXOR_VV(REG_V1, REG_V1, REG_V13) VXOR_VV(REG_V2, REG_V2, REG_V14) VXOR_VV(REG_V3, REG_V3, REG_V15) "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "addi %[m], %[m], 64\n\t" VMV_X_S(REG_T0, REG_V19) "mv t2, %[c]\n\t" VS4R_V(REG_V0, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 2 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V19, REG_T0) VADD_VV(REG_V4, REG_V4, REG_V16) VADD_VV(REG_V5, REG_V5, REG_V17) VADD_VV(REG_V6, REG_V6, REG_V18) VADD_VV(REG_V7, REG_V7, REG_V19) VXOR_VV(REG_V4, REG_V4, REG_V12) VXOR_VV(REG_V5, REG_V5, REG_V13) VXOR_VV(REG_V6, REG_V6, REG_V14) VXOR_VV(REG_V7, REG_V7, REG_V15) "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "addi %[m], %[m], 64\n\t" "mv t2, %[c]\n\t" VS4R_V(REG_V4, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 3 */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V19, REG_T0) VADD_VV(REG_V8, REG_V8, REG_V16) VADD_VV(REG_V9, REG_V9, REG_V17) VADD_VV(REG_V10, REG_V10, REG_V18) VADD_VV(REG_V11, REG_V11, REG_V19) VXOR_VV(REG_V8, REG_V8, REG_V12) VXOR_VV(REG_V9, REG_V9, REG_V13) VXOR_VV(REG_V10, REG_V10, REG_V14) VXOR_VV(REG_V11, REG_V11, REG_V15) "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "mv t2, %[c]\n\t" VS4R_V(REG_V8, REG_T2) "addi %[c], %[c], 64\n\t" /* BLOCK 4 */ /* Move regular registers into vector registers for adding and xor */ "addi t0, t0, 1\n\t" VMV_S_X(REG_V0, REG_A4) VMV_S_X(REG_V1, REG_T3) VMV_S_X(REG_V2, REG_S2) VMV_S_X(REG_V3, REG_S6) VMV_S_X(REG_V4, REG_A5) VMV_S_X(REG_V5, REG_T4) VMV_S_X(REG_V6, REG_S3) VMV_S_X(REG_V7, REG_S7) VSLIDEUP_VI(REG_V0, REG_V4, 1) VSLIDEUP_VI(REG_V1, REG_V5, 1) VSLIDEUP_VI(REG_V2, REG_V6, 1) VSLIDEUP_VI(REG_V3, REG_V7, 1) VMV_S_X(REG_V4, REG_A6) VMV_S_X(REG_V5, REG_T5) VMV_S_X(REG_V6, REG_S4) VMV_S_X(REG_V7, REG_S8) VSLIDEUP_VI(REG_V0, REG_V4, 2) VSLIDEUP_VI(REG_V1, REG_V5, 2) VSLIDEUP_VI(REG_V2, REG_V6, 2) VSLIDEUP_VI(REG_V3, REG_V7, 2) VMV_S_X(REG_V4, REG_A7) VMV_S_X(REG_V5, REG_T6) VMV_S_X(REG_V6, REG_S5) VMV_S_X(REG_V7, REG_S9) VSLIDEUP_VI(REG_V0, REG_V4, 3) VSLIDEUP_VI(REG_V1, REG_V5, 3) VSLIDEUP_VI(REG_V2, REG_V6, 3) VSLIDEUP_VI(REG_V3, REG_V7, 3) VMV_S_X(REG_V19, REG_T0) /* Add back state, XOR in message and store */ VADD_VV(REG_V0, REG_V0, REG_V16) VADD_VV(REG_V1, REG_V1, REG_V17) VADD_VV(REG_V2, REG_V2, REG_V18) VADD_VV(REG_V3, REG_V3, REG_V19) VXOR_VV(REG_V0, REG_V0, REG_V12) VXOR_VV(REG_V1, REG_V1, REG_V13) VXOR_VV(REG_V2, REG_V2, REG_V14) VXOR_VV(REG_V3, REG_V3, REG_V15) "mv t2, %[c]\n\t" VS4R_V(REG_V0, REG_T2) : [m] "+r" (m), [c] "+r" (c) : [input] "r" (input) : "memory", "t0", "t1", "t2", "s1", "a3", "t3", "t4", "t5", "t6", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9" ); return CHACHA_CHUNK_BYTES * 4; } #ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION #define PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, sl, sr) \ VADD_VV(a, a, b) \ VADD_VV(a2, a2, b2) \ VXOR_VV(d, d, a) \ VXOR_VV(d2, d2, a2) \ VSLL_VI(t, d, sl) \ VSLL_VI(t2, d2, sl) \ VSRL_VI(d, d, sr) \ VSRL_VI(d2, d2, sr) \ VOR_VV(d, d, t) \ VOR_VV(d2, d2, t2) #else #define PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, sl, sr) \ VADD_VV(a, a, b) \ VADD_VV(a2, a2, b2) \ VXOR_VV(d, d, a) \ VXOR_VV(d2, d2, a2) \ VROR_VI(d, sr, d) \ VROR_VI(d2, sr, d2) #endif #define QUARTER_ROUND_2(a, b, c, d, t, a2, b2, c2, d2, t2) \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, 16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_2(c, d, b, t, c2, d2, b2, t2, 12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_2(c, d, b, t, c2, d2, b2, t2, 7, 25) #define ODD_SHUFFLE_2(b, c, d, t, b2, c2, d2, t2) \ /* a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 \ * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */ \ VRGATHER_VV(t, REG_V25, d) \ VRGATHER_VV(t2, REG_V25, d2) \ VMV_V_V(d, t) \ VMV_V_V(d2, t2) \ VRGATHER_VV(t, REG_V23, b) \ VRGATHER_VV(t2, REG_V23, b2) \ VMV_V_V(b, t) \ VMV_V_V(b2, t2) \ VRGATHER_VV(t, REG_V24, c) \ VRGATHER_VV(t2, REG_V24, c2) \ VMV_V_V(c, t) \ VMV_V_V(c2, t2) #define EVEN_SHUFFLE_2(b, c, d, t, b2, c2, d2, t2) \ /* a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 \ * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */ \ VRGATHER_VV(t, REG_V23, d) \ VRGATHER_VV(t2, REG_V23, d2) \ VMV_V_V(d, t) \ VMV_V_V(d2, t2) \ VRGATHER_VV(t, REG_V25, b) \ VRGATHER_VV(t2, REG_V25, b2) \ VMV_V_V(b, t) \ VMV_V_V(b2, t2) \ VRGATHER_VV(t, REG_V24, c) \ VRGATHER_VV(t2, REG_V24, c2) \ VMV_V_V(c, t) \ VMV_V_V(c2, t2) static WC_INLINE int wc_chacha_encrypt_128(const word32* input, const byte* m, byte* c) { __asm__ __volatile__ ( VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) /* The layout of used vector registers is: * v0-v3 - first block * v4-v7 - second block * v12-v15 - message * v16-v19 - input * v20-v22 - temp * v23-v25 - indices for rotating words in vector * * v0 0 1 2 3 * v1 4 5 6 7 * v2 8 9 10 11 * v3 12 13 14 15 * load CHACHA state with indices placed as shown above */ /* Load incrementer register to modify counter */ "mv t2, %[L_chacha20_vec_inc_first_word]\n\t" VL1RE32_V(REG_V22, REG_T2) VID_V(REG_V20) VSLIDEDOWN_VI(REG_V23, REG_V20, 1) VSLIDEUP_VI(REG_V23, REG_V20, 3) VSLIDEDOWN_VI(REG_V24, REG_V20, 2) VSLIDEUP_VI(REG_V24, REG_V20, 2) VSLIDEDOWN_VI(REG_V25, REG_V20, 3) VSLIDEUP_VI(REG_V25, REG_V20, 1) /* Load state to encrypt */ "mv t2, %[input]\n\t" VL4RE32_V(REG_V16, REG_T2) /* Load message */ "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "addi %[m], %[m], 64\n\t" /* Move state into vector registers */ VMVR_V(REG_V0, REG_V16, 4) VMVR_V(REG_V4, REG_V16, 4) /* Add counter word */ VADD_VV(REG_V7, REG_V7, REG_V22) /* Set number of odd+even rounds to perform */ "li t0, 10\n\t" "\n" "L_chacha20_riscv_128_loop:\n\t" QUARTER_ROUND_2(REG_V0, REG_V1, REG_V2, REG_V3, REG_V20, REG_V4, REG_V5, REG_V6, REG_V7, REG_V21) ODD_SHUFFLE_2(REG_V1, REG_V2, REG_V3, REG_V20, REG_V5, REG_V6, REG_V7, REG_V21) QUARTER_ROUND_2(REG_V0, REG_V1, REG_V2, REG_V3, REG_V20, REG_V4, REG_V5, REG_V6, REG_V7, REG_V21) EVEN_SHUFFLE_2(REG_V1, REG_V2, REG_V3, REG_V20, REG_V5, REG_V6, REG_V7, REG_V21) "addi t0, t0, -1\n\t" "bnez t0, L_chacha20_riscv_128_loop\n\t" /* Add back state, XOR in message and store (load next block) */ VADD_VV(REG_V0, REG_V0, REG_V16) VADD_VV(REG_V1, REG_V1, REG_V17) VADD_VV(REG_V2, REG_V2, REG_V18) VADD_VV(REG_V3, REG_V3, REG_V19) VXOR_VV(REG_V0, REG_V0, REG_V12) VXOR_VV(REG_V1, REG_V1, REG_V13) VXOR_VV(REG_V2, REG_V2, REG_V14) VXOR_VV(REG_V3, REG_V3, REG_V15) "mv t2, %[m]\n\t" VL4RE32_V(REG_V12, REG_T2) "mv t2, %[c]\n\t" VS4R_V(REG_V0, REG_T2) "addi %[c], %[c], 64\n\t" VADD_VV(REG_V19, REG_V19, REG_V22) VADD_VV(REG_V4, REG_V4, REG_V16) VADD_VV(REG_V5, REG_V5, REG_V17) VADD_VV(REG_V6, REG_V6, REG_V18) VADD_VV(REG_V7, REG_V7, REG_V19) VXOR_VV(REG_V4, REG_V4, REG_V12) VXOR_VV(REG_V5, REG_V5, REG_V13) VXOR_VV(REG_V6, REG_V6, REG_V14) VXOR_VV(REG_V7, REG_V7, REG_V15) "mv t2, %[c]\n\t" VS4R_V(REG_V4, REG_T2) : [m] "+r" (m), [c] "+r" (c) : [input] "r" (input), [L_chacha20_vec_inc_first_word] "r" (L_chacha20_vec_inc_first_word) : "memory", "t0", "t1", "t2" ); return CHACHA_CHUNK_BYTES * 2; } #ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION #define PART_ROUND(a, b, d, t, sl, sr) \ VADD_VV(a, a, b) \ VXOR_VV(d, d, a) \ VSLL_VI(t, d, sl) \ VSRL_VI(d, d, sr) \ VOR_VV(d, d, t) #else #define PART_ROUND(a, b, d, t, sl, sr) \ VADD_VV(a, a, b) \ VXOR_VV(d, d, a) \ VROR_VI(d, sr, d) #endif #define QUARTER_ROUND(a, b, c, d, t) \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND(a, b, d, t, 16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND(c, d, b, t, 12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND(a, b, d, t, 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND(c, d, b, t, 7, 25) #define ODD_SHUFFLE(b, c, d, t) \ /* a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 \ * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */ \ VSLIDEDOWN_VI(t, d, 3) \ VSLIDEUP_VI(t, d, 1) \ VMV_V_V(d, t) \ VSLIDEDOWN_VI(t, b, 1) \ VSLIDEUP_VI(t, b, 3) \ VMV_V_V(b, t) \ VSLIDEDOWN_VI(t, c, 2) \ VSLIDEUP_VI(t, c, 2) \ VMV_V_V(c, t) #define EVEN_SHUFFLE(b, c, d, t) \ /* a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 \ * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */ \ VSLIDEDOWN_VI(t, d, 1) \ VSLIDEUP_VI(t, d, 3) \ VMV_V_V(d, t) \ VSLIDEDOWN_VI(t, b, 3) \ VSLIDEUP_VI(t, b, 1) \ VMV_V_V(b, t) \ VSLIDEDOWN_VI(t, c, 2) \ VSLIDEUP_VI(t, c, 2) \ VMV_V_V(c, t) #define EIGHT_QUARTER_ROUNDS(a, b, c, d, t) \ /* Odd Round */ \ QUARTER_ROUND(a, b, c, d, t) \ ODD_SHUFFLE(b, c, d, t) \ /* Even Round */ \ QUARTER_ROUND(a, b, c, d, t) \ EVEN_SHUFFLE(b, c, d, t) static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m, byte* c, word32 bytes, byte* over) { word64 bytes64 = (word64)bytes; __asm__ __volatile__ ( VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) /* The layout of used vector registers is: * v0-v3 - block * v4-v7 - message * v8-v11 - input * v12 - temp * * v0 0 1 2 3 * v1 4 5 6 7 * v2 8 9 10 11 * v3 12 13 14 15 * load CHACHA state with indices placed as shown above */ /* Load incrementer register to modify counter */ "mv t2, %[L_chacha20_vec_inc_first_word]\n\t" VL1RE32_V(REG_V13, REG_T2) /* Load state to encrypt */ "mv t2, %[input]\n\t" VL4RE32_V(REG_V8, REG_T2) "\n" "L_chacha20_riscv_64_loop:\n\t" /* Move state into vector registers */ VMVR_V(REG_V0, REG_V8, 4) /* Add counter word */ /* Odd Round */ EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12) "addi t1, %[bytes], -64\n\t" /* Add back state */ VADD_VV(REG_V0, REG_V0, REG_V8) VADD_VV(REG_V1, REG_V1, REG_V9) VADD_VV(REG_V2, REG_V2, REG_V10) VADD_VV(REG_V3, REG_V3, REG_V11) "bltz t1, L_chacha20_riscv_64_lt_64\n\t" "mv t2, %[m]\n\t" VL4RE32_V(REG_V4, REG_T2) VXOR_VV(REG_V4, REG_V4, REG_V0) VXOR_VV(REG_V5, REG_V5, REG_V1) VXOR_VV(REG_V6, REG_V6, REG_V2) VXOR_VV(REG_V7, REG_V7, REG_V3) "mv t2, %[c]\n\t" VS4R_V(REG_V4, REG_T2) "addi %[bytes], %[bytes], -64\n\t" "addi %[c], %[c], 64\n\t" "addi %[m], %[m], 64\n\t" VADD_VV(REG_V11, REG_V11, REG_V13) "bnez %[bytes], L_chacha20_riscv_64_loop\n\t" "beqz %[bytes], L_chacha20_riscv_64_done\n\t" "\n" "L_chacha20_riscv_64_lt_64:\n\t" "mv t2, %[over]\n\t" "addi t1, %[bytes], -32\n\t" VS4R_V(REG_V0, REG_T2) "bltz t1, L_chacha20_riscv_64_lt_32\n\t" "mv t2, %[m]\n\t" VL2RE32_V(REG_V4, REG_T2) VXOR_VV(REG_V4, REG_V4, REG_V0) VXOR_VV(REG_V5, REG_V5, REG_V1) "mv t2, %[c]\n\t" VS2R_V(REG_V4, REG_T2) "addi %[bytes], %[bytes], -32\n\t" "addi %[c], %[c], 32\n\t" "addi %[m], %[m], 32\n\t" "beqz %[bytes], L_chacha20_riscv_64_done\n\t" VMVR_V(REG_V0, REG_V2, 2) "\n" "L_chacha20_riscv_64_lt_32:\n\t" "addi t1, %[bytes], -16\n\t" "bltz t1, L_chacha20_riscv_64_lt_16\n\t" "mv t2, %[m]\n\t" VL1RE32_V(REG_V4, REG_T2) VXOR_VV(REG_V4, REG_V4, REG_V0) "mv t2, %[c]\n\t" VS1R_V(REG_V4, REG_T2) "addi %[bytes], %[bytes], -16\n\t" "addi %[c], %[c], 16\n\t" "addi %[m], %[m], 16\n\t" "beqz %[bytes], L_chacha20_riscv_64_done\n\t" VMV_V_V(REG_V0, REG_V1) "\n" "L_chacha20_riscv_64_lt_16:\n\t" "addi t1, %[bytes], -8\n\t" "bltz t1, L_chacha20_riscv_64_lt_8\n\t" VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000) VMV_X_S(REG_T0, REG_V0) VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) "ld t1, (%[m])\n\t" "xor t1, t1, t0\n\t" "sd t1, (%[c])\n\t" "addi %[bytes], %[bytes], -8\n\t" "addi %[c], %[c], 8\n\t" "addi %[m], %[m], 8\n\t" "beqz %[bytes], L_chacha20_riscv_64_done\n\t" VSLIDEDOWN_VI(REG_V0, REG_V0, 2) "\n" "L_chacha20_riscv_64_lt_8:\n\t" "addi %[bytes], %[bytes], -1\n\t" VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000) VMV_X_S(REG_T0, REG_V0) VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000) "\n" "L_chacha20_riscv_64_loop_lt_8:\n\t" "addi %[bytes], %[bytes], -1\n\t" "lb t1, (%[m])\n\t" "addi %[m], %[m], 1\n\t" "xor t1, t1, t0\n\t" "sb t1, (%[c])\n\t" "addi %[c], %[c], 1\n\t" "srli t0, t0, 8\n\t" "bgez %[bytes], L_chacha20_riscv_64_loop_lt_8\n\t" "\n" "L_chacha20_riscv_64_done:\n\t" : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64) : [input] "r" (input), [over] "r" (over), [L_chacha20_vec_inc_first_word] "r" (L_chacha20_vec_inc_first_word) : "memory", "t0", "t1", "t2" ); } /** * Encrypt a stream of bytes */ static void wc_chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, word32 bytes) { int processed; if (bytes >= CHACHA_CHUNK_BYTES * 6) { processed = (bytes / (CHACHA_CHUNK_BYTES * 6)) * CHACHA_CHUNK_BYTES * 6; wc_chacha_encrypt_384(ctx->X, m, c, processed); bytes -= processed; c += processed; m += processed; ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); } if (bytes >= CHACHA_CHUNK_BYTES * 4) { processed = wc_chacha_encrypt_256(ctx->X, m, c); bytes -= processed; c += processed; m += processed; ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); } if (bytes >= CHACHA_CHUNK_BYTES * 2) { processed = wc_chacha_encrypt_128(ctx->X, m, c); bytes -= processed; c += processed; m += processed; ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES); } if (bytes > 0) { wc_chacha_encrypt_64(ctx->X, m, c, bytes, (byte*)ctx->over); if (bytes > CHACHA_CHUNK_BYTES) ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); ctx->left = CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1)); ctx->left &= CHACHA_CHUNK_BYTES - 1; ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); } } #else #if !defined(WOLFSSL_RISCV_BIT_MANIPULATION) #define PART_ROUND_ODD_ABD(sl, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ "slli " CC_T0 ", " CC_D0 ", " #sl "\n\t" \ "slli " CC_T1 ", " CC_D1 ", " #sl "\n\t" \ "slli " CC_T2 ", " CC_D2 ", " #sl "\n\t" \ "slli " CC_T3 ", " CC_D3 ", " #sl "\n\t" \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t" #define PART_ROUND_ODD_CDB(sl, sr) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ "slli " CC_T0 ", " CC_B0 ", " #sl "\n\t" \ "slli " CC_T1 ", " CC_B1 ", " #sl "\n\t" \ "slli " CC_T2 ", " CC_B2 ", " #sl "\n\t" \ "slli " CC_T3 ", " CC_B3 ", " #sl "\n\t" \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_ABD(sl, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ "slli " CC_T0 ", " CC_D3 ", " #sl "\n\t" \ "slli " CC_T1 ", " CC_D0 ", " #sl "\n\t" \ "slli " CC_T2 ", " CC_D1 ", " #sl "\n\t" \ "slli " CC_T3 ", " CC_D2 ", " #sl "\n\t" \ "srliw " CC_D3 ", " CC_D3 ", " #sr "\n\t" \ "srliw " CC_D0 ", " CC_D0 ", " #sr "\n\t" \ "srliw " CC_D1 ", " CC_D1 ", " #sr "\n\t" \ "srliw " CC_D2 ", " CC_D2 ", " #sr "\n\t" \ "or " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t" \ "or " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t" \ "or " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t" \ "or " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t" #define PART_ROUND_EVEN_CDB(sl, sr) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ "slli " CC_T0 ", " CC_B1 ", " #sl "\n\t" \ "slli " CC_T1 ", " CC_B2 ", " #sl "\n\t" \ "slli " CC_T2 ", " CC_B3 ", " #sl "\n\t" \ "slli " CC_T3 ", " CC_B0 ", " #sl "\n\t" \ "srliw " CC_B1 ", " CC_B1 ", " #sr "\n\t" \ "srliw " CC_B2 ", " CC_B2 ", " #sr "\n\t" \ "srliw " CC_B3 ", " CC_B3 ", " #sr "\n\t" \ "srliw " CC_B0 ", " CC_B0 ", " #sr "\n\t" \ "or " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t" \ "or " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t" \ "or " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t" \ "or " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t" #else #define PART_ROUND_ODD_ABD(sl, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t" \ "add " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t" \ "add " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t" \ "add " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t" \ "xor " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t" \ "xor " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t" \ "xor " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t" \ "xor " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t" \ RORIW(REG_S6, REG_S6, sr) \ RORIW(REG_S7, REG_S7, sr) \ RORIW(REG_S8, REG_S8, sr) \ RORIW(REG_S9, REG_S9, sr) #define PART_ROUND_ODD_CDB(sl, sr) \ "add " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t" \ "add " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t" \ "add " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t" \ "add " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t" \ "xor " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t" \ "xor " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t" \ "xor " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t" \ "xor " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t" \ RORIW(REG_T3, REG_T3, sr) \ RORIW(REG_T4, REG_T4, sr) \ RORIW(REG_T5, REG_T5, sr) \ RORIW(REG_T6, REG_T6, sr) #define PART_ROUND_EVEN_ABD(sl, sr) \ "add " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t" \ "add " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t" \ "add " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t" \ "add " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t" \ "xor " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t" \ "xor " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t" \ "xor " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t" \ "xor " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t" \ RORIW(REG_S9, REG_S9, sr) \ RORIW(REG_S6, REG_S6, sr) \ RORIW(REG_S7, REG_S7, sr) \ RORIW(REG_S8, REG_S8, sr) #define PART_ROUND_EVEN_CDB(sl, sr) \ "add " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t" \ "add " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t" \ "add " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t" \ "add " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t" \ "xor " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t" \ "xor " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t" \ "xor " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t" \ "xor " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t" \ RORIW(REG_T4, REG_T4, sr) \ RORIW(REG_T5, REG_T5, sr) \ RORIW(REG_T6, REG_T6, sr) \ RORIW(REG_T3, REG_T3, sr) #endif #define QUARTER_ROUND_ODD() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_ODD_ABD(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_ODD_CDB(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_ODD_ABD( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_ODD_CDB( 7, 25) #define QUARTER_ROUND_EVEN() \ /* a += b; d ^= a; d <<<= 16; */ \ PART_ROUND_EVEN_ABD(16, 16) \ /* c += d; b ^= c; b <<<= 12; */ \ PART_ROUND_EVEN_CDB(12, 20) \ /* a += b; d ^= a; d <<<= 8; */ \ PART_ROUND_EVEN_ABD( 8, 24) \ /* c += d; b ^= c; b <<<= 7; */ \ PART_ROUND_EVEN_CDB( 7, 25) static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m, byte* c, word32 bytes, word32* over) { __asm__ __volatile__ ( /* Ensure 64-bit bytes has top bits clear. */ "slli %[bytes], %[bytes], 32\n\t" "srli %[bytes], %[bytes], 32\n\t" "L_chacha20_riscv_outer:\n\t" /* Move state into regular registers */ "ld a4, 0(%[input])\n\t" "ld a6, 8(%[input])\n\t" "ld t3, 16(%[input])\n\t" "ld t5, 24(%[input])\n\t" "ld s2, 32(%[input])\n\t" "ld s4, 40(%[input])\n\t" "ld s6, 48(%[input])\n\t" "ld s8, 56(%[input])\n\t" "srli a5, a4, 32\n\t" "srli a7, a6, 32\n\t" "srli t4, t3, 32\n\t" "srli t6, t5, 32\n\t" "srli s3, s2, 32\n\t" "srli s5, s4, 32\n\t" "srli s7, s6, 32\n\t" "srli s9, s8, 32\n\t" /* Set number of odd+even rounds to perform */ "li a3, 10\n\t" "\n" "L_chacha20_riscv_loop:\n\t" /* Odd Round */ QUARTER_ROUND_ODD() "addi a3, a3, -1\n\t" /* Even Round */ QUARTER_ROUND_EVEN() "bnez a3, L_chacha20_riscv_loop\n\t" "addi %[bytes], %[bytes], -64\n\t" "ld t0, 0(%[input])\n\t" "ld t1, 8(%[input])\n\t" "ld t2, 16(%[input])\n\t" "ld s1, 24(%[input])\n\t" "add a4, a4, t0\n\t" "add a6, a6, t1\n\t" "add t3, t3, t2\n\t" "add t5, t5, s1\n\t" "srli t0, t0, 32\n\t" "srli t1, t1, 32\n\t" "srli t2, t2, 32\n\t" "srli s1, s1, 32\n\t" "add a5, a5, t0\n\t" "add a7, a7, t1\n\t" "add t4, t4, t2\n\t" "add t6, t6, s1\n\t" "ld t0, 32(%[input])\n\t" "ld t1, 40(%[input])\n\t" "ld t2, 48(%[input])\n\t" "ld s1, 56(%[input])\n\t" "add s2, s2, t0\n\t" "add s4, s4, t1\n\t" "add s6, s6, t2\n\t" "addi t2, t2, 1\n\t" "add s8, s8, s1\n\t" "srli t0, t0, 32\n\t" "srli t1, t1, 32\n\t" "sw t2, 48(%[input])\n\t" "srli t2, t2, 32\n\t" "srli s1, s1, 32\n\t" "add s3, s3, t0\n\t" "add s5, s5, t1\n\t" "add s7, s7, t2\n\t" "add s9, s9, s1\n\t" "bltz %[bytes], L_chacha20_riscv_over\n\t" #if !defined(WOLFSSL_RISCV_BIT_MANIPULATION) "ld t0, 0(%[m])\n\t" "ld t1, 8(%[m])\n\t" "ld t2, 16(%[m])\n\t" "ld s1, 24(%[m])\n\t" "xor a4, a4, t0\n\t" "xor a6, a6, t1\n\t" "xor t3, t3, t2\n\t" "xor t5, t5, s1\n\t" "srli t0, t0, 32\n\t" "srli t1, t1, 32\n\t" "srli t2, t2, 32\n\t" "srli s1, s1, 32\n\t" "xor a5, a5, t0\n\t" "xor a7, a7, t1\n\t" "xor t4, t4, t2\n\t" "xor t6, t6, s1\n\t" "ld t0, 32(%[m])\n\t" "ld t1, 40(%[m])\n\t" "ld t2, 48(%[m])\n\t" "ld s1, 56(%[m])\n\t" "xor s2, s2, t0\n\t" "xor s4, s4, t1\n\t" "xor s6, s6, t2\n\t" "xor s8, s8, s1\n\t" "srli t0, t0, 32\n\t" "srli t1, t1, 32\n\t" "srli t2, t2, 32\n\t" "srli s1, s1, 32\n\t" "xor s3, s3, t0\n\t" "xor s5, s5, t1\n\t" "xor s7, s7, t2\n\t" "xor s9, s9, s1\n\t" "sw a4, 0(%[c])\n\t" "sw a5, 4(%[c])\n\t" "sw a6, 8(%[c])\n\t" "sw a7, 12(%[c])\n\t" "sw t3, 16(%[c])\n\t" "sw t4, 20(%[c])\n\t" "sw t5, 24(%[c])\n\t" "sw t6, 28(%[c])\n\t" "sw s2, 32(%[c])\n\t" "sw s3, 36(%[c])\n\t" "sw s4, 40(%[c])\n\t" "sw s5, 44(%[c])\n\t" "sw s6, 48(%[c])\n\t" "sw s7, 52(%[c])\n\t" "sw s8, 56(%[c])\n\t" "sw s9, 60(%[c])\n\t" #else PACK(REG_A4, REG_A4, REG_A5) PACK(REG_A6, REG_A6, REG_A7) PACK(REG_T3, REG_T3, REG_T4) PACK(REG_T5, REG_T5, REG_T6) PACK(REG_S2, REG_S2, REG_S3) PACK(REG_S4, REG_S4, REG_S5) PACK(REG_S6, REG_S6, REG_S7) PACK(REG_S8, REG_S8, REG_S9) "ld a5, 0(%[m])\n\t" "ld a7, 8(%[m])\n\t" "ld t4, 16(%[m])\n\t" "ld t6, 24(%[m])\n\t" "ld s3, 32(%[m])\n\t" "ld s5, 40(%[m])\n\t" "ld s7, 48(%[m])\n\t" "ld s9, 56(%[m])\n\t" "xor a4, a4, a5\n\t" "xor a6, a6, a7\n\t" "xor t3, t3, t4\n\t" "xor t5, t5, t6\n\t" "xor s2, s2, s3\n\t" "xor s4, s4, s5\n\t" "xor s6, s6, s7\n\t" "xor s8, s8, s9\n\t" "sd a4, 0(%[c])\n\t" "sd a6, 8(%[c])\n\t" "sd t3, 16(%[c])\n\t" "sd t5, 24(%[c])\n\t" "sd s2, 32(%[c])\n\t" "sd s4, 40(%[c])\n\t" "sd s6, 48(%[c])\n\t" "sd s8, 56(%[c])\n\t" #endif "addi %[m], %[m], 64\n\t" "addi %[c], %[c], 64\n\t" "bnez %[bytes], L_chacha20_riscv_outer\n\t" "beqz %[bytes], L_chacha20_riscv_done\n\t" "L_chacha20_riscv_over:\n\t" "addi a3, %[bytes], 64\n\t" "sw a4, 0(%[over])\n\t" "sw a5, 4(%[over])\n\t" "sw a6, 8(%[over])\n\t" "sw a7, 12(%[over])\n\t" "sw t3, 16(%[over])\n\t" "sw t4, 20(%[over])\n\t" "sw t5, 24(%[over])\n\t" "sw t6, 28(%[over])\n\t" "sw s2, 32(%[over])\n\t" "sw s3, 36(%[over])\n\t" "sw s4, 40(%[over])\n\t" "sw s5, 44(%[over])\n\t" "sw s6, 48(%[over])\n\t" "sw s7, 52(%[over])\n\t" "sw s8, 56(%[over])\n\t" "sw s9, 60(%[over])\n\t" "addi t0, a3, -8\n\t" "bltz t0, L_chacha20_riscv_32bit\n\t" "addi a3, a3, -1\n\t" "L_chacha20_riscv_64bit_loop:\n\t" "ld t0, (%[m])\n\t" "ld t1, (%[over])\n\t" "xor t0, t0, t1\n\t" "sd t0, (%[c])\n\t" "addi %[m], %[m], 8\n\t" "addi %[c], %[c], 8\n\t" "addi %[over], %[over], 8\n\t" "addi a3, a3, -8\n\t" "bgez a3, L_chacha20_riscv_64bit_loop\n\t" "addi a3, a3, 1\n\t" "L_chacha20_riscv_32bit:\n\t" "addi t0, a3, -4\n\t" "bltz t0, L_chacha20_riscv_16bit\n\t" "lw t0, (%[m])\n\t" "lw t1, (%[over])\n\t" "xor t0, t0, t1\n\t" "sw t0, (%[c])\n\t" "addi %[m], %[m], 4\n\t" "addi %[c], %[c], 4\n\t" "addi %[over], %[over], 4\n\t" "L_chacha20_riscv_16bit:\n\t" "addi t0, a3, -2\n\t" "bltz t0, L_chacha20_riscv_8bit\n\t" "lh t0, (%[m])\n\t" "lh t1, (%[over])\n\t" "xor t0, t0, t1\n\t" "sh t0, (%[c])\n\t" "addi %[m], %[m], 2\n\t" "addi %[c], %[c], 2\n\t" "addi %[over], %[over], 2\n\t" "L_chacha20_riscv_8bit:\n\t" "addi t0, a3, -1\n\t" "bltz t0, L_chacha20_riscv_done\n\t\n\t" "lb t0, (%[m])\n\t" "lb t1, (%[over])\n\t" "xor t0, t0, t1\n\t" "sb t0, (%[c])\n\t" "bltz %[bytes], L_chacha20_riscv_done\n\t" "L_chacha20_riscv_done:\n\t" : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes), [over] "+r" (over) : [input] "r" (input) : "memory", "t0", "t1", "t2", "s1", "a3", "t3", "t4", "t5", "t6", "a4", "a5", "a6", "a7", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9" ); } /** * Encrypt a stream of bytes */ static WC_INLINE void wc_chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, word32 bytes) { wc_chacha_encrypt(ctx->X, m, c, bytes, ctx->over); ctx->left = (CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1))) & (CHACHA_CHUNK_BYTES - 1); } #endif /** * API to encrypt/decrypt a message of any size. */ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, word32 msglen) { int ret = 0; if ((ctx == NULL) || (output == NULL) || (input == NULL)) { ret = BAD_FUNC_ARG; } else if (msglen > 0) { if (ctx->left > 0) { word32 processed = min(msglen, ctx->left); byte* out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left; xorbufout(output, input, out, processed); ctx->left -= processed; msglen -= processed; output += processed; input += processed; } if (msglen > 0) { wc_chacha_encrypt_bytes(ctx, input, output, msglen); } } return ret; } #endif /* HAVE_CHACHA */ #endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */