/* armv8-32-poly1305-asm * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ /* Generated using (from wolfssl): * cd ../scripts * ruby ./poly1305/poly1305.rb arm32 \ * ../wolfssl/wolfcrypt/src/port/arm/armv8-32-poly1305-asm.S */ #include #ifdef WOLFSSL_ARMASM #if !defined(__aarch64__) && !defined(WOLFSSL_ARMASM_THUMB2) #ifndef WOLFSSL_ARMASM_INLINE #ifdef HAVE_POLY1305 #ifdef WOLFSSL_ARMASM_NO_NEON .text .align 4 .globl poly1305_arm32_blocks_16 .type poly1305_arm32_blocks_16, %function poly1305_arm32_blocks_16: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #28 cmp r2, #0 beq L_poly1305_arm32_16_done add lr, sp, #12 stm lr, {r0, r1, r2, r3} # Get h pointer add lr, r0, #16 ldm lr, {r4, r5, r6, r7, r8} L_poly1305_arm32_16_loop: # Add m to h ldr r1, [sp, #16] ldr r2, [r1] ldr r3, [r1, #4] ldr r9, [r1, #8] ldr r10, [r1, #12] ldr r11, [sp, #24] adds r4, r4, r2 adcs r5, r5, r3 adcs r6, r6, r9 adcs r7, r7, r10 add r1, r1, #16 adc r8, r8, r11 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) stm lr, {r4, r5, r6, r7, r8} #else # h[0]-h[2] in r4-r6 for multiplication. str r7, [lr, #12] str r8, [lr, #16] #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r1, [sp, #16] ldr r1, [sp, #12] # Multiply h by r #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] ldr r3, [r1] eor r0, r0, r0 # r[0] * h[0] # h[0] in r4 umull r4, r5, r3, r4 # r[0] * h[2] # h[2] in r6 umull r6, r7, r3, r6 # r[0] * h[4] # h[4] in r8 mul r8, r3, r8 # r[0] * h[1] ldr r2, [lr, #4] mov r12, r0 umlal r5, r12, r3, r2 # r[0] * h[3] ldr r2, [lr, #12] adds r6, r6, r12 adc r7, r7, r0 umlal r7, r8, r3, r2 # r[1] * h[0] ldr r3, [r1, #4] ldr r2, [lr] mov r12, r0 umlal r5, r12, r3, r2 # r[1] * h[1] ldr r2, [lr, #4] adds r6, r6, r12 adc r12, r0, r0 umlal r6, r12, r3, r2 # r[1] * h[2] ldr r2, [lr, #8] adds r7, r7, r12 adc r12, r0, r0 umlal r7, r12, r3, r2 # r[1] * h[3] ldr r2, [lr, #12] adds r8, r8, r12 adc r9, r0, r0 umlal r8, r9, r3, r2 # r[1] * h[4] ldr r2, [lr, #16] mla r9, r3, r2, r9 # r[2] * h[0] ldr r3, [r1, #8] ldr r2, [lr] mov r12, r0 umlal r6, r12, r3, r2 # r[2] * h[1] ldr r2, [lr, #4] adds r7, r7, r12 adc r12, r0, r0 umlal r7, r12, r3, r2 # r[2] * h[2] ldr r2, [lr, #8] adds r8, r8, r12 adc r12, r0, r0 umlal r8, r12, r3, r2 # r[2] * h[3] ldr r2, [lr, #12] adds r9, r9, r12 adc r10, r0, r0 umlal r9, r10, r3, r2 # r[2] * h[4] ldr r2, [lr, #16] mla r10, r3, r2, r10 # r[3] * h[0] ldr r3, [r1, #12] ldr r2, [lr] mov r12, r0 umlal r7, r12, r3, r2 # r[3] * h[1] ldr r2, [lr, #4] adds r8, r8, r12 adc r12, r0, r0 umlal r8, r12, r3, r2 # r[3] * h[2] ldr r2, [lr, #8] adds r9, r9, r12 adc r10, r10, r0 umlal r9, r10, r3, r2 # r[3] * h[3] ldr r2, [lr, #12] mov r11, r0 umlal r10, r11, r3, r2 # r[3] * h[4] ldr r2, [lr, #16] mov r12, r0 mla r11, r3, r2, r11 #else ldm r1, {r0, r1, r2, r3} # r[0] * h[0] umull r10, r11, r0, r4 # r[1] * h[0] umull r12, r7, r1, r4 # r[0] * h[1] umaal r11, r12, r0, r5 # r[2] * h[0] umull r8, r9, r2, r4 # r[1] * h[1] umaal r12, r8, r1, r5 # r[0] * h[2] umaal r12, r7, r0, r6 # r[3] * h[0] umaal r8, r9, r3, r4 stm sp, {r10, r11, r12} # r[2] * h[1] umaal r7, r8, r2, r5 # Replace h[0] with h[3] ldr r4, [lr, #12] # r[1] * h[2] umull r10, r11, r1, r6 # r[2] * h[2] umaal r8, r9, r2, r6 # r[0] * h[3] umaal r7, r10, r0, r4 # r[3] * h[1] umaal r8, r11, r3, r5 # r[1] * h[3] umaal r8, r10, r1, r4 # r[3] * h[2] umaal r9, r11, r3, r6 # r[2] * h[3] umaal r9, r10, r2, r4 # Replace h[1] with h[4] ldr r5, [lr, #16] # r[3] * h[3] umaal r10, r11, r3, r4 mov r12, #0 # r[0] * h[4] umaal r8, r12, r0, r5 # r[1] * h[4] umaal r9, r12, r1, r5 # r[2] * h[4] umaal r10, r12, r2, r5 # r[3] * h[4] umaal r11, r12, r3, r5 # DONE ldm sp, {r4, r5, r6} #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ # r12 will be zero because r is masked. # Load length ldr r2, [sp, #20] # Reduce mod 2^130 - 5 bic r3, r8, #0x3 and r8, r8, #3 adds r4, r4, r3 lsr r3, r3, #2 adcs r5, r5, r9 orr r3, r3, r9, lsl #30 adcs r6, r6, r10 lsr r9, r9, #2 adcs r7, r7, r11 orr r9, r9, r10, lsl #30 adc r8, r8, r12 lsr r10, r10, #2 adds r4, r4, r3 orr r10, r10, r11, lsl #30 adcs r5, r5, r9 lsr r11, r11, #2 adcs r6, r6, r10 adcs r7, r7, r11 adc r8, r8, r12 # Sub 16 from length. subs r2, r2, #16 # Store length. str r2, [sp, #20] # Loop again if more message to do. bgt L_poly1305_arm32_16_loop stm lr, {r4, r5, r6, r7, r8} L_poly1305_arm32_16_done: add sp, sp, #28 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16 #ifndef __APPLE__ .text .type L_poly1305_arm32_clamp, %object .size L_poly1305_arm32_clamp, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_poly1305_arm32_clamp: .long 0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc .text .align 4 .globl poly1305_set_key .type poly1305_set_key, %function poly1305_set_key: push {r4, r5, r6, r7, r8, lr} # Load mask. adr lr, L_poly1305_arm32_clamp ldm lr, {r6, r7, r8, r12} # Load and cache padding. ldr r2, [r1, #16] ldr r3, [r1, #20] ldr r4, [r1, #24] ldr r5, [r1, #28] add lr, r0, #36 stm lr, {r2, r3, r4, r5} # Load, mask and store r. ldr r2, [r1] ldr r3, [r1, #4] ldr r4, [r1, #8] ldr r5, [r1, #12] and r2, r2, r6 and r3, r3, r7 and r4, r4, r8 and r5, r5, r12 add lr, r0, #0 stm lr, {r2, r3, r4, r5} # h (accumulator) = 0 eor r6, r6, r6 eor r7, r7, r7 eor r8, r8, r8 eor r12, r12, r12 add lr, r0, #16 eor r5, r5, r5 stm lr, {r5, r6, r7, r8, r12} # Zero leftover str r5, [r0, #52] pop {r4, r5, r6, r7, r8, pc} .size poly1305_set_key,.-poly1305_set_key .text .align 4 .globl poly1305_final .type poly1305_final, %function poly1305_final: push {r4, r5, r6, r7, r8, r9, lr} add r9, r0, #16 ldm r9, {r4, r5, r6, r7, r8} # Add 5 and check for h larger than p. adds r2, r4, #5 adcs r2, r5, #0 adcs r2, r6, #0 adcs r2, r7, #0 adc r2, r8, #0 sub r2, r2, #4 lsr r2, r2, #31 sub r2, r2, #1 and r2, r2, #5 # Add 0/5 to h. adds r4, r4, r2 adcs r5, r5, #0 adcs r6, r6, #0 adc r7, r7, #0 # Add padding add r9, r0, #36 ldm r9, {r2, r3, r12, lr} adds r4, r4, r2 adcs r5, r5, r3 adcs r6, r6, r12 adc r7, r7, lr # Store MAC str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] # Zero out h. eor r4, r4, r4 eor r5, r5, r5 eor r6, r6, r6 eor r7, r7, r7 eor r8, r8, r8 add r9, r0, #16 stm r9, {r4, r5, r6, r7, r8} # Zero out r. add r9, r0, #0 stm r9, {r4, r5, r6, r7} # Zero out padding. add r9, r0, #36 stm r9, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, pc} .size poly1305_final,.-poly1305_final #else .text .align 4 .globl poly1305_arm32_blocks_16 .type poly1305_arm32_blocks_16, %function poly1305_arm32_blocks_16: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} sub sp, sp, #28 cmp r2, #0 beq L_poly1305_arm32_16_done add lr, sp, #12 stm lr, {r0, r1, r2, r3} # Get h pointer add lr, r0, #16 ldm lr, {r4, r5, r6, r7, r8} L_poly1305_arm32_16_loop: # Add m to h ldr r1, [sp, #16] ldr r2, [r1] ldr r3, [r1, #4] ldr r9, [r1, #8] ldr r10, [r1, #12] ldr r11, [sp, #24] adds r4, r4, r2 adcs r5, r5, r3 adcs r6, r6, r9 adcs r7, r7, r10 add r1, r1, #16 adc r8, r8, r11 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) stm lr, {r4, r5, r6, r7, r8} #else # h[0]-h[2] in r4-r6 for multiplication. str r7, [lr, #12] str r8, [lr, #16] #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ str r1, [sp, #16] ldr r1, [sp, #12] # Multiply h by r #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] ldr r3, [r1] eor r0, r0, r0 # r[0] * h[0] # h[0] in r4 umull r4, r5, r3, r4 # r[0] * h[2] # h[2] in r6 umull r6, r7, r3, r6 # r[0] * h[4] # h[4] in r8 mul r8, r3, r8 # r[0] * h[1] ldr r2, [lr, #4] mov r12, r0 umlal r5, r12, r3, r2 # r[0] * h[3] ldr r2, [lr, #12] adds r6, r6, r12 adc r7, r7, r0 umlal r7, r8, r3, r2 # r[1] * h[0] ldr r3, [r1, #4] ldr r2, [lr] mov r12, r0 umlal r5, r12, r3, r2 # r[1] * h[1] ldr r2, [lr, #4] adds r6, r6, r12 adc r12, r0, r0 umlal r6, r12, r3, r2 # r[1] * h[2] ldr r2, [lr, #8] adds r7, r7, r12 adc r12, r0, r0 umlal r7, r12, r3, r2 # r[1] * h[3] ldr r2, [lr, #12] adds r8, r8, r12 adc r9, r0, r0 umlal r8, r9, r3, r2 # r[1] * h[4] ldr r2, [lr, #16] mla r9, r3, r2, r9 # r[2] * h[0] ldr r3, [r1, #8] ldr r2, [lr] mov r12, r0 umlal r6, r12, r3, r2 # r[2] * h[1] ldr r2, [lr, #4] adds r7, r7, r12 adc r12, r0, r0 umlal r7, r12, r3, r2 # r[2] * h[2] ldr r2, [lr, #8] adds r8, r8, r12 adc r12, r0, r0 umlal r8, r12, r3, r2 # r[2] * h[3] ldr r2, [lr, #12] adds r9, r9, r12 adc r10, r0, r0 umlal r9, r10, r3, r2 # r[2] * h[4] ldr r2, [lr, #16] mla r10, r3, r2, r10 # r[3] * h[0] ldr r3, [r1, #12] ldr r2, [lr] mov r12, r0 umlal r7, r12, r3, r2 # r[3] * h[1] ldr r2, [lr, #4] adds r8, r8, r12 adc r12, r0, r0 umlal r8, r12, r3, r2 # r[3] * h[2] ldr r2, [lr, #8] adds r9, r9, r12 adc r10, r10, r0 umlal r9, r10, r3, r2 # r[3] * h[3] ldr r2, [lr, #12] mov r11, r0 umlal r10, r11, r3, r2 # r[3] * h[4] ldr r2, [lr, #16] mov r12, r0 mla r11, r3, r2, r11 #else ldm r1, {r0, r1, r2, r3} # r[0] * h[0] umull r10, r11, r0, r4 # r[1] * h[0] umull r12, r7, r1, r4 # r[0] * h[1] umaal r11, r12, r0, r5 # r[2] * h[0] umull r8, r9, r2, r4 # r[1] * h[1] umaal r12, r8, r1, r5 # r[0] * h[2] umaal r12, r7, r0, r6 # r[3] * h[0] umaal r8, r9, r3, r4 stm sp, {r10, r11, r12} # r[2] * h[1] umaal r7, r8, r2, r5 # Replace h[0] with h[3] ldr r4, [lr, #12] # r[1] * h[2] umull r10, r11, r1, r6 # r[2] * h[2] umaal r8, r9, r2, r6 # r[0] * h[3] umaal r7, r10, r0, r4 # r[3] * h[1] umaal r8, r11, r3, r5 # r[1] * h[3] umaal r8, r10, r1, r4 # r[3] * h[2] umaal r9, r11, r3, r6 # r[2] * h[3] umaal r9, r10, r2, r4 # Replace h[1] with h[4] ldr r5, [lr, #16] # r[3] * h[3] umaal r10, r11, r3, r4 mov r12, #0 # r[0] * h[4] umaal r8, r12, r0, r5 # r[1] * h[4] umaal r9, r12, r1, r5 # r[2] * h[4] umaal r10, r12, r2, r5 # r[3] * h[4] umaal r11, r12, r3, r5 # DONE ldm sp, {r4, r5, r6} #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ # r12 will be zero because r is masked. # Load length ldr r2, [sp, #20] # Reduce mod 2^130 - 5 bic r3, r8, #0x3 and r8, r8, #3 adds r4, r4, r3 lsr r3, r3, #2 adcs r5, r5, r9 orr r3, r3, r9, lsl #30 adcs r6, r6, r10 lsr r9, r9, #2 adcs r7, r7, r11 orr r9, r9, r10, lsl #30 adc r8, r8, r12 lsr r10, r10, #2 adds r4, r4, r3 orr r10, r10, r11, lsl #30 adcs r5, r5, r9 lsr r11, r11, #2 adcs r6, r6, r10 adcs r7, r7, r11 adc r8, r8, r12 # Sub 16 from length. subs r2, r2, #16 # Store length. str r2, [sp, #20] # Loop again if more message to do. bgt L_poly1305_arm32_16_loop stm lr, {r4, r5, r6, r7, r8} L_poly1305_arm32_16_done: add sp, sp, #28 pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size poly1305_arm32_blocks_16,.-poly1305_arm32_blocks_16 .text .align 4 .globl poly1305_arm32_blocks .type poly1305_arm32_blocks, %function poly1305_arm32_blocks: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} vpush {d8-d15} cmp r2, #16 add r12, r0, #16 bgt L_poly1305_arm32_blocks_begin_neon ldm r12, {r7, r8, r9, r10, r11} b L_poly1305_arm32_blocks_start_1 L_poly1305_arm32_blocks_begin_neon: vmov.i16 q15, #0xffff vshr.u64 q15, q15, #38 vld1.64 {d0-d2}, [r12] vshl.u64 d4, d2, #24 vsri.u64 d4, d1, #40 vshr.u64 d3, d1, #14 vshl.u64 d2, d1, #12 vsri.u64 d1, d0, #26 vsri.u64 d2, d0, #52 vand.u64 d0, d0, d31 vand.u64 d3, d3, d31 vand.u64 d2, d2, d31 vand.u64 d1, d1, d31 add r3, r0, #0x7c vldm.32 r3, {d20-d24} cmp r2, #0x40 bge L_poly1305_arm32_blocks_begin_4 vshl.u32 d6, d21, #2 vshl.u32 d7, d22, #2 vshl.u32 d8, d23, #2 vshl.u32 d9, d24, #2 vadd.u32 d6, d6, d21 vadd.u32 d7, d7, d22 vadd.u32 d8, d8, d23 vadd.u32 d9, d9, d24 b L_poly1305_arm32_blocks_start_2 L_poly1305_arm32_blocks_begin_4: add r3, r0, #0xa4 vldm.32 r3, {d26-d30} L_poly1305_arm32_blocks_start_4: sub r2, #0x40 vld4.32 {d10-d13}, [r1]! vshl.u32 d6, d27, #2 vshl.u32 d7, d28, #2 vshl.u32 d8, d29, #2 vshl.u32 d9, d30, #2 vadd.u32 d6, d6, d27 vadd.u32 d7, d7, d28 vadd.u32 d8, d8, d29 vadd.u32 d9, d9, d30 vshr.u32 d14, d13, #8 vshl.u32 d13, d13, #18 vorr.i32 d14, d14, #0x1000000 vsri.u32 d13, d12, #14 vshl.u32 d12, d12, #12 vand.i32 d13, d13, #0x3ffffff vsri.u32 d12, d11, #20 vshl.u32 d11, d11, #6 vand.i32 d12, d12, #0x3ffffff vsri.u32 d11, d10, #26 vand.i32 d10, d10, #0x3ffffff vand.i32 d11, d11, #0x3ffffff vadd.u32 d4, d4, d14 vadd.u32 q1, q1, q6 vadd.u32 q0, q0, q5 vmull.u32 q5, d0, d26 vmull.u32 q6, d0, d27 vmull.u32 q7, d0, d28 vmull.u32 q8, d0, d29 vmull.u32 q9, d0, d30 vmlal.u32 q5, d1, d9 vmlal.u32 q6, d1, d26 vmlal.u32 q7, d1, d27 vmlal.u32 q8, d1, d28 vmlal.u32 q9, d1, d29 vmlal.u32 q5, d2, d8 vmlal.u32 q6, d2, d9 vmlal.u32 q7, d2, d26 vmlal.u32 q8, d2, d27 vmlal.u32 q9, d2, d28 vmlal.u32 q5, d3, d7 vmlal.u32 q6, d3, d8 vmlal.u32 q7, d3, d9 vmlal.u32 q8, d3, d26 vmlal.u32 q9, d3, d27 vmlal.u32 q5, d4, d6 vmlal.u32 q6, d4, d7 vmlal.u32 q7, d4, d8 vmlal.u32 q8, d4, d9 vmlal.u32 q9, d4, d26 vld4.32 {d0-d3}, [r1]! vshl.u32 d6, d21, #2 vshl.u32 d7, d22, #2 vshl.u32 d8, d23, #2 vshl.u32 d9, d24, #2 vadd.u32 d6, d6, d21 vadd.u32 d7, d7, d22 vadd.u32 d8, d8, d23 vadd.u32 d9, d9, d24 vshr.u32 d4, d3, #8 vshl.u32 d3, d3, #18 vorr.i32 d4, d4, #0x1000000 vsri.u32 d3, d2, #14 vshl.u32 d2, d2, #12 vand.i32 d3, d3, #0x3ffffff vsri.u32 d2, d1, #20 vshl.u32 d1, d1, #6 vand.i32 d2, d2, #0x3ffffff vsri.u32 d1, d0, #26 vand.i32 d0, d0, #0x3ffffff vand.i32 d1, d1, #0x3ffffff vmlal.u32 q5, d0, d20 vmlal.u32 q6, d0, d21 vmlal.u32 q7, d0, d22 vmlal.u32 q8, d0, d23 vmlal.u32 q9, d0, d24 vmlal.u32 q5, d1, d9 vmlal.u32 q6, d1, d20 vmlal.u32 q7, d1, d21 vmlal.u32 q8, d1, d22 vmlal.u32 q9, d1, d23 vmlal.u32 q5, d2, d8 vmlal.u32 q6, d2, d9 vmlal.u32 q7, d2, d20 vmlal.u32 q8, d2, d21 vmlal.u32 q9, d2, d22 vmlal.u32 q5, d3, d7 vmlal.u32 q6, d3, d8 vmlal.u32 q7, d3, d9 vmlal.u32 q8, d3, d20 vmlal.u32 q9, d3, d21 vmlal.u32 q5, d4, d6 vmlal.u32 q6, d4, d7 vmlal.u32 q7, d4, d8 vmlal.u32 q8, d4, d9 vmlal.u32 q9, d4, d20 vadd.u64 d0, d10, d11 vadd.u64 d1, d12, d13 vadd.u64 d2, d14, d15 vadd.u64 d3, d16, d17 vadd.u64 d4, d18, d19 vsra.u64 d1, d0, #26 vand.u64 d0, d0, d31 vsra.u64 d2, d1, #26 vand.u64 d1, d1, d31 vsra.u64 d3, d2, #26 vand.u64 d2, d2, d31 vsra.u64 d4, d3, #26 vand.u64 d3, d3, d31 vshr.u64 d15, d4, #26 vand.u64 d4, d4, d31 vadd.u64 d0, d0, d15 vshl.u64 d15, d15, #2 vadd.u64 d0, d0, d15 vsra.u64 d1, d0, #26 vand.u64 d0, d0, d31 cmp r2, #0x40 bge L_poly1305_arm32_blocks_start_4 cmp r2, #32 blt L_poly1305_arm32_blocks_done_neon L_poly1305_arm32_blocks_start_2: sub r2, #32 vld4.32 {d10-d13}, [r1]! vshr.u32 d14, d13, #8 vshl.u32 d13, d13, #18 vorr.i32 d14, d14, #0x1000000 vsri.u32 d13, d12, #14 vshl.u32 d12, d12, #12 vand.i32 d13, d13, #0x3ffffff vsri.u32 d12, d11, #20 vshl.u32 d11, d11, #6 vand.i32 d12, d12, #0x3ffffff vsri.u32 d11, d10, #26 vand.i32 d10, d10, #0x3ffffff vand.i32 d11, d11, #0x3ffffff vadd.u32 d4, d4, d14 vadd.u32 q1, q1, q6 vadd.u32 q0, q0, q5 vmull.u32 q5, d0, d20 vmull.u32 q6, d0, d21 vmull.u32 q7, d0, d22 vmull.u32 q8, d0, d23 vmull.u32 q9, d0, d24 vmlal.u32 q5, d1, d9 vmlal.u32 q6, d1, d20 vmlal.u32 q7, d1, d21 vmlal.u32 q8, d1, d22 vmlal.u32 q9, d1, d23 vmlal.u32 q5, d2, d8 vmlal.u32 q6, d2, d9 vmlal.u32 q7, d2, d20 vmlal.u32 q8, d2, d21 vmlal.u32 q9, d2, d22 vmlal.u32 q5, d3, d7 vmlal.u32 q6, d3, d8 vmlal.u32 q7, d3, d9 vmlal.u32 q8, d3, d20 vmlal.u32 q9, d3, d21 vmlal.u32 q5, d4, d6 vmlal.u32 q6, d4, d7 vmlal.u32 q7, d4, d8 vmlal.u32 q8, d4, d9 vmlal.u32 q9, d4, d20 vadd.u64 d0, d10, d11 vadd.u64 d1, d12, d13 vadd.u64 d2, d14, d15 vadd.u64 d3, d16, d17 vadd.u64 d4, d18, d19 vsra.u64 d1, d0, #26 vand.u64 d0, d0, d31 vsra.u64 d2, d1, #26 vand.u64 d1, d1, d31 vsra.u64 d3, d2, #26 vand.u64 d2, d2, d31 vsra.u64 d4, d3, #26 vand.u64 d3, d3, d31 vshr.u64 d5, d4, #26 vand.u64 d4, d4, d31 vadd.u64 d0, d0, d5 vshl.u64 d5, d5, #2 vadd.u64 d0, d0, d5 vsra.u64 d1, d0, #26 vand.u64 d0, d0, d31 L_poly1305_arm32_blocks_done_neon: cmp r2, #16 beq L_poly1305_arm32_blocks_begin_1 add r12, r0, #16 vsli.u64 d0, d1, #26 vsli.u64 d0, d2, #52 vshr.u64 d1, d2, #12 vsli.u64 d1, d3, #14 vsli.u64 d1, d4, #40 vshr.u64 d2, d4, #24 vst1.64 {d0-d2}, [r12] b L_poly1305_arm32_blocks_done L_poly1305_arm32_blocks_begin_1: vsli.u64 d0, d1, #26 vsli.u64 d0, d2, #52 vshr.u64 d1, d2, #12 vsli.u64 d1, d3, #14 vsli.u64 d1, d4, #40 vshr.u64 d2, d4, #24 vmov r7, r8, d0 vmov r9, r10, d1 vmov r11, d2[0] L_poly1305_arm32_blocks_start_1: mov r12, #1 push {r2} # Load message ldr r2, [r1] ldr r3, [r1, #4] ldr r4, [r1, #8] ldr r5, [r1, #12] # Add message adds r7, r7, r2 adcs r8, r8, r3 adcs r9, r9, r4 adcs r10, r10, r5 adc r11, r11, r12 push {r0-r1} add r1, r0, #0 add lr, r0, #16 #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) stm lr, {r7, r8, r9, r10, r11} #else # h[0]-h[2] in r4-r6 for multiplication. str r10, [lr, #12] str r11, [lr, #16] #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] ldr r3, [r1] eor r0, r0, r0 # r[0] * h[0] # h[0] in r4 umull r7, r8, r3, r7 # r[0] * h[2] # h[2] in r6 umull r9, r10, r3, r9 # r[0] * h[4] # h[4] in r8 mul r11, r3, r11 # r[0] * h[1] ldr r2, [lr, #4] mov r12, r0 umlal r8, r12, r3, r2 # r[0] * h[3] ldr r2, [lr, #12] adds r9, r9, r12 adc r10, r10, r0 umlal r10, r11, r3, r2 # r[1] * h[0] ldr r3, [r1, #4] ldr r2, [lr] mov r12, r0 umlal r8, r12, r3, r2 # r[1] * h[1] ldr r2, [lr, #4] adds r9, r9, r12 adc r12, r0, r0 umlal r9, r12, r3, r2 # r[1] * h[2] ldr r2, [lr, #8] adds r10, r10, r12 adc r12, r0, r0 umlal r10, r12, r3, r2 # r[1] * h[3] ldr r2, [lr, #12] adds r11, r11, r12 adc r4, r0, r0 umlal r11, r4, r3, r2 # r[1] * h[4] ldr r2, [lr, #16] mla r4, r3, r2, r4 # r[2] * h[0] ldr r3, [r1, #8] ldr r2, [lr] mov r12, r0 umlal r9, r12, r3, r2 # r[2] * h[1] ldr r2, [lr, #4] adds r10, r10, r12 adc r12, r0, r0 umlal r10, r12, r3, r2 # r[2] * h[2] ldr r2, [lr, #8] adds r11, r11, r12 adc r12, r0, r0 umlal r11, r12, r3, r2 # r[2] * h[3] ldr r2, [lr, #12] adds r4, r4, r12 adc r5, r0, r0 umlal r4, r5, r3, r2 # r[2] * h[4] ldr r2, [lr, #16] mla r5, r3, r2, r5 # r[3] * h[0] ldr r3, [r1, #12] ldr r2, [lr] mov r12, r0 umlal r10, r12, r3, r2 # r[3] * h[1] ldr r2, [lr, #4] adds r11, r11, r12 adc r12, r0, r0 umlal r11, r12, r3, r2 # r[3] * h[2] ldr r2, [lr, #8] adds r4, r4, r12 adc r5, r5, r0 umlal r4, r5, r3, r2 # r[3] * h[3] ldr r2, [lr, #12] mov r6, r0 umlal r5, r6, r3, r2 # r[3] * h[4] ldr r2, [lr, #16] mov r12, r0 mla r6, r3, r2, r6 #else sub sp, sp, #12 ldm r1, {r0, r1, r2, r3} # r[0] * h[0] umull r5, r6, r0, r7 # r[1] * h[0] umull r12, r10, r1, r7 # r[0] * h[1] umaal r6, r12, r0, r8 # r[2] * h[0] umull r11, r4, r2, r7 # r[1] * h[1] umaal r12, r11, r1, r8 # r[0] * h[2] umaal r12, r10, r0, r9 # r[3] * h[0] umaal r11, r4, r3, r7 stm sp, {r5, r6, r12} # r[2] * h[1] umaal r10, r11, r2, r8 # Replace h[0] with h[3] ldr r7, [lr, #12] # r[1] * h[2] umull r5, r6, r1, r9 # r[2] * h[2] umaal r11, r4, r2, r9 # r[0] * h[3] umaal r10, r5, r0, r7 # r[3] * h[1] umaal r11, r6, r3, r8 # r[1] * h[3] umaal r11, r5, r1, r7 # r[3] * h[2] umaal r4, r6, r3, r9 # r[2] * h[3] umaal r4, r5, r2, r7 # Replace h[1] with h[4] ldr r8, [lr, #16] # r[3] * h[3] umaal r5, r6, r3, r7 mov r12, #0 # r[0] * h[4] umaal r11, r12, r0, r8 # r[1] * h[4] umaal r4, r12, r1, r8 # r[2] * h[4] umaal r5, r12, r2, r8 # r[3] * h[4] umaal r6, r12, r3, r8 # DONE ldm sp, {r7, r8, r9} add sp, sp, #12 #endif /* WOLFSSL_ARM_ARCH && WOLFSSL_ARM_ARCH < 6 */ # Reduce mod 2^130 - 5 bic r3, r11, #0x3 and r11, r11, #3 adds r7, r7, r3 lsr r3, r3, #2 adcs r8, r8, r4 orr r3, r3, r4, lsl #30 adcs r9, r9, r5 lsr r4, r4, #2 adcs r10, r10, r6 orr r4, r4, r5, lsl #30 adc r11, r11, r12 lsr r5, r5, #2 adds r7, r7, r3 orr r5, r5, r6, lsl #30 adcs r8, r8, r4 lsr r6, r6, #2 adcs r9, r9, r5 adcs r10, r10, r6 adc r11, r11, r12 pop {r0-r1} pop {r2} add r12, r0, #16 stm r12, {r7, r8, r9, r10, r11} L_poly1305_arm32_blocks_done: vpop {d8-d15} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size poly1305_arm32_blocks,.-poly1305_arm32_blocks #ifndef __APPLE__ .text .type L_poly1305_arm32_clamp, %object .size L_poly1305_arm32_clamp, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_poly1305_arm32_clamp: .long 0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc .text .align 4 .globl poly1305_set_key .type poly1305_set_key, %function poly1305_set_key: push {r4, r5, r6, r7, r8, r9, r10, r11, lr} vpush {d8-d15} # Load mask. adr lr, L_poly1305_arm32_clamp ldm lr, {r6, r7, r8, r9} # Load and cache padding. ldr r2, [r1, #16] ldr r3, [r1, #20] ldr r4, [r1, #24] ldr r5, [r1, #28] add lr, r0, #40 stm lr, {r2, r3, r4, r5} # Load, mask and store r. ldr r2, [r1] ldr r3, [r1, #4] ldr r4, [r1, #8] ldr r5, [r1, #12] and r2, r2, r6 and r3, r3, r7 and r4, r4, r8 and r5, r5, r9 add lr, r0, #0 stm lr, {r2, r3, r4, r5} vmov.i16 q10, #0xffff vshr.u64 q10, q10, #38 lsr r8, r2, #26 lsr r9, r3, #20 lsr r10, r4, #14 lsr r11, r5, #8 eor r8, r8, r3, lsl #6 eor r9, r9, r4, lsl #12 eor r10, r10, r5, lsl #18 and r7, r2, #0x3ffffff and r8, r8, #0x3ffffff and r9, r9, #0x3ffffff and r10, r10, #0x3ffffff vmov.i32 s1, r7 vmov.i32 s3, r8 vmov.i32 s5, r9 vmov.i32 s7, r10 vmov.i32 s9, r11 push {r0-r1} #if defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) # Square r umull r1, r6, r2, r3 mov r12, #0 umull r7, r8, r2, r5 mov lr, r12 umlal r6, lr, r2, r4 adds r7, r7, lr adc lr, r12, r12 umlal r7, lr, r3, r4 mov r9, r12 umlal lr, r9, r3, r5 adds r8, r8, lr adcs r9, r9, r12 adc r10, r12, r12 umlal r9, r10, r4, r5 adds r1, r1, r1 adcs r6, r6, r6 adcs r7, r7, r7 adcs r8, r8, r8 adcs r9, r9, r9 adcs r10, r10, r10 adc r11, r12, r12 umull r0, lr, r2, r2 adds r1, r1, lr adcs r6, r6, r12 adc lr, r12, r12 umlal r6, lr, r3, r3 adds r7, r7, lr adcs r8, r8, r12 adc lr, r12, r12 umlal r8, lr, r4, r4 adds r9, r9, lr adcs r10, r10, r12 adc r11, r11, r12 umlal r10, r11, r5, r5 #else umull r0, r1, r2, r2 umull r6, r7, r2, r3 adds r6, r6, r6 mov r12, #0 umaal r1, r6, r12, r12 mov r8, r12 umaal r8, r7, r2, r4 adcs r8, r8, r8 umaal r6, r8, r3, r3 umull r9, r10, r2, r5 umaal r7, r9, r3, r4 adcs r7, r7, r7 umaal r7, r8, r12, r12 umaal r10, r9, r3, r5 adcs r10, r10, r10 umaal r8, r10, r4, r4 mov r11, r12 umaal r9, r11, r4, r5 adcs r9, r9, r9 umaal r9, r10, r12, r12 adcs r11, r11, r11 umaal r10, r11, r5, r5 adc r11, r11, r12 #endif /* defined(WOLFSSL_ARM_ARCH) && (WOLFSSL_ARM_ARCH < 6) */ # Reduce mod 2^130 - 5 bic r2, r8, #0x3 and r8, r8, #3 adds r0, r0, r2 lsr r2, r2, #2 adcs r1, r1, r9 orr r2, r2, r9, lsl #30 adcs r6, r6, r10 lsr r9, r9, #2 adcs r7, r7, r11 orr r9, r9, r10, lsl #30 adc r8, r8, r12 lsr r10, r10, #2 adds r0, r0, r2 orr r10, r10, r11, lsl #30 adcs r1, r1, r9 lsr r11, r11, #2 adcs r6, r6, r10 adcs r7, r7, r11 adc r8, r8, r12 lsr r3, r0, #26 lsr r4, r1, #20 lsr r5, r6, #14 lsr r10, r7, #8 eor r3, r3, r1, lsl #6 eor r4, r4, r6, lsl #12 eor r5, r5, r7, lsl #18 eor r10, r10, r8, lsl #24 and r2, r0, #0x3ffffff and r3, r3, #0x3ffffff and r4, r4, #0x3ffffff and r5, r5, #0x3ffffff vmov.i32 s0, r2 vmov.i32 s2, r3 vmov.i32 s4, r4 vmov.i32 s6, r5 vmov.i32 s8, r10 pop {r0-r1} add lr, r0, #0x7c vstm.32 lr, {d0-d4} # Multiply r^2, r by r^2 vshl.u32 d6, d1, #2 vshl.u32 d7, d2, #2 vshl.u32 d8, d3, #2 vshl.u32 d9, d4, #2 vadd.u32 d6, d6, d1 vadd.u32 d7, d7, d2 vadd.u32 d8, d8, d3 vadd.u32 d9, d9, d4 vmull.u32 q5, d0, d0[0] vmull.u32 q6, d0, d1[0] vmull.u32 q7, d0, d2[0] vmull.u32 q8, d0, d3[0] vmull.u32 q9, d0, d4[0] vmlal.u32 q5, d1, d9[0] vmlal.u32 q6, d1, d0[0] vmlal.u32 q7, d1, d1[0] vmlal.u32 q8, d1, d2[0] vmlal.u32 q9, d1, d3[0] vmlal.u32 q5, d2, d8[0] vmlal.u32 q6, d2, d9[0] vmlal.u32 q7, d2, d0[0] vmlal.u32 q8, d2, d1[0] vmlal.u32 q9, d2, d2[0] vmlal.u32 q5, d3, d7[0] vmlal.u32 q6, d3, d8[0] vmlal.u32 q7, d3, d9[0] vmlal.u32 q8, d3, d0[0] vmlal.u32 q9, d3, d1[0] vmlal.u32 q5, d4, d6[0] vmlal.u32 q6, d4, d7[0] vmlal.u32 q7, d4, d8[0] vmlal.u32 q8, d4, d9[0] vmlal.u32 q9, d4, d0[0] vsra.u64 q6, q5, #26 vand.u64 q5, q5, q10 vsra.u64 q7, q6, #26 vand.u64 q6, q6, q10 vsra.u64 q8, q7, #26 vand.u64 q7, q7, q10 vsra.u64 q9, q8, #26 vand.u64 q8, q8, q10 vshr.u64 q3, q9, #26 vand.u64 q9, q9, q10 vadd.u64 q5, q5, q3 vshl.u64 q3, q3, #2 vadd.u64 q5, q5, q3 vsra.u64 q6, q5, #26 vand.u64 q5, q5, q10 vmovn.i64 d10, q5 vmovn.i64 d11, q6 vmovn.i64 d12, q7 vmovn.i64 d13, q8 vmovn.i64 d14, q9 add lr, r0, #0xa4 vstm.32 lr, {d10-d14} # h (accumulator) = 0 eor r6, r6, r6 eor r7, r7, r7 eor r8, r8, r8 eor r9, r9, r9 add lr, r0, #16 eor r4, r4, r4 eor r5, r5, r5 stm lr, {r4, r5, r6, r7, r8, r9} # Zero leftover str r5, [r0, #56] vpop {d8-d15} pop {r4, r5, r6, r7, r8, r9, r10, r11, pc} .size poly1305_set_key,.-poly1305_set_key .text .align 4 .globl poly1305_final .type poly1305_final, %function poly1305_final: push {r4, r5, r6, r7, r8, r9, lr} add r9, r0, #16 ldm r9, {r4, r5, r6, r7, r8} # Add 5 and check for h larger than p. adds r2, r4, #5 adcs r2, r5, #0 adcs r2, r6, #0 adcs r2, r7, #0 adc r2, r8, #0 sub r2, r2, #4 lsr r2, r2, #31 sub r2, r2, #1 and r2, r2, #5 # Add 0/5 to h. adds r4, r4, r2 adcs r5, r5, #0 adcs r6, r6, #0 adc r7, r7, #0 # Add padding add r9, r0, #40 ldm r9, {r2, r3, r12, lr} adds r4, r4, r2 adcs r5, r5, r3 adcs r6, r6, r12 adc r7, r7, lr # Store MAC str r4, [r1] str r5, [r1, #4] str r6, [r1, #8] str r7, [r1, #12] # Zero out h. eor r4, r4, r4 eor r5, r5, r5 eor r6, r6, r6 eor r7, r7, r7 eor r8, r8, r8 add r9, r0, #16 stm r9, {r4, r5, r6, r7, r8} # Zero out r. add r9, r0, #0 stm r9, {r4, r5, r6, r7} # Zero out padding. add r9, r0, #40 stm r9, {r4, r5, r6, r7} pop {r4, r5, r6, r7, r8, r9, pc} .size poly1305_final,.-poly1305_final #endif /* WOLFSSL_ARMASM_NO_NEON */ #endif /* HAVE_POLY1305 */ #endif /* !__aarch64__ && !WOLFSSL_ARMASM_THUMB2 */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* !WOLFSSL_ARMASM_INLINE */