/* armv8-poly1305-asm * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #include /* Generated using (from wolfssl): * cd ../scripts * ruby ./poly1305/poly1305.rb arm64 \ * ../wolfssl/wolfcrypt/src/port/arm/armv8-poly1305-asm.S */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #ifndef WOLFSSL_ARMASM_INLINE #ifndef __APPLE__ .text .globl poly1305_arm64_block_16 .type poly1305_arm64_block_16,@function .align 2 poly1305_arm64_block_16: #else .section __TEXT,__text .globl _poly1305_arm64_block_16 .p2align 2 _poly1305_arm64_block_16: #endif /* __APPLE__ */ # Load h ldp w2, w3, [x0, #96] ldp w4, w11, [x0, #104] ldr w12, [x0, #112] # Load m ldr x14, [x1] ldr x15, [x1, #8] # Load r ldp x5, x6, [x0] # h: Base26 -> Base 64 add x2, x2, x3, lsl 26 lsr x3, x4, #12 add x2, x2, x4, lsl 52 add x3, x3, x11, lsl 14 lsr x4, x12, #24 add x3, x3, x12, lsl 40 # Add m and !finished at bit 128 adds x2, x2, x14 adcs x3, x3, x15 adc x4, x4, xzr # Multiply h by r # b[0] * a[0] mul x7, x5, x2 umulh x8, x5, x2 # b[0] * a[1] mul x10, x5, x3 umulh x9, x5, x3 # b[1] * a[0] mul x11, x6, x2 umulh x12, x6, x2 adds x8, x8, x10 # b[1] * a[1] mul x13, x6, x3 umulh x10, x6, x3 adc x9, x9, x12 adds x8, x8, x11 # b[0] * a[2] mul x11, x5, x4 adcs x9, x9, x13 # b[1] * a[2] mul x12, x6, x4 adc x10, x10, xzr adds x9, x9, x11 adc x10, x10, x12 # Reduce mod 2^130 - 5 # Get high bits and x11, x9, #-4 # Get top two bits and x9, x9, #3 # Add top bits * 4 adds x2, x7, x11 # Move down 2 bits extr x11, x10, x11, #2 adcs x3, x8, x10 lsr x10, x10, #2 adc x4, x9, xzr # Add top bits. adds x2, x2, x11 adcs x3, x3, x10 adc x4, x4, xzr extr x12, x4, x3, #40 ubfx x4, x2, #52, #12 ubfx x11, x3, #14, #26 bfi x4, x3, #12, #14 ubfx x3, x2, #26, #26 ubfx x2, x2, #0, #26 stp w2, w3, [x0, #96] stp w4, w11, [x0, #104] str w12, [x0, #112] ret #ifndef __APPLE__ .size poly1305_arm64_block_16,.-poly1305_arm64_block_16 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_arm64_blocks .type poly1305_arm64_blocks,@function .align 2 poly1305_arm64_blocks: #else .section __TEXT,__text .globl _poly1305_arm64_blocks .p2align 2 _poly1305_arm64_blocks: #endif /* __APPLE__ */ stp x29, x30, [sp, #-96]! add x29, sp, #0 str x17, [x29, #24] stp d8, d9, [x29, #32] stp d10, d11, [x29, #48] stp d12, d13, [x29, #64] stp d14, d15, [x29, #80] cmp x2, #0x40 blt L_poly1305_arm64_blocks_done # Set mask (0x3ffffff), hi bit and 5 into vector registers movi v25.16b, #0xff movi v27.4s, #1, lsl 24 ushr v25.4s, v25.4s, #6 movi v24.4s, #5 uxtl v26.2d, v25.2s add x14, x0, #16 ld4 {v15.4s, v16.4s, v17.4s, v18.4s}, [x14], #0x40 ld1 {v19.4s}, [x14] add x14, x0, #0x60 movi v0.4s, #0 movi v1.4s, #0 movi v2.4s, #0 movi v3.4s, #0 movi v4.4s, #0 ld4 {v0.s, v1.s, v2.s, v3.s}[0], [x14], #16 ld1 {v4.s}[0], [x14] mul v20.4s, v16.4s, v24.4s mul v21.4s, v17.4s, v24.4s mul v22.4s, v18.4s, v24.4s mul v23.4s, v19.4s, v24.4s L_poly1305_arm64_blocks_loop_64: # Load message of 64 bytes - setting hi bit for not finished ld4 {v5.4s, v6.4s, v7.4s, v8.4s}, [x1], #0x40 sub x2, x2, #0x40 ushr v9.4s, v8.4s, #8 shl v8.4s, v8.4s, #18 orr v9.16b, v9.16b, v27.16b sri v8.4s, v7.4s, #14 shl v7.4s, v7.4s, #12 and v8.16b, v8.16b, v25.16b sri v7.4s, v6.4s, #20 shl v6.4s, v6.4s, #6 and v7.16b, v7.16b, v25.16b sri v6.4s, v5.4s, #26 and v5.16b, v5.16b, v25.16b and v6.16b, v6.16b, v25.16b umull2 v10.2d, v5.4s, v15.4s umull2 v11.2d, v5.4s, v16.4s umull2 v12.2d, v5.4s, v17.4s umull2 v13.2d, v5.4s, v18.4s umull2 v14.2d, v5.4s, v19.4s umlal2 v10.2d, v6.4s, v23.4s umlal2 v11.2d, v6.4s, v15.4s umlal2 v12.2d, v6.4s, v16.4s umlal2 v13.2d, v6.4s, v17.4s umlal2 v14.2d, v6.4s, v18.4s umlal2 v10.2d, v7.4s, v22.4s umlal2 v11.2d, v7.4s, v23.4s umlal2 v12.2d, v7.4s, v15.4s umlal2 v13.2d, v7.4s, v16.4s umlal2 v14.2d, v7.4s, v17.4s umlal2 v10.2d, v8.4s, v21.4s umlal2 v11.2d, v8.4s, v22.4s umlal2 v12.2d, v8.4s, v23.4s umlal2 v13.2d, v8.4s, v15.4s umlal2 v14.2d, v8.4s, v16.4s umlal2 v10.2d, v9.4s, v20.4s umlal2 v11.2d, v9.4s, v21.4s umlal2 v12.2d, v9.4s, v22.4s umlal2 v13.2d, v9.4s, v23.4s umlal2 v14.2d, v9.4s, v15.4s add v5.4s, v5.4s, v0.4s add v6.4s, v6.4s, v1.4s add v7.4s, v7.4s, v2.4s add v8.4s, v8.4s, v3.4s add v9.4s, v9.4s, v4.4s umlal v10.2d, v5.2s, v15.2s umlal v11.2d, v5.2s, v16.2s umlal v12.2d, v5.2s, v17.2s umlal v13.2d, v5.2s, v18.2s umlal v14.2d, v5.2s, v19.2s umlal v10.2d, v6.2s, v23.2s umlal v11.2d, v6.2s, v15.2s umlal v12.2d, v6.2s, v16.2s umlal v13.2d, v6.2s, v17.2s umlal v14.2d, v6.2s, v18.2s umlal v10.2d, v7.2s, v22.2s umlal v11.2d, v7.2s, v23.2s umlal v12.2d, v7.2s, v15.2s umlal v13.2d, v7.2s, v16.2s umlal v14.2d, v7.2s, v17.2s umlal v10.2d, v8.2s, v21.2s umlal v11.2d, v8.2s, v22.2s umlal v12.2d, v8.2s, v23.2s umlal v13.2d, v8.2s, v15.2s umlal v14.2d, v8.2s, v16.2s umlal v10.2d, v9.2s, v20.2s umlal v11.2d, v9.2s, v21.2s umlal v12.2d, v9.2s, v22.2s umlal v13.2d, v9.2s, v23.2s umlal v14.2d, v9.2s, v15.2s addp d10, v10.2d addp d11, v11.2d addp d12, v12.2d addp d13, v13.2d addp d14, v14.2d # Redistribute and handle overflow usra v11.2d, v10.2d, #26 and v10.16b, v10.16b, v26.16b usra v14.2d, v13.2d, #26 and v3.16b, v13.16b, v26.16b ushr v2.2d, v14.2d, #26 usra v12.2d, v11.2d, #26 shl v0.2d, v2.2d, #2 and v1.16b, v11.16b, v26.16b add v0.2d, v0.2d, v2.2d and v4.16b, v14.16b, v26.16b add v10.2d, v10.2d, v0.2d usra v3.2d, v12.2d, #26 and v2.16b, v12.16b, v26.16b usra v1.2d, v10.2d, #26 and v0.16b, v10.16b, v26.16b usra v4.2d, v3.2d, #26 and v3.16b, v3.16b, v26.16b cmp x2, #0x40 bge L_poly1305_arm64_blocks_loop_64 cmp x2, #16 ble L_poly1305_arm64_blocks_done_32 # Start 32 ld4 {v5.2s, v6.2s, v7.2s, v8.2s}, [x1], #32 sub x2, x2, #32 mov v15.d[0], v15.d[1] mov v16.d[0], v16.d[1] mov v17.d[0], v17.d[1] mov v18.d[0], v18.d[1] mov v19.d[0], v19.d[1] mov v20.d[0], v20.d[1] mov v21.d[0], v21.d[1] mov v22.d[0], v22.d[1] mov v23.d[0], v23.d[1] ushr v9.2s, v8.2s, #8 shl v8.2s, v8.2s, #18 orr v9.8b, v9.8b, v27.8b sri v8.2s, v7.2s, #14 shl v7.2s, v7.2s, #12 and v8.8b, v8.8b, v25.8b sri v7.2s, v6.2s, #20 shl v6.2s, v6.2s, #6 and v7.8b, v7.8b, v25.8b sri v6.2s, v5.2s, #26 and v5.8b, v5.8b, v25.8b and v6.8b, v6.8b, v25.8b add v5.2s, v5.2s, v0.2s add v6.2s, v6.2s, v1.2s add v7.2s, v7.2s, v2.2s add v8.2s, v8.2s, v3.2s add v9.2s, v9.2s, v4.2s umull v10.2d, v5.2s, v15.2s umull v11.2d, v5.2s, v16.2s umull v12.2d, v5.2s, v17.2s umull v13.2d, v5.2s, v18.2s umull v14.2d, v5.2s, v19.2s umlal v10.2d, v6.2s, v23.2s umlal v11.2d, v6.2s, v15.2s umlal v12.2d, v6.2s, v16.2s umlal v13.2d, v6.2s, v17.2s umlal v14.2d, v6.2s, v18.2s umlal v10.2d, v7.2s, v22.2s umlal v11.2d, v7.2s, v23.2s umlal v12.2d, v7.2s, v15.2s umlal v13.2d, v7.2s, v16.2s umlal v14.2d, v7.2s, v17.2s umlal v10.2d, v8.2s, v21.2s umlal v11.2d, v8.2s, v22.2s umlal v12.2d, v8.2s, v23.2s umlal v13.2d, v8.2s, v15.2s umlal v14.2d, v8.2s, v16.2s umlal v10.2d, v9.2s, v20.2s umlal v11.2d, v9.2s, v21.2s umlal v12.2d, v9.2s, v22.2s umlal v13.2d, v9.2s, v23.2s umlal v14.2d, v9.2s, v15.2s addp d10, v10.2d addp d11, v11.2d addp d12, v12.2d addp d13, v13.2d addp d14, v14.2d # Redistribute and handle overflow usra v11.2d, v10.2d, #26 and v10.16b, v10.16b, v26.16b usra v14.2d, v13.2d, #26 and v3.16b, v13.16b, v26.16b ushr v2.2d, v14.2d, #26 usra v12.2d, v11.2d, #26 shl v0.2d, v2.2d, #2 and v1.16b, v11.16b, v26.16b add v0.2d, v0.2d, v2.2d and v4.16b, v14.16b, v26.16b add v10.2d, v10.2d, v0.2d usra v3.2d, v12.2d, #26 and v2.16b, v12.16b, v26.16b usra v1.2d, v10.2d, #26 and v0.16b, v10.16b, v26.16b usra v4.2d, v3.2d, #26 and v3.16b, v3.16b, v26.16b L_poly1305_arm64_blocks_done_32: cmp x2, #16 beq L_poly1305_arm64_blocks_transfer add x14, x0, #0x60 st4 {v0.s, v1.s, v2.s, v3.s}[0], [x14], #16 st1 {v4.s}[0], [x14] b L_poly1305_arm64_blocks_done_all L_poly1305_arm64_blocks_transfer: mov w3, v0.s[0] mov w4, v1.s[0] mov w5, v2.s[0] mov w6, v3.s[0] mov w7, v4.s[0] b L_poly1305_arm64_blocks_start L_poly1305_arm64_blocks_done: cmp x2, #16 blt L_poly1305_arm64_blocks_done_all # Load h ldp w3, w4, [x0, #96] ldp w5, w6, [x0, #104] ldr w7, [x0, #112] L_poly1305_arm64_blocks_start: mov x17, #1 # Load r ldp x8, x9, [x0] # Base26 -> Base 64 add x3, x3, x4, lsl 26 lsr x4, x5, #12 add x3, x3, x5, lsl 52 add x4, x4, x6, lsl 14 lsr x5, x7, #24 add x4, x4, x7, lsl 40 L_poly1305_arm64_blocks_loop: # Load m ldr x14, [x1] ldr x15, [x1, #8] # Add m and !finished at bit 128 adds x3, x3, x14 adcs x4, x4, x15 adc x5, x5, x17 # Multiply h by r # b[0] * a[0] mul x10, x8, x3 umulh x11, x8, x3 # b[0] * a[1] mul x13, x8, x4 umulh x12, x8, x4 # b[1] * a[0] mul x14, x9, x3 umulh x15, x9, x3 adds x11, x11, x13 # b[1] * a[1] mul x16, x9, x4 umulh x13, x9, x4 adc x12, x12, x15 adds x11, x11, x14 # b[0] * a[2] mul x14, x8, x5 adcs x12, x12, x16 # b[1] * a[2] mul x15, x9, x5 adc x13, x13, xzr adds x12, x12, x14 adc x13, x13, x15 # Reduce mod 2^130 - 5 # Get high bits and x14, x12, #-4 # Get top two bits and x12, x12, #3 # Add top bits * 4 adds x3, x10, x14 # Move down 2 bits extr x14, x13, x14, #2 adcs x4, x11, x13 lsr x13, x13, #2 adc x5, x12, xzr # Add top bits. adds x3, x3, x14 adcs x4, x4, x13 adc x5, x5, xzr # Sub 16 from length. subs x2, x2, #16 add x1, x1, #16 # Loop again if more message to do. bgt L_poly1305_arm64_blocks_loop extr x7, x5, x4, #40 ubfx x5, x3, #52, #12 ubfx x6, x4, #14, #26 bfi x5, x4, #12, #14 ubfx x4, x3, #26, #26 ubfx x3, x3, #0, #26 stp w3, w4, [x0, #96] stp w5, w6, [x0, #104] str w7, [x0, #112] L_poly1305_arm64_blocks_done_all: ldr x17, [x29, #24] ldp d8, d9, [x29, #32] ldp d10, d11, [x29, #48] ldp d12, d13, [x29, #64] ldp d14, d15, [x29, #80] ldp x29, x30, [sp], #0x60 ret #ifndef __APPLE__ .size poly1305_arm64_blocks,.-poly1305_arm64_blocks #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_poly1305_set_key_arm64_clamp, %object .size L_poly1305_set_key_arm64_clamp, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_poly1305_set_key_arm64_clamp: .long 0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc #ifndef __APPLE__ .text .globl poly1305_set_key .type poly1305_set_key,@function .align 2 poly1305_set_key: #else .section __TEXT,__text .globl _poly1305_set_key .p2align 2 _poly1305_set_key: #endif /* __APPLE__ */ stp x29, x30, [sp, #-32]! add x29, sp, #0 str x17, [x29, #24] #ifndef __APPLE__ adrp x2, L_poly1305_set_key_arm64_clamp add x2, x2, :lo12:L_poly1305_set_key_arm64_clamp #else adrp x2, L_poly1305_set_key_arm64_clamp@PAGE add x2, x2, L_poly1305_set_key_arm64_clamp@PAGEOFF #endif /* __APPLE__ */ # Load key and pad. ldp x11, x12, [x1] ldp x14, x15, [x1, #16] # Load mask. ldp x16, x17, [x2] # Save pad for later stp x14, x15, [x0, #120] # Apply clamp. # r &= 0x0ffffffc0ffffffc0ffffffc0fffffff and x11, x11, x16 and x12, x12, x17 # Store r - 64-bit version. stp x11, x12, [x0] # 128-bits: Base 64 -> Base 26 lsr x7, x12, #40 ubfx x5, x11, #52, #12 ubfx x6, x12, #14, #26 bfi x5, x12, #12, #14 ubfx x4, x11, #26, #26 ubfx x3, x11, #0, #26 stp w3, w4, [x0, #64] stp w5, w6, [x0, #72] str w7, [x0, #92] # Compute r^2 # a[0] * a[0] mul x3, x11, x11 umulh x4, x11, x11 # 2 * a[0] * a[1] mul x14, x11, x12 umulh x5, x11, x12 # a[1] * a[1] mul x15, x12, x12 umulh x6, x12, x12 adds x4, x4, x14, lsl 1 extr x5, x5, x14, #63 adcs x5, x5, x15 adc x6, x6, xzr # Reduce mod 2^130 - 5 # Get high bits and x14, x5, #-4 # Get top two bits and x5, x5, #3 # Add top bits * 4 adds x8, x3, x14 # Move down 2 bits extr x14, x6, x14, #2 adcs x9, x4, x6 lsr x6, x6, #2 adc x10, x5, xzr # Add top bits. adds x8, x8, x14 adcs x9, x9, x6 adc x10, x10, xzr # 130-bits: Base 64 -> Base 26 extr x7, x10, x9, #40 ubfx x5, x8, #52, #12 ubfx x6, x9, #14, #26 bfi x5, x9, #12, #14 ubfx x4, x8, #26, #26 ubfx x3, x8, #0, #26 stp w3, w4, [x0, #48] stp w5, w6, [x0, #56] str w7, [x0, #88] # Compute r^3 # b[0] * a[0] mul x3, x11, x8 umulh x4, x11, x8 # b[0] * a[1] mul x6, x11, x9 umulh x5, x11, x9 # b[1] * a[0] mul x14, x12, x8 umulh x15, x12, x8 adds x4, x4, x6 # b[1] * a[1] mul x16, x12, x9 umulh x6, x12, x9 adc x5, x5, x15 adds x4, x4, x14 # b[0] * a[2] mul x14, x11, x10 adcs x5, x5, x16 # b[1] * a[2] mul x15, x12, x10 adc x6, x6, xzr adds x5, x5, x14 adc x6, x6, x15 # Reduce mod 2^130 - 5 # Get high bits and x14, x5, #-4 # Get top two bits and x5, x5, #3 # Add top bits * 4 adds x8, x3, x14 # Move down 2 bits extr x14, x6, x14, #2 adcs x9, x4, x6 lsr x6, x6, #2 adc x10, x5, xzr # Add top bits. adds x8, x8, x14 adcs x9, x9, x6 adc x10, x10, xzr # 130-bits: Base 64 -> Base 26 extr x7, x10, x9, #40 ubfx x5, x8, #52, #12 ubfx x6, x9, #14, #26 bfi x5, x9, #12, #14 ubfx x4, x8, #26, #26 ubfx x3, x8, #0, #26 stp w3, w4, [x0, #32] stp w5, w6, [x0, #40] str w7, [x0, #84] # Compute r^4 # b[0] * a[0] mul x3, x11, x8 umulh x4, x11, x8 # b[0] * a[1] mul x6, x11, x9 umulh x5, x11, x9 # b[1] * a[0] mul x14, x12, x8 umulh x15, x12, x8 adds x4, x4, x6 # b[1] * a[1] mul x16, x12, x9 umulh x6, x12, x9 adc x5, x5, x15 adds x4, x4, x14 # b[0] * a[2] mul x14, x11, x10 adcs x5, x5, x16 # b[1] * a[2] mul x15, x12, x10 adc x6, x6, xzr adds x5, x5, x14 adc x6, x6, x15 # Reduce mod 2^130 - 5 # Get high bits and x14, x5, #-4 # Get top two bits and x5, x5, #3 # Add top bits * 4 adds x11, x3, x14 # Move down 2 bits extr x14, x6, x14, #2 adcs x12, x4, x6 lsr x6, x6, #2 adc x13, x5, xzr # Add top bits. adds x11, x11, x14 adcs x12, x12, x6 adc x13, x13, xzr # 130-bits: Base 64 -> Base 26 extr x7, x13, x12, #40 ubfx x5, x11, #52, #12 ubfx x6, x12, #14, #26 bfi x5, x12, #12, #14 ubfx x4, x11, #26, #26 ubfx x3, x11, #0, #26 stp w3, w4, [x0, #16] stp w5, w6, [x0, #24] str w7, [x0, #80] # h (accumulator) = 0 stp xzr, xzr, [x0, #96] str wzr, [x0, #112] # Zero leftover str xzr, [x0, #136] # Zero finished strb wzr, [x0, #160] ldr x17, [x29, #24] ldp x29, x30, [sp], #32 ret #ifndef __APPLE__ .size poly1305_set_key,.-poly1305_set_key #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_final .type poly1305_final,@function .align 2 poly1305_final: #else .section __TEXT,__text .globl _poly1305_final .p2align 2 _poly1305_final: #endif /* __APPLE__ */ ldp x8, x9, [x0, #120] ldp w2, w3, [x0, #96] ldp w4, w5, [x0, #104] ldr w6, [x0, #112] add x2, x2, x3, lsl 26 lsr x3, x4, #12 add x2, x2, x4, lsl 52 add x3, x3, x5, lsl 14 lsr x4, x6, #24 add x3, x3, x6, lsl 40 # Add 5 to h. adds x5, x2, #5 adcs x6, x3, xzr adc x7, x4, xzr # Check if h+5 s larger than p. cmp x7, #3 csel x2, x5, x2, hi csel x3, x6, x3, hi # Add padding adds x2, x2, x8 adc x3, x3, x9 # Store MAC stp x2, x3, [x1] # Zero out h. stp xzr, xzr, [x0, #96] str wzr, [x0, #112] # Zero out r64. stp xzr, xzr, [x0] # Zero out r. stp xzr, xzr, [x0, #16] # Zero out r_2. stp xzr, xzr, [x0, #48] str xzr, [x0, #64] # Zero out r_4. stp xzr, xzr, [x0, #16] str xzr, [x0, #32] # Zero out pad. stp xzr, xzr, [x0, #120] ret #ifndef __APPLE__ .size poly1305_final,.-poly1305_final #endif /* __APPLE__ */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* !WOLFSSL_ARMASM_INLINE */