/* armv8-mlkem-asm * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #include /* Generated using (from wolfssl): * cd ../scripts * ruby ./kyber/kyber.rb arm64 \ * ../wolfssl/wolfcrypt/src/port/arm/armv8-mlkem-asm.S */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #ifndef WOLFSSL_ARMASM_INLINE #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_consts, %object .size L_mlkem_aarch64_consts, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_consts: .short 0x0d01,0xf301,0x4ebf,0x0549,0x5049,0x0000,0x0000,0x0000 #ifdef WOLFSSL_HAVE_MLKEM #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_zetas, %object .size L_mlkem_aarch64_zetas, 576 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_zetas: .short 0x08ed,0x0a0b,0x0b9a,0x0714,0x05d5,0x058e,0x011f,0x00ca .short 0x0c56,0x026e,0x0629,0x00b6,0x03c2,0x084f,0x073f,0x05bc .short 0x023d,0x07d4,0x0108,0x017f,0x09c4,0x05b2,0x06bf,0x0c7f .short 0x0a58,0x03f9,0x02dc,0x0260,0x06fb,0x019b,0x0c34,0x06de .short 0x04c7,0x04c7,0x04c7,0x04c7,0x028c,0x028c,0x028c,0x028c .short 0x0ad9,0x0ad9,0x0ad9,0x0ad9,0x03f7,0x03f7,0x03f7,0x03f7 .short 0x07f4,0x07f4,0x07f4,0x07f4,0x05d3,0x05d3,0x05d3,0x05d3 .short 0x0be7,0x0be7,0x0be7,0x0be7,0x06f9,0x06f9,0x06f9,0x06f9 .short 0x0204,0x0204,0x0204,0x0204,0x0cf9,0x0cf9,0x0cf9,0x0cf9 .short 0x0bc1,0x0bc1,0x0bc1,0x0bc1,0x0a67,0x0a67,0x0a67,0x0a67 .short 0x06af,0x06af,0x06af,0x06af,0x0877,0x0877,0x0877,0x0877 .short 0x007e,0x007e,0x007e,0x007e,0x05bd,0x05bd,0x05bd,0x05bd .short 0x09ac,0x09ac,0x09ac,0x09ac,0x0ca7,0x0ca7,0x0ca7,0x0ca7 .short 0x0bf2,0x0bf2,0x0bf2,0x0bf2,0x033e,0x033e,0x033e,0x033e .short 0x006b,0x006b,0x006b,0x006b,0x0774,0x0774,0x0774,0x0774 .short 0x0c0a,0x0c0a,0x0c0a,0x0c0a,0x094a,0x094a,0x094a,0x094a .short 0x0b73,0x0b73,0x0b73,0x0b73,0x03c1,0x03c1,0x03c1,0x03c1 .short 0x071d,0x071d,0x071d,0x071d,0x0a2c,0x0a2c,0x0a2c,0x0a2c .short 0x01c0,0x01c0,0x01c0,0x01c0,0x08d8,0x08d8,0x08d8,0x08d8 .short 0x02a5,0x02a5,0x02a5,0x02a5,0x0806,0x0806,0x0806,0x0806 .short 0x08b2,0x08b2,0x01ae,0x01ae,0x022b,0x022b,0x034b,0x034b .short 0x081e,0x081e,0x0367,0x0367,0x060e,0x060e,0x0069,0x0069 .short 0x01a6,0x01a6,0x024b,0x024b,0x00b1,0x00b1,0x0c16,0x0c16 .short 0x0bde,0x0bde,0x0b35,0x0b35,0x0626,0x0626,0x0675,0x0675 .short 0x0c0b,0x0c0b,0x030a,0x030a,0x0487,0x0487,0x0c6e,0x0c6e .short 0x09f8,0x09f8,0x05cb,0x05cb,0x0aa7,0x0aa7,0x045f,0x045f .short 0x06cb,0x06cb,0x0284,0x0284,0x0999,0x0999,0x015d,0x015d .short 0x01a2,0x01a2,0x0149,0x0149,0x0c65,0x0c65,0x0cb6,0x0cb6 .short 0x0331,0x0331,0x0449,0x0449,0x025b,0x025b,0x0262,0x0262 .short 0x052a,0x052a,0x07fc,0x07fc,0x0748,0x0748,0x0180,0x0180 .short 0x0842,0x0842,0x0c79,0x0c79,0x04c2,0x04c2,0x07ca,0x07ca .short 0x0997,0x0997,0x00dc,0x00dc,0x085e,0x085e,0x0686,0x0686 .short 0x0860,0x0860,0x0707,0x0707,0x0803,0x0803,0x031a,0x031a .short 0x071b,0x071b,0x09ab,0x09ab,0x099b,0x099b,0x01de,0x01de .short 0x0c95,0x0c95,0x0bcd,0x0bcd,0x03e4,0x03e4,0x03df,0x03df .short 0x03be,0x03be,0x074d,0x074d,0x05f2,0x05f2,0x065c,0x065c #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_zetas_qinv, %object .size L_mlkem_aarch64_zetas_qinv, 576 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_zetas_qinv: .short 0xffed,0x7b0b,0x399a,0x0314,0x34d5,0xcf8e,0x6e1f,0xbeca .short 0xae56,0x6c6e,0xf129,0xc2b6,0x29c2,0x054f,0xd43f,0x79bc .short 0xe93d,0x43d4,0x9908,0x8e7f,0x15c4,0xfbb2,0x53bf,0x997f .short 0x9258,0x5ef9,0xd6dc,0x2260,0x47fb,0x229b,0x6834,0xc0de .short 0xe9c7,0xe9c7,0xe9c7,0xe9c7,0xe68c,0xe68c,0xe68c,0xe68c .short 0x05d9,0x05d9,0x05d9,0x05d9,0x78f7,0x78f7,0x78f7,0x78f7 .short 0xa3f4,0xa3f4,0xa3f4,0xa3f4,0x4ed3,0x4ed3,0x4ed3,0x4ed3 .short 0x50e7,0x50e7,0x50e7,0x50e7,0x61f9,0x61f9,0x61f9,0x61f9 .short 0xce04,0xce04,0xce04,0xce04,0x67f9,0x67f9,0x67f9,0x67f9 .short 0x3ec1,0x3ec1,0x3ec1,0x3ec1,0xcf67,0xcf67,0xcf67,0xcf67 .short 0x23af,0x23af,0x23af,0x23af,0xfd77,0xfd77,0xfd77,0xfd77 .short 0x9a7e,0x9a7e,0x9a7e,0x9a7e,0x6cbd,0x6cbd,0x6cbd,0x6cbd .short 0x4dac,0x4dac,0x4dac,0x4dac,0x91a7,0x91a7,0x91a7,0x91a7 .short 0xc1f2,0xc1f2,0xc1f2,0xc1f2,0xdd3e,0xdd3e,0xdd3e,0xdd3e .short 0x916b,0x916b,0x916b,0x916b,0x2374,0x2374,0x2374,0x2374 .short 0x8a0a,0x8a0a,0x8a0a,0x8a0a,0x474a,0x474a,0x474a,0x474a .short 0x3473,0x3473,0x3473,0x3473,0x36c1,0x36c1,0x36c1,0x36c1 .short 0x8e1d,0x8e1d,0x8e1d,0x8e1d,0xce2c,0xce2c,0xce2c,0xce2c .short 0x41c0,0x41c0,0x41c0,0x41c0,0x10d8,0x10d8,0x10d8,0x10d8 .short 0xa1a5,0xa1a5,0xa1a5,0xa1a5,0xba06,0xba06,0xba06,0xba06 .short 0xfeb2,0xfeb2,0x2bae,0x2bae,0xd32b,0xd32b,0x344b,0x344b .short 0x821e,0x821e,0xc867,0xc867,0x500e,0x500e,0xab69,0xab69 .short 0x93a6,0x93a6,0x334b,0x334b,0x03b1,0x03b1,0xee16,0xee16 .short 0xc5de,0xc5de,0x5a35,0x5a35,0x1826,0x1826,0x1575,0x1575 .short 0x7d0b,0x7d0b,0x810a,0x810a,0x2987,0x2987,0x766e,0x766e .short 0x71f8,0x71f8,0xb6cb,0xb6cb,0x8fa7,0x8fa7,0x315f,0x315f .short 0xb7cb,0xb7cb,0x4e84,0x4e84,0x4499,0x4499,0x485d,0x485d .short 0xc7a2,0xc7a2,0x4c49,0x4c49,0xeb65,0xeb65,0xceb6,0xceb6 .short 0x8631,0x8631,0x4f49,0x4f49,0x635b,0x635b,0x0862,0x0862 .short 0xe32a,0xe32a,0x3bfc,0x3bfc,0x5f48,0x5f48,0x8180,0x8180 .short 0xae42,0xae42,0xe779,0xe779,0x2ac2,0x2ac2,0xc5ca,0xc5ca .short 0x5e97,0x5e97,0xd4dc,0xd4dc,0x425e,0x425e,0x3886,0x3886 .short 0x2860,0x2860,0xac07,0xac07,0xe103,0xe103,0xb11a,0xb11a .short 0xa81b,0xa81b,0x5aab,0x5aab,0x2a9b,0x2a9b,0xbbde,0xbbde .short 0x7b95,0x7b95,0xa2cd,0xa2cd,0x6fe4,0x6fe4,0xb0df,0xb0df .short 0x5dbe,0x5dbe,0x1e4d,0x1e4d,0xbbf2,0xbbf2,0x5a5c,0x5a5c #ifndef __APPLE__ .text .globl mlkem_ntt .type mlkem_ntt,@function .align 2 mlkem_ntt: #else .section __TEXT,__text .globl _mlkem_ntt .p2align 2 _mlkem_ntt: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_zetas add x2, x2, :lo12:L_mlkem_aarch64_zetas #else adrp x2, L_mlkem_aarch64_zetas@PAGE add x2, x2, L_mlkem_aarch64_zetas@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_qinv add x3, x3, :lo12:L_mlkem_aarch64_zetas_qinv #else adrp x3, L_mlkem_aarch64_zetas_qinv@PAGE add x3, x3, L_mlkem_aarch64_zetas_qinv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ add x1, x0, #0x100 ldr q4, [x4] ldr q5, [x0] ldr q6, [x0, #32] ldr q7, [x0, #64] ldr q8, [x0, #96] ldr q9, [x0, #128] ldr q10, [x0, #160] ldr q11, [x0, #192] ldr q12, [x0, #224] ldr q13, [x1] ldr q14, [x1, #32] ldr q15, [x1, #64] ldr q16, [x1, #96] ldr q17, [x1, #128] ldr q18, [x1, #160] ldr q19, [x1, #192] ldr q20, [x1, #224] ldr q0, [x2] ldr q1, [x3] mul v29.8h, v13.8h, v1.h[1] mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v14.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v15.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v16.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v9.8h, v25.8h add v9.8h, v9.8h, v25.8h sub v18.8h, v10.8h, v26.8h add v10.8h, v10.8h, v26.8h sub v19.8h, v11.8h, v27.8h add v11.8h, v11.8h, v27.8h sub v20.8h, v12.8h, v28.8h add v12.8h, v12.8h, v28.8h mul v29.8h, v9.8h, v1.h[2] mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v10.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v12.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v18.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v15.8h, v27.8h add v15.8h, v15.8h, v27.8h sub v20.8h, v16.8h, v28.8h add v16.8h, v16.8h, v28.8h mul v29.8h, v7.8h, v1.h[4] mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v10.8h, v24.8h add v10.8h, v10.8h, v24.8h sub v15.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v18.8h, v28.8h add v18.8h, v18.8h, v28.8h ldr q0, [x2, #16] ldr q1, [x3, #16] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h str q5, [x0] str q6, [x0, #32] str q7, [x0, #64] str q8, [x0, #96] str q9, [x0, #128] str q10, [x0, #160] str q11, [x0, #192] str q12, [x0, #224] str q13, [x1] str q14, [x1, #32] str q15, [x1, #64] str q16, [x1, #96] str q17, [x1, #128] str q18, [x1, #160] str q19, [x1, #192] str q20, [x1, #224] ldr q5, [x0, #16] ldr q6, [x0, #48] ldr q7, [x0, #80] ldr q8, [x0, #112] ldr q9, [x0, #144] ldr q10, [x0, #176] ldr q11, [x0, #208] ldr q12, [x0, #240] ldr q13, [x1, #16] ldr q14, [x1, #48] ldr q15, [x1, #80] ldr q16, [x1, #112] ldr q17, [x1, #144] ldr q18, [x1, #176] ldr q19, [x1, #208] ldr q20, [x1, #240] ldr q0, [x2] ldr q1, [x3] mul v29.8h, v13.8h, v1.h[1] mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v14.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v15.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v16.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v9.8h, v25.8h add v9.8h, v9.8h, v25.8h sub v18.8h, v10.8h, v26.8h add v10.8h, v10.8h, v26.8h sub v19.8h, v11.8h, v27.8h add v11.8h, v11.8h, v27.8h sub v20.8h, v12.8h, v28.8h add v12.8h, v12.8h, v28.8h mul v29.8h, v9.8h, v1.h[2] mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v10.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v12.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v18.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v15.8h, v27.8h add v15.8h, v15.8h, v27.8h sub v20.8h, v16.8h, v28.8h add v16.8h, v16.8h, v28.8h mul v29.8h, v7.8h, v1.h[4] mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v10.8h, v24.8h add v10.8h, v10.8h, v24.8h sub v15.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v18.8h, v28.8h add v18.8h, v18.8h, v28.8h ldr q0, [x2, #16] ldr q1, [x3, #16] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h str q5, [x0, #16] str q6, [x0, #48] str q7, [x0, #80] str q8, [x0, #112] str q9, [x0, #144] str q10, [x0, #176] str q11, [x0, #208] str q12, [x0, #240] str q13, [x1, #16] str q14, [x1, #48] str q15, [x1, #80] str q16, [x1, #112] str q17, [x1, #144] str q18, [x1, #176] str q19, [x1, #208] str q20, [x1, #240] ldp q5, q6, [x0] ldp q7, q8, [x0, #32] ldp q9, q10, [x0, #64] ldp q11, q12, [x0, #96] ldp q13, q14, [x0, #128] ldp q15, q16, [x0, #160] ldp q17, q18, [x0, #192] ldp q19, q20, [x0, #224] ldr q0, [x2, #32] ldr q1, [x3, #32] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #64] ldr q2, [x2, #80] ldr q1, [x3, #64] ldr q3, [x3, #80] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.2d, v5.2d, v6.2d trn1 v7.2d, v7.2d, v8.2d trn2 v6.2d, v29.2d, v6.2d trn2 v8.2d, v30.2d, v8.2d mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #96] ldr q2, [x2, #112] ldr q1, [x3, #96] ldr q3, [x3, #112] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v29.2d, v10.2d trn2 v12.2d, v30.2d, v12.2d mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #128] ldr q2, [x2, #144] ldr q1, [x3, #128] ldr q3, [x3, #144] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v29.2d, v14.2d trn2 v16.2d, v30.2d, v16.2d mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #160] ldr q2, [x2, #176] ldr q1, [x3, #160] ldr q3, [x3, #176] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v29.2d, v18.2d trn2 v20.2d, v30.2d, v20.2d mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #320] ldr q2, [x2, #336] ldr q1, [x3, #320] ldr q3, [x3, #336] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.4s, v5.4s, v6.4s trn1 v7.4s, v7.4s, v8.4s trn2 v6.4s, v29.4s, v6.4s trn2 v8.4s, v30.4s, v8.4s mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #352] ldr q2, [x2, #368] ldr q1, [x3, #352] ldr q3, [x3, #368] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v29.4s, v10.4s trn2 v12.4s, v30.4s, v12.4s mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] ldr q2, [x2, #400] ldr q1, [x3, #384] ldr q3, [x3, #400] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v29.4s, v14.4s trn2 v16.4s, v30.4s, v16.4s mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #416] ldr q2, [x2, #432] ldr q1, [x3, #416] ldr q3, [x3, #432] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v29.4s, v18.4s trn2 v20.4s, v30.4s, v20.4s mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h sqdmulh v21.8h, v5.8h, v4.h[2] sqdmulh v22.8h, v6.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v5.8h, v21.8h, v4.h[0] mls v6.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v7.8h, v4.h[2] sqdmulh v22.8h, v8.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v7.8h, v21.8h, v4.h[0] mls v8.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v9.8h, v4.h[2] sqdmulh v22.8h, v10.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v9.8h, v21.8h, v4.h[0] mls v10.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v11.8h, v4.h[2] sqdmulh v22.8h, v12.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v11.8h, v21.8h, v4.h[0] mls v12.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v13.8h, v4.h[2] sqdmulh v22.8h, v14.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v13.8h, v21.8h, v4.h[0] mls v14.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v15.8h, v4.h[2] sqdmulh v22.8h, v16.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v15.8h, v21.8h, v4.h[0] mls v16.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v17.8h, v4.h[2] sqdmulh v22.8h, v18.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v17.8h, v21.8h, v4.h[0] mls v18.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v19.8h, v4.h[2] sqdmulh v22.8h, v20.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v19.8h, v21.8h, v4.h[0] mls v20.8h, v22.8h, v4.h[0] mov v29.16b, v5.16b trn1 v5.4s, v5.4s, v6.4s trn2 v6.4s, v29.4s, v6.4s mov v29.16b, v5.16b trn1 v5.2d, v5.2d, v6.2d trn2 v6.2d, v29.2d, v6.2d mov v29.16b, v7.16b trn1 v7.4s, v7.4s, v8.4s trn2 v8.4s, v29.4s, v8.4s mov v29.16b, v7.16b trn1 v7.2d, v7.2d, v8.2d trn2 v8.2d, v29.2d, v8.2d mov v29.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v29.4s, v10.4s mov v29.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v29.2d, v10.2d mov v29.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v29.4s, v12.4s mov v29.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v29.2d, v12.2d mov v29.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v29.4s, v14.4s mov v29.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v29.2d, v14.2d mov v29.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v29.4s, v16.4s mov v29.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v29.2d, v16.2d mov v29.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v29.4s, v18.4s mov v29.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v29.2d, v18.2d mov v29.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v29.4s, v20.4s mov v29.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v29.2d, v20.2d stp q5, q6, [x0] stp q7, q8, [x0, #32] stp q9, q10, [x0, #64] stp q11, q12, [x0, #96] stp q13, q14, [x0, #128] stp q15, q16, [x0, #160] stp q17, q18, [x0, #192] stp q19, q20, [x0, #224] ldp q5, q6, [x1] ldp q7, q8, [x1, #32] ldp q9, q10, [x1, #64] ldp q11, q12, [x1, #96] ldp q13, q14, [x1, #128] ldp q15, q16, [x1, #160] ldp q17, q18, [x1, #192] ldp q19, q20, [x1, #224] ldr q0, [x2, #48] ldr q1, [x3, #48] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #192] ldr q2, [x2, #208] ldr q1, [x3, #192] ldr q3, [x3, #208] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.2d, v5.2d, v6.2d trn1 v7.2d, v7.2d, v8.2d trn2 v6.2d, v29.2d, v6.2d trn2 v8.2d, v30.2d, v8.2d mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #224] ldr q2, [x2, #240] ldr q1, [x3, #224] ldr q3, [x3, #240] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v29.2d, v10.2d trn2 v12.2d, v30.2d, v12.2d mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] ldr q2, [x2, #272] ldr q1, [x3, #256] ldr q3, [x3, #272] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v29.2d, v14.2d trn2 v16.2d, v30.2d, v16.2d mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #288] ldr q2, [x2, #304] ldr q1, [x3, #288] ldr q3, [x3, #304] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v29.2d, v18.2d trn2 v20.2d, v30.2d, v20.2d mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #448] ldr q2, [x2, #464] ldr q1, [x3, #448] ldr q3, [x3, #464] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.4s, v5.4s, v6.4s trn1 v7.4s, v7.4s, v8.4s trn2 v6.4s, v29.4s, v6.4s trn2 v8.4s, v30.4s, v8.4s mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v30.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #480] ldr q2, [x2, #496] ldr q1, [x3, #480] ldr q3, [x3, #496] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v29.4s, v10.4s trn2 v12.4s, v30.4s, v12.4s mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v23.8h, v23.8h, v29.8h sub v24.8h, v24.8h, v30.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] ldr q2, [x2, #528] ldr q1, [x3, #512] ldr q3, [x3, #528] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v29.4s, v14.4s trn2 v16.4s, v30.4s, v16.4s mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v25.8h, v25.8h, v29.8h sub v26.8h, v26.8h, v30.8h sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #544] ldr q2, [x2, #560] ldr q1, [x3, #544] ldr q3, [x3, #560] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v29.4s, v18.4s trn2 v20.4s, v30.4s, v20.4s mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmulh v29.8h, v29.8h, v4.h[0] sqrdmulh v30.8h, v30.8h, v4.h[0] sub v27.8h, v27.8h, v29.8h sub v28.8h, v28.8h, v30.8h sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h sqdmulh v21.8h, v5.8h, v4.h[2] sqdmulh v22.8h, v6.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v5.8h, v21.8h, v4.h[0] mls v6.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v7.8h, v4.h[2] sqdmulh v22.8h, v8.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v7.8h, v21.8h, v4.h[0] mls v8.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v9.8h, v4.h[2] sqdmulh v22.8h, v10.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v9.8h, v21.8h, v4.h[0] mls v10.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v11.8h, v4.h[2] sqdmulh v22.8h, v12.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v11.8h, v21.8h, v4.h[0] mls v12.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v13.8h, v4.h[2] sqdmulh v22.8h, v14.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v13.8h, v21.8h, v4.h[0] mls v14.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v15.8h, v4.h[2] sqdmulh v22.8h, v16.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v15.8h, v21.8h, v4.h[0] mls v16.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v17.8h, v4.h[2] sqdmulh v22.8h, v18.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v17.8h, v21.8h, v4.h[0] mls v18.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v19.8h, v4.h[2] sqdmulh v22.8h, v20.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v19.8h, v21.8h, v4.h[0] mls v20.8h, v22.8h, v4.h[0] mov v29.16b, v5.16b trn1 v5.4s, v5.4s, v6.4s trn2 v6.4s, v29.4s, v6.4s mov v29.16b, v5.16b trn1 v5.2d, v5.2d, v6.2d trn2 v6.2d, v29.2d, v6.2d mov v29.16b, v7.16b trn1 v7.4s, v7.4s, v8.4s trn2 v8.4s, v29.4s, v8.4s mov v29.16b, v7.16b trn1 v7.2d, v7.2d, v8.2d trn2 v8.2d, v29.2d, v8.2d mov v29.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v29.4s, v10.4s mov v29.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v29.2d, v10.2d mov v29.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v29.4s, v12.4s mov v29.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v29.2d, v12.2d mov v29.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v29.4s, v14.4s mov v29.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v29.2d, v14.2d mov v29.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v29.4s, v16.4s mov v29.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v29.2d, v16.2d mov v29.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v29.4s, v18.4s mov v29.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v29.2d, v18.2d mov v29.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v29.4s, v20.4s mov v29.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v29.2d, v20.2d stp q5, q6, [x1] stp q7, q8, [x1, #32] stp q9, q10, [x1, #64] stp q11, q12, [x1, #96] stp q13, q14, [x1, #128] stp q15, q16, [x1, #160] stp q17, q18, [x1, #192] stp q19, q20, [x1, #224] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_ntt,.-mlkem_ntt #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_zetas_inv, %object .size L_mlkem_aarch64_zetas_inv, 576 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_zetas_inv: .short 0x06a5,0x06a5,0x070f,0x070f,0x05b4,0x05b4,0x0943,0x0943 .short 0x0922,0x0922,0x091d,0x091d,0x0134,0x0134,0x006c,0x006c .short 0x0b23,0x0b23,0x0366,0x0366,0x0356,0x0356,0x05e6,0x05e6 .short 0x09e7,0x09e7,0x04fe,0x04fe,0x05fa,0x05fa,0x04a1,0x04a1 .short 0x067b,0x067b,0x04a3,0x04a3,0x0c25,0x0c25,0x036a,0x036a .short 0x0537,0x0537,0x083f,0x083f,0x0088,0x0088,0x04bf,0x04bf .short 0x0b81,0x0b81,0x05b9,0x05b9,0x0505,0x0505,0x07d7,0x07d7 .short 0x0a9f,0x0a9f,0x0aa6,0x0aa6,0x08b8,0x08b8,0x09d0,0x09d0 .short 0x004b,0x004b,0x009c,0x009c,0x0bb8,0x0bb8,0x0b5f,0x0b5f .short 0x0ba4,0x0ba4,0x0368,0x0368,0x0a7d,0x0a7d,0x0636,0x0636 .short 0x08a2,0x08a2,0x025a,0x025a,0x0736,0x0736,0x0309,0x0309 .short 0x0093,0x0093,0x087a,0x087a,0x09f7,0x09f7,0x00f6,0x00f6 .short 0x068c,0x068c,0x06db,0x06db,0x01cc,0x01cc,0x0123,0x0123 .short 0x00eb,0x00eb,0x0c50,0x0c50,0x0ab6,0x0ab6,0x0b5b,0x0b5b .short 0x0c98,0x0c98,0x06f3,0x06f3,0x099a,0x099a,0x04e3,0x04e3 .short 0x09b6,0x09b6,0x0ad6,0x0ad6,0x0b53,0x0b53,0x044f,0x044f .short 0x04fb,0x04fb,0x04fb,0x04fb,0x0a5c,0x0a5c,0x0a5c,0x0a5c .short 0x0429,0x0429,0x0429,0x0429,0x0b41,0x0b41,0x0b41,0x0b41 .short 0x02d5,0x02d5,0x02d5,0x02d5,0x05e4,0x05e4,0x05e4,0x05e4 .short 0x0940,0x0940,0x0940,0x0940,0x018e,0x018e,0x018e,0x018e .short 0x03b7,0x03b7,0x03b7,0x03b7,0x00f7,0x00f7,0x00f7,0x00f7 .short 0x058d,0x058d,0x058d,0x058d,0x0c96,0x0c96,0x0c96,0x0c96 .short 0x09c3,0x09c3,0x09c3,0x09c3,0x010f,0x010f,0x010f,0x010f .short 0x005a,0x005a,0x005a,0x005a,0x0355,0x0355,0x0355,0x0355 .short 0x0744,0x0744,0x0744,0x0744,0x0c83,0x0c83,0x0c83,0x0c83 .short 0x048a,0x048a,0x048a,0x048a,0x0652,0x0652,0x0652,0x0652 .short 0x029a,0x029a,0x029a,0x029a,0x0140,0x0140,0x0140,0x0140 .short 0x0008,0x0008,0x0008,0x0008,0x0afd,0x0afd,0x0afd,0x0afd .short 0x0608,0x0608,0x0608,0x0608,0x011a,0x011a,0x011a,0x011a .short 0x072e,0x072e,0x072e,0x072e,0x050d,0x050d,0x050d,0x050d .short 0x090a,0x090a,0x090a,0x090a,0x0228,0x0228,0x0228,0x0228 .short 0x0a75,0x0a75,0x0a75,0x0a75,0x083a,0x083a,0x083a,0x083a .short 0x0623,0x00cd,0x0b66,0x0606,0x0aa1,0x0a25,0x0908,0x02a9 .short 0x0082,0x0642,0x074f,0x033d,0x0b82,0x0bf9,0x052d,0x0ac4 .short 0x0745,0x05c2,0x04b2,0x093f,0x0c4b,0x06d8,0x0a93,0x00ab .short 0x0c37,0x0be2,0x0773,0x072c,0x05ed,0x0167,0x02f6,0x05a1 #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_zetas_inv_qinv, %object .size L_mlkem_aarch64_zetas_inv_qinv, 576 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_zetas_inv_qinv: .short 0xa5a5,0xa5a5,0x440f,0x440f,0xe1b4,0xe1b4,0xa243,0xa243 .short 0x4f22,0x4f22,0x901d,0x901d,0x5d34,0x5d34,0x846c,0x846c .short 0x4423,0x4423,0xd566,0xd566,0xa556,0xa556,0x57e6,0x57e6 .short 0x4ee7,0x4ee7,0x1efe,0x1efe,0x53fa,0x53fa,0xd7a1,0xd7a1 .short 0xc77b,0xc77b,0xbda3,0xbda3,0x2b25,0x2b25,0xa16a,0xa16a .short 0x3a37,0x3a37,0xd53f,0xd53f,0x1888,0x1888,0x51bf,0x51bf .short 0x7e81,0x7e81,0xa0b9,0xa0b9,0xc405,0xc405,0x1cd7,0x1cd7 .short 0xf79f,0xf79f,0x9ca6,0x9ca6,0xb0b8,0xb0b8,0x79d0,0x79d0 .short 0x314b,0x314b,0x149c,0x149c,0xb3b8,0xb3b8,0x385f,0x385f .short 0xb7a4,0xb7a4,0xbb68,0xbb68,0xb17d,0xb17d,0x4836,0x4836 .short 0xcea2,0xcea2,0x705a,0x705a,0x4936,0x4936,0x8e09,0x8e09 .short 0x8993,0x8993,0xd67a,0xd67a,0x7ef7,0x7ef7,0x82f6,0x82f6 .short 0xea8c,0xea8c,0xe7db,0xe7db,0xa5cc,0xa5cc,0x3a23,0x3a23 .short 0x11eb,0x11eb,0xfc50,0xfc50,0xccb6,0xccb6,0x6c5b,0x6c5b .short 0x5498,0x5498,0xaff3,0xaff3,0x379a,0x379a,0x7de3,0x7de3 .short 0xcbb6,0xcbb6,0x2cd6,0x2cd6,0xd453,0xd453,0x014f,0x014f .short 0x45fb,0x45fb,0x45fb,0x45fb,0x5e5c,0x5e5c,0x5e5c,0x5e5c .short 0xef29,0xef29,0xef29,0xef29,0xbe41,0xbe41,0xbe41,0xbe41 .short 0x31d5,0x31d5,0x31d5,0x31d5,0x71e4,0x71e4,0x71e4,0x71e4 .short 0xc940,0xc940,0xc940,0xc940,0xcb8e,0xcb8e,0xcb8e,0xcb8e .short 0xb8b7,0xb8b7,0xb8b7,0xb8b7,0x75f7,0x75f7,0x75f7,0x75f7 .short 0xdc8d,0xdc8d,0xdc8d,0xdc8d,0x6e96,0x6e96,0x6e96,0x6e96 .short 0x22c3,0x22c3,0x22c3,0x22c3,0x3e0f,0x3e0f,0x3e0f,0x3e0f .short 0x6e5a,0x6e5a,0x6e5a,0x6e5a,0xb255,0xb255,0xb255,0xb255 .short 0x9344,0x9344,0x9344,0x9344,0x6583,0x6583,0x6583,0x6583 .short 0x028a,0x028a,0x028a,0x028a,0xdc52,0xdc52,0xdc52,0xdc52 .short 0x309a,0x309a,0x309a,0x309a,0xc140,0xc140,0xc140,0xc140 .short 0x9808,0x9808,0x9808,0x9808,0x31fd,0x31fd,0x31fd,0x31fd .short 0x9e08,0x9e08,0x9e08,0x9e08,0xaf1a,0xaf1a,0xaf1a,0xaf1a .short 0xb12e,0xb12e,0xb12e,0xb12e,0x5c0d,0x5c0d,0x5c0d,0x5c0d .short 0x870a,0x870a,0x870a,0x870a,0xfa28,0xfa28,0xfa28,0xfa28 .short 0x1975,0x1975,0x1975,0x1975,0x163a,0x163a,0x163a,0x163a .short 0x3f23,0x97cd,0xdd66,0xb806,0xdda1,0x2925,0xa108,0x6da9 .short 0x6682,0xac42,0x044f,0xea3d,0x7182,0x66f9,0xbc2d,0x16c4 .short 0x8645,0x2bc2,0xfab2,0xd63f,0x3d4b,0x0ed8,0x9393,0x51ab .short 0x4137,0x91e2,0x3073,0xcb2c,0xfced,0xc667,0x84f6,0xd8a1 #ifndef __APPLE__ .text .globl mlkem_invntt .type mlkem_invntt,@function .align 2 mlkem_invntt: #else .section __TEXT,__text .globl _mlkem_invntt .p2align 2 _mlkem_invntt: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_zetas_inv add x2, x2, :lo12:L_mlkem_aarch64_zetas_inv #else adrp x2, L_mlkem_aarch64_zetas_inv@PAGE add x2, x2, L_mlkem_aarch64_zetas_inv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_inv_qinv add x3, x3, :lo12:L_mlkem_aarch64_zetas_inv_qinv #else adrp x3, L_mlkem_aarch64_zetas_inv_qinv@PAGE add x3, x3, L_mlkem_aarch64_zetas_inv_qinv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ add x1, x0, #0x100 ldr q8, [x4] ldp q9, q10, [x0] ldp q11, q12, [x0, #32] ldp q13, q14, [x0, #64] ldp q15, q16, [x0, #96] ldp q17, q18, [x0, #128] ldp q19, q20, [x0, #160] ldp q21, q22, [x0, #192] ldp q23, q24, [x0, #224] mov v25.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v25.2d, v10.2d mov v25.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v25.4s, v10.4s mov v25.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v25.2d, v12.2d mov v25.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v25.4s, v12.4s mov v25.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v25.2d, v14.2d mov v25.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v25.4s, v14.4s mov v25.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v25.2d, v16.2d mov v25.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v25.4s, v16.4s mov v25.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v25.2d, v18.2d mov v25.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v25.4s, v18.4s mov v25.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v25.2d, v20.2d mov v25.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v25.4s, v20.4s mov v25.16b, v21.16b trn1 v21.2d, v21.2d, v22.2d trn2 v22.2d, v25.2d, v22.2d mov v25.16b, v21.16b trn1 v21.4s, v21.4s, v22.4s trn2 v22.4s, v25.4s, v22.4s mov v25.16b, v23.16b trn1 v23.2d, v23.2d, v24.2d trn2 v24.2d, v25.2d, v24.2d mov v25.16b, v23.16b trn1 v23.4s, v23.4s, v24.4s trn2 v24.4s, v25.4s, v24.4s ldr q0, [x2] ldr q1, [x2, #16] ldr q2, [x3] ldr q3, [x3, #16] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #32] ldr q1, [x2, #48] ldr q2, [x3, #32] ldr q3, [x3, #48] sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #64] ldr q1, [x2, #80] ldr q2, [x3, #64] ldr q3, [x3, #80] sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #96] ldr q1, [x2, #112] ldr q2, [x3, #96] ldr q3, [x3, #112] sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] ldr q1, [x2, #272] ldr q2, [x3, #256] ldr q3, [x3, #272] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v25.4s, v10.4s trn2 v12.4s, v26.4s, v12.4s sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #288] ldr q1, [x2, #304] ldr q2, [x3, #288] ldr q3, [x3, #304] mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v25.4s, v14.4s trn2 v16.4s, v26.4s, v16.4s sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #320] ldr q1, [x2, #336] ldr q2, [x3, #320] ldr q3, [x3, #336] mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v25.4s, v18.4s trn2 v20.4s, v26.4s, v20.4s sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #352] ldr q1, [x2, #368] ldr q2, [x3, #352] ldr q3, [x3, #368] mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.4s, v21.4s, v22.4s trn1 v23.4s, v23.4s, v24.4s trn2 v22.4s, v25.4s, v22.4s trn2 v24.4s, v26.4s, v24.4s sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] ldr q2, [x3, #512] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v25.2d, v10.2d trn2 v12.2d, v26.2d, v12.2d sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.h[0] mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v25.2d, v14.2d trn2 v16.2d, v26.2d, v16.2d sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.h[2] mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v25.2d, v18.2d trn2 v20.2d, v26.2d, v20.2d sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.h[4] mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.2d, v21.2d, v22.2d trn1 v23.2d, v23.2d, v24.2d trn2 v22.2d, v25.2d, v22.2d trn2 v24.2d, v26.2d, v24.2d sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.h[6] mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v11.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v11.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v13.8h, v8.h[2] sqdmulh v26.8h, v15.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v13.8h, v25.8h, v8.h[0] mls v15.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v19.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v19.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v21.8h, v8.h[2] sqdmulh v26.8h, v23.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v21.8h, v25.8h, v8.h[0] mls v23.8h, v26.8h, v8.h[0] stp q9, q10, [x0] stp q11, q12, [x0, #32] stp q13, q14, [x0, #64] stp q15, q16, [x0, #96] stp q17, q18, [x0, #128] stp q19, q20, [x0, #160] stp q21, q22, [x0, #192] stp q23, q24, [x0, #224] ldp q9, q10, [x1] ldp q11, q12, [x1, #32] ldp q13, q14, [x1, #64] ldp q15, q16, [x1, #96] ldp q17, q18, [x1, #128] ldp q19, q20, [x1, #160] ldp q21, q22, [x1, #192] ldp q23, q24, [x1, #224] mov v25.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v25.2d, v10.2d mov v25.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v25.4s, v10.4s mov v25.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v25.2d, v12.2d mov v25.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v25.4s, v12.4s mov v25.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v25.2d, v14.2d mov v25.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v25.4s, v14.4s mov v25.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v25.2d, v16.2d mov v25.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v25.4s, v16.4s mov v25.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v25.2d, v18.2d mov v25.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v25.4s, v18.4s mov v25.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v25.2d, v20.2d mov v25.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v25.4s, v20.4s mov v25.16b, v21.16b trn1 v21.2d, v21.2d, v22.2d trn2 v22.2d, v25.2d, v22.2d mov v25.16b, v21.16b trn1 v21.4s, v21.4s, v22.4s trn2 v22.4s, v25.4s, v22.4s mov v25.16b, v23.16b trn1 v23.2d, v23.2d, v24.2d trn2 v24.2d, v25.2d, v24.2d mov v25.16b, v23.16b trn1 v23.4s, v23.4s, v24.4s trn2 v24.4s, v25.4s, v24.4s ldr q0, [x2, #128] ldr q1, [x2, #144] ldr q2, [x3, #128] ldr q3, [x3, #144] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #160] ldr q1, [x2, #176] ldr q2, [x3, #160] ldr q3, [x3, #176] sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #192] ldr q1, [x2, #208] ldr q2, [x3, #192] ldr q3, [x3, #208] sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #224] ldr q1, [x2, #240] ldr q2, [x3, #224] ldr q3, [x3, #240] sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] ldr q1, [x2, #400] ldr q2, [x3, #384] ldr q3, [x3, #400] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v25.4s, v10.4s trn2 v12.4s, v26.4s, v12.4s sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #416] ldr q1, [x2, #432] ldr q2, [x3, #416] ldr q3, [x3, #432] mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v25.4s, v14.4s trn2 v16.4s, v26.4s, v16.4s sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #448] ldr q1, [x2, #464] ldr q2, [x3, #448] ldr q3, [x3, #464] mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v25.4s, v18.4s trn2 v20.4s, v26.4s, v20.4s sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #480] ldr q1, [x2, #496] ldr q2, [x3, #480] ldr q3, [x3, #496] mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.4s, v21.4s, v22.4s trn1 v23.4s, v23.4s, v24.4s trn2 v22.4s, v25.4s, v22.4s trn2 v24.4s, v26.4s, v24.4s sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #528] ldr q2, [x3, #528] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v25.2d, v10.2d trn2 v12.2d, v26.2d, v12.2d sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.h[0] mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v25.2d, v14.2d trn2 v16.2d, v26.2d, v16.2d sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.h[2] mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v25.2d, v18.2d trn2 v20.2d, v26.2d, v20.2d sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.h[4] mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.2d, v21.2d, v22.2d trn1 v23.2d, v23.2d, v24.2d trn2 v22.2d, v25.2d, v22.2d trn2 v24.2d, v26.2d, v24.2d sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.h[6] mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v11.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v11.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v13.8h, v8.h[2] sqdmulh v26.8h, v15.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v13.8h, v25.8h, v8.h[0] mls v15.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v19.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v19.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v21.8h, v8.h[2] sqdmulh v26.8h, v23.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v21.8h, v25.8h, v8.h[0] mls v23.8h, v26.8h, v8.h[0] stp q9, q10, [x1] stp q11, q12, [x1, #32] stp q13, q14, [x1, #64] stp q15, q16, [x1, #96] stp q17, q18, [x1, #128] stp q19, q20, [x1, #160] stp q21, q22, [x1, #192] stp q23, q24, [x1, #224] ldr q4, [x2, #544] ldr q5, [x2, #560] ldr q6, [x3, #544] ldr q7, [x3, #560] ldr q9, [x0] ldr q10, [x0, #32] ldr q11, [x0, #64] ldr q12, [x0, #96] ldr q13, [x0, #128] ldr q14, [x0, #160] ldr q15, [x0, #192] ldr q16, [x0, #224] ldr q17, [x1] ldr q18, [x1, #32] ldr q19, [x1, #64] ldr q20, [x1, #96] ldr q21, [x1, #128] ldr q22, [x1, #160] ldr q23, [x1, #192] ldr q24, [x1, #224] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v6.h[0] mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v6.h[2] mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v6.h[4] mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v6.h[6] mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h sub v28.8h, v10.8h, v12.8h add v9.8h, v9.8h, v11.8h add v10.8h, v10.8h, v12.8h mul v25.8h, v26.8h, v7.h[0] mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h sub v28.8h, v14.8h, v16.8h add v13.8h, v13.8h, v15.8h add v14.8h, v14.8h, v16.8h mul v25.8h, v26.8h, v7.h[1] mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h sub v28.8h, v18.8h, v20.8h add v17.8h, v17.8h, v19.8h add v18.8h, v18.8h, v20.8h mul v25.8h, v26.8h, v7.h[2] mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h sub v28.8h, v22.8h, v24.8h add v21.8h, v21.8h, v23.8h add v22.8h, v22.8h, v24.8h mul v25.8h, v26.8h, v7.h[3] mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h sub v28.8h, v10.8h, v14.8h add v9.8h, v9.8h, v13.8h add v10.8h, v10.8h, v14.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v27.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h sub v28.8h, v12.8h, v16.8h add v11.8h, v11.8h, v15.8h add v12.8h, v12.8h, v16.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h sub v28.8h, v18.8h, v22.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h sub v28.8h, v20.8h, v24.8h add v19.8h, v19.8h, v23.8h add v20.8h, v20.8h, v24.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v10.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v10.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v11.8h, v8.h[2] sqdmulh v26.8h, v12.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v11.8h, v25.8h, v8.h[0] mls v12.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v18.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v18.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v19.8h, v8.h[2] sqdmulh v26.8h, v20.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v19.8h, v25.8h, v8.h[0] mls v20.8h, v26.8h, v8.h[0] sub v26.8h, v9.8h, v17.8h sub v28.8h, v10.8h, v18.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v27.8h sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h sub v28.8h, v12.8h, v20.8h add v11.8h, v11.8h, v19.8h add v12.8h, v12.8h, v20.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h sub v28.8h, v14.8h, v22.8h add v13.8h, v13.8h, v21.8h add v14.8h, v14.8h, v22.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h sub v28.8h, v16.8h, v24.8h add v15.8h, v15.8h, v23.8h add v16.8h, v16.8h, v24.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v9.8h, v9.8h, v25.8h sub v10.8h, v10.8h, v26.8h sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v26.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v26.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v26.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v26.8h sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v26.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v26.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v26.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0] str q10, [x0, #32] str q11, [x0, #64] str q12, [x0, #96] str q13, [x0, #128] str q14, [x0, #160] str q15, [x0, #192] str q16, [x0, #224] str q17, [x1] str q18, [x1, #32] str q19, [x1, #64] str q20, [x1, #96] str q21, [x1, #128] str q22, [x1, #160] str q23, [x1, #192] str q24, [x1, #224] ldr q9, [x0, #16] ldr q10, [x0, #48] ldr q11, [x0, #80] ldr q12, [x0, #112] ldr q13, [x0, #144] ldr q14, [x0, #176] ldr q15, [x0, #208] ldr q16, [x0, #240] ldr q17, [x1, #16] ldr q18, [x1, #48] ldr q19, [x1, #80] ldr q20, [x1, #112] ldr q21, [x1, #144] ldr q22, [x1, #176] ldr q23, [x1, #208] ldr q24, [x1, #240] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v6.h[0] mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v10.8h, v10.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v6.h[2] mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v14.8h, v14.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v6.h[4] mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v18.8h, v18.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v6.h[6] mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v22.8h, v22.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h sub v28.8h, v10.8h, v12.8h add v9.8h, v9.8h, v11.8h add v10.8h, v10.8h, v12.8h mul v25.8h, v26.8h, v7.h[0] mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v27.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h sub v28.8h, v14.8h, v16.8h add v13.8h, v13.8h, v15.8h add v14.8h, v14.8h, v16.8h mul v25.8h, v26.8h, v7.h[1] mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h sub v28.8h, v18.8h, v20.8h add v17.8h, v17.8h, v19.8h add v18.8h, v18.8h, v20.8h mul v25.8h, v26.8h, v7.h[2] mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h sub v28.8h, v22.8h, v24.8h add v21.8h, v21.8h, v23.8h add v22.8h, v22.8h, v24.8h mul v25.8h, v26.8h, v7.h[3] mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h sub v28.8h, v10.8h, v14.8h add v9.8h, v9.8h, v13.8h add v10.8h, v10.8h, v14.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v27.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h sub v28.8h, v12.8h, v16.8h add v11.8h, v11.8h, v15.8h add v12.8h, v12.8h, v16.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v27.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h sub v28.8h, v18.8h, v22.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h sub v28.8h, v20.8h, v24.8h add v19.8h, v19.8h, v23.8h add v20.8h, v20.8h, v24.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v10.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v10.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v11.8h, v8.h[2] sqdmulh v26.8h, v12.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v11.8h, v25.8h, v8.h[0] mls v12.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v18.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v18.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v19.8h, v8.h[2] sqdmulh v26.8h, v20.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v19.8h, v25.8h, v8.h[0] mls v20.8h, v26.8h, v8.h[0] sub v26.8h, v9.8h, v17.8h sub v28.8h, v10.8h, v18.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v27.8h sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h sub v28.8h, v12.8h, v20.8h add v11.8h, v11.8h, v19.8h add v12.8h, v12.8h, v20.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v27.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h sub v28.8h, v14.8h, v22.8h add v13.8h, v13.8h, v21.8h add v14.8h, v14.8h, v22.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v27.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h sub v28.8h, v16.8h, v24.8h add v15.8h, v15.8h, v23.8h add v16.8h, v16.8h, v24.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v27.8h, v27.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v27.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v9.8h, v9.8h, v25.8h sub v10.8h, v10.8h, v26.8h sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v11.8h, v11.8h, v25.8h sub v12.8h, v12.8h, v26.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v13.8h, v13.8h, v25.8h sub v14.8h, v14.8h, v26.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v15.8h, v15.8h, v25.8h sub v16.8h, v16.8h, v26.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v17.8h, v17.8h, v25.8h sub v18.8h, v18.8h, v26.8h sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v19.8h, v19.8h, v25.8h sub v20.8h, v20.8h, v26.8h sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v21.8h, v21.8h, v25.8h sub v22.8h, v22.8h, v26.8h sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] sqrdmulh v25.8h, v25.8h, v8.h[0] sqrdmulh v26.8h, v26.8h, v8.h[0] sub v23.8h, v23.8h, v25.8h sub v24.8h, v24.8h, v26.8h sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0, #16] str q10, [x0, #48] str q11, [x0, #80] str q12, [x0, #112] str q13, [x0, #144] str q14, [x0, #176] str q15, [x0, #208] str q16, [x0, #240] str q17, [x1, #16] str q18, [x1, #48] str q19, [x1, #80] str q20, [x1, #112] str q21, [x1, #144] str q22, [x1, #176] str q23, [x1, #208] str q24, [x1, #240] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_invntt,.-mlkem_invntt #endif /* __APPLE__ */ #ifndef WOLFSSL_AARCH64_NO_SQRDMLSH #ifndef __APPLE__ .text .globl mlkem_ntt_sqrdmlsh .type mlkem_ntt_sqrdmlsh,@function .align 2 mlkem_ntt_sqrdmlsh: #else .section __TEXT,__text .globl _mlkem_ntt_sqrdmlsh .p2align 2 _mlkem_ntt_sqrdmlsh: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_zetas add x2, x2, :lo12:L_mlkem_aarch64_zetas #else adrp x2, L_mlkem_aarch64_zetas@PAGE add x2, x2, L_mlkem_aarch64_zetas@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_qinv add x3, x3, :lo12:L_mlkem_aarch64_zetas_qinv #else adrp x3, L_mlkem_aarch64_zetas_qinv@PAGE add x3, x3, L_mlkem_aarch64_zetas_qinv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ add x1, x0, #0x100 ldr q4, [x4] ldr q5, [x0] ldr q6, [x0, #32] ldr q7, [x0, #64] ldr q8, [x0, #96] ldr q9, [x0, #128] ldr q10, [x0, #160] ldr q11, [x0, #192] ldr q12, [x0, #224] ldr q13, [x1] ldr q14, [x1, #32] ldr q15, [x1, #64] ldr q16, [x1, #96] ldr q17, [x1, #128] ldr q18, [x1, #160] ldr q19, [x1, #192] ldr q20, [x1, #224] ldr q0, [x2] ldr q1, [x3] mul v29.8h, v13.8h, v1.h[1] mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v14.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v15.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v16.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v9.8h, v25.8h add v9.8h, v9.8h, v25.8h sub v18.8h, v10.8h, v26.8h add v10.8h, v10.8h, v26.8h sub v19.8h, v11.8h, v27.8h add v11.8h, v11.8h, v27.8h sub v20.8h, v12.8h, v28.8h add v12.8h, v12.8h, v28.8h mul v29.8h, v9.8h, v1.h[2] mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v10.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v12.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v18.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v15.8h, v27.8h add v15.8h, v15.8h, v27.8h sub v20.8h, v16.8h, v28.8h add v16.8h, v16.8h, v28.8h mul v29.8h, v7.8h, v1.h[4] mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v10.8h, v24.8h add v10.8h, v10.8h, v24.8h sub v15.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v18.8h, v28.8h add v18.8h, v18.8h, v28.8h ldr q0, [x2, #16] ldr q1, [x3, #16] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h str q5, [x0] str q6, [x0, #32] str q7, [x0, #64] str q8, [x0, #96] str q9, [x0, #128] str q10, [x0, #160] str q11, [x0, #192] str q12, [x0, #224] str q13, [x1] str q14, [x1, #32] str q15, [x1, #64] str q16, [x1, #96] str q17, [x1, #128] str q18, [x1, #160] str q19, [x1, #192] str q20, [x1, #224] ldr q5, [x0, #16] ldr q6, [x0, #48] ldr q7, [x0, #80] ldr q8, [x0, #112] ldr q9, [x0, #144] ldr q10, [x0, #176] ldr q11, [x0, #208] ldr q12, [x0, #240] ldr q13, [x1, #16] ldr q14, [x1, #48] ldr q15, [x1, #80] ldr q16, [x1, #112] ldr q17, [x1, #144] ldr q18, [x1, #176] ldr q19, [x1, #208] ldr q20, [x1, #240] ldr q0, [x2] ldr q1, [x3] mul v29.8h, v13.8h, v1.h[1] mul v30.8h, v14.8h, v1.h[1] sqrdmulh v21.8h, v13.8h, v0.h[1] sqrdmulh v22.8h, v14.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v15.8h, v1.h[1] mul v30.8h, v16.8h, v1.h[1] sqrdmulh v23.8h, v15.8h, v0.h[1] sqrdmulh v24.8h, v16.8h, v0.h[1] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[1] mul v30.8h, v18.8h, v1.h[1] sqrdmulh v25.8h, v17.8h, v0.h[1] sqrdmulh v26.8h, v18.8h, v0.h[1] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[1] mul v30.8h, v20.8h, v1.h[1] sqrdmulh v27.8h, v19.8h, v0.h[1] sqrdmulh v28.8h, v20.8h, v0.h[1] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v13.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v14.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v15.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v16.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v9.8h, v25.8h add v9.8h, v9.8h, v25.8h sub v18.8h, v10.8h, v26.8h add v10.8h, v10.8h, v26.8h sub v19.8h, v11.8h, v27.8h add v11.8h, v11.8h, v27.8h sub v20.8h, v12.8h, v28.8h add v12.8h, v12.8h, v28.8h mul v29.8h, v9.8h, v1.h[2] mul v30.8h, v10.8h, v1.h[2] sqrdmulh v21.8h, v9.8h, v0.h[2] sqrdmulh v22.8h, v10.8h, v0.h[2] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[2] sqrdmulh v23.8h, v11.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[2] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v17.8h, v1.h[3] mul v30.8h, v18.8h, v1.h[3] sqrdmulh v25.8h, v17.8h, v0.h[3] sqrdmulh v26.8h, v18.8h, v0.h[3] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[3] mul v30.8h, v20.8h, v1.h[3] sqrdmulh v27.8h, v19.8h, v0.h[3] sqrdmulh v28.8h, v20.8h, v0.h[3] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v9.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v10.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v7.8h, v23.8h add v7.8h, v7.8h, v23.8h sub v12.8h, v8.8h, v24.8h add v8.8h, v8.8h, v24.8h sub v17.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v18.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v15.8h, v27.8h add v15.8h, v15.8h, v27.8h sub v20.8h, v16.8h, v28.8h add v16.8h, v16.8h, v28.8h mul v29.8h, v7.8h, v1.h[4] mul v30.8h, v8.8h, v1.h[4] sqrdmulh v21.8h, v7.8h, v0.h[4] sqrdmulh v22.8h, v8.8h, v0.h[4] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v11.8h, v1.h[5] mul v30.8h, v12.8h, v1.h[5] sqrdmulh v23.8h, v11.8h, v0.h[5] sqrdmulh v24.8h, v12.8h, v0.h[5] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v15.8h, v1.h[6] mul v30.8h, v16.8h, v1.h[6] sqrdmulh v25.8h, v15.8h, v0.h[6] sqrdmulh v26.8h, v16.8h, v0.h[6] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v19.8h, v1.h[7] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v19.8h, v0.h[7] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v7.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v6.8h, v22.8h add v6.8h, v6.8h, v22.8h sub v11.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v10.8h, v24.8h add v10.8h, v10.8h, v24.8h sub v15.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v14.8h, v26.8h add v14.8h, v14.8h, v26.8h sub v19.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v18.8h, v28.8h add v18.8h, v18.8h, v28.8h ldr q0, [x2, #16] ldr q1, [x3, #16] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h str q5, [x0, #16] str q6, [x0, #48] str q7, [x0, #80] str q8, [x0, #112] str q9, [x0, #144] str q10, [x0, #176] str q11, [x0, #208] str q12, [x0, #240] str q13, [x1, #16] str q14, [x1, #48] str q15, [x1, #80] str q16, [x1, #112] str q17, [x1, #144] str q18, [x1, #176] str q19, [x1, #208] str q20, [x1, #240] ldp q5, q6, [x0] ldp q7, q8, [x0, #32] ldp q9, q10, [x0, #64] ldp q11, q12, [x0, #96] ldp q13, q14, [x0, #128] ldp q15, q16, [x0, #160] ldp q17, q18, [x0, #192] ldp q19, q20, [x0, #224] ldr q0, [x2, #32] ldr q1, [x3, #32] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #64] ldr q2, [x2, #80] ldr q1, [x3, #64] ldr q3, [x3, #80] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.2d, v5.2d, v6.2d trn1 v7.2d, v7.2d, v8.2d trn2 v6.2d, v29.2d, v6.2d trn2 v8.2d, v30.2d, v8.2d mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #96] ldr q2, [x2, #112] ldr q1, [x3, #96] ldr q3, [x3, #112] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v29.2d, v10.2d trn2 v12.2d, v30.2d, v12.2d mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #128] ldr q2, [x2, #144] ldr q1, [x3, #128] ldr q3, [x3, #144] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v29.2d, v14.2d trn2 v16.2d, v30.2d, v16.2d mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #160] ldr q2, [x2, #176] ldr q1, [x3, #160] ldr q3, [x3, #176] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v29.2d, v18.2d trn2 v20.2d, v30.2d, v20.2d mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #320] ldr q2, [x2, #336] ldr q1, [x3, #320] ldr q3, [x3, #336] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.4s, v5.4s, v6.4s trn1 v7.4s, v7.4s, v8.4s trn2 v6.4s, v29.4s, v6.4s trn2 v8.4s, v30.4s, v8.4s mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #352] ldr q2, [x2, #368] ldr q1, [x3, #352] ldr q3, [x3, #368] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v29.4s, v10.4s trn2 v12.4s, v30.4s, v12.4s mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] ldr q2, [x2, #400] ldr q1, [x3, #384] ldr q3, [x3, #400] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v29.4s, v14.4s trn2 v16.4s, v30.4s, v16.4s mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #416] ldr q2, [x2, #432] ldr q1, [x3, #416] ldr q3, [x3, #432] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v29.4s, v18.4s trn2 v20.4s, v30.4s, v20.4s mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h sqdmulh v21.8h, v5.8h, v4.h[2] sqdmulh v22.8h, v6.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v5.8h, v21.8h, v4.h[0] mls v6.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v7.8h, v4.h[2] sqdmulh v22.8h, v8.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v7.8h, v21.8h, v4.h[0] mls v8.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v9.8h, v4.h[2] sqdmulh v22.8h, v10.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v9.8h, v21.8h, v4.h[0] mls v10.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v11.8h, v4.h[2] sqdmulh v22.8h, v12.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v11.8h, v21.8h, v4.h[0] mls v12.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v13.8h, v4.h[2] sqdmulh v22.8h, v14.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v13.8h, v21.8h, v4.h[0] mls v14.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v15.8h, v4.h[2] sqdmulh v22.8h, v16.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v15.8h, v21.8h, v4.h[0] mls v16.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v17.8h, v4.h[2] sqdmulh v22.8h, v18.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v17.8h, v21.8h, v4.h[0] mls v18.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v19.8h, v4.h[2] sqdmulh v22.8h, v20.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v19.8h, v21.8h, v4.h[0] mls v20.8h, v22.8h, v4.h[0] mov v29.16b, v5.16b trn1 v5.4s, v5.4s, v6.4s trn2 v6.4s, v29.4s, v6.4s mov v29.16b, v5.16b trn1 v5.2d, v5.2d, v6.2d trn2 v6.2d, v29.2d, v6.2d mov v29.16b, v7.16b trn1 v7.4s, v7.4s, v8.4s trn2 v8.4s, v29.4s, v8.4s mov v29.16b, v7.16b trn1 v7.2d, v7.2d, v8.2d trn2 v8.2d, v29.2d, v8.2d mov v29.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v29.4s, v10.4s mov v29.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v29.2d, v10.2d mov v29.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v29.4s, v12.4s mov v29.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v29.2d, v12.2d mov v29.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v29.4s, v14.4s mov v29.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v29.2d, v14.2d mov v29.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v29.4s, v16.4s mov v29.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v29.2d, v16.2d mov v29.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v29.4s, v18.4s mov v29.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v29.2d, v18.2d mov v29.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v29.4s, v20.4s mov v29.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v29.2d, v20.2d stp q5, q6, [x0] stp q7, q8, [x0, #32] stp q9, q10, [x0, #64] stp q11, q12, [x0, #96] stp q13, q14, [x0, #128] stp q15, q16, [x0, #160] stp q17, q18, [x0, #192] stp q19, q20, [x0, #224] ldp q5, q6, [x1] ldp q7, q8, [x1, #32] ldp q9, q10, [x1, #64] ldp q11, q12, [x1, #96] ldp q13, q14, [x1, #128] ldp q15, q16, [x1, #160] ldp q17, q18, [x1, #192] ldp q19, q20, [x1, #224] ldr q0, [x2, #48] ldr q1, [x3, #48] mul v29.8h, v6.8h, v1.h[0] mul v30.8h, v8.8h, v1.h[1] sqrdmulh v21.8h, v6.8h, v0.h[0] sqrdmulh v22.8h, v8.8h, v0.h[1] sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v29.8h, v10.8h, v1.h[2] mul v30.8h, v12.8h, v1.h[3] sqrdmulh v23.8h, v10.8h, v0.h[2] sqrdmulh v24.8h, v12.8h, v0.h[3] sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v29.8h, v14.8h, v1.h[4] mul v30.8h, v16.8h, v1.h[5] sqrdmulh v25.8h, v14.8h, v0.h[4] sqrdmulh v26.8h, v16.8h, v0.h[5] sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 mul v29.8h, v18.8h, v1.h[6] mul v30.8h, v20.8h, v1.h[7] sqrdmulh v27.8h, v18.8h, v0.h[6] sqrdmulh v28.8h, v20.8h, v0.h[7] sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #192] ldr q2, [x2, #208] ldr q1, [x3, #192] ldr q3, [x3, #208] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.2d, v5.2d, v6.2d trn1 v7.2d, v7.2d, v8.2d trn2 v6.2d, v29.2d, v6.2d trn2 v8.2d, v30.2d, v8.2d mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #224] ldr q2, [x2, #240] ldr q1, [x3, #224] ldr q3, [x3, #240] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v29.2d, v10.2d trn2 v12.2d, v30.2d, v12.2d mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] ldr q2, [x2, #272] ldr q1, [x3, #256] ldr q3, [x3, #272] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v29.2d, v14.2d trn2 v16.2d, v30.2d, v16.2d mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #288] ldr q2, [x2, #304] ldr q1, [x3, #288] ldr q3, [x3, #304] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v29.2d, v18.2d trn2 v20.2d, v30.2d, v20.2d mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h ldr q0, [x2, #448] ldr q2, [x2, #464] ldr q1, [x3, #448] ldr q3, [x3, #464] mov v29.16b, v5.16b mov v30.16b, v7.16b trn1 v5.4s, v5.4s, v6.4s trn1 v7.4s, v7.4s, v8.4s trn2 v6.4s, v29.4s, v6.4s trn2 v8.4s, v30.4s, v8.4s mul v29.8h, v6.8h, v1.8h mul v30.8h, v8.8h, v3.8h sqrdmulh v21.8h, v6.8h, v0.8h sqrdmulh v22.8h, v8.8h, v2.8h sqrdmlsh v21.8h, v29.8h, v4.h[0] sqrdmlsh v22.8h, v30.8h, v4.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 ldr q0, [x2, #480] ldr q2, [x2, #496] ldr q1, [x3, #480] ldr q3, [x3, #496] mov v29.16b, v9.16b mov v30.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v29.4s, v10.4s trn2 v12.4s, v30.4s, v12.4s mul v29.8h, v10.8h, v1.8h mul v30.8h, v12.8h, v3.8h sqrdmulh v23.8h, v10.8h, v0.8h sqrdmulh v24.8h, v12.8h, v2.8h sqrdmlsh v23.8h, v29.8h, v4.h[0] sqrdmlsh v24.8h, v30.8h, v4.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] ldr q2, [x2, #528] ldr q1, [x3, #512] ldr q3, [x3, #528] mov v29.16b, v13.16b mov v30.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v29.4s, v14.4s trn2 v16.4s, v30.4s, v16.4s mul v29.8h, v14.8h, v1.8h mul v30.8h, v16.8h, v3.8h sqrdmulh v25.8h, v14.8h, v0.8h sqrdmulh v26.8h, v16.8h, v2.8h sqrdmlsh v25.8h, v29.8h, v4.h[0] sqrdmlsh v26.8h, v30.8h, v4.h[0] sshr v25.8h, v25.8h, #1 sshr v26.8h, v26.8h, #1 ldr q0, [x2, #544] ldr q2, [x2, #560] ldr q1, [x3, #544] ldr q3, [x3, #560] mov v29.16b, v17.16b mov v30.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v29.4s, v18.4s trn2 v20.4s, v30.4s, v20.4s mul v29.8h, v18.8h, v1.8h mul v30.8h, v20.8h, v3.8h sqrdmulh v27.8h, v18.8h, v0.8h sqrdmulh v28.8h, v20.8h, v2.8h sqrdmlsh v27.8h, v29.8h, v4.h[0] sqrdmlsh v28.8h, v30.8h, v4.h[0] sshr v27.8h, v27.8h, #1 sshr v28.8h, v28.8h, #1 sub v6.8h, v5.8h, v21.8h add v5.8h, v5.8h, v21.8h sub v8.8h, v7.8h, v22.8h add v7.8h, v7.8h, v22.8h sub v10.8h, v9.8h, v23.8h add v9.8h, v9.8h, v23.8h sub v12.8h, v11.8h, v24.8h add v11.8h, v11.8h, v24.8h sub v14.8h, v13.8h, v25.8h add v13.8h, v13.8h, v25.8h sub v16.8h, v15.8h, v26.8h add v15.8h, v15.8h, v26.8h sub v18.8h, v17.8h, v27.8h add v17.8h, v17.8h, v27.8h sub v20.8h, v19.8h, v28.8h add v19.8h, v19.8h, v28.8h sqdmulh v21.8h, v5.8h, v4.h[2] sqdmulh v22.8h, v6.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v5.8h, v21.8h, v4.h[0] mls v6.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v7.8h, v4.h[2] sqdmulh v22.8h, v8.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v7.8h, v21.8h, v4.h[0] mls v8.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v9.8h, v4.h[2] sqdmulh v22.8h, v10.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v9.8h, v21.8h, v4.h[0] mls v10.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v11.8h, v4.h[2] sqdmulh v22.8h, v12.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v11.8h, v21.8h, v4.h[0] mls v12.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v13.8h, v4.h[2] sqdmulh v22.8h, v14.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v13.8h, v21.8h, v4.h[0] mls v14.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v15.8h, v4.h[2] sqdmulh v22.8h, v16.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v15.8h, v21.8h, v4.h[0] mls v16.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v17.8h, v4.h[2] sqdmulh v22.8h, v18.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v17.8h, v21.8h, v4.h[0] mls v18.8h, v22.8h, v4.h[0] sqdmulh v21.8h, v19.8h, v4.h[2] sqdmulh v22.8h, v20.8h, v4.h[2] sshr v21.8h, v21.8h, #11 sshr v22.8h, v22.8h, #11 mls v19.8h, v21.8h, v4.h[0] mls v20.8h, v22.8h, v4.h[0] mov v29.16b, v5.16b trn1 v5.4s, v5.4s, v6.4s trn2 v6.4s, v29.4s, v6.4s mov v29.16b, v5.16b trn1 v5.2d, v5.2d, v6.2d trn2 v6.2d, v29.2d, v6.2d mov v29.16b, v7.16b trn1 v7.4s, v7.4s, v8.4s trn2 v8.4s, v29.4s, v8.4s mov v29.16b, v7.16b trn1 v7.2d, v7.2d, v8.2d trn2 v8.2d, v29.2d, v8.2d mov v29.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v29.4s, v10.4s mov v29.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v29.2d, v10.2d mov v29.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v29.4s, v12.4s mov v29.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v29.2d, v12.2d mov v29.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v29.4s, v14.4s mov v29.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v29.2d, v14.2d mov v29.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v29.4s, v16.4s mov v29.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v29.2d, v16.2d mov v29.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v29.4s, v18.4s mov v29.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v29.2d, v18.2d mov v29.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v29.4s, v20.4s mov v29.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v29.2d, v20.2d stp q5, q6, [x1] stp q7, q8, [x1, #32] stp q9, q10, [x1, #64] stp q11, q12, [x1, #96] stp q13, q14, [x1, #128] stp q15, q16, [x1, #160] stp q17, q18, [x1, #192] stp q19, q20, [x1, #224] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_ntt_sqrdmlsh,.-mlkem_ntt_sqrdmlsh #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_invntt_sqrdmlsh .type mlkem_invntt_sqrdmlsh,@function .align 2 mlkem_invntt_sqrdmlsh: #else .section __TEXT,__text .globl _mlkem_invntt_sqrdmlsh .p2align 2 _mlkem_invntt_sqrdmlsh: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_zetas_inv add x2, x2, :lo12:L_mlkem_aarch64_zetas_inv #else adrp x2, L_mlkem_aarch64_zetas_inv@PAGE add x2, x2, L_mlkem_aarch64_zetas_inv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_inv_qinv add x3, x3, :lo12:L_mlkem_aarch64_zetas_inv_qinv #else adrp x3, L_mlkem_aarch64_zetas_inv_qinv@PAGE add x3, x3, L_mlkem_aarch64_zetas_inv_qinv@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ add x1, x0, #0x100 ldr q8, [x4] ldp q9, q10, [x0] ldp q11, q12, [x0, #32] ldp q13, q14, [x0, #64] ldp q15, q16, [x0, #96] ldp q17, q18, [x0, #128] ldp q19, q20, [x0, #160] ldp q21, q22, [x0, #192] ldp q23, q24, [x0, #224] mov v25.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v25.2d, v10.2d mov v25.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v25.4s, v10.4s mov v25.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v25.2d, v12.2d mov v25.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v25.4s, v12.4s mov v25.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v25.2d, v14.2d mov v25.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v25.4s, v14.4s mov v25.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v25.2d, v16.2d mov v25.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v25.4s, v16.4s mov v25.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v25.2d, v18.2d mov v25.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v25.4s, v18.4s mov v25.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v25.2d, v20.2d mov v25.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v25.4s, v20.4s mov v25.16b, v21.16b trn1 v21.2d, v21.2d, v22.2d trn2 v22.2d, v25.2d, v22.2d mov v25.16b, v21.16b trn1 v21.4s, v21.4s, v22.4s trn2 v22.4s, v25.4s, v22.4s mov v25.16b, v23.16b trn1 v23.2d, v23.2d, v24.2d trn2 v24.2d, v25.2d, v24.2d mov v25.16b, v23.16b trn1 v23.4s, v23.4s, v24.4s trn2 v24.4s, v25.4s, v24.4s ldr q0, [x2] ldr q1, [x2, #16] ldr q2, [x3] ldr q3, [x3, #16] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #32] ldr q1, [x2, #48] ldr q2, [x3, #32] ldr q3, [x3, #48] sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #64] ldr q1, [x2, #80] ldr q2, [x3, #64] ldr q3, [x3, #80] sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #96] ldr q1, [x2, #112] ldr q2, [x3, #96] ldr q3, [x3, #112] sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #256] ldr q1, [x2, #272] ldr q2, [x3, #256] ldr q3, [x3, #272] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v25.4s, v10.4s trn2 v12.4s, v26.4s, v12.4s sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #288] ldr q1, [x2, #304] ldr q2, [x3, #288] ldr q3, [x3, #304] mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v25.4s, v14.4s trn2 v16.4s, v26.4s, v16.4s sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #320] ldr q1, [x2, #336] ldr q2, [x3, #320] ldr q3, [x3, #336] mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v25.4s, v18.4s trn2 v20.4s, v26.4s, v20.4s sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #352] ldr q1, [x2, #368] ldr q2, [x3, #352] ldr q3, [x3, #368] mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.4s, v21.4s, v22.4s trn1 v23.4s, v23.4s, v24.4s trn2 v22.4s, v25.4s, v22.4s trn2 v24.4s, v26.4s, v24.4s sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #512] ldr q2, [x3, #512] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v25.2d, v10.2d trn2 v12.2d, v26.2d, v12.2d sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.h[0] mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v25.2d, v14.2d trn2 v16.2d, v26.2d, v16.2d sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.h[2] mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v25.2d, v18.2d trn2 v20.2d, v26.2d, v20.2d sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.h[4] mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.2d, v21.2d, v22.2d trn1 v23.2d, v23.2d, v24.2d trn2 v22.2d, v25.2d, v22.2d trn2 v24.2d, v26.2d, v24.2d sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.h[6] mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v11.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v11.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v13.8h, v8.h[2] sqdmulh v26.8h, v15.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v13.8h, v25.8h, v8.h[0] mls v15.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v19.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v19.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v21.8h, v8.h[2] sqdmulh v26.8h, v23.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v21.8h, v25.8h, v8.h[0] mls v23.8h, v26.8h, v8.h[0] stp q9, q10, [x0] stp q11, q12, [x0, #32] stp q13, q14, [x0, #64] stp q15, q16, [x0, #96] stp q17, q18, [x0, #128] stp q19, q20, [x0, #160] stp q21, q22, [x0, #192] stp q23, q24, [x0, #224] ldp q9, q10, [x1] ldp q11, q12, [x1, #32] ldp q13, q14, [x1, #64] ldp q15, q16, [x1, #96] ldp q17, q18, [x1, #128] ldp q19, q20, [x1, #160] ldp q21, q22, [x1, #192] ldp q23, q24, [x1, #224] mov v25.16b, v9.16b trn1 v9.2d, v9.2d, v10.2d trn2 v10.2d, v25.2d, v10.2d mov v25.16b, v9.16b trn1 v9.4s, v9.4s, v10.4s trn2 v10.4s, v25.4s, v10.4s mov v25.16b, v11.16b trn1 v11.2d, v11.2d, v12.2d trn2 v12.2d, v25.2d, v12.2d mov v25.16b, v11.16b trn1 v11.4s, v11.4s, v12.4s trn2 v12.4s, v25.4s, v12.4s mov v25.16b, v13.16b trn1 v13.2d, v13.2d, v14.2d trn2 v14.2d, v25.2d, v14.2d mov v25.16b, v13.16b trn1 v13.4s, v13.4s, v14.4s trn2 v14.4s, v25.4s, v14.4s mov v25.16b, v15.16b trn1 v15.2d, v15.2d, v16.2d trn2 v16.2d, v25.2d, v16.2d mov v25.16b, v15.16b trn1 v15.4s, v15.4s, v16.4s trn2 v16.4s, v25.4s, v16.4s mov v25.16b, v17.16b trn1 v17.2d, v17.2d, v18.2d trn2 v18.2d, v25.2d, v18.2d mov v25.16b, v17.16b trn1 v17.4s, v17.4s, v18.4s trn2 v18.4s, v25.4s, v18.4s mov v25.16b, v19.16b trn1 v19.2d, v19.2d, v20.2d trn2 v20.2d, v25.2d, v20.2d mov v25.16b, v19.16b trn1 v19.4s, v19.4s, v20.4s trn2 v20.4s, v25.4s, v20.4s mov v25.16b, v21.16b trn1 v21.2d, v21.2d, v22.2d trn2 v22.2d, v25.2d, v22.2d mov v25.16b, v21.16b trn1 v21.4s, v21.4s, v22.4s trn2 v22.4s, v25.4s, v22.4s mov v25.16b, v23.16b trn1 v23.2d, v23.2d, v24.2d trn2 v24.2d, v25.2d, v24.2d mov v25.16b, v23.16b trn1 v23.4s, v23.4s, v24.4s trn2 v24.4s, v25.4s, v24.4s ldr q0, [x2, #128] ldr q1, [x2, #144] ldr q2, [x3, #128] ldr q3, [x3, #144] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #160] ldr q1, [x2, #176] ldr q2, [x3, #160] ldr q3, [x3, #176] sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #192] ldr q1, [x2, #208] ldr q2, [x3, #192] ldr q3, [x3, #208] sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #224] ldr q1, [x2, #240] ldr q2, [x3, #224] ldr q3, [x3, #240] sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #384] ldr q1, [x2, #400] ldr q2, [x3, #384] ldr q3, [x3, #400] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.4s, v9.4s, v10.4s trn1 v11.4s, v11.4s, v12.4s trn2 v10.4s, v25.4s, v10.4s trn2 v12.4s, v26.4s, v12.4s sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v10.8h, v26.8h, v0.8h sqrdmulh v12.8h, v28.8h, v1.8h sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 ldr q0, [x2, #416] ldr q1, [x2, #432] ldr q2, [x3, #416] ldr q3, [x3, #432] mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.4s, v13.4s, v14.4s trn1 v15.4s, v15.4s, v16.4s trn2 v14.4s, v25.4s, v14.4s trn2 v16.4s, v26.4s, v16.4s sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v14.8h, v26.8h, v0.8h sqrdmulh v16.8h, v28.8h, v1.8h sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 ldr q0, [x2, #448] ldr q1, [x2, #464] ldr q2, [x3, #448] ldr q3, [x3, #464] mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.4s, v17.4s, v18.4s trn1 v19.4s, v19.4s, v20.4s trn2 v18.4s, v25.4s, v18.4s trn2 v20.4s, v26.4s, v20.4s sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v18.8h, v26.8h, v0.8h sqrdmulh v20.8h, v28.8h, v1.8h sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 ldr q0, [x2, #480] ldr q1, [x2, #496] ldr q2, [x3, #480] ldr q3, [x3, #496] mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.4s, v21.4s, v22.4s trn1 v23.4s, v23.4s, v24.4s trn2 v22.4s, v25.4s, v22.4s trn2 v24.4s, v26.4s, v24.4s sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.8h mul v27.8h, v28.8h, v3.8h sqrdmulh v22.8h, v26.8h, v0.8h sqrdmulh v24.8h, v28.8h, v1.8h sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 ldr q0, [x2, #528] ldr q2, [x3, #528] mov v25.16b, v9.16b mov v26.16b, v11.16b trn1 v9.2d, v9.2d, v10.2d trn1 v11.2d, v11.2d, v12.2d trn2 v10.2d, v25.2d, v10.2d trn2 v12.2d, v26.2d, v12.2d sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v2.h[0] mul v27.8h, v28.8h, v2.h[1] sqrdmulh v10.8h, v26.8h, v0.h[0] sqrdmulh v12.8h, v28.8h, v0.h[1] sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 mov v25.16b, v13.16b mov v26.16b, v15.16b trn1 v13.2d, v13.2d, v14.2d trn1 v15.2d, v15.2d, v16.2d trn2 v14.2d, v25.2d, v14.2d trn2 v16.2d, v26.2d, v16.2d sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v2.h[2] mul v27.8h, v28.8h, v2.h[3] sqrdmulh v14.8h, v26.8h, v0.h[2] sqrdmulh v16.8h, v28.8h, v0.h[3] sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 mov v25.16b, v17.16b mov v26.16b, v19.16b trn1 v17.2d, v17.2d, v18.2d trn1 v19.2d, v19.2d, v20.2d trn2 v18.2d, v25.2d, v18.2d trn2 v20.2d, v26.2d, v20.2d sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v2.h[4] mul v27.8h, v28.8h, v2.h[5] sqrdmulh v18.8h, v26.8h, v0.h[4] sqrdmulh v20.8h, v28.8h, v0.h[5] sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 mov v25.16b, v21.16b mov v26.16b, v23.16b trn1 v21.2d, v21.2d, v22.2d trn1 v23.2d, v23.2d, v24.2d trn2 v22.2d, v25.2d, v22.2d trn2 v24.2d, v26.2d, v24.2d sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v2.h[6] mul v27.8h, v28.8h, v2.h[7] sqrdmulh v22.8h, v26.8h, v0.h[6] sqrdmulh v24.8h, v28.8h, v0.h[7] sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v11.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v11.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v13.8h, v8.h[2] sqdmulh v26.8h, v15.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v13.8h, v25.8h, v8.h[0] mls v15.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v19.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v19.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v21.8h, v8.h[2] sqdmulh v26.8h, v23.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v21.8h, v25.8h, v8.h[0] mls v23.8h, v26.8h, v8.h[0] stp q9, q10, [x1] stp q11, q12, [x1, #32] stp q13, q14, [x1, #64] stp q15, q16, [x1, #96] stp q17, q18, [x1, #128] stp q19, q20, [x1, #160] stp q21, q22, [x1, #192] stp q23, q24, [x1, #224] ldr q4, [x2, #544] ldr q5, [x2, #560] ldr q6, [x3, #544] ldr q7, [x3, #560] ldr q9, [x0] ldr q10, [x0, #32] ldr q11, [x0, #64] ldr q12, [x0, #96] ldr q13, [x0, #128] ldr q14, [x0, #160] ldr q15, [x0, #192] ldr q16, [x0, #224] ldr q17, [x1] ldr q18, [x1, #32] ldr q19, [x1, #64] ldr q20, [x1, #96] ldr q21, [x1, #128] ldr q22, [x1, #160] ldr q23, [x1, #192] ldr q24, [x1, #224] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v6.h[0] mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v6.h[2] mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v6.h[4] mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v6.h[6] mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h sub v28.8h, v10.8h, v12.8h add v9.8h, v9.8h, v11.8h add v10.8h, v10.8h, v12.8h mul v25.8h, v26.8h, v7.h[0] mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] sqrdmlsh v11.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h sub v28.8h, v14.8h, v16.8h add v13.8h, v13.8h, v15.8h add v14.8h, v14.8h, v16.8h mul v25.8h, v26.8h, v7.h[1] mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h sub v28.8h, v18.8h, v20.8h add v17.8h, v17.8h, v19.8h add v18.8h, v18.8h, v20.8h mul v25.8h, v26.8h, v7.h[2] mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h sub v28.8h, v22.8h, v24.8h add v21.8h, v21.8h, v23.8h add v22.8h, v22.8h, v24.8h mul v25.8h, v26.8h, v7.h[3] mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h sub v28.8h, v10.8h, v14.8h add v9.8h, v9.8h, v13.8h add v10.8h, v10.8h, v14.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] sqrdmlsh v13.8h, v25.8h, v8.h[0] sqrdmlsh v14.8h, v27.8h, v8.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h sub v28.8h, v12.8h, v16.8h add v11.8h, v11.8h, v15.8h add v12.8h, v12.8h, v16.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h sub v28.8h, v18.8h, v22.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v27.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h sub v28.8h, v20.8h, v24.8h add v19.8h, v19.8h, v23.8h add v20.8h, v20.8h, v24.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v10.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v10.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v11.8h, v8.h[2] sqdmulh v26.8h, v12.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v11.8h, v25.8h, v8.h[0] mls v12.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v18.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v18.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v19.8h, v8.h[2] sqdmulh v26.8h, v20.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v19.8h, v25.8h, v8.h[0] mls v20.8h, v26.8h, v8.h[0] sub v26.8h, v9.8h, v17.8h sub v28.8h, v10.8h, v18.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] sqrdmlsh v17.8h, v25.8h, v8.h[0] sqrdmlsh v18.8h, v27.8h, v8.h[0] sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h sub v28.8h, v12.8h, v20.8h add v11.8h, v11.8h, v19.8h add v12.8h, v12.8h, v20.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h sub v28.8h, v14.8h, v22.8h add v13.8h, v13.8h, v21.8h add v14.8h, v14.8h, v22.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v27.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h sub v28.8h, v16.8h, v24.8h add v15.8h, v15.8h, v23.8h add v16.8h, v16.8h, v24.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] sqrdmlsh v9.8h, v25.8h, v8.h[0] sqrdmlsh v10.8h, v26.8h, v8.h[0] sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] sqrdmlsh v11.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v26.8h, v8.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] sqrdmlsh v13.8h, v25.8h, v8.h[0] sqrdmlsh v14.8h, v26.8h, v8.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v26.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] sqrdmlsh v17.8h, v25.8h, v8.h[0] sqrdmlsh v18.8h, v26.8h, v8.h[0] sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v26.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v26.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v26.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0] str q10, [x0, #32] str q11, [x0, #64] str q12, [x0, #96] str q13, [x0, #128] str q14, [x0, #160] str q15, [x0, #192] str q16, [x0, #224] str q17, [x1] str q18, [x1, #32] str q19, [x1, #64] str q20, [x1, #96] str q21, [x1, #128] str q22, [x1, #160] str q23, [x1, #192] str q24, [x1, #224] ldr q9, [x0, #16] ldr q10, [x0, #48] ldr q11, [x0, #80] ldr q12, [x0, #112] ldr q13, [x0, #144] ldr q14, [x0, #176] ldr q15, [x0, #208] ldr q16, [x0, #240] ldr q17, [x1, #16] ldr q18, [x1, #48] ldr q19, [x1, #80] ldr q20, [x1, #112] ldr q21, [x1, #144] ldr q22, [x1, #176] ldr q23, [x1, #208] ldr q24, [x1, #240] sub v26.8h, v9.8h, v10.8h sub v28.8h, v11.8h, v12.8h add v9.8h, v9.8h, v10.8h add v11.8h, v11.8h, v12.8h mul v25.8h, v26.8h, v6.h[0] mul v27.8h, v28.8h, v6.h[1] sqrdmulh v10.8h, v26.8h, v4.h[0] sqrdmulh v12.8h, v28.8h, v4.h[1] sqrdmlsh v10.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v10.8h, v10.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v14.8h sub v28.8h, v15.8h, v16.8h add v13.8h, v13.8h, v14.8h add v15.8h, v15.8h, v16.8h mul v25.8h, v26.8h, v6.h[2] mul v27.8h, v28.8h, v6.h[3] sqrdmulh v14.8h, v26.8h, v4.h[2] sqrdmulh v16.8h, v28.8h, v4.h[3] sqrdmlsh v14.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v14.8h, v14.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v18.8h sub v28.8h, v19.8h, v20.8h add v17.8h, v17.8h, v18.8h add v19.8h, v19.8h, v20.8h mul v25.8h, v26.8h, v6.h[4] mul v27.8h, v28.8h, v6.h[5] sqrdmulh v18.8h, v26.8h, v4.h[4] sqrdmulh v20.8h, v28.8h, v4.h[5] sqrdmlsh v18.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v18.8h, v18.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v22.8h sub v28.8h, v23.8h, v24.8h add v21.8h, v21.8h, v22.8h add v23.8h, v23.8h, v24.8h mul v25.8h, v26.8h, v6.h[6] mul v27.8h, v28.8h, v6.h[7] sqrdmulh v22.8h, v26.8h, v4.h[6] sqrdmulh v24.8h, v28.8h, v4.h[7] sqrdmlsh v22.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v22.8h, v22.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v11.8h sub v28.8h, v10.8h, v12.8h add v9.8h, v9.8h, v11.8h add v10.8h, v10.8h, v12.8h mul v25.8h, v26.8h, v7.h[0] mul v27.8h, v28.8h, v7.h[0] sqrdmulh v11.8h, v26.8h, v5.h[0] sqrdmulh v12.8h, v28.8h, v5.h[0] sqrdmlsh v11.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v27.8h, v8.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 sub v26.8h, v13.8h, v15.8h sub v28.8h, v14.8h, v16.8h add v13.8h, v13.8h, v15.8h add v14.8h, v14.8h, v16.8h mul v25.8h, v26.8h, v7.h[1] mul v27.8h, v28.8h, v7.h[1] sqrdmulh v15.8h, v26.8h, v5.h[1] sqrdmulh v16.8h, v28.8h, v5.h[1] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v19.8h sub v28.8h, v18.8h, v20.8h add v17.8h, v17.8h, v19.8h add v18.8h, v18.8h, v20.8h mul v25.8h, v26.8h, v7.h[2] mul v27.8h, v28.8h, v7.h[2] sqrdmulh v19.8h, v26.8h, v5.h[2] sqrdmulh v20.8h, v28.8h, v5.h[2] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v21.8h, v23.8h sub v28.8h, v22.8h, v24.8h add v21.8h, v21.8h, v23.8h add v22.8h, v22.8h, v24.8h mul v25.8h, v26.8h, v7.h[3] mul v27.8h, v28.8h, v7.h[3] sqrdmulh v23.8h, v26.8h, v5.h[3] sqrdmulh v24.8h, v28.8h, v5.h[3] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sub v26.8h, v9.8h, v13.8h sub v28.8h, v10.8h, v14.8h add v9.8h, v9.8h, v13.8h add v10.8h, v10.8h, v14.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v13.8h, v26.8h, v5.h[4] sqrdmulh v14.8h, v28.8h, v5.h[4] sqrdmlsh v13.8h, v25.8h, v8.h[0] sqrdmlsh v14.8h, v27.8h, v8.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 sub v26.8h, v11.8h, v15.8h sub v28.8h, v12.8h, v16.8h add v11.8h, v11.8h, v15.8h add v12.8h, v12.8h, v16.8h mul v25.8h, v26.8h, v7.h[4] mul v27.8h, v28.8h, v7.h[4] sqrdmulh v15.8h, v26.8h, v5.h[4] sqrdmulh v16.8h, v28.8h, v5.h[4] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v27.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 sub v26.8h, v17.8h, v21.8h sub v28.8h, v18.8h, v22.8h add v17.8h, v17.8h, v21.8h add v18.8h, v18.8h, v22.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v21.8h, v26.8h, v5.h[5] sqrdmulh v22.8h, v28.8h, v5.h[5] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v27.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v19.8h, v23.8h sub v28.8h, v20.8h, v24.8h add v19.8h, v19.8h, v23.8h add v20.8h, v20.8h, v24.8h mul v25.8h, v26.8h, v7.h[5] mul v27.8h, v28.8h, v7.h[5] sqrdmulh v23.8h, v26.8h, v5.h[5] sqrdmulh v24.8h, v28.8h, v5.h[5] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 sqdmulh v25.8h, v9.8h, v8.h[2] sqdmulh v26.8h, v10.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v9.8h, v25.8h, v8.h[0] mls v10.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v11.8h, v8.h[2] sqdmulh v26.8h, v12.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v11.8h, v25.8h, v8.h[0] mls v12.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v17.8h, v8.h[2] sqdmulh v26.8h, v18.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v17.8h, v25.8h, v8.h[0] mls v18.8h, v26.8h, v8.h[0] sqdmulh v25.8h, v19.8h, v8.h[2] sqdmulh v26.8h, v20.8h, v8.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v19.8h, v25.8h, v8.h[0] mls v20.8h, v26.8h, v8.h[0] sub v26.8h, v9.8h, v17.8h sub v28.8h, v10.8h, v18.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v17.8h, v26.8h, v5.h[6] sqrdmulh v18.8h, v28.8h, v5.h[6] sqrdmlsh v17.8h, v25.8h, v8.h[0] sqrdmlsh v18.8h, v27.8h, v8.h[0] sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 sub v26.8h, v11.8h, v19.8h sub v28.8h, v12.8h, v20.8h add v11.8h, v11.8h, v19.8h add v12.8h, v12.8h, v20.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v19.8h, v26.8h, v5.h[6] sqrdmulh v20.8h, v28.8h, v5.h[6] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v27.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 sub v26.8h, v13.8h, v21.8h sub v28.8h, v14.8h, v22.8h add v13.8h, v13.8h, v21.8h add v14.8h, v14.8h, v22.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v21.8h, v26.8h, v5.h[6] sqrdmulh v22.8h, v28.8h, v5.h[6] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v27.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 sub v26.8h, v15.8h, v23.8h sub v28.8h, v16.8h, v24.8h add v15.8h, v15.8h, v23.8h add v16.8h, v16.8h, v24.8h mul v25.8h, v26.8h, v7.h[6] mul v27.8h, v28.8h, v7.h[6] sqrdmulh v23.8h, v26.8h, v5.h[6] sqrdmulh v24.8h, v28.8h, v5.h[6] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v27.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 mul v25.8h, v9.8h, v7.h[7] mul v26.8h, v10.8h, v7.h[7] sqrdmulh v9.8h, v9.8h, v5.h[7] sqrdmulh v10.8h, v10.8h, v5.h[7] sqrdmlsh v9.8h, v25.8h, v8.h[0] sqrdmlsh v10.8h, v26.8h, v8.h[0] sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v25.8h, v11.8h, v7.h[7] mul v26.8h, v12.8h, v7.h[7] sqrdmulh v11.8h, v11.8h, v5.h[7] sqrdmulh v12.8h, v12.8h, v5.h[7] sqrdmlsh v11.8h, v25.8h, v8.h[0] sqrdmlsh v12.8h, v26.8h, v8.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v25.8h, v13.8h, v7.h[7] mul v26.8h, v14.8h, v7.h[7] sqrdmulh v13.8h, v13.8h, v5.h[7] sqrdmulh v14.8h, v14.8h, v5.h[7] sqrdmlsh v13.8h, v25.8h, v8.h[0] sqrdmlsh v14.8h, v26.8h, v8.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v25.8h, v15.8h, v7.h[7] mul v26.8h, v16.8h, v7.h[7] sqrdmulh v15.8h, v15.8h, v5.h[7] sqrdmulh v16.8h, v16.8h, v5.h[7] sqrdmlsh v15.8h, v25.8h, v8.h[0] sqrdmlsh v16.8h, v26.8h, v8.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 mul v25.8h, v17.8h, v7.h[7] mul v26.8h, v18.8h, v7.h[7] sqrdmulh v17.8h, v17.8h, v5.h[7] sqrdmulh v18.8h, v18.8h, v5.h[7] sqrdmlsh v17.8h, v25.8h, v8.h[0] sqrdmlsh v18.8h, v26.8h, v8.h[0] sshr v17.8h, v17.8h, #1 sshr v18.8h, v18.8h, #1 mul v25.8h, v19.8h, v7.h[7] mul v26.8h, v20.8h, v7.h[7] sqrdmulh v19.8h, v19.8h, v5.h[7] sqrdmulh v20.8h, v20.8h, v5.h[7] sqrdmlsh v19.8h, v25.8h, v8.h[0] sqrdmlsh v20.8h, v26.8h, v8.h[0] sshr v19.8h, v19.8h, #1 sshr v20.8h, v20.8h, #1 mul v25.8h, v21.8h, v7.h[7] mul v26.8h, v22.8h, v7.h[7] sqrdmulh v21.8h, v21.8h, v5.h[7] sqrdmulh v22.8h, v22.8h, v5.h[7] sqrdmlsh v21.8h, v25.8h, v8.h[0] sqrdmlsh v22.8h, v26.8h, v8.h[0] sshr v21.8h, v21.8h, #1 sshr v22.8h, v22.8h, #1 mul v25.8h, v23.8h, v7.h[7] mul v26.8h, v24.8h, v7.h[7] sqrdmulh v23.8h, v23.8h, v5.h[7] sqrdmulh v24.8h, v24.8h, v5.h[7] sqrdmlsh v23.8h, v25.8h, v8.h[0] sqrdmlsh v24.8h, v26.8h, v8.h[0] sshr v23.8h, v23.8h, #1 sshr v24.8h, v24.8h, #1 str q9, [x0, #16] str q10, [x0, #48] str q11, [x0, #80] str q12, [x0, #112] str q13, [x0, #144] str q14, [x0, #176] str q15, [x0, #208] str q16, [x0, #240] str q17, [x1, #16] str q18, [x1, #48] str q19, [x1, #80] str q20, [x1, #112] str q21, [x1, #144] str q22, [x1, #176] str q23, [x1, #208] str q24, [x1, #240] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_invntt_sqrdmlsh,.-mlkem_invntt_sqrdmlsh #endif /* __APPLE__ */ #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_zetas_mul, %object .size L_mlkem_aarch64_zetas_mul, 256 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_zetas_mul: .short 0x08b2,0xf74e,0x01ae,0xfe52,0x022b,0xfdd5,0x034b,0xfcb5 .short 0x081e,0xf7e2,0x0367,0xfc99,0x060e,0xf9f2,0x0069,0xff97 .short 0x01a6,0xfe5a,0x024b,0xfdb5,0x00b1,0xff4f,0x0c16,0xf3ea .short 0x0bde,0xf422,0x0b35,0xf4cb,0x0626,0xf9da,0x0675,0xf98b .short 0x0c0b,0xf3f5,0x030a,0xfcf6,0x0487,0xfb79,0x0c6e,0xf392 .short 0x09f8,0xf608,0x05cb,0xfa35,0x0aa7,0xf559,0x045f,0xfba1 .short 0x06cb,0xf935,0x0284,0xfd7c,0x0999,0xf667,0x015d,0xfea3 .short 0x01a2,0xfe5e,0x0149,0xfeb7,0x0c65,0xf39b,0x0cb6,0xf34a .short 0x0331,0xfccf,0x0449,0xfbb7,0x025b,0xfda5,0x0262,0xfd9e .short 0x052a,0xfad6,0x07fc,0xf804,0x0748,0xf8b8,0x0180,0xfe80 .short 0x0842,0xf7be,0x0c79,0xf387,0x04c2,0xfb3e,0x07ca,0xf836 .short 0x0997,0xf669,0x00dc,0xff24,0x085e,0xf7a2,0x0686,0xf97a .short 0x0860,0xf7a0,0x0707,0xf8f9,0x0803,0xf7fd,0x031a,0xfce6 .short 0x071b,0xf8e5,0x09ab,0xf655,0x099b,0xf665,0x01de,0xfe22 .short 0x0c95,0xf36b,0x0bcd,0xf433,0x03e4,0xfc1c,0x03df,0xfc21 .short 0x03be,0xfc42,0x074d,0xf8b3,0x05f2,0xfa0e,0x065c,0xf9a4 #ifndef __APPLE__ .text .globl mlkem_basemul_mont .type mlkem_basemul_mont,@function .align 2 mlkem_basemul_mont: #else .section __TEXT,__text .globl _mlkem_basemul_mont .p2align 2 _mlkem_basemul_mont: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_mul add x3, x3, :lo12:L_mlkem_aarch64_zetas_mul #else adrp x3, L_mlkem_aarch64_zetas_mul@PAGE add x3, x3, L_mlkem_aarch64_zetas_mul@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q1, [x4] ldp q2, q3, [x1] ldp q4, q5, [x1, #32] ldp q6, q7, [x1, #64] ldp q8, q9, [x1, #96] ldp q10, q11, [x2] ldp q12, q13, [x2, #32] ldp q14, q15, [x2, #64] ldp q16, q17, [x2, #96] ldr q0, [x3] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0] ldr q0, [x3, #16] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #32] ldr q0, [x3, #32] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #64] ldr q0, [x3, #48] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #96] ldp q2, q3, [x1, #128] ldp q4, q5, [x1, #160] ldp q6, q7, [x1, #192] ldp q8, q9, [x1, #224] ldp q10, q11, [x2, #128] ldp q12, q13, [x2, #160] ldp q14, q15, [x2, #192] ldp q16, q17, [x2, #224] ldr q0, [x3, #64] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #128] ldr q0, [x3, #80] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #160] ldr q0, [x3, #96] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #192] ldr q0, [x3, #112] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #224] ldp q2, q3, [x1, #256] ldp q4, q5, [x1, #288] ldp q6, q7, [x1, #320] ldp q8, q9, [x1, #352] ldp q10, q11, [x2, #256] ldp q12, q13, [x2, #288] ldp q14, q15, [x2, #320] ldp q16, q17, [x2, #352] ldr q0, [x3, #128] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #256] ldr q0, [x3, #144] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #288] ldr q0, [x3, #160] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #320] ldr q0, [x3, #176] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #352] ldp q2, q3, [x1, #384] ldp q4, q5, [x1, #416] ldp q6, q7, [x1, #448] ldp q8, q9, [x1, #480] ldp q10, q11, [x2, #384] ldp q12, q13, [x2, #416] ldp q14, q15, [x2, #448] ldp q16, q17, [x2, #480] ldr q0, [x3, #192] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #384] ldr q0, [x3, #208] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #416] ldr q0, [x3, #224] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #448] ldr q0, [x3, #240] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h stp q24, q25, [x0, #480] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_basemul_mont,.-mlkem_basemul_mont #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_basemul_mont_add .type mlkem_basemul_mont_add,@function .align 2 mlkem_basemul_mont_add: #else .section __TEXT,__text .globl _mlkem_basemul_mont_add .p2align 2 _mlkem_basemul_mont_add: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_zetas_mul add x3, x3, :lo12:L_mlkem_aarch64_zetas_mul #else adrp x3, L_mlkem_aarch64_zetas_mul@PAGE add x3, x3, L_mlkem_aarch64_zetas_mul@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_aarch64_consts add x4, x4, :lo12:L_mlkem_aarch64_consts #else adrp x4, L_mlkem_aarch64_consts@PAGE add x4, x4, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q1, [x4] ldp q2, q3, [x1] ldp q4, q5, [x1, #32] ldp q6, q7, [x1, #64] ldp q8, q9, [x1, #96] ldp q10, q11, [x2] ldp q12, q13, [x2, #32] ldp q14, q15, [x2, #64] ldp q16, q17, [x2, #96] ldp q28, q29, [x0] ldr q0, [x3] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0] ldp q28, q29, [x0, #32] ldr q0, [x3, #16] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #32] ldp q28, q29, [x0, #64] ldr q0, [x3, #32] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #64] ldp q28, q29, [x0, #96] ldr q0, [x3, #48] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #96] ldp q2, q3, [x1, #128] ldp q4, q5, [x1, #160] ldp q6, q7, [x1, #192] ldp q8, q9, [x1, #224] ldp q10, q11, [x2, #128] ldp q12, q13, [x2, #160] ldp q14, q15, [x2, #192] ldp q16, q17, [x2, #224] ldp q28, q29, [x0, #128] ldr q0, [x3, #64] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #128] ldp q28, q29, [x0, #160] ldr q0, [x3, #80] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #160] ldp q28, q29, [x0, #192] ldr q0, [x3, #96] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #192] ldp q28, q29, [x0, #224] ldr q0, [x3, #112] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #224] ldp q2, q3, [x1, #256] ldp q4, q5, [x1, #288] ldp q6, q7, [x1, #320] ldp q8, q9, [x1, #352] ldp q10, q11, [x2, #256] ldp q12, q13, [x2, #288] ldp q14, q15, [x2, #320] ldp q16, q17, [x2, #352] ldp q28, q29, [x0, #256] ldr q0, [x3, #128] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #256] ldp q28, q29, [x0, #288] ldr q0, [x3, #144] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #288] ldp q28, q29, [x0, #320] ldr q0, [x3, #160] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #320] ldp q28, q29, [x0, #352] ldr q0, [x3, #176] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #352] ldp q2, q3, [x1, #384] ldp q4, q5, [x1, #416] ldp q6, q7, [x1, #448] ldp q8, q9, [x1, #480] ldp q10, q11, [x2, #384] ldp q12, q13, [x2, #416] ldp q14, q15, [x2, #448] ldp q16, q17, [x2, #480] ldp q28, q29, [x0, #384] ldr q0, [x3, #192] uzp1 v18.8h, v2.8h, v3.8h uzp2 v19.8h, v2.8h, v3.8h uzp1 v20.8h, v10.8h, v11.8h uzp2 v21.8h, v10.8h, v11.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #384] ldp q28, q29, [x0, #416] ldr q0, [x3, #208] uzp1 v18.8h, v4.8h, v5.8h uzp2 v19.8h, v4.8h, v5.8h uzp1 v20.8h, v12.8h, v13.8h uzp2 v21.8h, v12.8h, v13.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #416] ldp q28, q29, [x0, #448] ldr q0, [x3, #224] uzp1 v18.8h, v6.8h, v7.8h uzp2 v19.8h, v6.8h, v7.8h uzp1 v20.8h, v14.8h, v15.8h uzp2 v21.8h, v14.8h, v15.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #448] ldp q28, q29, [x0, #480] ldr q0, [x3, #240] uzp1 v18.8h, v8.8h, v9.8h uzp2 v19.8h, v8.8h, v9.8h uzp1 v20.8h, v16.8h, v17.8h uzp2 v21.8h, v16.8h, v17.8h smull v26.4s, v18.4h, v20.4h smull2 v27.4s, v18.8h, v20.8h smull v23.4s, v19.4h, v21.4h smull2 v24.4s, v19.8h, v21.8h xtn v25.4h, v23.4s xtn2 v25.8h, v24.4s mul v25.8h, v25.8h, v1.h[1] smlsl v23.4s, v25.4h, v1.h[0] smlsl2 v24.4s, v25.8h, v1.h[0] shrn v22.4h, v23.4s, #16 shrn2 v22.8h, v24.4s, #16 smlal v26.4s, v22.4h, v0.4h smlal2 v27.4s, v22.8h, v0.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v22.4h, v26.4s, #16 shrn2 v22.8h, v27.4s, #16 smull v26.4s, v18.4h, v21.4h smull2 v27.4s, v18.8h, v21.8h smlal v26.4s, v19.4h, v20.4h smlal2 v27.4s, v19.8h, v20.8h xtn v24.4h, v26.4s xtn2 v24.8h, v27.4s mul v24.8h, v24.8h, v1.h[1] smlsl v26.4s, v24.4h, v1.h[0] smlsl2 v27.4s, v24.8h, v1.h[0] shrn v23.4h, v26.4s, #16 shrn2 v23.8h, v27.4s, #16 zip1 v24.8h, v22.8h, v23.8h zip2 v25.8h, v22.8h, v23.8h add v28.8h, v28.8h, v24.8h add v29.8h, v29.8h, v25.8h stp q28, q29, [x0, #480] ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_basemul_mont_add,.-mlkem_basemul_mont_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_aarch64_q, %object .size L_mlkem_aarch64_q, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_aarch64_q: .short 0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01,0x0d01 #ifndef __APPLE__ .text .globl mlkem_csubq_neon .type mlkem_csubq_neon,@function .align 2 mlkem_csubq_neon: #else .section __TEXT,__text .globl _mlkem_csubq_neon .p2align 2 _mlkem_csubq_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x1, L_mlkem_aarch64_q add x1, x1, :lo12:L_mlkem_aarch64_q #else adrp x1, L_mlkem_aarch64_q@PAGE add x1, x1, L_mlkem_aarch64_q@PAGEOFF #endif /* __APPLE__ */ ldr q20, [x1] ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 sub x0, x0, #0x100 sub v0.8h, v0.8h, v20.8h sub v1.8h, v1.8h, v20.8h sub v2.8h, v2.8h, v20.8h sub v3.8h, v3.8h, v20.8h sub v4.8h, v4.8h, v20.8h sub v5.8h, v5.8h, v20.8h sub v6.8h, v6.8h, v20.8h sub v7.8h, v7.8h, v20.8h sub v8.8h, v8.8h, v20.8h sub v9.8h, v9.8h, v20.8h sub v10.8h, v10.8h, v20.8h sub v11.8h, v11.8h, v20.8h sub v12.8h, v12.8h, v20.8h sub v13.8h, v13.8h, v20.8h sub v14.8h, v14.8h, v20.8h sub v15.8h, v15.8h, v20.8h sshr v16.8h, v0.8h, #15 sshr v17.8h, v1.8h, #15 sshr v18.8h, v2.8h, #15 sshr v19.8h, v3.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h sshr v16.8h, v4.8h, #15 sshr v17.8h, v5.8h, #15 sshr v18.8h, v6.8h, #15 sshr v19.8h, v7.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v4.8h, v4.8h, v16.8h add v5.8h, v5.8h, v17.8h add v6.8h, v6.8h, v18.8h add v7.8h, v7.8h, v19.8h sshr v16.8h, v8.8h, #15 sshr v17.8h, v9.8h, #15 sshr v18.8h, v10.8h, #15 sshr v19.8h, v11.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v8.8h, v8.8h, v16.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h add v11.8h, v11.8h, v19.8h sshr v16.8h, v12.8h, #15 sshr v17.8h, v13.8h, #15 sshr v18.8h, v14.8h, #15 sshr v19.8h, v15.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v12.8h, v12.8h, v16.8h add v13.8h, v13.8h, v17.8h add v14.8h, v14.8h, v18.8h add v15.8h, v15.8h, v19.8h st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 ld4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 ld4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 ld4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 sub x0, x0, #0x100 sub v0.8h, v0.8h, v20.8h sub v1.8h, v1.8h, v20.8h sub v2.8h, v2.8h, v20.8h sub v3.8h, v3.8h, v20.8h sub v4.8h, v4.8h, v20.8h sub v5.8h, v5.8h, v20.8h sub v6.8h, v6.8h, v20.8h sub v7.8h, v7.8h, v20.8h sub v8.8h, v8.8h, v20.8h sub v9.8h, v9.8h, v20.8h sub v10.8h, v10.8h, v20.8h sub v11.8h, v11.8h, v20.8h sub v12.8h, v12.8h, v20.8h sub v13.8h, v13.8h, v20.8h sub v14.8h, v14.8h, v20.8h sub v15.8h, v15.8h, v20.8h sshr v16.8h, v0.8h, #15 sshr v17.8h, v1.8h, #15 sshr v18.8h, v2.8h, #15 sshr v19.8h, v3.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h sshr v16.8h, v4.8h, #15 sshr v17.8h, v5.8h, #15 sshr v18.8h, v6.8h, #15 sshr v19.8h, v7.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v4.8h, v4.8h, v16.8h add v5.8h, v5.8h, v17.8h add v6.8h, v6.8h, v18.8h add v7.8h, v7.8h, v19.8h sshr v16.8h, v8.8h, #15 sshr v17.8h, v9.8h, #15 sshr v18.8h, v10.8h, #15 sshr v19.8h, v11.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v8.8h, v8.8h, v16.8h add v9.8h, v9.8h, v17.8h add v10.8h, v10.8h, v18.8h add v11.8h, v11.8h, v19.8h sshr v16.8h, v12.8h, #15 sshr v17.8h, v13.8h, #15 sshr v18.8h, v14.8h, #15 sshr v19.8h, v15.8h, #15 and v16.16b, v16.16b, v20.16b and v17.16b, v17.16b, v20.16b and v18.16b, v18.16b, v20.16b and v19.16b, v19.16b, v20.16b add v12.8h, v12.8h, v16.8h add v13.8h, v13.8h, v17.8h add v14.8h, v14.8h, v18.8h add v15.8h, v15.8h, v19.8h st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #0x40 st4 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 st4 {v8.8h, v9.8h, v10.8h, v11.8h}, [x0], #0x40 st4 {v12.8h, v13.8h, v14.8h, v15.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_csubq_neon,.-mlkem_csubq_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_add_reduce .type mlkem_add_reduce,@function .align 2 mlkem_add_reduce: #else .section __TEXT,__text .globl _mlkem_add_reduce .p2align 2 _mlkem_add_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_consts add x2, x2, :lo12:L_mlkem_aarch64_consts #else adrp x2, L_mlkem_aarch64_consts@PAGE add x2, x2, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x2] ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_add_reduce,.-mlkem_add_reduce #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_add3_reduce .type mlkem_add3_reduce,@function .align 2 mlkem_add3_reduce: #else .section __TEXT,__text .globl _mlkem_add3_reduce .p2align 2 _mlkem_add3_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x3, L_mlkem_aarch64_consts add x3, x3, :lo12:L_mlkem_aarch64_consts #else adrp x3, L_mlkem_aarch64_consts@PAGE add x3, x3, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x3] ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h add v8.8h, v8.8h, v24.8h sqdmulh v25.8h, v1.8h, v0.h[2] sqdmulh v26.8h, v2.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v1.8h, v25.8h, v0.h[0] mls v2.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v3.8h, v0.h[2] sqdmulh v26.8h, v4.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v3.8h, v25.8h, v0.h[0] mls v4.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v5.8h, v0.h[2] sqdmulh v26.8h, v6.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v5.8h, v25.8h, v0.h[0] mls v6.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v7.8h, v0.h[2] sqdmulh v26.8h, v8.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v7.8h, v25.8h, v0.h[0] mls v8.8h, v26.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h add v8.8h, v8.8h, v24.8h sqdmulh v25.8h, v1.8h, v0.h[2] sqdmulh v26.8h, v2.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v1.8h, v25.8h, v0.h[0] mls v2.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v3.8h, v0.h[2] sqdmulh v26.8h, v4.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v3.8h, v25.8h, v0.h[0] mls v4.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v5.8h, v0.h[2] sqdmulh v26.8h, v6.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v5.8h, v25.8h, v0.h[0] mls v6.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v7.8h, v0.h[2] sqdmulh v26.8h, v8.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v7.8h, v25.8h, v0.h[0] mls v8.8h, v26.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h add v8.8h, v8.8h, v24.8h sqdmulh v25.8h, v1.8h, v0.h[2] sqdmulh v26.8h, v2.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v1.8h, v25.8h, v0.h[0] mls v2.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v3.8h, v0.h[2] sqdmulh v26.8h, v4.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v3.8h, v25.8h, v0.h[0] mls v4.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v5.8h, v0.h[2] sqdmulh v26.8h, v6.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v5.8h, v25.8h, v0.h[0] mls v6.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v7.8h, v0.h[2] sqdmulh v26.8h, v8.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v7.8h, v25.8h, v0.h[0] mls v8.8h, v26.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 ld4 {v17.8h, v18.8h, v19.8h, v20.8h}, [x2], #0x40 ld4 {v21.8h, v22.8h, v23.8h, v24.8h}, [x2], #0x40 sub x0, x0, #0x80 add v1.8h, v1.8h, v9.8h add v2.8h, v2.8h, v10.8h add v3.8h, v3.8h, v11.8h add v4.8h, v4.8h, v12.8h add v5.8h, v5.8h, v13.8h add v6.8h, v6.8h, v14.8h add v7.8h, v7.8h, v15.8h add v8.8h, v8.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h add v8.8h, v8.8h, v24.8h sqdmulh v25.8h, v1.8h, v0.h[2] sqdmulh v26.8h, v2.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v1.8h, v25.8h, v0.h[0] mls v2.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v3.8h, v0.h[2] sqdmulh v26.8h, v4.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v3.8h, v25.8h, v0.h[0] mls v4.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v5.8h, v0.h[2] sqdmulh v26.8h, v6.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v5.8h, v25.8h, v0.h[0] mls v6.8h, v26.8h, v0.h[0] sqdmulh v25.8h, v7.8h, v0.h[2] sqdmulh v26.8h, v8.8h, v0.h[2] sshr v25.8h, v25.8h, #11 sshr v26.8h, v26.8h, #11 mls v7.8h, v25.8h, v0.h[0] mls v8.8h, v26.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_add3_reduce,.-mlkem_add3_reduce #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_rsub_reduce .type mlkem_rsub_reduce,@function .align 2 mlkem_rsub_reduce: #else .section __TEXT,__text .globl _mlkem_rsub_reduce .p2align 2 _mlkem_rsub_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_aarch64_consts add x2, x2, :lo12:L_mlkem_aarch64_consts #else adrp x2, L_mlkem_aarch64_consts@PAGE add x2, x2, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x2] ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 sub v1.8h, v9.8h, v1.8h sub v2.8h, v10.8h, v2.8h sub v3.8h, v11.8h, v3.8h sub v4.8h, v12.8h, v4.8h sub v5.8h, v13.8h, v5.8h sub v6.8h, v14.8h, v6.8h sub v7.8h, v15.8h, v7.8h sub v8.8h, v16.8h, v8.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 sub v1.8h, v9.8h, v1.8h sub v2.8h, v10.8h, v2.8h sub v3.8h, v11.8h, v3.8h sub v4.8h, v12.8h, v4.8h sub v5.8h, v13.8h, v5.8h sub v6.8h, v14.8h, v6.8h sub v7.8h, v15.8h, v7.8h sub v8.8h, v16.8h, v8.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 sub v1.8h, v9.8h, v1.8h sub v2.8h, v10.8h, v2.8h sub v3.8h, v11.8h, v3.8h sub v4.8h, v12.8h, v4.8h sub v5.8h, v13.8h, v5.8h sub v6.8h, v14.8h, v6.8h sub v7.8h, v15.8h, v7.8h sub v8.8h, v16.8h, v8.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x1], #0x40 sub x0, x0, #0x80 sub v1.8h, v9.8h, v1.8h sub v2.8h, v10.8h, v2.8h sub v3.8h, v11.8h, v3.8h sub v4.8h, v12.8h, v4.8h sub v5.8h, v13.8h, v5.8h sub v6.8h, v14.8h, v6.8h sub v7.8h, v15.8h, v7.8h sub v8.8h, v16.8h, v8.8h sqdmulh v17.8h, v1.8h, v0.h[2] sqdmulh v18.8h, v2.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v1.8h, v17.8h, v0.h[0] mls v2.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v3.8h, v0.h[2] sqdmulh v18.8h, v4.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v3.8h, v17.8h, v0.h[0] mls v4.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v5.8h, v0.h[2] sqdmulh v18.8h, v6.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v5.8h, v17.8h, v0.h[0] mls v6.8h, v18.8h, v0.h[0] sqdmulh v17.8h, v7.8h, v0.h[2] sqdmulh v18.8h, v8.8h, v0.h[2] sshr v17.8h, v17.8h, #11 sshr v18.8h, v18.8h, #11 mls v7.8h, v17.8h, v0.h[0] mls v8.8h, v18.8h, v0.h[0] st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_rsub_reduce,.-mlkem_rsub_reduce #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_to_mont .type mlkem_to_mont,@function .align 2 mlkem_to_mont: #else .section __TEXT,__text .globl _mlkem_to_mont .p2align 2 _mlkem_to_mont: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x1, L_mlkem_aarch64_consts add x1, x1, :lo12:L_mlkem_aarch64_consts #else adrp x1, L_mlkem_aarch64_consts@PAGE add x1, x1, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x1] ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 sub x0, x0, #0x100 mul v17.8h, v1.8h, v0.h[4] mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v1.8h, v1.8h, v17.8h sub v2.8h, v2.8h, v18.8h sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v3.8h, v3.8h, v17.8h sub v4.8h, v4.8h, v18.8h sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v5.8h, v5.8h, v17.8h sub v6.8h, v6.8h, v18.8h sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v7.8h, v7.8h, v17.8h sub v8.8h, v8.8h, v18.8h sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v9.8h, v9.8h, v17.8h sub v10.8h, v10.8h, v18.8h sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v11.8h, v11.8h, v17.8h sub v12.8h, v12.8h, v18.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v13.8h, v13.8h, v17.8h sub v14.8h, v14.8h, v18.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v15.8h, v15.8h, v17.8h sub v16.8h, v16.8h, v18.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 sub x0, x0, #0x100 mul v17.8h, v1.8h, v0.h[4] mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v1.8h, v1.8h, v17.8h sub v2.8h, v2.8h, v18.8h sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v3.8h, v3.8h, v17.8h sub v4.8h, v4.8h, v18.8h sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v5.8h, v5.8h, v17.8h sub v6.8h, v6.8h, v18.8h sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v7.8h, v7.8h, v17.8h sub v8.8h, v8.8h, v18.8h sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v9.8h, v9.8h, v17.8h sub v10.8h, v10.8h, v18.8h sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v11.8h, v11.8h, v17.8h sub v12.8h, v12.8h, v18.8h sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v13.8h, v13.8h, v17.8h sub v14.8h, v14.8h, v18.8h sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] sqrdmulh v17.8h, v17.8h, v0.h[0] sqrdmulh v18.8h, v18.8h, v0.h[0] sub v15.8h, v15.8h, v17.8h sub v16.8h, v16.8h, v18.8h sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_to_mont,.-mlkem_to_mont #endif /* __APPLE__ */ #ifndef WOLFSSL_AARCH64_NO_SQRDMLSH #ifndef __APPLE__ .text .globl mlkem_to_mont_sqrdmlsh .type mlkem_to_mont_sqrdmlsh,@function .align 2 mlkem_to_mont_sqrdmlsh: #else .section __TEXT,__text .globl _mlkem_to_mont_sqrdmlsh .p2align 2 _mlkem_to_mont_sqrdmlsh: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x1, L_mlkem_aarch64_consts add x1, x1, :lo12:L_mlkem_aarch64_consts #else adrp x1, L_mlkem_aarch64_consts@PAGE add x1, x1, L_mlkem_aarch64_consts@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x1] ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 sub x0, x0, #0x100 mul v17.8h, v1.8h, v0.h[4] mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] sqrdmlsh v1.8h, v17.8h, v0.h[0] sqrdmlsh v2.8h, v18.8h, v0.h[0] sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] sqrdmlsh v3.8h, v17.8h, v0.h[0] sqrdmlsh v4.8h, v18.8h, v0.h[0] sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] sqrdmlsh v5.8h, v17.8h, v0.h[0] sqrdmlsh v6.8h, v18.8h, v0.h[0] sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] sqrdmlsh v7.8h, v17.8h, v0.h[0] sqrdmlsh v8.8h, v18.8h, v0.h[0] sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] sqrdmlsh v9.8h, v17.8h, v0.h[0] sqrdmlsh v10.8h, v18.8h, v0.h[0] sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] sqrdmlsh v11.8h, v17.8h, v0.h[0] sqrdmlsh v12.8h, v18.8h, v0.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] sqrdmlsh v13.8h, v17.8h, v0.h[0] sqrdmlsh v14.8h, v18.8h, v0.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] sqrdmlsh v15.8h, v17.8h, v0.h[0] sqrdmlsh v16.8h, v18.8h, v0.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 ld4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 ld4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 ld4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 sub x0, x0, #0x100 mul v17.8h, v1.8h, v0.h[4] mul v18.8h, v2.8h, v0.h[4] sqrdmulh v1.8h, v1.8h, v0.h[3] sqrdmulh v2.8h, v2.8h, v0.h[3] sqrdmlsh v1.8h, v17.8h, v0.h[0] sqrdmlsh v2.8h, v18.8h, v0.h[0] sshr v1.8h, v1.8h, #1 sshr v2.8h, v2.8h, #1 mul v17.8h, v3.8h, v0.h[4] mul v18.8h, v4.8h, v0.h[4] sqrdmulh v3.8h, v3.8h, v0.h[3] sqrdmulh v4.8h, v4.8h, v0.h[3] sqrdmlsh v3.8h, v17.8h, v0.h[0] sqrdmlsh v4.8h, v18.8h, v0.h[0] sshr v3.8h, v3.8h, #1 sshr v4.8h, v4.8h, #1 mul v17.8h, v5.8h, v0.h[4] mul v18.8h, v6.8h, v0.h[4] sqrdmulh v5.8h, v5.8h, v0.h[3] sqrdmulh v6.8h, v6.8h, v0.h[3] sqrdmlsh v5.8h, v17.8h, v0.h[0] sqrdmlsh v6.8h, v18.8h, v0.h[0] sshr v5.8h, v5.8h, #1 sshr v6.8h, v6.8h, #1 mul v17.8h, v7.8h, v0.h[4] mul v18.8h, v8.8h, v0.h[4] sqrdmulh v7.8h, v7.8h, v0.h[3] sqrdmulh v8.8h, v8.8h, v0.h[3] sqrdmlsh v7.8h, v17.8h, v0.h[0] sqrdmlsh v8.8h, v18.8h, v0.h[0] sshr v7.8h, v7.8h, #1 sshr v8.8h, v8.8h, #1 mul v17.8h, v9.8h, v0.h[4] mul v18.8h, v10.8h, v0.h[4] sqrdmulh v9.8h, v9.8h, v0.h[3] sqrdmulh v10.8h, v10.8h, v0.h[3] sqrdmlsh v9.8h, v17.8h, v0.h[0] sqrdmlsh v10.8h, v18.8h, v0.h[0] sshr v9.8h, v9.8h, #1 sshr v10.8h, v10.8h, #1 mul v17.8h, v11.8h, v0.h[4] mul v18.8h, v12.8h, v0.h[4] sqrdmulh v11.8h, v11.8h, v0.h[3] sqrdmulh v12.8h, v12.8h, v0.h[3] sqrdmlsh v11.8h, v17.8h, v0.h[0] sqrdmlsh v12.8h, v18.8h, v0.h[0] sshr v11.8h, v11.8h, #1 sshr v12.8h, v12.8h, #1 mul v17.8h, v13.8h, v0.h[4] mul v18.8h, v14.8h, v0.h[4] sqrdmulh v13.8h, v13.8h, v0.h[3] sqrdmulh v14.8h, v14.8h, v0.h[3] sqrdmlsh v13.8h, v17.8h, v0.h[0] sqrdmlsh v14.8h, v18.8h, v0.h[0] sshr v13.8h, v13.8h, #1 sshr v14.8h, v14.8h, #1 mul v17.8h, v15.8h, v0.h[4] mul v18.8h, v16.8h, v0.h[4] sqrdmulh v15.8h, v15.8h, v0.h[3] sqrdmulh v16.8h, v16.8h, v0.h[3] sqrdmlsh v15.8h, v17.8h, v0.h[0] sqrdmlsh v16.8h, v18.8h, v0.h[0] sshr v15.8h, v15.8h, #1 sshr v16.8h, v16.8h, #1 st4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x0], #0x40 st4 {v5.8h, v6.8h, v7.8h, v8.8h}, [x0], #0x40 st4 {v9.8h, v10.8h, v11.8h, v12.8h}, [x0], #0x40 st4 {v13.8h, v14.8h, v15.8h, v16.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_to_mont_sqrdmlsh,.-mlkem_to_mont_sqrdmlsh #endif /* __APPLE__ */ #endif /* WOLFSSL_AARCH64_NO_SQRDMLSH */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_to_msg_low, %object .size L_mlkem_to_msg_low, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_to_msg_low: .short 0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373,0x0373 #ifndef __APPLE__ .text .section .rodata .type L_mlkem_to_msg_high, %object .size L_mlkem_to_msg_high, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_to_msg_high: .short 0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0,0x09c0 #ifndef __APPLE__ .text .section .rodata .type L_mlkem_to_msg_bits, %object .size L_mlkem_to_msg_bits, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_to_msg_bits: .short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080 #ifndef __APPLE__ .text .globl mlkem_to_msg_neon .type mlkem_to_msg_neon,@function .align 2 mlkem_to_msg_neon: #else .section __TEXT,__text .globl _mlkem_to_msg_neon .p2align 2 _mlkem_to_msg_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] stp d14, d15, [x29, #64] #ifndef __APPLE__ adrp x2, L_mlkem_to_msg_low add x2, x2, :lo12:L_mlkem_to_msg_low #else adrp x2, L_mlkem_to_msg_low@PAGE add x2, x2, L_mlkem_to_msg_low@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_to_msg_high add x3, x3, :lo12:L_mlkem_to_msg_high #else adrp x3, L_mlkem_to_msg_high@PAGE add x3, x3, L_mlkem_to_msg_high@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x4, L_mlkem_to_msg_bits add x4, x4, :lo12:L_mlkem_to_msg_bits #else adrp x4, L_mlkem_to_msg_bits@PAGE add x4, x4, L_mlkem_to_msg_bits@PAGEOFF #endif /* __APPLE__ */ ldr q0, [x2] ldr q1, [x3] ldr q26, [x4] ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 cmge v10.8h, v2.8h, v0.8h cmge v18.8h, v1.8h, v2.8h cmge v11.8h, v3.8h, v0.8h cmge v19.8h, v1.8h, v3.8h cmge v12.8h, v4.8h, v0.8h cmge v20.8h, v1.8h, v4.8h cmge v13.8h, v5.8h, v0.8h cmge v21.8h, v1.8h, v5.8h cmge v14.8h, v6.8h, v0.8h cmge v22.8h, v1.8h, v6.8h cmge v15.8h, v7.8h, v0.8h cmge v23.8h, v1.8h, v7.8h cmge v16.8h, v8.8h, v0.8h cmge v24.8h, v1.8h, v8.8h cmge v17.8h, v9.8h, v0.8h cmge v25.8h, v1.8h, v9.8h and v18.16b, v18.16b, v10.16b and v19.16b, v19.16b, v11.16b and v20.16b, v20.16b, v12.16b and v21.16b, v21.16b, v13.16b and v22.16b, v22.16b, v14.16b and v23.16b, v23.16b, v15.16b and v24.16b, v24.16b, v16.16b and v25.16b, v25.16b, v17.16b and v18.16b, v18.16b, v26.16b and v19.16b, v19.16b, v26.16b and v20.16b, v20.16b, v26.16b and v21.16b, v21.16b, v26.16b and v22.16b, v22.16b, v26.16b and v23.16b, v23.16b, v26.16b and v24.16b, v24.16b, v26.16b and v25.16b, v25.16b, v26.16b addv h18, v18.8h addv h19, v19.8h addv h20, v20.8h addv h21, v21.8h addv h22, v22.8h addv h23, v23.8h addv h24, v24.8h addv h25, v25.8h ins v18.b[1], v19.b[0] ins v18.b[2], v20.b[0] ins v18.b[3], v21.b[0] ins v18.b[4], v22.b[0] ins v18.b[5], v23.b[0] ins v18.b[6], v24.b[0] ins v18.b[7], v25.b[0] st1 {v18.8b}, [x0], #8 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 cmge v10.8h, v2.8h, v0.8h cmge v18.8h, v1.8h, v2.8h cmge v11.8h, v3.8h, v0.8h cmge v19.8h, v1.8h, v3.8h cmge v12.8h, v4.8h, v0.8h cmge v20.8h, v1.8h, v4.8h cmge v13.8h, v5.8h, v0.8h cmge v21.8h, v1.8h, v5.8h cmge v14.8h, v6.8h, v0.8h cmge v22.8h, v1.8h, v6.8h cmge v15.8h, v7.8h, v0.8h cmge v23.8h, v1.8h, v7.8h cmge v16.8h, v8.8h, v0.8h cmge v24.8h, v1.8h, v8.8h cmge v17.8h, v9.8h, v0.8h cmge v25.8h, v1.8h, v9.8h and v18.16b, v18.16b, v10.16b and v19.16b, v19.16b, v11.16b and v20.16b, v20.16b, v12.16b and v21.16b, v21.16b, v13.16b and v22.16b, v22.16b, v14.16b and v23.16b, v23.16b, v15.16b and v24.16b, v24.16b, v16.16b and v25.16b, v25.16b, v17.16b and v18.16b, v18.16b, v26.16b and v19.16b, v19.16b, v26.16b and v20.16b, v20.16b, v26.16b and v21.16b, v21.16b, v26.16b and v22.16b, v22.16b, v26.16b and v23.16b, v23.16b, v26.16b and v24.16b, v24.16b, v26.16b and v25.16b, v25.16b, v26.16b addv h18, v18.8h addv h19, v19.8h addv h20, v20.8h addv h21, v21.8h addv h22, v22.8h addv h23, v23.8h addv h24, v24.8h addv h25, v25.8h ins v18.b[1], v19.b[0] ins v18.b[2], v20.b[0] ins v18.b[3], v21.b[0] ins v18.b[4], v22.b[0] ins v18.b[5], v23.b[0] ins v18.b[6], v24.b[0] ins v18.b[7], v25.b[0] st1 {v18.8b}, [x0], #8 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 cmge v10.8h, v2.8h, v0.8h cmge v18.8h, v1.8h, v2.8h cmge v11.8h, v3.8h, v0.8h cmge v19.8h, v1.8h, v3.8h cmge v12.8h, v4.8h, v0.8h cmge v20.8h, v1.8h, v4.8h cmge v13.8h, v5.8h, v0.8h cmge v21.8h, v1.8h, v5.8h cmge v14.8h, v6.8h, v0.8h cmge v22.8h, v1.8h, v6.8h cmge v15.8h, v7.8h, v0.8h cmge v23.8h, v1.8h, v7.8h cmge v16.8h, v8.8h, v0.8h cmge v24.8h, v1.8h, v8.8h cmge v17.8h, v9.8h, v0.8h cmge v25.8h, v1.8h, v9.8h and v18.16b, v18.16b, v10.16b and v19.16b, v19.16b, v11.16b and v20.16b, v20.16b, v12.16b and v21.16b, v21.16b, v13.16b and v22.16b, v22.16b, v14.16b and v23.16b, v23.16b, v15.16b and v24.16b, v24.16b, v16.16b and v25.16b, v25.16b, v17.16b and v18.16b, v18.16b, v26.16b and v19.16b, v19.16b, v26.16b and v20.16b, v20.16b, v26.16b and v21.16b, v21.16b, v26.16b and v22.16b, v22.16b, v26.16b and v23.16b, v23.16b, v26.16b and v24.16b, v24.16b, v26.16b and v25.16b, v25.16b, v26.16b addv h18, v18.8h addv h19, v19.8h addv h20, v20.8h addv h21, v21.8h addv h22, v22.8h addv h23, v23.8h addv h24, v24.8h addv h25, v25.8h ins v18.b[1], v19.b[0] ins v18.b[2], v20.b[0] ins v18.b[3], v21.b[0] ins v18.b[4], v22.b[0] ins v18.b[5], v23.b[0] ins v18.b[6], v24.b[0] ins v18.b[7], v25.b[0] st1 {v18.8b}, [x0], #8 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x1], #0x40 ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [x1], #0x40 cmge v10.8h, v2.8h, v0.8h cmge v18.8h, v1.8h, v2.8h cmge v11.8h, v3.8h, v0.8h cmge v19.8h, v1.8h, v3.8h cmge v12.8h, v4.8h, v0.8h cmge v20.8h, v1.8h, v4.8h cmge v13.8h, v5.8h, v0.8h cmge v21.8h, v1.8h, v5.8h cmge v14.8h, v6.8h, v0.8h cmge v22.8h, v1.8h, v6.8h cmge v15.8h, v7.8h, v0.8h cmge v23.8h, v1.8h, v7.8h cmge v16.8h, v8.8h, v0.8h cmge v24.8h, v1.8h, v8.8h cmge v17.8h, v9.8h, v0.8h cmge v25.8h, v1.8h, v9.8h and v18.16b, v18.16b, v10.16b and v19.16b, v19.16b, v11.16b and v20.16b, v20.16b, v12.16b and v21.16b, v21.16b, v13.16b and v22.16b, v22.16b, v14.16b and v23.16b, v23.16b, v15.16b and v24.16b, v24.16b, v16.16b and v25.16b, v25.16b, v17.16b and v18.16b, v18.16b, v26.16b and v19.16b, v19.16b, v26.16b and v20.16b, v20.16b, v26.16b and v21.16b, v21.16b, v26.16b and v22.16b, v22.16b, v26.16b and v23.16b, v23.16b, v26.16b and v24.16b, v24.16b, v26.16b and v25.16b, v25.16b, v26.16b addv h18, v18.8h addv h19, v19.8h addv h20, v20.8h addv h21, v21.8h addv h22, v22.8h addv h23, v23.8h addv h24, v24.8h addv h25, v25.8h ins v18.b[1], v19.b[0] ins v18.b[2], v20.b[0] ins v18.b[3], v21.b[0] ins v18.b[4], v22.b[0] ins v18.b[5], v23.b[0] ins v18.b[6], v24.b[0] ins v18.b[7], v25.b[0] st1 {v18.8b}, [x0], #8 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp d14, d15, [x29, #64] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size mlkem_to_msg_neon,.-mlkem_to_msg_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_from_msg_q1half, %object .size L_mlkem_from_msg_q1half, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_from_msg_q1half: .short 0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681,0x0681 #ifndef __APPLE__ .text .section .rodata .type L_mlkem_from_msg_bits, %object .size L_mlkem_from_msg_bits, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_from_msg_bits: .byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 .byte 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 #ifndef __APPLE__ .text .globl mlkem_from_msg_neon .type mlkem_from_msg_neon,@function .align 2 mlkem_from_msg_neon: #else .section __TEXT,__text .globl _mlkem_from_msg_neon .p2align 2 _mlkem_from_msg_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-48]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] #ifndef __APPLE__ adrp x2, L_mlkem_from_msg_q1half add x2, x2, :lo12:L_mlkem_from_msg_q1half #else adrp x2, L_mlkem_from_msg_q1half@PAGE add x2, x2, L_mlkem_from_msg_q1half@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x3, L_mlkem_from_msg_bits add x3, x3, :lo12:L_mlkem_from_msg_bits #else adrp x3, L_mlkem_from_msg_bits@PAGE add x3, x3, L_mlkem_from_msg_bits@PAGEOFF #endif /* __APPLE__ */ ld1 {v2.16b, v3.16b}, [x1] ldr q1, [x2] ldr q0, [x3] dup v4.8b, v2.b[0] dup v5.8b, v2.b[1] dup v6.8b, v2.b[2] dup v7.8b, v2.b[3] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v2.b[4] dup v5.8b, v2.b[5] dup v6.8b, v2.b[6] dup v7.8b, v2.b[7] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v2.b[8] dup v5.8b, v2.b[9] dup v6.8b, v2.b[10] dup v7.8b, v2.b[11] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v2.b[12] dup v5.8b, v2.b[13] dup v6.8b, v2.b[14] dup v7.8b, v2.b[15] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v3.b[0] dup v5.8b, v3.b[1] dup v6.8b, v3.b[2] dup v7.8b, v3.b[3] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v3.b[4] dup v5.8b, v3.b[5] dup v6.8b, v3.b[6] dup v7.8b, v3.b[7] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v3.b[8] dup v5.8b, v3.b[9] dup v6.8b, v3.b[10] dup v7.8b, v3.b[11] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 dup v4.8b, v3.b[12] dup v5.8b, v3.b[13] dup v6.8b, v3.b[14] dup v7.8b, v3.b[15] cmtst v4.8b, v4.8b, v0.8b cmtst v5.8b, v5.8b, v0.8b cmtst v6.8b, v6.8b, v0.8b cmtst v7.8b, v7.8b, v0.8b zip1 v4.16b, v4.16b, v4.16b zip1 v5.16b, v5.16b, v5.16b zip1 v6.16b, v6.16b, v6.16b zip1 v7.16b, v7.16b, v7.16b and v4.16b, v4.16b, v1.16b and v5.16b, v5.16b, v1.16b and v6.16b, v6.16b, v1.16b and v7.16b, v7.16b, v1.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #0x40 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp x29, x30, [sp], #48 ret #ifndef __APPLE__ .size mlkem_from_msg_neon,.-mlkem_from_msg_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_cmp_neon .type mlkem_cmp_neon,@function .align 2 mlkem_cmp_neon: #else .section __TEXT,__text .globl _mlkem_cmp_neon .p2align 2 _mlkem_cmp_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-48]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v8.16b, v0.16b, v4.16b eor v9.16b, v1.16b, v5.16b eor v10.16b, v2.16b, v6.16b eor v11.16b, v3.16b, v7.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b subs w2, w2, #0x300 beq L_mlkem_aarch64_cmp_neon_done ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b subs w2, w2, #0x140 beq L_mlkem_aarch64_cmp_neon_done ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #0x40 ld4 {v4.16b, v5.16b, v6.16b, v7.16b}, [x1], #0x40 eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b eor v2.16b, v2.16b, v6.16b eor v3.16b, v3.16b, v7.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b orr v10.16b, v10.16b, v2.16b orr v11.16b, v11.16b, v3.16b ld2 {v0.16b, v1.16b}, [x0] ld2 {v4.16b, v5.16b}, [x1] eor v0.16b, v0.16b, v4.16b eor v1.16b, v1.16b, v5.16b orr v8.16b, v8.16b, v0.16b orr v9.16b, v9.16b, v1.16b L_mlkem_aarch64_cmp_neon_done: orr v8.16b, v8.16b, v9.16b orr v10.16b, v10.16b, v11.16b orr v8.16b, v8.16b, v10.16b ext v9.16b, v8.16b, v8.16b, #8 orr v8.16b, v8.16b, v9.16b mov x0, v8.d[0] subs x0, x0, xzr csetm w0, ne ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp x29, x30, [sp], #48 ret #ifndef __APPLE__ .size mlkem_cmp_neon,.-mlkem_cmp_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_mlkem_rej_uniform_mask, %object .size L_mlkem_rej_uniform_mask, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_rej_uniform_mask: .short 0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff,0x0fff #ifndef __APPLE__ .text .section .rodata .type L_mlkem_rej_uniform_bits, %object .size L_mlkem_rej_uniform_bits, 16 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_rej_uniform_bits: .short 0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080 #ifndef __APPLE__ .text .section .rodata .type L_mlkem_rej_uniform_indices, %object .size L_mlkem_rej_uniform_indices, 4096 #else .section __DATA,__data #endif /* __APPLE__ */ # 8-byte aligned, 64-bit aligned #ifndef __APPLE__ .align 3 #else .p2align 3 #endif /* __APPLE__ */ L_mlkem_rej_uniform_indices: .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0a,0x0b,0x0e,0x0f,0xff,0xff .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x06,0x07,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x04,0x05,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d .byte 0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x02,0x03,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b .byte 0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff .byte 0x00,0x01,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09 .byte 0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07 .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f #ifndef __APPLE__ .text .globl mlkem_rej_uniform_neon .type mlkem_rej_uniform_neon,@function .align 2 mlkem_rej_uniform_neon: #else .section __TEXT,__text .globl _mlkem_rej_uniform_neon .p2align 2 _mlkem_rej_uniform_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 stp d8, d9, [x29, #16] stp d10, d11, [x29, #32] stp d12, d13, [x29, #48] #ifndef __APPLE__ adrp x4, L_mlkem_rej_uniform_mask add x4, x4, :lo12:L_mlkem_rej_uniform_mask #else adrp x4, L_mlkem_rej_uniform_mask@PAGE add x4, x4, L_mlkem_rej_uniform_mask@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x5, L_mlkem_aarch64_q add x5, x5, :lo12:L_mlkem_aarch64_q #else adrp x5, L_mlkem_aarch64_q@PAGE add x5, x5, L_mlkem_aarch64_q@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x6, L_mlkem_rej_uniform_bits add x6, x6, :lo12:L_mlkem_rej_uniform_bits #else adrp x6, L_mlkem_rej_uniform_bits@PAGE add x6, x6, L_mlkem_rej_uniform_bits@PAGEOFF #endif /* __APPLE__ */ #ifndef __APPLE__ adrp x7, L_mlkem_rej_uniform_indices add x7, x7, :lo12:L_mlkem_rej_uniform_indices #else adrp x7, L_mlkem_rej_uniform_indices@PAGE add x7, x7, L_mlkem_rej_uniform_indices@PAGEOFF #endif /* __APPLE__ */ eor v1.16b, v1.16b, v1.16b eor v12.16b, v12.16b, v12.16b eor v13.16b, v13.16b, v13.16b eor x12, x12, x12 eor v10.16b, v10.16b, v10.16b eor v11.16b, v11.16b, v11.16b mov x13, #0xd01 ldr q0, [x4] ldr q3, [x5] ldr q2, [x6] subs wzr, w1, #0 beq L_mlkem_rej_uniform_done subs wzr, w1, #16 blt L_mlkem_rej_uniform_loop_4 L_mlkem_rej_uniform_loop_16: ld3 {v4.8b, v5.8b, v6.8b}, [x2], #24 zip1 v4.16b, v4.16b, v1.16b zip1 v5.16b, v5.16b, v1.16b zip1 v6.16b, v6.16b, v1.16b shl v7.8h, v5.8h, #8 ushr v8.8h, v5.8h, #4 shl v6.8h, v6.8h, #4 orr v4.16b, v4.16b, v7.16b orr v5.16b, v8.16b, v6.16b and v7.16b, v4.16b, v0.16b and v8.16b, v5.16b, v0.16b zip1 v4.8h, v7.8h, v8.8h zip2 v5.8h, v7.8h, v8.8h cmgt v7.8h, v3.8h, v4.8h cmgt v8.8h, v3.8h, v5.8h ushr v12.8h, v7.8h, #15 ushr v13.8h, v8.8h, #15 addv h12, v12.8h addv h13, v13.8h mov x10, v12.d[0] mov x11, v13.d[0] and v10.16b, v7.16b, v2.16b and v11.16b, v8.16b, v2.16b addv h10, v10.8h addv h11, v11.8h mov w8, v10.s[0] mov w9, v11.s[0] lsl w8, w8, #4 lsl w9, w9, #4 ldr q10, [x7, x8] ldr q11, [x7, x9] tbl v7.16b, {v4.16b}, v10.16b tbl v8.16b, {v5.16b}, v11.16b str q7, [x0] add x0, x0, x10, lsl 1 add x12, x12, x10 str q8, [x0] add x0, x0, x11, lsl 1 add x12, x12, x11 subs w3, w3, #24 beq L_mlkem_rej_uniform_done sub w10, w1, w12 subs x10, x10, #16 blt L_mlkem_rej_uniform_loop_4 b L_mlkem_rej_uniform_loop_16 L_mlkem_rej_uniform_loop_4: subs w10, w1, w12 beq L_mlkem_rej_uniform_done subs x10, x10, #4 blt L_mlkem_rej_uniform_loop_lt_4 ldr x4, [x2], #6 lsr x5, x4, #12 lsr x6, x4, #24 lsr x7, x4, #36 and x4, x4, #0xfff and x5, x5, #0xfff and x6, x6, #0xfff and x7, x7, #0xfff strh w4, [x0] subs xzr, x4, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt strh w5, [x0] subs xzr, x5, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt strh w6, [x0] subs xzr, x6, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt strh w7, [x0] subs xzr, x7, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt subs w3, w3, #6 beq L_mlkem_rej_uniform_done b L_mlkem_rej_uniform_loop_4 L_mlkem_rej_uniform_loop_lt_4: ldr x4, [x2], #6 lsr x5, x4, #12 lsr x6, x4, #24 lsr x7, x4, #36 and x4, x4, #0xfff and x5, x5, #0xfff and x6, x6, #0xfff and x7, x7, #0xfff strh w4, [x0] subs xzr, x4, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt subs wzr, w1, w12 beq L_mlkem_rej_uniform_done strh w5, [x0] subs xzr, x5, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt subs wzr, w1, w12 beq L_mlkem_rej_uniform_done strh w6, [x0] subs xzr, x6, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt subs wzr, w1, w12 beq L_mlkem_rej_uniform_done strh w7, [x0] subs xzr, x7, x13 cinc x0, x0, lt cinc x0, x0, lt cinc x12, x12, lt subs wzr, w1, w12 beq L_mlkem_rej_uniform_done subs w3, w3, #6 beq L_mlkem_rej_uniform_done b L_mlkem_rej_uniform_loop_lt_4 L_mlkem_rej_uniform_done: mov x0, x12 ldp d8, d9, [x29, #16] ldp d10, d11, [x29, #32] ldp d12, d13, [x29, #48] ldp x29, x30, [sp], #0x40 ret #ifndef __APPLE__ .size mlkem_rej_uniform_neon,.-mlkem_rej_uniform_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .section .rodata .type L_sha3_aarch64_r, %object .size L_sha3_aarch64_r, 192 #else .section __DATA,__data #endif /* __APPLE__ */ # 16-byte aligned, 128-bit aligned #ifndef __APPLE__ .align 4 #else .p2align 4 #endif /* __APPLE__ */ L_sha3_aarch64_r: .quad 0x0000000000000001,0x0000000000008082 .quad 0x800000000000808a,0x8000000080008000 .quad 0x000000000000808b,0x0000000080000001 .quad 0x8000000080008081,0x8000000000008009 .quad 0x000000000000008a,0x0000000000000088 .quad 0x0000000080008009,0x000000008000000a .quad 0x000000008000808b,0x800000000000008b .quad 0x8000000000008089,0x8000000000008003 .quad 0x8000000000008002,0x8000000000000080 .quad 0x000000000000800a,0x800000008000000a .quad 0x8000000080008081,0x8000000000008080 .quad 0x0000000080000001,0x8000000080008008 #ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 #ifndef __APPLE__ .text .globl mlkem_sha3_blocksx3_neon .type mlkem_sha3_blocksx3_neon,@function .align 2 mlkem_sha3_blocksx3_neon: #else .section __TEXT,__text .globl _mlkem_sha3_blocksx3_neon .p2align 2 _mlkem_sha3_blocksx3_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x27, L_sha3_aarch64_r add x27, x27, :lo12:L_sha3_aarch64_r #else adrp x27, L_sha3_aarch64_r@PAGE add x27, x27, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 ld1 {v24.d}[0], [x0] add x0, x0, #8 ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 ld1 {v24.d}[1], [x0] add x0, x0, #8 ldp x1, x2, [x0] ldp x3, x4, [x0, #16] ldp x5, x6, [x0, #32] ldp x7, x8, [x0, #48] ldp x9, x10, [x0, #64] ldp x11, x12, [x0, #80] ldp x13, x14, [x0, #96] ldp x15, x16, [x0, #112] ldp x17, x19, [x0, #128] ldp x20, x21, [x0, #144] ldp x22, x23, [x0, #160] ldp x24, x25, [x0, #176] ldr x26, [x0, #192] mov x28, #24 # Start of 24 rounds L_SHA3_transform_blocksx3_neon_begin: stp x27, x28, [x29, #48] # Col Mix eor3 v31.16b, v0.16b, v5.16b, v10.16b eor x0, x5, x10 eor3 v27.16b, v1.16b, v6.16b, v11.16b eor x30, x1, x6 eor3 v28.16b, v2.16b, v7.16b, v12.16b eor x28, x3, x8 eor3 v29.16b, v3.16b, v8.16b, v13.16b eor x0, x0, x15 eor3 v30.16b, v4.16b, v9.16b, v14.16b eor x30, x30, x11 eor3 v31.16b, v31.16b, v15.16b, v20.16b eor x28, x28, x13 eor3 v27.16b, v27.16b, v16.16b, v21.16b eor x0, x0, x21 eor3 v28.16b, v28.16b, v17.16b, v22.16b eor x30, x30, x16 eor3 v29.16b, v29.16b, v18.16b, v23.16b eor x28, x28, x19 eor3 v30.16b, v30.16b, v19.16b, v24.16b eor x0, x0, x26 rax1 v25.2d, v30.2d, v27.2d eor x30, x30, x22 rax1 v26.2d, v31.2d, v28.2d eor x28, x28, x24 rax1 v27.2d, v27.2d, v29.2d str x0, [x29, #32] rax1 v28.2d, v28.2d, v30.2d str x28, [x29, #24] rax1 v29.2d, v29.2d, v31.2d eor x27, x2, x7 eor v0.16b, v0.16b, v25.16b xar v30.2d, v1.2d, v26.2d, #63 eor x28, x4, x9 xar v1.2d, v6.2d, v26.2d, #20 eor x27, x27, x12 xar v6.2d, v9.2d, v29.2d, #44 eor x28, x28, x14 xar v9.2d, v22.2d, v27.2d, #3 eor x27, x27, x17 xar v22.2d, v14.2d, v29.2d, #25 eor x28, x28, x20 xar v14.2d, v20.2d, v25.2d, #46 eor x27, x27, x23 xar v20.2d, v2.2d, v27.2d, #2 eor x28, x28, x25 xar v2.2d, v12.2d, v27.2d, #21 eor x0, x0, x27, ror 63 xar v12.2d, v13.2d, v28.2d, #39 eor x27, x27, x28, ror 63 xar v13.2d, v19.2d, v29.2d, #56 eor x1, x1, x0 xar v19.2d, v23.2d, v28.2d, #8 eor x6, x6, x0 xar v23.2d, v15.2d, v25.2d, #23 eor x11, x11, x0 xar v15.2d, v4.2d, v29.2d, #37 eor x16, x16, x0 xar v4.2d, v24.2d, v29.2d, #50 eor x22, x22, x0 xar v24.2d, v21.2d, v26.2d, #62 eor x3, x3, x27 xar v21.2d, v8.2d, v28.2d, #9 eor x8, x8, x27 xar v8.2d, v16.2d, v26.2d, #19 eor x13, x13, x27 xar v16.2d, v5.2d, v25.2d, #28 eor x19, x19, x27 xar v5.2d, v3.2d, v28.2d, #36 eor x24, x24, x27 xar v3.2d, v18.2d, v28.2d, #43 ldr x0, [x29, #32] xar v18.2d, v17.2d, v27.2d, #49 ldr x27, [x29, #24] xar v17.2d, v11.2d, v26.2d, #54 eor x28, x28, x30, ror 63 xar v11.2d, v7.2d, v27.2d, #58 eor x30, x30, x27, ror 63 xar v7.2d, v10.2d, v25.2d, #61 eor x27, x27, x0, ror 63 # Row Mix mov v25.16b, v0.16b eor x5, x5, x28 mov v26.16b, v1.16b eor x10, x10, x28 bcax v0.16b, v25.16b, v2.16b, v26.16b eor x15, x15, x28 bcax v1.16b, v26.16b, v3.16b, v2.16b eor x21, x21, x28 bcax v2.16b, v2.16b, v4.16b, v3.16b eor x26, x26, x28 bcax v3.16b, v3.16b, v25.16b, v4.16b eor x2, x2, x30 bcax v4.16b, v4.16b, v26.16b, v25.16b eor x7, x7, x30 mov v25.16b, v5.16b eor x12, x12, x30 mov v26.16b, v6.16b eor x17, x17, x30 bcax v5.16b, v25.16b, v7.16b, v26.16b eor x23, x23, x30 bcax v6.16b, v26.16b, v8.16b, v7.16b eor x4, x4, x27 bcax v7.16b, v7.16b, v9.16b, v8.16b eor x9, x9, x27 bcax v8.16b, v8.16b, v25.16b, v9.16b eor x14, x14, x27 bcax v9.16b, v9.16b, v26.16b, v25.16b eor x20, x20, x27 mov v26.16b, v11.16b eor x25, x25, x27 # Swap Rotate Base bcax v10.16b, v30.16b, v12.16b, v26.16b ror x0, x2, #63 bcax v11.16b, v26.16b, v13.16b, v12.16b ror x2, x7, #20 bcax v12.16b, v12.16b, v14.16b, v13.16b ror x7, x10, #44 bcax v13.16b, v13.16b, v30.16b, v14.16b ror x10, x24, #3 bcax v14.16b, v14.16b, v26.16b, v30.16b ror x24, x15, #25 mov v25.16b, v15.16b ror x15, x22, #46 mov v26.16b, v16.16b ror x22, x3, #2 bcax v15.16b, v25.16b, v17.16b, v26.16b ror x3, x13, #21 bcax v16.16b, v26.16b, v18.16b, v17.16b ror x13, x14, #39 bcax v17.16b, v17.16b, v19.16b, v18.16b ror x14, x21, #56 bcax v18.16b, v18.16b, v25.16b, v19.16b ror x21, x25, #8 bcax v19.16b, v19.16b, v26.16b, v25.16b ror x25, x16, #23 mov v25.16b, v20.16b ror x16, x5, #37 mov v26.16b, v21.16b ror x5, x26, #50 bcax v20.16b, v25.16b, v22.16b, v26.16b ror x26, x23, #62 bcax v21.16b, v26.16b, v23.16b, v22.16b ror x23, x9, #9 bcax v22.16b, v22.16b, v24.16b, v23.16b ror x9, x17, #19 bcax v23.16b, v23.16b, v25.16b, v24.16b ror x17, x6, #28 bcax v24.16b, v24.16b, v26.16b, v25.16b ror x6, x4, #36 ror x4, x20, #43 ror x20, x19, #49 ror x19, x12, #54 ror x12, x8, #58 ror x8, x11, #61 # Row Mix Base bic x11, x3, x2 bic x27, x4, x3 bic x28, x1, x5 bic x30, x2, x1 eor x1, x1, x11 eor x2, x2, x27 bic x11, x5, x4 eor x4, x4, x28 eor x3, x3, x11 eor x5, x5, x30 bic x11, x8, x7 bic x27, x9, x8 bic x28, x6, x10 bic x30, x7, x6 eor x6, x6, x11 eor x7, x7, x27 bic x11, x10, x9 eor x9, x9, x28 eor x8, x8, x11 eor x10, x10, x30 bic x11, x13, x12 bic x27, x14, x13 bic x28, x0, x15 bic x30, x12, x0 eor x11, x0, x11 eor x12, x12, x27 bic x0, x15, x14 eor x14, x14, x28 eor x13, x13, x0 eor x15, x15, x30 bic x0, x19, x17 bic x27, x20, x19 bic x28, x16, x21 bic x30, x17, x16 eor x16, x16, x0 eor x17, x17, x27 bic x0, x21, x20 eor x20, x20, x28 eor x19, x19, x0 eor x21, x21, x30 bic x0, x24, x23 bic x27, x25, x24 bic x28, x22, x26 bic x30, x23, x22 eor x22, x22, x0 eor x23, x23, x27 bic x0, x26, x25 eor x25, x25, x28 eor x24, x24, x0 eor x26, x26, x30 # Done transforming ldp x27, x28, [x29, #48] ldr x0, [x27], #8 subs x28, x28, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x1, x1, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_transform_blocksx3_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x1, x2, [x0] stp x3, x4, [x0, #16] stp x5, x6, [x0, #32] stp x7, x8, [x0, #48] stp x9, x10, [x0, #64] stp x11, x12, [x0, #80] stp x13, x14, [x0, #96] stp x15, x16, [x0, #112] stp x17, x19, [x0, #128] stp x20, x21, [x0, #144] stp x22, x23, [x0, #160] stp x24, x25, [x0, #176] str x26, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_sha3_blocksx3_neon,.-mlkem_sha3_blocksx3_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_shake128_blocksx3_seed_neon .type mlkem_shake128_blocksx3_seed_neon,@function .align 2 mlkem_shake128_blocksx3_seed_neon: #else .section __TEXT,__text .globl _mlkem_shake128_blocksx3_seed_neon .p2align 2 _mlkem_shake128_blocksx3_seed_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x28, L_sha3_aarch64_r add x28, x28, :lo12:L_sha3_aarch64_r #else adrp x28, L_sha3_aarch64_r@PAGE add x28, x28, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] ldp x2, x3, [x1], #16 add x0, x0, #0xc8 ld1 {v4.d}[1], [x0] ldp x4, x5, [x1], #16 ldr x6, [x0, #200] eor v5.16b, v5.16b, v5.16b eor x7, x7, x7 eor v6.16b, v6.16b, v6.16b eor x8, x8, x8 eor v7.16b, v7.16b, v7.16b eor x9, x9, x9 eor v8.16b, v8.16b, v8.16b eor x10, x10, x10 eor v9.16b, v9.16b, v9.16b eor x11, x11, x11 eor v10.16b, v10.16b, v10.16b eor x12, x12, x12 eor v11.16b, v11.16b, v11.16b eor x13, x13, x13 eor v12.16b, v12.16b, v12.16b eor x14, x14, x14 eor v13.16b, v13.16b, v13.16b eor x15, x15, x15 eor v14.16b, v14.16b, v14.16b eor x16, x16, x16 eor v15.16b, v15.16b, v15.16b eor x17, x17, x17 eor v16.16b, v16.16b, v16.16b eor x19, x19, x19 eor v17.16b, v17.16b, v17.16b eor x20, x20, x20 eor v18.16b, v18.16b, v18.16b eor x21, x21, x21 eor v19.16b, v19.16b, v19.16b eor x22, x22, x22 movz x23, #0x8000, lsl 48 eor v21.16b, v21.16b, v21.16b eor x24, x24, x24 eor v22.16b, v22.16b, v22.16b eor x25, x25, x25 eor v23.16b, v23.16b, v23.16b eor x26, x26, x26 eor v24.16b, v24.16b, v24.16b eor x27, x27, x27 dup v0.2d, x2 dup v1.2d, x3 dup v2.2d, x4 dup v3.2d, x5 dup v20.2d, x23 mov x1, #24 # Start of 24 rounds L_SHA3_shake128_blocksx3_seed_neon_begin: stp x28, x1, [x29, #48] # Col Mix eor3 v31.16b, v0.16b, v5.16b, v10.16b eor x0, x6, x11 eor3 v27.16b, v1.16b, v6.16b, v11.16b eor x30, x2, x7 eor3 v28.16b, v2.16b, v7.16b, v12.16b eor x28, x4, x9 eor3 v29.16b, v3.16b, v8.16b, v13.16b eor x0, x0, x16 eor3 v30.16b, v4.16b, v9.16b, v14.16b eor x30, x30, x12 eor3 v31.16b, v31.16b, v15.16b, v20.16b eor x28, x28, x14 eor3 v27.16b, v27.16b, v16.16b, v21.16b eor x0, x0, x22 eor3 v28.16b, v28.16b, v17.16b, v22.16b eor x30, x30, x17 eor3 v29.16b, v29.16b, v18.16b, v23.16b eor x28, x28, x20 eor3 v30.16b, v30.16b, v19.16b, v24.16b eor x0, x0, x27 rax1 v25.2d, v30.2d, v27.2d eor x30, x30, x23 rax1 v26.2d, v31.2d, v28.2d eor x28, x28, x25 rax1 v27.2d, v27.2d, v29.2d str x0, [x29, #32] rax1 v28.2d, v28.2d, v30.2d str x28, [x29, #24] rax1 v29.2d, v29.2d, v31.2d eor x1, x3, x8 eor v0.16b, v0.16b, v25.16b xar v30.2d, v1.2d, v26.2d, #63 eor x28, x5, x10 xar v1.2d, v6.2d, v26.2d, #20 eor x1, x1, x13 xar v6.2d, v9.2d, v29.2d, #44 eor x28, x28, x15 xar v9.2d, v22.2d, v27.2d, #3 eor x1, x1, x19 xar v22.2d, v14.2d, v29.2d, #25 eor x28, x28, x21 xar v14.2d, v20.2d, v25.2d, #46 eor x1, x1, x24 xar v20.2d, v2.2d, v27.2d, #2 eor x28, x28, x26 xar v2.2d, v12.2d, v27.2d, #21 eor x0, x0, x1, ror 63 xar v12.2d, v13.2d, v28.2d, #39 eor x1, x1, x28, ror 63 xar v13.2d, v19.2d, v29.2d, #56 eor x2, x2, x0 xar v19.2d, v23.2d, v28.2d, #8 eor x7, x7, x0 xar v23.2d, v15.2d, v25.2d, #23 eor x12, x12, x0 xar v15.2d, v4.2d, v29.2d, #37 eor x17, x17, x0 xar v4.2d, v24.2d, v29.2d, #50 eor x23, x23, x0 xar v24.2d, v21.2d, v26.2d, #62 eor x4, x4, x1 xar v21.2d, v8.2d, v28.2d, #9 eor x9, x9, x1 xar v8.2d, v16.2d, v26.2d, #19 eor x14, x14, x1 xar v16.2d, v5.2d, v25.2d, #28 eor x20, x20, x1 xar v5.2d, v3.2d, v28.2d, #36 eor x25, x25, x1 xar v3.2d, v18.2d, v28.2d, #43 ldr x0, [x29, #32] xar v18.2d, v17.2d, v27.2d, #49 ldr x1, [x29, #24] xar v17.2d, v11.2d, v26.2d, #54 eor x28, x28, x30, ror 63 xar v11.2d, v7.2d, v27.2d, #58 eor x30, x30, x1, ror 63 xar v7.2d, v10.2d, v25.2d, #61 eor x1, x1, x0, ror 63 # Row Mix mov v25.16b, v0.16b eor x6, x6, x28 mov v26.16b, v1.16b eor x11, x11, x28 bcax v0.16b, v25.16b, v2.16b, v26.16b eor x16, x16, x28 bcax v1.16b, v26.16b, v3.16b, v2.16b eor x22, x22, x28 bcax v2.16b, v2.16b, v4.16b, v3.16b eor x27, x27, x28 bcax v3.16b, v3.16b, v25.16b, v4.16b eor x3, x3, x30 bcax v4.16b, v4.16b, v26.16b, v25.16b eor x8, x8, x30 mov v25.16b, v5.16b eor x13, x13, x30 mov v26.16b, v6.16b eor x19, x19, x30 bcax v5.16b, v25.16b, v7.16b, v26.16b eor x24, x24, x30 bcax v6.16b, v26.16b, v8.16b, v7.16b eor x5, x5, x1 bcax v7.16b, v7.16b, v9.16b, v8.16b eor x10, x10, x1 bcax v8.16b, v8.16b, v25.16b, v9.16b eor x15, x15, x1 bcax v9.16b, v9.16b, v26.16b, v25.16b eor x21, x21, x1 mov v26.16b, v11.16b eor x26, x26, x1 # Swap Rotate Base bcax v10.16b, v30.16b, v12.16b, v26.16b ror x0, x3, #63 bcax v11.16b, v26.16b, v13.16b, v12.16b ror x3, x8, #20 bcax v12.16b, v12.16b, v14.16b, v13.16b ror x8, x11, #44 bcax v13.16b, v13.16b, v30.16b, v14.16b ror x11, x25, #3 bcax v14.16b, v14.16b, v26.16b, v30.16b ror x25, x16, #25 mov v25.16b, v15.16b ror x16, x23, #46 mov v26.16b, v16.16b ror x23, x4, #2 bcax v15.16b, v25.16b, v17.16b, v26.16b ror x4, x14, #21 bcax v16.16b, v26.16b, v18.16b, v17.16b ror x14, x15, #39 bcax v17.16b, v17.16b, v19.16b, v18.16b ror x15, x22, #56 bcax v18.16b, v18.16b, v25.16b, v19.16b ror x22, x26, #8 bcax v19.16b, v19.16b, v26.16b, v25.16b ror x26, x17, #23 mov v25.16b, v20.16b ror x17, x6, #37 mov v26.16b, v21.16b ror x6, x27, #50 bcax v20.16b, v25.16b, v22.16b, v26.16b ror x27, x24, #62 bcax v21.16b, v26.16b, v23.16b, v22.16b ror x24, x10, #9 bcax v22.16b, v22.16b, v24.16b, v23.16b ror x10, x19, #19 bcax v23.16b, v23.16b, v25.16b, v24.16b ror x19, x7, #28 bcax v24.16b, v24.16b, v26.16b, v25.16b ror x7, x5, #36 ror x5, x21, #43 ror x21, x20, #49 ror x20, x13, #54 ror x13, x9, #58 ror x9, x12, #61 # Row Mix Base bic x12, x4, x3 bic x1, x5, x4 bic x28, x2, x6 bic x30, x3, x2 eor x2, x2, x12 eor x3, x3, x1 bic x12, x6, x5 eor x5, x5, x28 eor x4, x4, x12 eor x6, x6, x30 bic x12, x9, x8 bic x1, x10, x9 bic x28, x7, x11 bic x30, x8, x7 eor x7, x7, x12 eor x8, x8, x1 bic x12, x11, x10 eor x10, x10, x28 eor x9, x9, x12 eor x11, x11, x30 bic x12, x14, x13 bic x1, x15, x14 bic x28, x0, x16 bic x30, x13, x0 eor x12, x0, x12 eor x13, x13, x1 bic x0, x16, x15 eor x15, x15, x28 eor x14, x14, x0 eor x16, x16, x30 bic x0, x20, x19 bic x1, x21, x20 bic x28, x17, x22 bic x30, x19, x17 eor x17, x17, x0 eor x19, x19, x1 bic x0, x22, x21 eor x21, x21, x28 eor x20, x20, x0 eor x22, x22, x30 bic x0, x25, x24 bic x1, x26, x25 bic x28, x23, x27 bic x30, x24, x23 eor x23, x23, x0 eor x24, x24, x1 bic x0, x27, x26 eor x26, x26, x28 eor x25, x25, x0 eor x27, x27, x30 # Done transforming ldp x28, x1, [x29, #48] ldr x0, [x28], #8 subs x1, x1, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x2, x2, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_shake128_blocksx3_seed_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x2, x3, [x0] stp x4, x5, [x0, #16] stp x6, x7, [x0, #32] stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] stp x14, x15, [x0, #96] stp x16, x17, [x0, #112] stp x19, x20, [x0, #128] stp x21, x22, [x0, #144] stp x23, x24, [x0, #160] stp x25, x26, [x0, #176] str x27, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_shake128_blocksx3_seed_neon,.-mlkem_shake128_blocksx3_seed_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_shake256_blocksx3_seed_neon .type mlkem_shake256_blocksx3_seed_neon,@function .align 2 mlkem_shake256_blocksx3_seed_neon: #else .section __TEXT,__text .globl _mlkem_shake256_blocksx3_seed_neon .p2align 2 _mlkem_shake256_blocksx3_seed_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x28, L_sha3_aarch64_r add x28, x28, :lo12:L_sha3_aarch64_r #else adrp x28, L_sha3_aarch64_r@PAGE add x28, x28, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] ldp x2, x3, [x1], #16 add x0, x0, #0xc8 ld1 {v4.d}[1], [x0] ldp x4, x5, [x1], #16 ldr x6, [x0, #200] eor v5.16b, v5.16b, v5.16b eor x7, x7, x7 eor v6.16b, v6.16b, v6.16b eor x8, x8, x8 eor v7.16b, v7.16b, v7.16b eor x9, x9, x9 eor v8.16b, v8.16b, v8.16b eor x10, x10, x10 eor v9.16b, v9.16b, v9.16b eor x11, x11, x11 eor v10.16b, v10.16b, v10.16b eor x12, x12, x12 eor v11.16b, v11.16b, v11.16b eor x13, x13, x13 eor v12.16b, v12.16b, v12.16b eor x14, x14, x14 eor v13.16b, v13.16b, v13.16b eor x15, x15, x15 eor v14.16b, v14.16b, v14.16b eor x16, x16, x16 eor v15.16b, v15.16b, v15.16b eor x17, x17, x17 movz x19, #0x8000, lsl 48 eor v17.16b, v17.16b, v17.16b eor x20, x20, x20 eor v18.16b, v18.16b, v18.16b eor x21, x21, x21 eor v19.16b, v19.16b, v19.16b eor x22, x22, x22 eor v20.16b, v20.16b, v20.16b eor x23, x23, x23 eor v21.16b, v21.16b, v21.16b eor x24, x24, x24 eor v22.16b, v22.16b, v22.16b eor x25, x25, x25 eor v23.16b, v23.16b, v23.16b eor x26, x26, x26 eor v24.16b, v24.16b, v24.16b eor x27, x27, x27 dup v0.2d, x2 dup v1.2d, x3 dup v2.2d, x4 dup v3.2d, x5 dup v16.2d, x19 mov x1, #24 # Start of 24 rounds L_SHA3_shake256_blocksx3_seed_neon_begin: stp x28, x1, [x29, #48] # Col Mix eor3 v31.16b, v0.16b, v5.16b, v10.16b eor x0, x6, x11 eor3 v27.16b, v1.16b, v6.16b, v11.16b eor x30, x2, x7 eor3 v28.16b, v2.16b, v7.16b, v12.16b eor x28, x4, x9 eor3 v29.16b, v3.16b, v8.16b, v13.16b eor x0, x0, x16 eor3 v30.16b, v4.16b, v9.16b, v14.16b eor x30, x30, x12 eor3 v31.16b, v31.16b, v15.16b, v20.16b eor x28, x28, x14 eor3 v27.16b, v27.16b, v16.16b, v21.16b eor x0, x0, x22 eor3 v28.16b, v28.16b, v17.16b, v22.16b eor x30, x30, x17 eor3 v29.16b, v29.16b, v18.16b, v23.16b eor x28, x28, x20 eor3 v30.16b, v30.16b, v19.16b, v24.16b eor x0, x0, x27 rax1 v25.2d, v30.2d, v27.2d eor x30, x30, x23 rax1 v26.2d, v31.2d, v28.2d eor x28, x28, x25 rax1 v27.2d, v27.2d, v29.2d str x0, [x29, #32] rax1 v28.2d, v28.2d, v30.2d str x28, [x29, #24] rax1 v29.2d, v29.2d, v31.2d eor x1, x3, x8 eor v0.16b, v0.16b, v25.16b xar v30.2d, v1.2d, v26.2d, #63 eor x28, x5, x10 xar v1.2d, v6.2d, v26.2d, #20 eor x1, x1, x13 xar v6.2d, v9.2d, v29.2d, #44 eor x28, x28, x15 xar v9.2d, v22.2d, v27.2d, #3 eor x1, x1, x19 xar v22.2d, v14.2d, v29.2d, #25 eor x28, x28, x21 xar v14.2d, v20.2d, v25.2d, #46 eor x1, x1, x24 xar v20.2d, v2.2d, v27.2d, #2 eor x28, x28, x26 xar v2.2d, v12.2d, v27.2d, #21 eor x0, x0, x1, ror 63 xar v12.2d, v13.2d, v28.2d, #39 eor x1, x1, x28, ror 63 xar v13.2d, v19.2d, v29.2d, #56 eor x2, x2, x0 xar v19.2d, v23.2d, v28.2d, #8 eor x7, x7, x0 xar v23.2d, v15.2d, v25.2d, #23 eor x12, x12, x0 xar v15.2d, v4.2d, v29.2d, #37 eor x17, x17, x0 xar v4.2d, v24.2d, v29.2d, #50 eor x23, x23, x0 xar v24.2d, v21.2d, v26.2d, #62 eor x4, x4, x1 xar v21.2d, v8.2d, v28.2d, #9 eor x9, x9, x1 xar v8.2d, v16.2d, v26.2d, #19 eor x14, x14, x1 xar v16.2d, v5.2d, v25.2d, #28 eor x20, x20, x1 xar v5.2d, v3.2d, v28.2d, #36 eor x25, x25, x1 xar v3.2d, v18.2d, v28.2d, #43 ldr x0, [x29, #32] xar v18.2d, v17.2d, v27.2d, #49 ldr x1, [x29, #24] xar v17.2d, v11.2d, v26.2d, #54 eor x28, x28, x30, ror 63 xar v11.2d, v7.2d, v27.2d, #58 eor x30, x30, x1, ror 63 xar v7.2d, v10.2d, v25.2d, #61 eor x1, x1, x0, ror 63 # Row Mix mov v25.16b, v0.16b eor x6, x6, x28 mov v26.16b, v1.16b eor x11, x11, x28 bcax v0.16b, v25.16b, v2.16b, v26.16b eor x16, x16, x28 bcax v1.16b, v26.16b, v3.16b, v2.16b eor x22, x22, x28 bcax v2.16b, v2.16b, v4.16b, v3.16b eor x27, x27, x28 bcax v3.16b, v3.16b, v25.16b, v4.16b eor x3, x3, x30 bcax v4.16b, v4.16b, v26.16b, v25.16b eor x8, x8, x30 mov v25.16b, v5.16b eor x13, x13, x30 mov v26.16b, v6.16b eor x19, x19, x30 bcax v5.16b, v25.16b, v7.16b, v26.16b eor x24, x24, x30 bcax v6.16b, v26.16b, v8.16b, v7.16b eor x5, x5, x1 bcax v7.16b, v7.16b, v9.16b, v8.16b eor x10, x10, x1 bcax v8.16b, v8.16b, v25.16b, v9.16b eor x15, x15, x1 bcax v9.16b, v9.16b, v26.16b, v25.16b eor x21, x21, x1 mov v26.16b, v11.16b eor x26, x26, x1 # Swap Rotate Base bcax v10.16b, v30.16b, v12.16b, v26.16b ror x0, x3, #63 bcax v11.16b, v26.16b, v13.16b, v12.16b ror x3, x8, #20 bcax v12.16b, v12.16b, v14.16b, v13.16b ror x8, x11, #44 bcax v13.16b, v13.16b, v30.16b, v14.16b ror x11, x25, #3 bcax v14.16b, v14.16b, v26.16b, v30.16b ror x25, x16, #25 mov v25.16b, v15.16b ror x16, x23, #46 mov v26.16b, v16.16b ror x23, x4, #2 bcax v15.16b, v25.16b, v17.16b, v26.16b ror x4, x14, #21 bcax v16.16b, v26.16b, v18.16b, v17.16b ror x14, x15, #39 bcax v17.16b, v17.16b, v19.16b, v18.16b ror x15, x22, #56 bcax v18.16b, v18.16b, v25.16b, v19.16b ror x22, x26, #8 bcax v19.16b, v19.16b, v26.16b, v25.16b ror x26, x17, #23 mov v25.16b, v20.16b ror x17, x6, #37 mov v26.16b, v21.16b ror x6, x27, #50 bcax v20.16b, v25.16b, v22.16b, v26.16b ror x27, x24, #62 bcax v21.16b, v26.16b, v23.16b, v22.16b ror x24, x10, #9 bcax v22.16b, v22.16b, v24.16b, v23.16b ror x10, x19, #19 bcax v23.16b, v23.16b, v25.16b, v24.16b ror x19, x7, #28 bcax v24.16b, v24.16b, v26.16b, v25.16b ror x7, x5, #36 ror x5, x21, #43 ror x21, x20, #49 ror x20, x13, #54 ror x13, x9, #58 ror x9, x12, #61 # Row Mix Base bic x12, x4, x3 bic x1, x5, x4 bic x28, x2, x6 bic x30, x3, x2 eor x2, x2, x12 eor x3, x3, x1 bic x12, x6, x5 eor x5, x5, x28 eor x4, x4, x12 eor x6, x6, x30 bic x12, x9, x8 bic x1, x10, x9 bic x28, x7, x11 bic x30, x8, x7 eor x7, x7, x12 eor x8, x8, x1 bic x12, x11, x10 eor x10, x10, x28 eor x9, x9, x12 eor x11, x11, x30 bic x12, x14, x13 bic x1, x15, x14 bic x28, x0, x16 bic x30, x13, x0 eor x12, x0, x12 eor x13, x13, x1 bic x0, x16, x15 eor x15, x15, x28 eor x14, x14, x0 eor x16, x16, x30 bic x0, x20, x19 bic x1, x21, x20 bic x28, x17, x22 bic x30, x19, x17 eor x17, x17, x0 eor x19, x19, x1 bic x0, x22, x21 eor x21, x21, x28 eor x20, x20, x0 eor x22, x22, x30 bic x0, x25, x24 bic x1, x26, x25 bic x28, x23, x27 bic x30, x24, x23 eor x23, x23, x0 eor x24, x24, x1 bic x0, x27, x26 eor x26, x26, x28 eor x25, x25, x0 eor x27, x27, x30 # Done transforming ldp x28, x1, [x29, #48] ldr x0, [x28], #8 subs x1, x1, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x2, x2, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_shake256_blocksx3_seed_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x2, x3, [x0] stp x4, x5, [x0, #16] stp x6, x7, [x0, #32] stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] stp x14, x15, [x0, #96] stp x16, x17, [x0, #112] stp x19, x20, [x0, #128] stp x21, x22, [x0, #144] stp x23, x24, [x0, #160] stp x25, x26, [x0, #176] str x27, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_shake256_blocksx3_seed_neon,.-mlkem_shake256_blocksx3_seed_neon #endif /* __APPLE__ */ #else #ifndef __APPLE__ .text .globl mlkem_sha3_blocksx3_neon .type mlkem_sha3_blocksx3_neon,@function .align 2 mlkem_sha3_blocksx3_neon: #else .section __TEXT,__text .globl _mlkem_sha3_blocksx3_neon .p2align 2 _mlkem_sha3_blocksx3_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x27, L_sha3_aarch64_r add x27, x27, :lo12:L_sha3_aarch64_r #else adrp x27, L_sha3_aarch64_r@PAGE add x27, x27, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] ld4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 ld4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 ld4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 ld4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 ld4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 ld4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 ld1 {v24.d}[0], [x0] add x0, x0, #8 ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 ld4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 ld4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 ld4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 ld4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 ld4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 ld1 {v24.d}[1], [x0] add x0, x0, #8 ldp x1, x2, [x0] ldp x3, x4, [x0, #16] ldp x5, x6, [x0, #32] ldp x7, x8, [x0, #48] ldp x9, x10, [x0, #64] ldp x11, x12, [x0, #80] ldp x13, x14, [x0, #96] ldp x15, x16, [x0, #112] ldp x17, x19, [x0, #128] ldp x20, x21, [x0, #144] ldp x22, x23, [x0, #160] ldp x24, x25, [x0, #176] ldr x26, [x0, #192] mov x28, #24 # Start of 24 rounds L_SHA3_transform_blocksx3_neon_begin: stp x27, x28, [x29, #48] # Col Mix NEON eor v30.16b, v4.16b, v9.16b eor x0, x5, x10 eor v27.16b, v1.16b, v6.16b eor x30, x1, x6 eor v30.16b, v30.16b, v14.16b eor x28, x3, x8 eor v27.16b, v27.16b, v11.16b eor x0, x0, x15 eor v30.16b, v30.16b, v19.16b eor x30, x30, x11 eor v27.16b, v27.16b, v16.16b eor x28, x28, x13 eor v30.16b, v30.16b, v24.16b eor x0, x0, x21 eor v27.16b, v27.16b, v21.16b eor x30, x30, x16 ushr v25.2d, v27.2d, #63 eor x28, x28, x19 sli v25.2d, v27.2d, #1 eor x0, x0, x26 eor v25.16b, v25.16b, v30.16b eor x30, x30, x22 eor v31.16b, v0.16b, v5.16b eor x28, x28, x24 eor v28.16b, v2.16b, v7.16b str x0, [x29, #32] eor v31.16b, v31.16b, v10.16b str x28, [x29, #24] eor v28.16b, v28.16b, v12.16b eor x27, x2, x7 eor v31.16b, v31.16b, v15.16b eor x28, x4, x9 eor v28.16b, v28.16b, v17.16b eor x27, x27, x12 eor v31.16b, v31.16b, v20.16b eor x28, x28, x14 eor v28.16b, v28.16b, v22.16b eor x27, x27, x17 ushr v29.2d, v30.2d, #63 eor x28, x28, x20 ushr v26.2d, v28.2d, #63 eor x27, x27, x23 sli v29.2d, v30.2d, #1 eor x28, x28, x25 sli v26.2d, v28.2d, #1 eor x0, x0, x27, ror 63 eor v28.16b, v28.16b, v29.16b eor x27, x27, x28, ror 63 eor v29.16b, v3.16b, v8.16b eor x1, x1, x0 eor v26.16b, v26.16b, v31.16b eor x6, x6, x0 eor v29.16b, v29.16b, v13.16b eor x11, x11, x0 eor v29.16b, v29.16b, v18.16b eor x16, x16, x0 eor v29.16b, v29.16b, v23.16b eor x22, x22, x0 ushr v30.2d, v29.2d, #63 eor x3, x3, x27 sli v30.2d, v29.2d, #1 eor x8, x8, x27 eor v27.16b, v27.16b, v30.16b eor x13, x13, x27 ushr v30.2d, v31.2d, #63 eor x19, x19, x27 sli v30.2d, v31.2d, #1 eor x24, x24, x27 eor v29.16b, v29.16b, v30.16b ldr x0, [x29, #32] # Swap Rotate NEON eor v0.16b, v0.16b, v25.16b eor v31.16b, v1.16b, v26.16b ldr x27, [x29, #24] eor v6.16b, v6.16b, v26.16b eor x28, x28, x30, ror 63 ushr v30.2d, v31.2d, #63 eor x30, x30, x27, ror 63 ushr v1.2d, v6.2d, #20 eor x27, x27, x0, ror 63 sli v30.2d, v31.2d, #1 eor x5, x5, x28 sli v1.2d, v6.2d, #44 eor x10, x10, x28 eor v31.16b, v9.16b, v29.16b eor x15, x15, x28 eor v22.16b, v22.16b, v27.16b eor x21, x21, x28 ushr v6.2d, v31.2d, #44 eor x26, x26, x28 ushr v9.2d, v22.2d, #3 eor x2, x2, x30 sli v6.2d, v31.2d, #20 eor x7, x7, x30 sli v9.2d, v22.2d, #61 eor x12, x12, x30 eor v31.16b, v14.16b, v29.16b eor x17, x17, x30 eor v20.16b, v20.16b, v25.16b eor x23, x23, x30 ushr v22.2d, v31.2d, #25 eor x4, x4, x27 ushr v14.2d, v20.2d, #46 eor x9, x9, x27 sli v22.2d, v31.2d, #39 eor x14, x14, x27 sli v14.2d, v20.2d, #18 eor x20, x20, x27 eor v31.16b, v2.16b, v27.16b eor x25, x25, x27 # Swap Rotate Base eor v12.16b, v12.16b, v27.16b ror x0, x2, #63 ushr v20.2d, v31.2d, #2 ror x2, x7, #20 ushr v2.2d, v12.2d, #21 ror x7, x10, #44 sli v20.2d, v31.2d, #62 ror x10, x24, #3 sli v2.2d, v12.2d, #43 ror x24, x15, #25 eor v31.16b, v13.16b, v28.16b ror x15, x22, #46 eor v19.16b, v19.16b, v29.16b ror x22, x3, #2 ushr v12.2d, v31.2d, #39 ror x3, x13, #21 ushr v13.2d, v19.2d, #56 ror x13, x14, #39 sli v12.2d, v31.2d, #25 ror x14, x21, #56 sli v13.2d, v19.2d, #8 ror x21, x25, #8 eor v31.16b, v23.16b, v28.16b ror x25, x16, #23 eor v15.16b, v15.16b, v25.16b ror x16, x5, #37 ushr v19.2d, v31.2d, #8 ror x5, x26, #50 ushr v23.2d, v15.2d, #23 ror x26, x23, #62 sli v19.2d, v31.2d, #56 ror x23, x9, #9 sli v23.2d, v15.2d, #41 ror x9, x17, #19 eor v31.16b, v4.16b, v29.16b ror x17, x6, #28 eor v24.16b, v24.16b, v29.16b ror x6, x4, #36 ushr v15.2d, v31.2d, #37 ror x4, x20, #43 ushr v4.2d, v24.2d, #50 ror x20, x19, #49 sli v15.2d, v31.2d, #27 ror x19, x12, #54 sli v4.2d, v24.2d, #14 ror x12, x8, #58 eor v31.16b, v21.16b, v26.16b ror x8, x11, #61 # Row Mix Base eor v8.16b, v8.16b, v28.16b bic x11, x3, x2 ushr v24.2d, v31.2d, #62 bic x27, x4, x3 ushr v21.2d, v8.2d, #9 bic x28, x1, x5 sli v24.2d, v31.2d, #2 bic x30, x2, x1 sli v21.2d, v8.2d, #55 eor x1, x1, x11 eor v31.16b, v16.16b, v26.16b eor x2, x2, x27 eor v5.16b, v5.16b, v25.16b bic x11, x5, x4 ushr v8.2d, v31.2d, #19 eor x4, x4, x28 ushr v16.2d, v5.2d, #28 eor x3, x3, x11 sli v8.2d, v31.2d, #45 eor x5, x5, x30 sli v16.2d, v5.2d, #36 bic x11, x8, x7 eor v31.16b, v3.16b, v28.16b bic x27, x9, x8 eor v18.16b, v18.16b, v28.16b bic x28, x6, x10 ushr v5.2d, v31.2d, #36 bic x30, x7, x6 ushr v3.2d, v18.2d, #43 eor x6, x6, x11 sli v5.2d, v31.2d, #28 eor x7, x7, x27 sli v3.2d, v18.2d, #21 bic x11, x10, x9 eor v31.16b, v17.16b, v27.16b eor x9, x9, x28 eor v11.16b, v11.16b, v26.16b eor x8, x8, x11 ushr v18.2d, v31.2d, #49 eor x10, x10, x30 ushr v17.2d, v11.2d, #54 bic x11, x13, x12 sli v18.2d, v31.2d, #15 bic x27, x14, x13 sli v17.2d, v11.2d, #10 bic x28, x0, x15 eor v31.16b, v7.16b, v27.16b bic x30, x12, x0 eor v10.16b, v10.16b, v25.16b eor x11, x0, x11 ushr v11.2d, v31.2d, #58 eor x12, x12, x27 ushr v7.2d, v10.2d, #61 bic x0, x15, x14 sli v11.2d, v31.2d, #6 eor x14, x14, x28 sli v7.2d, v10.2d, #3 eor x13, x13, x0 # Row Mix NEON bic v25.16b, v2.16b, v1.16b eor x15, x15, x30 bic v26.16b, v3.16b, v2.16b bic x0, x19, x17 bic v27.16b, v4.16b, v3.16b bic x27, x20, x19 bic v28.16b, v0.16b, v4.16b bic x28, x16, x21 bic v29.16b, v1.16b, v0.16b bic x30, x17, x16 eor v0.16b, v0.16b, v25.16b eor x16, x16, x0 eor v1.16b, v1.16b, v26.16b eor x17, x17, x27 eor v2.16b, v2.16b, v27.16b bic x0, x21, x20 eor v3.16b, v3.16b, v28.16b eor x20, x20, x28 eor v4.16b, v4.16b, v29.16b eor x19, x19, x0 bic v25.16b, v7.16b, v6.16b eor x21, x21, x30 bic v26.16b, v8.16b, v7.16b bic x0, x24, x23 bic v27.16b, v9.16b, v8.16b bic x27, x25, x24 bic v28.16b, v5.16b, v9.16b bic x28, x22, x26 bic v29.16b, v6.16b, v5.16b bic x30, x23, x22 eor v5.16b, v5.16b, v25.16b eor x22, x22, x0 eor v6.16b, v6.16b, v26.16b eor x23, x23, x27 eor v7.16b, v7.16b, v27.16b bic x0, x26, x25 eor v8.16b, v8.16b, v28.16b eor x25, x25, x28 eor v9.16b, v9.16b, v29.16b eor x24, x24, x0 bic v25.16b, v12.16b, v11.16b eor x26, x26, x30 bic v26.16b, v13.16b, v12.16b bic v27.16b, v14.16b, v13.16b bic v28.16b, v30.16b, v14.16b bic v29.16b, v11.16b, v30.16b eor v10.16b, v30.16b, v25.16b eor v11.16b, v11.16b, v26.16b eor v12.16b, v12.16b, v27.16b eor v13.16b, v13.16b, v28.16b eor v14.16b, v14.16b, v29.16b bic v25.16b, v17.16b, v16.16b bic v26.16b, v18.16b, v17.16b bic v27.16b, v19.16b, v18.16b bic v28.16b, v15.16b, v19.16b bic v29.16b, v16.16b, v15.16b eor v15.16b, v15.16b, v25.16b eor v16.16b, v16.16b, v26.16b eor v17.16b, v17.16b, v27.16b eor v18.16b, v18.16b, v28.16b eor v19.16b, v19.16b, v29.16b bic v25.16b, v22.16b, v21.16b bic v26.16b, v23.16b, v22.16b bic v27.16b, v24.16b, v23.16b bic v28.16b, v20.16b, v24.16b bic v29.16b, v21.16b, v20.16b eor v20.16b, v20.16b, v25.16b eor v21.16b, v21.16b, v26.16b eor v22.16b, v22.16b, v27.16b eor v23.16b, v23.16b, v28.16b eor v24.16b, v24.16b, v29.16b # Done transforming ldp x27, x28, [x29, #48] ldr x0, [x27], #8 subs x28, x28, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x1, x1, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_transform_blocksx3_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x1, x2, [x0] stp x3, x4, [x0, #16] stp x5, x6, [x0, #32] stp x7, x8, [x0, #48] stp x9, x10, [x0, #64] stp x11, x12, [x0, #80] stp x13, x14, [x0, #96] stp x15, x16, [x0, #112] stp x17, x19, [x0, #128] stp x20, x21, [x0, #144] stp x22, x23, [x0, #160] stp x24, x25, [x0, #176] str x26, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_sha3_blocksx3_neon,.-mlkem_sha3_blocksx3_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_shake128_blocksx3_seed_neon .type mlkem_shake128_blocksx3_seed_neon,@function .align 2 mlkem_shake128_blocksx3_seed_neon: #else .section __TEXT,__text .globl _mlkem_shake128_blocksx3_seed_neon .p2align 2 _mlkem_shake128_blocksx3_seed_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x28, L_sha3_aarch64_r add x28, x28, :lo12:L_sha3_aarch64_r #else adrp x28, L_sha3_aarch64_r@PAGE add x28, x28, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] ldp x2, x3, [x1], #16 add x0, x0, #0xc8 ld1 {v4.d}[1], [x0] ldp x4, x5, [x1], #16 ldr x6, [x0, #200] eor v5.16b, v5.16b, v5.16b eor x7, x7, x7 eor v6.16b, v6.16b, v6.16b eor x8, x8, x8 eor v7.16b, v7.16b, v7.16b eor x9, x9, x9 eor v8.16b, v8.16b, v8.16b eor x10, x10, x10 eor v9.16b, v9.16b, v9.16b eor x11, x11, x11 eor v10.16b, v10.16b, v10.16b eor x12, x12, x12 eor v11.16b, v11.16b, v11.16b eor x13, x13, x13 eor v12.16b, v12.16b, v12.16b eor x14, x14, x14 eor v13.16b, v13.16b, v13.16b eor x15, x15, x15 eor v14.16b, v14.16b, v14.16b eor x16, x16, x16 eor v15.16b, v15.16b, v15.16b eor x17, x17, x17 eor v16.16b, v16.16b, v16.16b eor x19, x19, x19 eor v17.16b, v17.16b, v17.16b eor x20, x20, x20 eor v18.16b, v18.16b, v18.16b eor x21, x21, x21 eor v19.16b, v19.16b, v19.16b eor x22, x22, x22 movz x23, #0x8000, lsl 48 eor v21.16b, v21.16b, v21.16b eor x24, x24, x24 eor v22.16b, v22.16b, v22.16b eor x25, x25, x25 eor v23.16b, v23.16b, v23.16b eor x26, x26, x26 eor v24.16b, v24.16b, v24.16b eor x27, x27, x27 dup v0.2d, x2 dup v1.2d, x3 dup v2.2d, x4 dup v3.2d, x5 dup v20.2d, x23 mov x1, #24 # Start of 24 rounds L_SHA3_shake128_blocksx3_seed_neon_begin: stp x28, x1, [x29, #48] # Col Mix NEON eor v30.16b, v4.16b, v9.16b eor x0, x6, x11 eor v27.16b, v1.16b, v6.16b eor x30, x2, x7 eor v30.16b, v30.16b, v14.16b eor x28, x4, x9 eor v27.16b, v27.16b, v11.16b eor x0, x0, x16 eor v30.16b, v30.16b, v19.16b eor x30, x30, x12 eor v27.16b, v27.16b, v16.16b eor x28, x28, x14 eor v30.16b, v30.16b, v24.16b eor x0, x0, x22 eor v27.16b, v27.16b, v21.16b eor x30, x30, x17 ushr v25.2d, v27.2d, #63 eor x28, x28, x20 sli v25.2d, v27.2d, #1 eor x0, x0, x27 eor v25.16b, v25.16b, v30.16b eor x30, x30, x23 eor v31.16b, v0.16b, v5.16b eor x28, x28, x25 eor v28.16b, v2.16b, v7.16b str x0, [x29, #32] eor v31.16b, v31.16b, v10.16b str x28, [x29, #24] eor v28.16b, v28.16b, v12.16b eor x1, x3, x8 eor v31.16b, v31.16b, v15.16b eor x28, x5, x10 eor v28.16b, v28.16b, v17.16b eor x1, x1, x13 eor v31.16b, v31.16b, v20.16b eor x28, x28, x15 eor v28.16b, v28.16b, v22.16b eor x1, x1, x19 ushr v29.2d, v30.2d, #63 eor x28, x28, x21 ushr v26.2d, v28.2d, #63 eor x1, x1, x24 sli v29.2d, v30.2d, #1 eor x28, x28, x26 sli v26.2d, v28.2d, #1 eor x0, x0, x1, ror 63 eor v28.16b, v28.16b, v29.16b eor x1, x1, x28, ror 63 eor v29.16b, v3.16b, v8.16b eor x2, x2, x0 eor v26.16b, v26.16b, v31.16b eor x7, x7, x0 eor v29.16b, v29.16b, v13.16b eor x12, x12, x0 eor v29.16b, v29.16b, v18.16b eor x17, x17, x0 eor v29.16b, v29.16b, v23.16b eor x23, x23, x0 ushr v30.2d, v29.2d, #63 eor x4, x4, x1 sli v30.2d, v29.2d, #1 eor x9, x9, x1 eor v27.16b, v27.16b, v30.16b eor x14, x14, x1 ushr v30.2d, v31.2d, #63 eor x20, x20, x1 sli v30.2d, v31.2d, #1 eor x25, x25, x1 eor v29.16b, v29.16b, v30.16b ldr x0, [x29, #32] # Swap Rotate NEON eor v0.16b, v0.16b, v25.16b eor v31.16b, v1.16b, v26.16b ldr x1, [x29, #24] eor v6.16b, v6.16b, v26.16b eor x28, x28, x30, ror 63 ushr v30.2d, v31.2d, #63 eor x30, x30, x1, ror 63 ushr v1.2d, v6.2d, #20 eor x1, x1, x0, ror 63 sli v30.2d, v31.2d, #1 eor x6, x6, x28 sli v1.2d, v6.2d, #44 eor x11, x11, x28 eor v31.16b, v9.16b, v29.16b eor x16, x16, x28 eor v22.16b, v22.16b, v27.16b eor x22, x22, x28 ushr v6.2d, v31.2d, #44 eor x27, x27, x28 ushr v9.2d, v22.2d, #3 eor x3, x3, x30 sli v6.2d, v31.2d, #20 eor x8, x8, x30 sli v9.2d, v22.2d, #61 eor x13, x13, x30 eor v31.16b, v14.16b, v29.16b eor x19, x19, x30 eor v20.16b, v20.16b, v25.16b eor x24, x24, x30 ushr v22.2d, v31.2d, #25 eor x5, x5, x1 ushr v14.2d, v20.2d, #46 eor x10, x10, x1 sli v22.2d, v31.2d, #39 eor x15, x15, x1 sli v14.2d, v20.2d, #18 eor x21, x21, x1 eor v31.16b, v2.16b, v27.16b eor x26, x26, x1 # Swap Rotate Base eor v12.16b, v12.16b, v27.16b ror x0, x3, #63 ushr v20.2d, v31.2d, #2 ror x3, x8, #20 ushr v2.2d, v12.2d, #21 ror x8, x11, #44 sli v20.2d, v31.2d, #62 ror x11, x25, #3 sli v2.2d, v12.2d, #43 ror x25, x16, #25 eor v31.16b, v13.16b, v28.16b ror x16, x23, #46 eor v19.16b, v19.16b, v29.16b ror x23, x4, #2 ushr v12.2d, v31.2d, #39 ror x4, x14, #21 ushr v13.2d, v19.2d, #56 ror x14, x15, #39 sli v12.2d, v31.2d, #25 ror x15, x22, #56 sli v13.2d, v19.2d, #8 ror x22, x26, #8 eor v31.16b, v23.16b, v28.16b ror x26, x17, #23 eor v15.16b, v15.16b, v25.16b ror x17, x6, #37 ushr v19.2d, v31.2d, #8 ror x6, x27, #50 ushr v23.2d, v15.2d, #23 ror x27, x24, #62 sli v19.2d, v31.2d, #56 ror x24, x10, #9 sli v23.2d, v15.2d, #41 ror x10, x19, #19 eor v31.16b, v4.16b, v29.16b ror x19, x7, #28 eor v24.16b, v24.16b, v29.16b ror x7, x5, #36 ushr v15.2d, v31.2d, #37 ror x5, x21, #43 ushr v4.2d, v24.2d, #50 ror x21, x20, #49 sli v15.2d, v31.2d, #27 ror x20, x13, #54 sli v4.2d, v24.2d, #14 ror x13, x9, #58 eor v31.16b, v21.16b, v26.16b ror x9, x12, #61 # Row Mix Base eor v8.16b, v8.16b, v28.16b bic x12, x4, x3 ushr v24.2d, v31.2d, #62 bic x1, x5, x4 ushr v21.2d, v8.2d, #9 bic x28, x2, x6 sli v24.2d, v31.2d, #2 bic x30, x3, x2 sli v21.2d, v8.2d, #55 eor x2, x2, x12 eor v31.16b, v16.16b, v26.16b eor x3, x3, x1 eor v5.16b, v5.16b, v25.16b bic x12, x6, x5 ushr v8.2d, v31.2d, #19 eor x5, x5, x28 ushr v16.2d, v5.2d, #28 eor x4, x4, x12 sli v8.2d, v31.2d, #45 eor x6, x6, x30 sli v16.2d, v5.2d, #36 bic x12, x9, x8 eor v31.16b, v3.16b, v28.16b bic x1, x10, x9 eor v18.16b, v18.16b, v28.16b bic x28, x7, x11 ushr v5.2d, v31.2d, #36 bic x30, x8, x7 ushr v3.2d, v18.2d, #43 eor x7, x7, x12 sli v5.2d, v31.2d, #28 eor x8, x8, x1 sli v3.2d, v18.2d, #21 bic x12, x11, x10 eor v31.16b, v17.16b, v27.16b eor x10, x10, x28 eor v11.16b, v11.16b, v26.16b eor x9, x9, x12 ushr v18.2d, v31.2d, #49 eor x11, x11, x30 ushr v17.2d, v11.2d, #54 bic x12, x14, x13 sli v18.2d, v31.2d, #15 bic x1, x15, x14 sli v17.2d, v11.2d, #10 bic x28, x0, x16 eor v31.16b, v7.16b, v27.16b bic x30, x13, x0 eor v10.16b, v10.16b, v25.16b eor x12, x0, x12 ushr v11.2d, v31.2d, #58 eor x13, x13, x1 ushr v7.2d, v10.2d, #61 bic x0, x16, x15 sli v11.2d, v31.2d, #6 eor x15, x15, x28 sli v7.2d, v10.2d, #3 eor x14, x14, x0 # Row Mix NEON bic v25.16b, v2.16b, v1.16b eor x16, x16, x30 bic v26.16b, v3.16b, v2.16b bic x0, x20, x19 bic v27.16b, v4.16b, v3.16b bic x1, x21, x20 bic v28.16b, v0.16b, v4.16b bic x28, x17, x22 bic v29.16b, v1.16b, v0.16b bic x30, x19, x17 eor v0.16b, v0.16b, v25.16b eor x17, x17, x0 eor v1.16b, v1.16b, v26.16b eor x19, x19, x1 eor v2.16b, v2.16b, v27.16b bic x0, x22, x21 eor v3.16b, v3.16b, v28.16b eor x21, x21, x28 eor v4.16b, v4.16b, v29.16b eor x20, x20, x0 bic v25.16b, v7.16b, v6.16b eor x22, x22, x30 bic v26.16b, v8.16b, v7.16b bic x0, x25, x24 bic v27.16b, v9.16b, v8.16b bic x1, x26, x25 bic v28.16b, v5.16b, v9.16b bic x28, x23, x27 bic v29.16b, v6.16b, v5.16b bic x30, x24, x23 eor v5.16b, v5.16b, v25.16b eor x23, x23, x0 eor v6.16b, v6.16b, v26.16b eor x24, x24, x1 eor v7.16b, v7.16b, v27.16b bic x0, x27, x26 eor v8.16b, v8.16b, v28.16b eor x26, x26, x28 eor v9.16b, v9.16b, v29.16b eor x25, x25, x0 bic v25.16b, v12.16b, v11.16b eor x27, x27, x30 bic v26.16b, v13.16b, v12.16b bic v27.16b, v14.16b, v13.16b bic v28.16b, v30.16b, v14.16b bic v29.16b, v11.16b, v30.16b eor v10.16b, v30.16b, v25.16b eor v11.16b, v11.16b, v26.16b eor v12.16b, v12.16b, v27.16b eor v13.16b, v13.16b, v28.16b eor v14.16b, v14.16b, v29.16b bic v25.16b, v17.16b, v16.16b bic v26.16b, v18.16b, v17.16b bic v27.16b, v19.16b, v18.16b bic v28.16b, v15.16b, v19.16b bic v29.16b, v16.16b, v15.16b eor v15.16b, v15.16b, v25.16b eor v16.16b, v16.16b, v26.16b eor v17.16b, v17.16b, v27.16b eor v18.16b, v18.16b, v28.16b eor v19.16b, v19.16b, v29.16b bic v25.16b, v22.16b, v21.16b bic v26.16b, v23.16b, v22.16b bic v27.16b, v24.16b, v23.16b bic v28.16b, v20.16b, v24.16b bic v29.16b, v21.16b, v20.16b eor v20.16b, v20.16b, v25.16b eor v21.16b, v21.16b, v26.16b eor v22.16b, v22.16b, v27.16b eor v23.16b, v23.16b, v28.16b eor v24.16b, v24.16b, v29.16b # Done transforming ldp x28, x1, [x29, #48] ldr x0, [x28], #8 subs x1, x1, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x2, x2, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_shake128_blocksx3_seed_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x2, x3, [x0] stp x4, x5, [x0, #16] stp x6, x7, [x0, #32] stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] stp x14, x15, [x0, #96] stp x16, x17, [x0, #112] stp x19, x20, [x0, #128] stp x21, x22, [x0, #144] stp x23, x24, [x0, #160] stp x25, x26, [x0, #176] str x27, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_shake128_blocksx3_seed_neon,.-mlkem_shake128_blocksx3_seed_neon #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl mlkem_shake256_blocksx3_seed_neon .type mlkem_shake256_blocksx3_seed_neon,@function .align 2 mlkem_shake256_blocksx3_seed_neon: #else .section __TEXT,__text .globl _mlkem_shake256_blocksx3_seed_neon .p2align 2 _mlkem_shake256_blocksx3_seed_neon: #endif /* __APPLE__ */ stp x29, x30, [sp, #-224]! add x29, sp, #0 stp x17, x19, [x29, #72] stp x20, x21, [x29, #88] stp x22, x23, [x29, #104] stp x24, x25, [x29, #120] stp x26, x27, [x29, #136] str x28, [x29, #152] stp d8, d9, [x29, #160] stp d10, d11, [x29, #176] stp d12, d13, [x29, #192] stp d14, d15, [x29, #208] #ifndef __APPLE__ adrp x28, L_sha3_aarch64_r add x28, x28, :lo12:L_sha3_aarch64_r #else adrp x28, L_sha3_aarch64_r@PAGE add x28, x28, L_sha3_aarch64_r@PAGEOFF #endif /* __APPLE__ */ str x0, [x29, #40] add x0, x0, #32 ld1 {v4.d}[0], [x0] ldp x2, x3, [x1], #16 add x0, x0, #0xc8 ld1 {v4.d}[1], [x0] ldp x4, x5, [x1], #16 ldr x6, [x0, #200] eor v5.16b, v5.16b, v5.16b eor x7, x7, x7 eor v6.16b, v6.16b, v6.16b eor x8, x8, x8 eor v7.16b, v7.16b, v7.16b eor x9, x9, x9 eor v8.16b, v8.16b, v8.16b eor x10, x10, x10 eor v9.16b, v9.16b, v9.16b eor x11, x11, x11 eor v10.16b, v10.16b, v10.16b eor x12, x12, x12 eor v11.16b, v11.16b, v11.16b eor x13, x13, x13 eor v12.16b, v12.16b, v12.16b eor x14, x14, x14 eor v13.16b, v13.16b, v13.16b eor x15, x15, x15 eor v14.16b, v14.16b, v14.16b eor x16, x16, x16 eor v15.16b, v15.16b, v15.16b eor x17, x17, x17 movz x19, #0x8000, lsl 48 eor v17.16b, v17.16b, v17.16b eor x20, x20, x20 eor v18.16b, v18.16b, v18.16b eor x21, x21, x21 eor v19.16b, v19.16b, v19.16b eor x22, x22, x22 eor v20.16b, v20.16b, v20.16b eor x23, x23, x23 eor v21.16b, v21.16b, v21.16b eor x24, x24, x24 eor v22.16b, v22.16b, v22.16b eor x25, x25, x25 eor v23.16b, v23.16b, v23.16b eor x26, x26, x26 eor v24.16b, v24.16b, v24.16b eor x27, x27, x27 dup v0.2d, x2 dup v1.2d, x3 dup v2.2d, x4 dup v3.2d, x5 dup v16.2d, x19 mov x1, #24 # Start of 24 rounds L_SHA3_shake256_blocksx3_seed_neon_begin: stp x28, x1, [x29, #48] # Col Mix NEON eor v30.16b, v4.16b, v9.16b eor x0, x6, x11 eor v27.16b, v1.16b, v6.16b eor x30, x2, x7 eor v30.16b, v30.16b, v14.16b eor x28, x4, x9 eor v27.16b, v27.16b, v11.16b eor x0, x0, x16 eor v30.16b, v30.16b, v19.16b eor x30, x30, x12 eor v27.16b, v27.16b, v16.16b eor x28, x28, x14 eor v30.16b, v30.16b, v24.16b eor x0, x0, x22 eor v27.16b, v27.16b, v21.16b eor x30, x30, x17 ushr v25.2d, v27.2d, #63 eor x28, x28, x20 sli v25.2d, v27.2d, #1 eor x0, x0, x27 eor v25.16b, v25.16b, v30.16b eor x30, x30, x23 eor v31.16b, v0.16b, v5.16b eor x28, x28, x25 eor v28.16b, v2.16b, v7.16b str x0, [x29, #32] eor v31.16b, v31.16b, v10.16b str x28, [x29, #24] eor v28.16b, v28.16b, v12.16b eor x1, x3, x8 eor v31.16b, v31.16b, v15.16b eor x28, x5, x10 eor v28.16b, v28.16b, v17.16b eor x1, x1, x13 eor v31.16b, v31.16b, v20.16b eor x28, x28, x15 eor v28.16b, v28.16b, v22.16b eor x1, x1, x19 ushr v29.2d, v30.2d, #63 eor x28, x28, x21 ushr v26.2d, v28.2d, #63 eor x1, x1, x24 sli v29.2d, v30.2d, #1 eor x28, x28, x26 sli v26.2d, v28.2d, #1 eor x0, x0, x1, ror 63 eor v28.16b, v28.16b, v29.16b eor x1, x1, x28, ror 63 eor v29.16b, v3.16b, v8.16b eor x2, x2, x0 eor v26.16b, v26.16b, v31.16b eor x7, x7, x0 eor v29.16b, v29.16b, v13.16b eor x12, x12, x0 eor v29.16b, v29.16b, v18.16b eor x17, x17, x0 eor v29.16b, v29.16b, v23.16b eor x23, x23, x0 ushr v30.2d, v29.2d, #63 eor x4, x4, x1 sli v30.2d, v29.2d, #1 eor x9, x9, x1 eor v27.16b, v27.16b, v30.16b eor x14, x14, x1 ushr v30.2d, v31.2d, #63 eor x20, x20, x1 sli v30.2d, v31.2d, #1 eor x25, x25, x1 eor v29.16b, v29.16b, v30.16b ldr x0, [x29, #32] # Swap Rotate NEON eor v0.16b, v0.16b, v25.16b eor v31.16b, v1.16b, v26.16b ldr x1, [x29, #24] eor v6.16b, v6.16b, v26.16b eor x28, x28, x30, ror 63 ushr v30.2d, v31.2d, #63 eor x30, x30, x1, ror 63 ushr v1.2d, v6.2d, #20 eor x1, x1, x0, ror 63 sli v30.2d, v31.2d, #1 eor x6, x6, x28 sli v1.2d, v6.2d, #44 eor x11, x11, x28 eor v31.16b, v9.16b, v29.16b eor x16, x16, x28 eor v22.16b, v22.16b, v27.16b eor x22, x22, x28 ushr v6.2d, v31.2d, #44 eor x27, x27, x28 ushr v9.2d, v22.2d, #3 eor x3, x3, x30 sli v6.2d, v31.2d, #20 eor x8, x8, x30 sli v9.2d, v22.2d, #61 eor x13, x13, x30 eor v31.16b, v14.16b, v29.16b eor x19, x19, x30 eor v20.16b, v20.16b, v25.16b eor x24, x24, x30 ushr v22.2d, v31.2d, #25 eor x5, x5, x1 ushr v14.2d, v20.2d, #46 eor x10, x10, x1 sli v22.2d, v31.2d, #39 eor x15, x15, x1 sli v14.2d, v20.2d, #18 eor x21, x21, x1 eor v31.16b, v2.16b, v27.16b eor x26, x26, x1 # Swap Rotate Base eor v12.16b, v12.16b, v27.16b ror x0, x3, #63 ushr v20.2d, v31.2d, #2 ror x3, x8, #20 ushr v2.2d, v12.2d, #21 ror x8, x11, #44 sli v20.2d, v31.2d, #62 ror x11, x25, #3 sli v2.2d, v12.2d, #43 ror x25, x16, #25 eor v31.16b, v13.16b, v28.16b ror x16, x23, #46 eor v19.16b, v19.16b, v29.16b ror x23, x4, #2 ushr v12.2d, v31.2d, #39 ror x4, x14, #21 ushr v13.2d, v19.2d, #56 ror x14, x15, #39 sli v12.2d, v31.2d, #25 ror x15, x22, #56 sli v13.2d, v19.2d, #8 ror x22, x26, #8 eor v31.16b, v23.16b, v28.16b ror x26, x17, #23 eor v15.16b, v15.16b, v25.16b ror x17, x6, #37 ushr v19.2d, v31.2d, #8 ror x6, x27, #50 ushr v23.2d, v15.2d, #23 ror x27, x24, #62 sli v19.2d, v31.2d, #56 ror x24, x10, #9 sli v23.2d, v15.2d, #41 ror x10, x19, #19 eor v31.16b, v4.16b, v29.16b ror x19, x7, #28 eor v24.16b, v24.16b, v29.16b ror x7, x5, #36 ushr v15.2d, v31.2d, #37 ror x5, x21, #43 ushr v4.2d, v24.2d, #50 ror x21, x20, #49 sli v15.2d, v31.2d, #27 ror x20, x13, #54 sli v4.2d, v24.2d, #14 ror x13, x9, #58 eor v31.16b, v21.16b, v26.16b ror x9, x12, #61 # Row Mix Base eor v8.16b, v8.16b, v28.16b bic x12, x4, x3 ushr v24.2d, v31.2d, #62 bic x1, x5, x4 ushr v21.2d, v8.2d, #9 bic x28, x2, x6 sli v24.2d, v31.2d, #2 bic x30, x3, x2 sli v21.2d, v8.2d, #55 eor x2, x2, x12 eor v31.16b, v16.16b, v26.16b eor x3, x3, x1 eor v5.16b, v5.16b, v25.16b bic x12, x6, x5 ushr v8.2d, v31.2d, #19 eor x5, x5, x28 ushr v16.2d, v5.2d, #28 eor x4, x4, x12 sli v8.2d, v31.2d, #45 eor x6, x6, x30 sli v16.2d, v5.2d, #36 bic x12, x9, x8 eor v31.16b, v3.16b, v28.16b bic x1, x10, x9 eor v18.16b, v18.16b, v28.16b bic x28, x7, x11 ushr v5.2d, v31.2d, #36 bic x30, x8, x7 ushr v3.2d, v18.2d, #43 eor x7, x7, x12 sli v5.2d, v31.2d, #28 eor x8, x8, x1 sli v3.2d, v18.2d, #21 bic x12, x11, x10 eor v31.16b, v17.16b, v27.16b eor x10, x10, x28 eor v11.16b, v11.16b, v26.16b eor x9, x9, x12 ushr v18.2d, v31.2d, #49 eor x11, x11, x30 ushr v17.2d, v11.2d, #54 bic x12, x14, x13 sli v18.2d, v31.2d, #15 bic x1, x15, x14 sli v17.2d, v11.2d, #10 bic x28, x0, x16 eor v31.16b, v7.16b, v27.16b bic x30, x13, x0 eor v10.16b, v10.16b, v25.16b eor x12, x0, x12 ushr v11.2d, v31.2d, #58 eor x13, x13, x1 ushr v7.2d, v10.2d, #61 bic x0, x16, x15 sli v11.2d, v31.2d, #6 eor x15, x15, x28 sli v7.2d, v10.2d, #3 eor x14, x14, x0 # Row Mix NEON bic v25.16b, v2.16b, v1.16b eor x16, x16, x30 bic v26.16b, v3.16b, v2.16b bic x0, x20, x19 bic v27.16b, v4.16b, v3.16b bic x1, x21, x20 bic v28.16b, v0.16b, v4.16b bic x28, x17, x22 bic v29.16b, v1.16b, v0.16b bic x30, x19, x17 eor v0.16b, v0.16b, v25.16b eor x17, x17, x0 eor v1.16b, v1.16b, v26.16b eor x19, x19, x1 eor v2.16b, v2.16b, v27.16b bic x0, x22, x21 eor v3.16b, v3.16b, v28.16b eor x21, x21, x28 eor v4.16b, v4.16b, v29.16b eor x20, x20, x0 bic v25.16b, v7.16b, v6.16b eor x22, x22, x30 bic v26.16b, v8.16b, v7.16b bic x0, x25, x24 bic v27.16b, v9.16b, v8.16b bic x1, x26, x25 bic v28.16b, v5.16b, v9.16b bic x28, x23, x27 bic v29.16b, v6.16b, v5.16b bic x30, x24, x23 eor v5.16b, v5.16b, v25.16b eor x23, x23, x0 eor v6.16b, v6.16b, v26.16b eor x24, x24, x1 eor v7.16b, v7.16b, v27.16b bic x0, x27, x26 eor v8.16b, v8.16b, v28.16b eor x26, x26, x28 eor v9.16b, v9.16b, v29.16b eor x25, x25, x0 bic v25.16b, v12.16b, v11.16b eor x27, x27, x30 bic v26.16b, v13.16b, v12.16b bic v27.16b, v14.16b, v13.16b bic v28.16b, v30.16b, v14.16b bic v29.16b, v11.16b, v30.16b eor v10.16b, v30.16b, v25.16b eor v11.16b, v11.16b, v26.16b eor v12.16b, v12.16b, v27.16b eor v13.16b, v13.16b, v28.16b eor v14.16b, v14.16b, v29.16b bic v25.16b, v17.16b, v16.16b bic v26.16b, v18.16b, v17.16b bic v27.16b, v19.16b, v18.16b bic v28.16b, v15.16b, v19.16b bic v29.16b, v16.16b, v15.16b eor v15.16b, v15.16b, v25.16b eor v16.16b, v16.16b, v26.16b eor v17.16b, v17.16b, v27.16b eor v18.16b, v18.16b, v28.16b eor v19.16b, v19.16b, v29.16b bic v25.16b, v22.16b, v21.16b bic v26.16b, v23.16b, v22.16b bic v27.16b, v24.16b, v23.16b bic v28.16b, v20.16b, v24.16b bic v29.16b, v21.16b, v20.16b eor v20.16b, v20.16b, v25.16b eor v21.16b, v21.16b, v26.16b eor v22.16b, v22.16b, v27.16b eor v23.16b, v23.16b, v28.16b eor v24.16b, v24.16b, v29.16b # Done transforming ldp x28, x1, [x29, #48] ldr x0, [x28], #8 subs x1, x1, #1 mov v30.d[0], x0 mov v30.d[1], x0 eor x2, x2, x0 eor v0.16b, v0.16b, v30.16b bne L_SHA3_shake256_blocksx3_seed_neon_begin ldr x0, [x29, #40] st4 {v0.d, v1.d, v2.d, v3.d}[0], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[0], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[0], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[0], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[0], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[0], [x0], #32 st1 {v24.d}[0], [x0] add x0, x0, #8 st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 st4 {v4.d, v5.d, v6.d, v7.d}[1], [x0], #32 st4 {v8.d, v9.d, v10.d, v11.d}[1], [x0], #32 st4 {v12.d, v13.d, v14.d, v15.d}[1], [x0], #32 st4 {v16.d, v17.d, v18.d, v19.d}[1], [x0], #32 st4 {v20.d, v21.d, v22.d, v23.d}[1], [x0], #32 st1 {v24.d}[1], [x0] add x0, x0, #8 stp x2, x3, [x0] stp x4, x5, [x0, #16] stp x6, x7, [x0, #32] stp x8, x9, [x0, #48] stp x10, x11, [x0, #64] stp x12, x13, [x0, #80] stp x14, x15, [x0, #96] stp x16, x17, [x0, #112] stp x19, x20, [x0, #128] stp x21, x22, [x0, #144] stp x23, x24, [x0, #160] stp x25, x26, [x0, #176] str x27, [x0, #192] ldp x17, x19, [x29, #72] ldp x20, x21, [x29, #88] ldp x22, x23, [x29, #104] ldp x24, x25, [x29, #120] ldp x26, x27, [x29, #136] ldr x28, [x29, #152] ldp d8, d9, [x29, #160] ldp d10, d11, [x29, #176] ldp d12, d13, [x29, #192] ldp d14, d15, [x29, #208] ldp x29, x30, [sp], #0xe0 ret #ifndef __APPLE__ .size mlkem_shake256_blocksx3_seed_neon,.-mlkem_shake256_blocksx3_seed_neon #endif /* __APPLE__ */ #endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ #endif /* WOLFSSL_HAVE_MLKEM */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* !WOLFSSL_ARMASM_INLINE */