/* fe_x25519_asm.S */ /* * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #ifdef WOLFSSL_USER_SETTINGS #ifdef WOLFSSL_USER_SETTINGS_ASM /* * user_settings_asm.h is a file generated by the script user_settings_asm.sh. * The script takes in a user_settings.h and produces user_settings_asm.h, which * is a stripped down version of user_settings.h containing only preprocessor * directives. This makes the header safe to include in assembly (.S) files. */ #include "user_settings_asm.h" #else /* * Note: if user_settings.h contains any C code (e.g. a typedef or function * prototype), including it here in an assembly (.S) file will cause an * assembler failure. See user_settings_asm.h above. */ #include "user_settings.h" #endif /* WOLFSSL_USER_SETTINGS_ASM */ #endif /* WOLFSSL_USER_SETTINGS */ #ifndef HAVE_INTEL_AVX1 #define HAVE_INTEL_AVX1 #endif /* HAVE_INTEL_AVX1 */ #ifndef NO_AVX2_SUPPORT #ifndef HAVE_INTEL_AVX2 #define HAVE_INTEL_AVX2 #endif /* HAVE_INTEL_AVX2 */ #endif /* NO_AVX2_SUPPORT */ #ifndef __APPLE__ .text .globl fe_init .type fe_init,@function .align 16 fe_init: #else .section __TEXT,__text .globl _fe_init .p2align 4 _fe_init: #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ movq cpuFlagsSet@GOTPCREL(%rip), %rax movl (%rax), %eax #else movl _cpuFlagsSet(%rip), %eax #endif /* __APPLE__ */ testl %eax, %eax je L_fe_init_get_flags repz retq L_fe_init_get_flags: #ifndef __APPLE__ callq cpuid_get_flags@plt #else callq _cpuid_get_flags #endif /* __APPLE__ */ #ifndef __APPLE__ movq intelFlags@GOTPCREL(%rip), %rdx movl %eax, (%rdx) #else movl %eax, _intelFlags(%rip) #endif /* __APPLE__ */ andl $0x50, %eax cmpl $0x50, %eax jne L_fe_init_flags_done #ifndef __APPLE__ movq fe_cmov_table_avx2@GOTPCREL(%rip), %rax #else leaq _fe_cmov_table_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_cmov_table_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_cmov_table_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul_avx2@GOTPCREL(%rip), %rax #else leaq _fe_mul_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_mul_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq_avx2@GOTPCREL(%rip), %rax #else leaq _fe_sq_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_sq_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul121666_avx2@GOTPCREL(%rip), %rax #else leaq _fe_mul121666_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul121666_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_mul121666_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_avx2@GOTPCREL(%rip), %rax #else leaq _fe_invert_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_invert_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq curve25519_avx2@GOTPCREL(%rip), %rax #else leaq _curve25519_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq curve25519_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _curve25519_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_avx2@GOTPCREL(%rip), %rax #else leaq _fe_pow22523_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_pow22523_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p1p1_to_p2_avx2@GOTPCREL(%rip), %rax #else leaq _ge_p1p1_to_p2_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p1p1_to_p2_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_p1p1_to_p2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p1p1_to_p3_avx2@GOTPCREL(%rip), %rax #else leaq _ge_p1p1_to_p3_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p1p1_to_p3_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_p1p1_to_p3_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p2_dbl_avx2@GOTPCREL(%rip), %rax #else leaq _ge_p2_dbl_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_p2_dbl_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_p2_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_madd_avx2@GOTPCREL(%rip), %rax #else leaq _ge_madd_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_madd_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_msub_avx2@GOTPCREL(%rip), %rax #else leaq _ge_msub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_msub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_add_avx2@GOTPCREL(%rip), %rax #else leaq _ge_add_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_add_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_sub_avx2@GOTPCREL(%rip), %rax #else leaq _ge_sub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq ge_sub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _ge_sub_p(%rip) #endif /* __APPLE__ */ #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ movq curve25519_base_avx2@GOTPCREL(%rip), %rax #else leaq _curve25519_base_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq curve25519_base_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _curve25519_base_p(%rip) #endif /* __APPLE__ */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ movq fe_sq2_avx2@GOTPCREL(%rip), %rax #else leaq _fe_sq2_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq2_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_sq2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_nct_avx2@GOTPCREL(%rip), %rax #else leaq _fe_invert_nct_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_nct_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_invert_nct_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_reduce_avx2@GOTPCREL(%rip), %rax #else leaq _sc_reduce_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_reduce_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _sc_reduce_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_muladd_avx2@GOTPCREL(%rip), %rax #else leaq _sc_muladd_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq sc_muladd_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _sc_muladd_p(%rip) #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ L_fe_init_flags_done: #ifndef __APPLE__ movq cpuFlagsSet@GOTPCREL(%rip), %rdx movl $0x1, (%rdx) #else movl $0x1, _cpuFlagsSet(%rip) #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ repz retq #ifndef __APPLE__ .size fe_init,.-fe_init #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_frombytes .type fe_frombytes,@function .align 16 fe_frombytes: #else .section __TEXT,__text .globl _fe_frombytes .p2align 4 _fe_frombytes: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r9 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 andq %r9, %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_frombytes,.-fe_frombytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_tobytes .type fe_tobytes,@function .align 16 fe_tobytes: #else .section __TEXT,__text .globl _fe_tobytes .p2align 4 _fe_tobytes: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r10 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq $19, %rdx adcq $0x00, %rax adcq $0x00, %rcx adcq $0x00, %r8 shrq $63, %r8 imulq $19, %r8, %r9 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq %r9, %rdx adcq $0x00, %rax adcq $0x00, %rcx adcq $0x00, %r8 andq %r10, %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_tobytes,.-fe_tobytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_1 .type fe_1,@function .align 16 fe_1: #else .section __TEXT,__text .globl _fe_1 .p2align 4 _fe_1: #endif /* __APPLE__ */ # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_1,.-fe_1 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_0 .type fe_0,@function .align 16 fe_0: #else .section __TEXT,__text .globl _fe_0 .p2align 4 _fe_0: #endif /* __APPLE__ */ # Set zero movq $0x00, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_0,.-fe_0 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_copy .type fe_copy,@function .align 16 fe_copy: #else .section __TEXT,__text .globl _fe_copy .p2align 4 _fe_copy: #endif /* __APPLE__ */ # Copy movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_copy,.-fe_copy #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sub .type fe_sub,@function .align 16 fe_sub: #else .section __TEXT,__text .globl _fe_sub .p2align 4 _fe_sub: #endif /* __APPLE__ */ # Sub movq (%rsi), %rax movq 8(%rsi), %rcx movq 16(%rsi), %r8 movq 24(%rsi), %r9 subq (%rdx), %rax sbbq 8(%rdx), %rcx sbbq 16(%rdx), %r8 sbbq 24(%rdx), %r9 sbbq %r10, %r10 shldq $0x01, %r9, %r10 imulq $-19, %r10 btr $63, %r9 # Add modulus (if underflow) subq %r10, %rax sbbq $0x00, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_sub,.-fe_sub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_add .type fe_add,@function .align 16 fe_add: #else .section __TEXT,__text .globl _fe_add .p2align 4 _fe_add: #endif /* __APPLE__ */ # Add movq (%rsi), %rax movq 8(%rsi), %rcx addq (%rdx), %rax movq 16(%rsi), %r8 adcq 8(%rdx), %rcx movq 24(%rsi), %r9 adcq 16(%rdx), %r8 adcq 24(%rdx), %r9 movq $0x00, %r10 adcq $0x00, %r10 shldq $0x01, %r9, %r10 imulq $19, %r10 btr $63, %r9 # Sub modulus (if overflow) addq %r10, %rax adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_add,.-fe_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_neg .type fe_neg,@function .align 16 fe_neg: #else .section __TEXT,__text .globl _fe_neg .p2align 4 _fe_neg: #endif /* __APPLE__ */ movq $-19, %rdx movq $-1, %rax movq $-1, %rcx movq $0x7fffffffffffffff, %r8 subq (%rsi), %rdx sbbq 8(%rsi), %rax sbbq 16(%rsi), %rcx sbbq 24(%rsi), %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_neg,.-fe_neg #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_cmov .type fe_cmov,@function .align 16 fe_cmov: #else .section __TEXT,__text .globl _fe_cmov .p2align 4 _fe_cmov: #endif /* __APPLE__ */ cmpl $0x01, %edx movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 cmoveq (%rsi), %rcx cmoveq 8(%rsi), %r8 cmoveq 16(%rsi), %r9 cmoveq 24(%rsi), %r10 movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_cmov,.-fe_cmov #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnonzero .type fe_isnonzero,@function .align 16 fe_isnonzero: #else .section __TEXT,__text .globl _fe_isnonzero .p2align 4 _fe_isnonzero: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r10 movq (%rdi), %rax movq 8(%rdi), %rdx movq 16(%rdi), %rcx movq 24(%rdi), %r8 addq $19, %rax adcq $0x00, %rdx adcq $0x00, %rcx adcq $0x00, %r8 shrq $63, %r8 imulq $19, %r8, %r9 movq (%rdi), %rax movq 8(%rdi), %rdx movq 16(%rdi), %rcx movq 24(%rdi), %r8 addq %r9, %rax adcq $0x00, %rdx adcq $0x00, %rcx adcq $0x00, %r8 andq %r10, %r8 orq %rdx, %rax orq %rcx, %rax orq %r8, %rax repz retq #ifndef __APPLE__ .size fe_isnonzero,.-fe_isnonzero #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnegative .type fe_isnegative,@function .align 16 fe_isnegative: #else .section __TEXT,__text .globl _fe_isnegative .p2align 4 _fe_isnegative: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r11 movq (%rdi), %rdx movq 8(%rdi), %rcx movq 16(%rdi), %r8 movq 24(%rdi), %r9 movq %rdx, %rax addq $19, %rdx adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 shrq $63, %r9 imulq $19, %r9, %r10 addq %r10, %rax andq $0x01, %rax repz retq #ifndef __APPLE__ .size fe_isnegative,.-fe_isnegative #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_cmov_table .type fe_cmov_table,@function .align 16 fe_cmov_table: #else .section __TEXT,__text .globl _fe_cmov_table .p2align 4 _fe_cmov_table: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_cmov_table_p(%rip) #else jmpq *_fe_cmov_table_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_cmov_table,.-fe_cmov_table #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul .type fe_mul,@function .align 16 fe_mul: #else .section __TEXT,__text .globl _fe_mul .p2align 4 _fe_mul: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_mul_p(%rip) #else jmpq *_fe_mul_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_mul,.-fe_mul #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq .type fe_sq,@function .align 16 fe_sq: #else .section __TEXT,__text .globl _fe_sq .p2align 4 _fe_sq: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_sq_p(%rip) #else jmpq *_fe_sq_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_sq,.-fe_sq #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666 .type fe_mul121666,@function .align 16 fe_mul121666: #else .section __TEXT,__text .globl _fe_mul121666 .p2align 4 _fe_mul121666: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_mul121666_p(%rip) #else jmpq *_fe_mul121666_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_mul121666,.-fe_mul121666 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert .type fe_invert,@function .align 16 fe_invert: #else .section __TEXT,__text .globl _fe_invert .p2align 4 _fe_invert: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_invert_p(%rip) #else jmpq *_fe_invert_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_invert,.-fe_invert #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl curve25519 .type curve25519,@function .align 16 curve25519: #else .section __TEXT,__text .globl _curve25519 .p2align 4 _curve25519: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *curve25519_p(%rip) #else jmpq *_curve25519_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523 .type fe_pow22523,@function .align 16 fe_pow22523: #else .section __TEXT,__text .globl _fe_pow22523 .p2align 4 _fe_pow22523: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_pow22523_p(%rip) #else jmpq *_fe_pow22523_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_pow22523,.-fe_pow22523 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p2 .type ge_p1p1_to_p2,@function .align 16 ge_p1p1_to_p2: #else .section __TEXT,__text .globl _ge_p1p1_to_p2 .p2align 4 _ge_p1p1_to_p2: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_p1p1_to_p2_p(%rip) #else jmpq *_ge_p1p1_to_p2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p3 .type ge_p1p1_to_p3,@function .align 16 ge_p1p1_to_p3: #else .section __TEXT,__text .globl _ge_p1p1_to_p3 .p2align 4 _ge_p1p1_to_p3: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_p1p1_to_p3_p(%rip) #else jmpq *_ge_p1p1_to_p3_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p2_dbl .type ge_p2_dbl,@function .align 16 ge_p2_dbl: #else .section __TEXT,__text .globl _ge_p2_dbl .p2align 4 _ge_p2_dbl: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_p2_dbl_p(%rip) #else jmpq *_ge_p2_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_p2_dbl,.-ge_p2_dbl #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_madd .type ge_madd,@function .align 16 ge_madd: #else .section __TEXT,__text .globl _ge_madd .p2align 4 _ge_madd: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_madd_p(%rip) #else jmpq *_ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_madd,.-ge_madd #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_msub .type ge_msub,@function .align 16 ge_msub: #else .section __TEXT,__text .globl _ge_msub .p2align 4 _ge_msub: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_msub_p(%rip) #else jmpq *_ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_msub,.-ge_msub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_add .type ge_add,@function .align 16 ge_add: #else .section __TEXT,__text .globl _ge_add .p2align 4 _ge_add: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_add_p(%rip) #else jmpq *_ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_add,.-ge_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_sub .type ge_sub,@function .align 16 ge_sub: #else .section __TEXT,__text .globl _ge_sub .p2align 4 _ge_sub: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *ge_sub_p(%rip) #else jmpq *_ge_sub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size ge_sub,.-ge_sub #endif /* __APPLE__ */ #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .text .globl curve25519_base .type curve25519_base,@function .align 16 curve25519_base: #else .section __TEXT,__text .globl _curve25519_base .p2align 4 _curve25519_base: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *curve25519_base_p(%rip) #else jmpq *_curve25519_base_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size curve25519_base,.-curve25519_base #endif /* __APPLE__ */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_sq2 .type fe_sq2,@function .align 16 fe_sq2: #else .section __TEXT,__text .globl _fe_sq2 .p2align 4 _fe_sq2: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_sq2_p(%rip) #else jmpq *_fe_sq2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_sq2,.-fe_sq2 #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_invert_nct .type fe_invert_nct,@function .align 16 fe_invert_nct: #else .section __TEXT,__text .globl _fe_invert_nct .p2align 4 _fe_invert_nct: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_invert_nct_p(%rip) #else jmpq *_fe_invert_nct_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_invert_nct,.-fe_invert_nct #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl sc_reduce .type sc_reduce,@function .align 16 sc_reduce: #else .section __TEXT,__text .globl _sc_reduce .p2align 4 _sc_reduce: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *sc_reduce_p(%rip) #else jmpq *_sc_reduce_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size sc_reduce,.-sc_reduce #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl sc_muladd .type sc_muladd,@function .align 16 sc_muladd: #else .section __TEXT,__text .globl _sc_muladd .p2align 4 _sc_muladd: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *sc_muladd_p(%rip) #else jmpq *_sc_muladd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size sc_muladd,.-sc_muladd #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #endif /* HAVE_ED25519 */ #ifndef __APPLE__ .data .type cpuFlagsSet, @object .size cpuFlagsSet,4 cpuFlagsSet: .long 0 #else .section __DATA,__data .p2align 3 _cpuFlagsSet: .long 0 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type intelFlags, @object .size intelFlags,4 intelFlags: .long 0 #else .section __DATA,__data .p2align 3 _intelFlags: .long 0 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_cmov_table_p, @object .size fe_cmov_table_p,8 fe_cmov_table_p: .quad fe_cmov_table_x64 #else .section __DATA,__data .p2align 3 _fe_cmov_table_p: .quad _fe_cmov_table_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_mul_p, @object .size fe_mul_p,8 fe_mul_p: .quad fe_mul_x64 #else .section __DATA,__data .p2align 3 _fe_mul_p: .quad _fe_mul_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_sq_p, @object .size fe_sq_p,8 fe_sq_p: .quad fe_sq_x64 #else .section __DATA,__data .p2align 3 _fe_sq_p: .quad _fe_sq_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_mul121666_p, @object .size fe_mul121666_p,8 fe_mul121666_p: .quad fe_mul121666_x64 #else .section __DATA,__data .p2align 3 _fe_mul121666_p: .quad _fe_mul121666_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_invert_p, @object .size fe_invert_p,8 fe_invert_p: .quad fe_invert_x64 #else .section __DATA,__data .p2align 3 _fe_invert_p: .quad _fe_invert_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type curve25519_p, @object .size curve25519_p,8 curve25519_p: .quad curve25519_x64 #else .section __DATA,__data .p2align 3 _curve25519_p: .quad _curve25519_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_pow22523_p, @object .size fe_pow22523_p,8 fe_pow22523_p: .quad fe_pow22523_x64 #else .section __DATA,__data .p2align 3 _fe_pow22523_p: .quad _fe_pow22523_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_p1p1_to_p2_p, @object .size ge_p1p1_to_p2_p,8 ge_p1p1_to_p2_p: .quad ge_p1p1_to_p2_x64 #else .section __DATA,__data .p2align 3 _ge_p1p1_to_p2_p: .quad _ge_p1p1_to_p2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_p1p1_to_p3_p, @object .size ge_p1p1_to_p3_p,8 ge_p1p1_to_p3_p: .quad ge_p1p1_to_p3_x64 #else .section __DATA,__data .p2align 3 _ge_p1p1_to_p3_p: .quad _ge_p1p1_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_p2_dbl_p, @object .size ge_p2_dbl_p,8 ge_p2_dbl_p: .quad ge_p2_dbl_x64 #else .section __DATA,__data .p2align 3 _ge_p2_dbl_p: .quad _ge_p2_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_madd_p, @object .size ge_madd_p,8 ge_madd_p: .quad ge_madd_x64 #else .section __DATA,__data .p2align 3 _ge_madd_p: .quad _ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_msub_p, @object .size ge_msub_p,8 ge_msub_p: .quad ge_msub_x64 #else .section __DATA,__data .p2align 3 _ge_msub_p: .quad _ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_add_p, @object .size ge_add_p,8 ge_add_p: .quad ge_add_x64 #else .section __DATA,__data .p2align 3 _ge_add_p: .quad _ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type ge_sub_p, @object .size ge_sub_p,8 ge_sub_p: .quad ge_sub_x64 #else .section __DATA,__data .p2align 3 _ge_sub_p: .quad _ge_sub_x64 #endif /* __APPLE__ */ #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data .type curve25519_base_p, @object .size curve25519_base_p,8 curve25519_base_p: .quad curve25519_base_x64 #else .section __DATA,__data .p2align 3 _curve25519_base_p: .quad _curve25519_base_x64 #endif /* __APPLE__ */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .data .type fe_sq2_p, @object .size fe_sq2_p,8 fe_sq2_p: .quad fe_sq2_x64 #else .section __DATA,__data .p2align 3 _fe_sq2_p: .quad _fe_sq2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_invert_nct_p, @object .size fe_invert_nct_p,8 fe_invert_nct_p: .quad fe_invert_nct_x64 #else .section __DATA,__data .p2align 3 _fe_invert_nct_p: .quad _fe_invert_nct_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type sc_reduce_p, @object .size sc_reduce_p,8 sc_reduce_p: .quad sc_reduce_x64 #else .section __DATA,__data .p2align 3 _sc_reduce_p: .quad _sc_reduce_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type sc_muladd_p, @object .size sc_muladd_p,8 sc_muladd_p: .quad sc_muladd_x64 #else .section __DATA,__data .p2align 3 _sc_muladd_p: .quad _sc_muladd_x64 #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifndef __APPLE__ .text .globl fe_cmov_table_x64 .type fe_cmov_table_x64,@function .align 16 fe_cmov_table_x64: #else .section __TEXT,__text .globl _fe_cmov_table_x64 .p2align 4 _fe_cmov_table_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 movq %rdx, %rcx movsbq %cl, %rax cdq xorb %dl, %al subb %dl, %al movb %al, %r15b movq $0x01, %rax xorq %rdx, %rdx xorq %r8, %r8 xorq %r9, %r9 movq $0x01, %r10 xorq %r11, %r11 xorq %r12, %r12 xorq %r13, %r13 cmpb $0x01, %r15b movq (%rsi), %r14 cmoveq %r14, %rax movq 8(%rsi), %r14 cmoveq %r14, %rdx movq 16(%rsi), %r14 cmoveq %r14, %r8 movq 24(%rsi), %r14 cmoveq %r14, %r9 movq 32(%rsi), %r14 cmoveq %r14, %r10 movq 40(%rsi), %r14 cmoveq %r14, %r11 movq 48(%rsi), %r14 cmoveq %r14, %r12 movq 56(%rsi), %r14 cmoveq %r14, %r13 cmpb $2, %r15b movq 96(%rsi), %r14 cmoveq %r14, %rax movq 104(%rsi), %r14 cmoveq %r14, %rdx movq 112(%rsi), %r14 cmoveq %r14, %r8 movq 120(%rsi), %r14 cmoveq %r14, %r9 movq 128(%rsi), %r14 cmoveq %r14, %r10 movq 136(%rsi), %r14 cmoveq %r14, %r11 movq 144(%rsi), %r14 cmoveq %r14, %r12 movq 152(%rsi), %r14 cmoveq %r14, %r13 cmpb $3, %r15b movq 192(%rsi), %r14 cmoveq %r14, %rax movq 200(%rsi), %r14 cmoveq %r14, %rdx movq 208(%rsi), %r14 cmoveq %r14, %r8 movq 216(%rsi), %r14 cmoveq %r14, %r9 movq 224(%rsi), %r14 cmoveq %r14, %r10 movq 232(%rsi), %r14 cmoveq %r14, %r11 movq 240(%rsi), %r14 cmoveq %r14, %r12 movq 248(%rsi), %r14 cmoveq %r14, %r13 cmpb $4, %r15b movq 288(%rsi), %r14 cmoveq %r14, %rax movq 296(%rsi), %r14 cmoveq %r14, %rdx movq 304(%rsi), %r14 cmoveq %r14, %r8 movq 312(%rsi), %r14 cmoveq %r14, %r9 movq 320(%rsi), %r14 cmoveq %r14, %r10 movq 328(%rsi), %r14 cmoveq %r14, %r11 movq 336(%rsi), %r14 cmoveq %r14, %r12 movq 344(%rsi), %r14 cmoveq %r14, %r13 cmpb $5, %r15b movq 384(%rsi), %r14 cmoveq %r14, %rax movq 392(%rsi), %r14 cmoveq %r14, %rdx movq 400(%rsi), %r14 cmoveq %r14, %r8 movq 408(%rsi), %r14 cmoveq %r14, %r9 movq 416(%rsi), %r14 cmoveq %r14, %r10 movq 424(%rsi), %r14 cmoveq %r14, %r11 movq 432(%rsi), %r14 cmoveq %r14, %r12 movq 440(%rsi), %r14 cmoveq %r14, %r13 cmpb $6, %r15b movq 480(%rsi), %r14 cmoveq %r14, %rax movq 488(%rsi), %r14 cmoveq %r14, %rdx movq 496(%rsi), %r14 cmoveq %r14, %r8 movq 504(%rsi), %r14 cmoveq %r14, %r9 movq 512(%rsi), %r14 cmoveq %r14, %r10 movq 520(%rsi), %r14 cmoveq %r14, %r11 movq 528(%rsi), %r14 cmoveq %r14, %r12 movq 536(%rsi), %r14 cmoveq %r14, %r13 cmpb $7, %r15b movq 576(%rsi), %r14 cmoveq %r14, %rax movq 584(%rsi), %r14 cmoveq %r14, %rdx movq 592(%rsi), %r14 cmoveq %r14, %r8 movq 600(%rsi), %r14 cmoveq %r14, %r9 movq 608(%rsi), %r14 cmoveq %r14, %r10 movq 616(%rsi), %r14 cmoveq %r14, %r11 movq 624(%rsi), %r14 cmoveq %r14, %r12 movq 632(%rsi), %r14 cmoveq %r14, %r13 cmpb $8, %r15b movq 672(%rsi), %r14 cmoveq %r14, %rax movq 680(%rsi), %r14 cmoveq %r14, %rdx movq 688(%rsi), %r14 cmoveq %r14, %r8 movq 696(%rsi), %r14 cmoveq %r14, %r9 movq 704(%rsi), %r14 cmoveq %r14, %r10 movq 712(%rsi), %r14 cmoveq %r14, %r11 movq 720(%rsi), %r14 cmoveq %r14, %r12 movq 728(%rsi), %r14 cmoveq %r14, %r13 cmpb $0x00, %cl movq %rax, %r14 cmovlq %r10, %rax cmovlq %r14, %r10 movq %rdx, %r14 cmovlq %r11, %rdx cmovlq %r14, %r11 movq %r8, %r14 cmovlq %r12, %r8 cmovlq %r14, %r12 movq %r9, %r14 cmovlq %r13, %r9 cmovlq %r14, %r13 movq %rax, (%rdi) movq %rdx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) movq %r10, 32(%rdi) movq %r11, 40(%rdi) movq %r12, 48(%rdi) movq %r13, 56(%rdi) xorq %rax, %rax xorq %rdx, %rdx xorq %r8, %r8 xorq %r9, %r9 cmpb $0x01, %r15b movq 64(%rsi), %r14 cmoveq %r14, %rax movq 72(%rsi), %r14 cmoveq %r14, %rdx movq 80(%rsi), %r14 cmoveq %r14, %r8 movq 88(%rsi), %r14 cmoveq %r14, %r9 cmpb $2, %r15b movq 160(%rsi), %r14 cmoveq %r14, %rax movq 168(%rsi), %r14 cmoveq %r14, %rdx movq 176(%rsi), %r14 cmoveq %r14, %r8 movq 184(%rsi), %r14 cmoveq %r14, %r9 cmpb $3, %r15b movq 256(%rsi), %r14 cmoveq %r14, %rax movq 264(%rsi), %r14 cmoveq %r14, %rdx movq 272(%rsi), %r14 cmoveq %r14, %r8 movq 280(%rsi), %r14 cmoveq %r14, %r9 cmpb $4, %r15b movq 352(%rsi), %r14 cmoveq %r14, %rax movq 360(%rsi), %r14 cmoveq %r14, %rdx movq 368(%rsi), %r14 cmoveq %r14, %r8 movq 376(%rsi), %r14 cmoveq %r14, %r9 cmpb $5, %r15b movq 448(%rsi), %r14 cmoveq %r14, %rax movq 456(%rsi), %r14 cmoveq %r14, %rdx movq 464(%rsi), %r14 cmoveq %r14, %r8 movq 472(%rsi), %r14 cmoveq %r14, %r9 cmpb $6, %r15b movq 544(%rsi), %r14 cmoveq %r14, %rax movq 552(%rsi), %r14 cmoveq %r14, %rdx movq 560(%rsi), %r14 cmoveq %r14, %r8 movq 568(%rsi), %r14 cmoveq %r14, %r9 cmpb $7, %r15b movq 640(%rsi), %r14 cmoveq %r14, %rax movq 648(%rsi), %r14 cmoveq %r14, %rdx movq 656(%rsi), %r14 cmoveq %r14, %r8 movq 664(%rsi), %r14 cmoveq %r14, %r9 cmpb $8, %r15b movq 736(%rsi), %r14 cmoveq %r14, %rax movq 744(%rsi), %r14 cmoveq %r14, %rdx movq 752(%rsi), %r14 cmoveq %r14, %r8 movq 760(%rsi), %r14 cmoveq %r14, %r9 movq $-19, %r10 movq $-1, %r11 movq $-1, %r12 movq $0x7fffffffffffffff, %r13 subq %rax, %r10 sbbq %rdx, %r11 sbbq %r8, %r12 sbbq %r9, %r13 cmpb $0x00, %cl cmovlq %r10, %rax cmovlq %r11, %rdx cmovlq %r12, %r8 cmovlq %r13, %r9 movq %rax, 64(%rdi) movq %rdx, 72(%rdi) movq %r8, 80(%rdi) movq %r9, 88(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_cmov_table_x64,.-fe_cmov_table_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul_x64 .type fe_mul_x64,@function .align 16 fe_mul_x64: #else .section __TEXT,__text .globl _fe_mul_x64 .p2align 4 _fe_mul_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbx, %r11 movq %rdx, %rbx movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbx, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 movq $0x7fffffffffffffff, %rbx movq %r11, %rax sarq $63, %rax andq $19, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_mul_x64,.-fe_mul_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_x64 .type fe_sq_x64,@function .align 16 fe_sq_x64: #else .section __TEXT,__text .globl _fe_sq_x64 .p2align 4 _fe_sq_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %rcx movq %rdx, %r15 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r15, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %r15 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r15, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r15 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %r15, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r15 shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %r15, %r10 movq %rdx, %r15 movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %r15, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 movq $0x7fffffffffffffff, %r15 movq %r10, %rax sarq $63, %rax andq $19, %rax andq %r15, %r10 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq_x64,.-fe_sq_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_n_x64 .type fe_sq_n_x64,@function .align 16 fe_sq_n_x64: #else .section __TEXT,__text .globl _fe_sq_n_x64 .p2align 4 _fe_sq_n_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx L_fe_sq_n_x64: # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rbx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rbx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rbx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbx, %r11 movq %rdx, %rbx movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbx, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) decb %cl jnz L_fe_sq_n_x64 popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq_n_x64,.-fe_sq_n_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666_x64 .type fe_mul121666_x64,@function .align 16 fe_mul121666_x64: #else .section __TEXT,__text .globl _fe_mul121666_x64 .p2align 4 _fe_mul121666_x64: #endif /* __APPLE__ */ pushq %r12 # Multiply by 121666 movq $0x1db42, %rax mulq (%rsi) xorq %r10, %r10 movq %rax, %r8 movq %rdx, %r9 movq $0x1db42, %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 movq $0x1db42, %rax mulq 24(%rsi) movq $0x7fffffffffffffff, %rcx addq %rax, %r11 adcq %rdx, %r12 shldq $0x01, %r11, %r12 andq %rcx, %r11 movq $19, %rax mulq %r12 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r12 repz retq #ifndef __APPLE__ .size fe_mul121666_x64,.-fe_mul121666_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert_x64 .type fe_invert_x64,@function .align 16 fe_invert_x64: #else .section __TEXT,__text .globl _fe_invert_x64 .p2align 4 _fe_invert_x64: #endif /* __APPLE__ */ subq $0x90, %rsp # Invert movq %rdi, 128(%rsp) movq %rsi, 136(%rsp) movq %rsp, %rdi movq 136(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 136(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq 128(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 136(%rsp), %rsi movq 128(%rsp), %rdi addq $0x90, %rsp repz retq #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 32 #else .p2align 5 #endif /* __APPLE__ */ L_curve25519_base_x64_x2: .quad 0x5cae469cdd684efb,0x8f3f5ced1e350b5c .quad 0xd9750c687d157114,0x20d342d51873f1b7 #ifndef __APPLE__ .text .globl curve25519_base_x64 .type curve25519_base_x64,@function .align 16 curve25519_base_x64: #else .section __TEXT,__text .globl _curve25519_base_x64 .p2align 4 _curve25519_base_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp subq $0xa8, %rsp xorq %r15, %r15 movq %rdi, 160(%rsp) # Set base point x movq $9, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set one movq $0x01, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) movq 0+L_curve25519_base_x64_x2(%rip), %rcx movq 8+L_curve25519_base_x64_x2(%rip), %r8 movq 16+L_curve25519_base_x64_x2(%rip), %r9 movq 24+L_curve25519_base_x64_x2(%rip), %r10 # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) movq %rcx, 64(%rsp) movq %r8, 72(%rsp) movq %r9, 80(%rsp) movq %r10, 88(%rsp) movq $0xfd, %rbp L_curve25519_base_x64_bits: movq %rbp, %r8 movq %rbp, %rcx andq $63, %rcx shrq $6, %r8 movq (%rsi,%r8,8), %rbx shrq %cl, %rbx andq $0x01, %rbx xorq %rbx, %r15 negq %r15 # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 movq (%rsp), %r11 movq 8(%rsp), %r12 movq 16(%rsp), %r13 movq 24(%rsp), %r14 xorq 64(%rsp), %rcx xorq 72(%rsp), %r8 xorq 80(%rsp), %r9 xorq 88(%rsp), %r10 xorq 32(%rsp), %r11 xorq 40(%rsp), %r12 xorq 48(%rsp), %r13 xorq 56(%rsp), %r14 andq %r15, %rcx andq %r15, %r8 andq %r15, %r9 andq %r15, %r10 andq %r15, %r11 andq %r15, %r12 andq %r15, %r13 andq %r15, %r14 xorq %rcx, (%rdi) xorq %r8, 8(%rdi) xorq %r9, 16(%rdi) xorq %r10, 24(%rdi) xorq %r11, (%rsp) xorq %r12, 8(%rsp) xorq %r13, 16(%rsp) xorq %r14, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r8, 72(%rsp) xorq %r9, 80(%rsp) xorq %r10, 88(%rsp) xorq %r11, 32(%rsp) xorq %r12, 40(%rsp) xorq %r13, 48(%rsp) xorq %r14, 56(%rsp) movq %rbx, %r15 # Add-Sub # Add movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 movq %rcx, %r11 addq (%rsp), %rcx movq %r8, %r12 adcq 8(%rsp), %r8 movq %r9, %r13 adcq 16(%rsp), %r9 movq %r10, %r14 adcq 24(%rsp), %r10 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r10, %rbx imulq $19, %rbx btr $63, %r10 # Sub modulus (if overflow) addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Sub subq (%rsp), %r11 sbbq 8(%rsp), %r12 sbbq 16(%rsp), %r13 sbbq 24(%rsp), %r14 sbbq %rbx, %rbx shldq $0x01, %r14, %rbx imulq $-19, %rbx btr $63, %r14 # Add modulus (if underflow) subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) movq %r11, 128(%rsp) movq %r12, 136(%rsp) movq %r13, 144(%rsp) movq %r14, 152(%rsp) # Add-Sub # Add movq 64(%rsp), %rcx movq 72(%rsp), %r8 movq 80(%rsp), %r9 movq 88(%rsp), %r10 movq %rcx, %r11 addq 32(%rsp), %rcx movq %r8, %r12 adcq 40(%rsp), %r8 movq %r9, %r13 adcq 48(%rsp), %r9 movq %r10, %r14 adcq 56(%rsp), %r10 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r10, %rbx imulq $19, %rbx btr $63, %r10 # Sub modulus (if overflow) addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Sub subq 32(%rsp), %r11 sbbq 40(%rsp), %r12 sbbq 48(%rsp), %r13 sbbq 56(%rsp), %r14 sbbq %rbx, %rbx shldq $0x01, %r14, %rbx imulq $-19, %rbx btr $63, %r14 # Add modulus (if underflow) subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 movq %rcx, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) movq %r10, 56(%rsp) movq %r11, 96(%rsp) movq %r12, 104(%rsp) movq %r13, 112(%rsp) movq %r14, 120(%rsp) # Multiply # A[0] * B[0] movq 128(%rsp), %rax mulq 32(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 136(%rsp), %rax mulq 32(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq 128(%rsp), %rax mulq 40(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 144(%rsp), %rax mulq 32(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 136(%rsp), %rax mulq 40(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq 128(%rsp), %rax mulq 48(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 152(%rsp), %rax mulq 32(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 144(%rsp), %rax mulq 40(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 136(%rsp), %rax mulq 48(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq 128(%rsp), %rax mulq 56(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 152(%rsp), %rax mulq 40(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 144(%rsp), %rax mulq 48(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 136(%rsp), %rax mulq 56(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 152(%rsp), %rax mulq 48(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 144(%rsp), %rax mulq 56(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 152(%rsp), %rax mulq 56(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) movq %r10, 56(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rax mulq 96(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 8(%rdi), %rax mulq 96(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq (%rdi), %rax mulq 104(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 16(%rdi), %rax mulq 96(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 8(%rdi), %rax mulq 104(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq (%rdi), %rax mulq 112(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 24(%rdi), %rax mulq 96(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 16(%rdi), %rax mulq 104(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 8(%rdi), %rax mulq 112(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq (%rdi), %rax mulq 120(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 24(%rdi), %rax mulq 104(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 16(%rdi), %rax mulq 112(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 8(%rdi), %rax mulq 120(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 24(%rdi), %rax mulq 112(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 16(%rdi), %rax mulq 120(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 24(%rdi), %rax mulq 120(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, (%rsp) movq %r8, 8(%rsp) movq %r9, 16(%rsp) movq %r10, 24(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rax mulq 136(%rsp) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq 128(%rsp), %rax mulq 144(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq 128(%rsp), %rax mulq 152(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 136(%rsp), %rax mulq 144(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 136(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 144(%rsp), %rax mulq 152(%rsp) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq 128(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 136(%rsp), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 144(%rsp), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 152(%rsp), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 96(%rsp) movq %r8, 104(%rsp) movq %r9, 112(%rsp) movq %r10, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 128(%rsp) movq %r8, 136(%rsp) movq %r9, 144(%rsp) movq %r10, 152(%rsp) # Add-Sub # Add movq (%rsp), %rcx movq 8(%rsp), %r8 movq 16(%rsp), %r9 movq 24(%rsp), %r10 movq %rcx, %r11 addq 32(%rsp), %rcx movq %r8, %r12 adcq 40(%rsp), %r8 movq %r9, %r13 adcq 48(%rsp), %r9 movq %r10, %r14 adcq 56(%rsp), %r10 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r10, %rbx imulq $19, %rbx btr $63, %r10 # Sub modulus (if overflow) addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Sub subq 32(%rsp), %r11 sbbq 40(%rsp), %r12 sbbq 48(%rsp), %r13 sbbq 56(%rsp), %r14 sbbq %rbx, %rbx shldq $0x01, %r14, %rbx imulq $-19, %rbx btr $63, %r14 # Add modulus (if underflow) subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 movq %rcx, 64(%rsp) movq %r8, 72(%rsp) movq %r9, 80(%rsp) movq %r10, 88(%rsp) movq %r11, 32(%rsp) movq %r12, 40(%rsp) movq %r13, 48(%rsp) movq %r14, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) # Sub movq 128(%rsp), %rcx movq 136(%rsp), %r8 movq 144(%rsp), %r9 movq 152(%rsp), %r10 subq 96(%rsp), %rcx sbbq 104(%rsp), %r8 sbbq 112(%rsp), %r9 sbbq 120(%rsp), %r10 sbbq %rbx, %rbx shldq $0x01, %r10, %rbx imulq $-19, %rbx btr $63, %r10 # Add modulus (if underflow) subq %rbx, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 movq %rcx, 128(%rsp) movq %r8, 136(%rsp) movq %r9, 144(%rsp) movq %r10, 152(%rsp) # Square # A[0] * A[1] movq 32(%rsp), %rax mulq 40(%rsp) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq 32(%rsp), %rax mulq 48(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq 32(%rsp), %rax mulq 56(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 40(%rsp), %rax mulq 48(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 40(%rsp), %rax mulq 56(%rsp) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 48(%rsp), %rax mulq 56(%rsp) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq 32(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 40(%rsp), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 48(%rsp), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 56(%rsp), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) movq %r10, 56(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax mulq 72(%rsp) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq 64(%rsp), %rax mulq 80(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq 64(%rsp), %rax mulq 88(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 72(%rsp), %rax mulq 80(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 72(%rsp), %rax mulq 88(%rsp) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 80(%rsp), %rax mulq 88(%rsp) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq 64(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 72(%rsp), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 80(%rsp), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 88(%rsp), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 64(%rsp) movq %r8, 72(%rsp) movq %r9, 80(%rsp) movq %r10, 88(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) xorq %r9, %r9 movq %rax, %rcx movq %rdx, %r8 movq $0x1db42, %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 movq $0x1db42, %rax mulq 144(%rsp) xorq %r12, %r12 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 152(%rsp) movq $0x7fffffffffffffff, %r11 addq %rax, %r10 adcq %rdx, %r12 addq 96(%rsp), %rcx adcq 104(%rsp), %r8 adcq 112(%rsp), %r9 adcq 120(%rsp), %r10 adcq $0x00, %r12 shldq $0x01, %r10, %r12 andq %r11, %r10 movq $19, %rax mulq %r12 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 movq %rcx, 96(%rsp) movq %r8, 104(%rsp) movq %r9, 112(%rsp) movq %r10, 120(%rsp) # Multiply by 9 movq $9, %rax mulq 32(%rsp) xorq %r9, %r9 movq %rax, %rcx movq %rdx, %r8 movq $9, %rax mulq 40(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 movq $9, %rax mulq 48(%rsp) xorq %r12, %r12 addq %rax, %r9 adcq %rdx, %r10 movq $9, %rax mulq 56(%rsp) movq $0x7fffffffffffffff, %r11 addq %rax, %r10 adcq %rdx, %r12 shldq $0x01, %r10, %r12 andq %r11, %r10 movq $19, %rax mulq %r12 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 movq %rcx, 32(%rsp) movq %r8, 40(%rsp) movq %r9, 48(%rsp) movq %r10, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, (%rsp) movq %r8, 8(%rsp) movq %r9, 16(%rsp) movq %r10, 24(%rsp) decq %rbp cmpq $3, %rbp jge L_curve25519_base_x64_bits negq %r15 # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 movq (%rsp), %r11 movq 8(%rsp), %r12 movq 16(%rsp), %r13 movq 24(%rsp), %r14 xorq 64(%rsp), %rcx xorq 72(%rsp), %r8 xorq 80(%rsp), %r9 xorq 88(%rsp), %r10 xorq 32(%rsp), %r11 xorq 40(%rsp), %r12 xorq 48(%rsp), %r13 xorq 56(%rsp), %r14 andq %r15, %rcx andq %r15, %r8 andq %r15, %r9 andq %r15, %r10 andq %r15, %r11 andq %r15, %r12 andq %r15, %r13 andq %r15, %r14 xorq %rcx, (%rdi) xorq %r8, 8(%rdi) xorq %r9, 16(%rdi) xorq %r10, 24(%rdi) xorq %r11, (%rsp) xorq %r12, 8(%rsp) xorq %r13, 16(%rsp) xorq %r14, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r8, 72(%rsp) xorq %r9, 80(%rsp) xorq %r10, 88(%rsp) xorq %r11, 32(%rsp) xorq %r12, 40(%rsp) xorq %r13, 48(%rsp) xorq %r14, 56(%rsp) L_curve25519_base_x64_3: # Add-Sub # Add movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 movq %rcx, %r11 addq (%rsp), %rcx movq %r8, %r12 adcq 8(%rsp), %r8 movq %r9, %r13 adcq 16(%rsp), %r9 movq %r10, %r14 adcq 24(%rsp), %r10 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r10, %rbx imulq $19, %rbx btr $63, %r10 # Sub modulus (if overflow) addq %rbx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Sub subq (%rsp), %r11 sbbq 8(%rsp), %r12 sbbq 16(%rsp), %r13 sbbq 24(%rsp), %r14 sbbq %rbx, %rbx shldq $0x01, %r14, %rbx imulq $-19, %rbx btr $63, %r14 # Add modulus (if underflow) subq %rbx, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) movq %r11, 128(%rsp) movq %r12, 136(%rsp) movq %r13, 144(%rsp) movq %r14, 152(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rax mulq 136(%rsp) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq 128(%rsp), %rax mulq 144(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq 128(%rsp), %rax mulq 152(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 136(%rsp), %rax mulq 144(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 136(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 144(%rsp), %rax mulq 152(%rsp) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq 128(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 136(%rsp), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 144(%rsp), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 152(%rsp), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 96(%rsp) movq %r8, 104(%rsp) movq %r9, 112(%rsp) movq %r10, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbx # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %rbx, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %rbx, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %rbx, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, 128(%rsp) movq %r8, 136(%rsp) movq %r9, 144(%rsp) movq %r10, 152(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) # Sub movq 128(%rsp), %rcx movq 136(%rsp), %r8 movq 144(%rsp), %r9 movq 152(%rsp), %r10 subq 96(%rsp), %rcx sbbq 104(%rsp), %r8 sbbq 112(%rsp), %r9 sbbq 120(%rsp), %r10 sbbq %rbx, %rbx shldq $0x01, %r10, %rbx imulq $-19, %rbx btr $63, %r10 # Add modulus (if underflow) subq %rbx, %rcx sbbq $0x00, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 movq %rcx, 128(%rsp) movq %r8, 136(%rsp) movq %r9, 144(%rsp) movq %r10, 152(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) xorq %r9, %r9 movq %rax, %rcx movq %rdx, %r8 movq $0x1db42, %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 movq $0x1db42, %rax mulq 144(%rsp) xorq %r12, %r12 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 152(%rsp) movq $0x7fffffffffffffff, %r11 addq %rax, %r10 adcq %rdx, %r12 addq 96(%rsp), %rcx adcq 104(%rsp), %r8 adcq 112(%rsp), %r9 adcq 120(%rsp), %r10 adcq $0x00, %r12 shldq $0x01, %r10, %r12 andq %r11, %r10 movq $19, %rax mulq %r12 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 movq %rcx, 96(%rsp) movq %r8, 104(%rsp) movq %r9, 112(%rsp) movq %r10, 120(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 # Store movq %rcx, (%rsp) movq %r8, 8(%rsp) movq %r9, 16(%rsp) movq %r10, 24(%rsp) decq %rbp jge L_curve25519_base_x64_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 160(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rax mulq (%rdi) movq %rax, %rcx movq %rdx, %r8 # A[0] * B[1] movq 8(%rsp), %rax mulq (%rdi) xorq %r9, %r9 addq %rax, %r8 adcq %rdx, %r9 # A[1] * B[0] movq (%rsp), %rax mulq 8(%rdi) xorq %r10, %r10 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r10 # A[0] * B[2] movq 16(%rsp), %rax mulq (%rdi) addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[1] movq 8(%rsp), %rax mulq 8(%rdi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[2] * B[0] movq (%rsp), %rax mulq 16(%rdi) addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[3] movq 24(%rsp), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[2] movq 16(%rsp), %rax mulq 8(%rdi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[1] movq 8(%rsp), %rax mulq 16(%rdi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[3] * B[0] movq (%rsp), %rax mulq 24(%rdi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * B[3] movq 24(%rsp), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[2] movq 16(%rsp), %rax mulq 16(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[1] movq 8(%rsp), %rax mulq 24(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[3] movq 24(%rsp), %rax mulq 16(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[2] movq 16(%rsp), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[3] movq 24(%rsp), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %rbx, %r10 movq %rdx, %rbx movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %rbx, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 movq $0x7fffffffffffffff, %rbx movq %r10, %rax sarq $63, %rax andq $19, %rax andq %rbx, %r10 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 movq $0x7fffffffffffffff, %rax movq %rcx, %rdx addq $19, %rdx movq %r8, %rdx adcq $0x00, %rdx movq %r9, %rdx adcq $0x00, %rdx movq %r10, %rdx adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r10 addq %rdx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) xorq %rax, %rax addq $0xa8, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size curve25519_base_x64,.-curve25519_base_x64 #endif /* __APPLE__ */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_x64 .type curve25519_x64,@function .align 16 curve25519_x64: #else .section __TEXT,__text .globl _curve25519_x64 .p2align 4 _curve25519_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %r8 subq $0xb0, %rsp xorq %rbx, %rbx movq %rdi, 168(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set zero movq $0x00, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) # Copy movq (%r8), %rcx movq 8(%r8), %r9 movq 16(%r8), %r10 movq 24(%r8), %r11 movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq $0xfe, %r9 L_curve25519_x64_bits: movq %r9, 160(%rsp) movq %r9, %rcx andq $63, %rcx shrq $6, %r9 movq (%rsi,%r9,8), %rbp shrq %cl, %rbp andq $0x01, %rbp xorq %rbp, %rbx negq %rbx # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq (%rsp), %r12 movq 8(%rsp), %r13 movq 16(%rsp), %r14 movq 24(%rsp), %r15 xorq 64(%rsp), %rcx xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 xorq 32(%rsp), %r12 xorq 40(%rsp), %r13 xorq 48(%rsp), %r14 xorq 56(%rsp), %r15 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 andq %rbx, %r12 andq %rbx, %r13 andq %rbx, %r14 andq %rbx, %r15 xorq %rcx, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) xorq %r12, (%rsp) xorq %r13, 8(%rsp) xorq %r14, 16(%rsp) xorq %r15, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) xorq %r12, 32(%rsp) xorq %r13, 40(%rsp) xorq %r14, 48(%rsp) xorq %r15, 56(%rsp) movq %rbp, %rbx # Add-Sub # Add movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq %rcx, %r12 addq (%rsp), %rcx movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 movq $0x00, %rbp adcq $0x00, %rbp shldq $0x01, %r11, %rbp imulq $19, %rbp btr $63, %r11 # Sub modulus (if overflow) addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq (%rsp), %r12 sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 sbbq %rbp, %rbp shldq $0x01, %r15, %rbp imulq $-19, %rbp btr $63, %r15 # Add modulus (if underflow) subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) # Add-Sub # Add movq 64(%rsp), %rcx movq 72(%rsp), %r9 movq 80(%rsp), %r10 movq 88(%rsp), %r11 movq %rcx, %r12 addq 32(%rsp), %rcx movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 movq $0x00, %rbp adcq $0x00, %rbp shldq $0x01, %r11, %rbp imulq $19, %rbp btr $63, %r11 # Sub modulus (if overflow) addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq 32(%rsp), %r12 sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 sbbq %rbp, %rbp shldq $0x01, %r15, %rbp imulq $-19, %rbp btr $63, %r15 # Add modulus (if underflow) subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) movq %r12, 96(%rsp) movq %r13, 104(%rsp) movq %r14, 112(%rsp) movq %r15, 120(%rsp) # Multiply # A[0] * B[0] movq 128(%rsp), %rax mulq 32(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 136(%rsp), %rax mulq 32(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 128(%rsp), %rax mulq 40(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 144(%rsp), %rax mulq 32(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 136(%rsp), %rax mulq 40(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 128(%rsp), %rax mulq 48(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 152(%rsp), %rax mulq 32(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 144(%rsp), %rax mulq 40(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 136(%rsp), %rax mulq 48(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 128(%rsp), %rax mulq 56(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 152(%rsp), %rax mulq 40(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 144(%rsp), %rax mulq 48(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 136(%rsp), %rax mulq 56(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 152(%rsp), %rax mulq 48(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 144(%rsp), %rax mulq 56(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 152(%rsp), %rax mulq 56(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rax mulq 96(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 8(%rdi), %rax mulq 96(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rdi), %rax mulq 104(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rdi), %rax mulq 96(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rdi), %rax mulq 104(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rdi), %rax mulq 112(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rdi), %rax mulq 96(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rdi), %rax mulq 104(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rdi), %rax mulq 112(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rdi), %rax mulq 120(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rdi), %rax mulq 104(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rdi), %rax mulq 112(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rdi), %rax mulq 120(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rdi), %rax mulq 112(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rdi), %rax mulq 120(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rdi), %rax mulq 120(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rax mulq 136(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 128(%rsp), %rax mulq 144(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 128(%rsp), %rax mulq 152(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 136(%rsp), %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 136(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 144(%rsp), %rax mulq 152(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 128(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 136(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 144(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 152(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Add-Sub # Add movq (%rsp), %rcx movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq %rcx, %r12 addq 32(%rsp), %rcx movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 movq $0x00, %rbp adcq $0x00, %rbp shldq $0x01, %r11, %rbp imulq $19, %rbp btr $63, %r11 # Sub modulus (if overflow) addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq 32(%rsp), %r12 sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 sbbq %rbp, %rbp shldq $0x01, %r15, %rbp imulq $-19, %rbp btr $63, %r15 # Add modulus (if underflow) subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq %r12, 32(%rsp) movq %r13, 40(%rsp) movq %r14, 48(%rsp) movq %r15, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) # Sub movq 128(%rsp), %rcx movq 136(%rsp), %r9 movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %rcx sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 sbbq %rbp, %rbp shldq $0x01, %r11, %rbp imulq $-19, %rbp btr $63, %r11 # Add modulus (if underflow) subq %rbp, %rcx sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Square # A[0] * A[1] movq 32(%rsp), %rax mulq 40(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 32(%rsp), %rax mulq 48(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 32(%rsp), %rax mulq 56(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 40(%rsp), %rax mulq 48(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 40(%rsp), %rax mulq 56(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 48(%rsp), %rax mulq 56(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 32(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 40(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 48(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 56(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax mulq 72(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 64(%rsp), %rax mulq 80(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 64(%rsp), %rax mulq 88(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 72(%rsp), %rax mulq 80(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 72(%rsp), %rax mulq 88(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 80(%rsp), %rax mulq 88(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 64(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 72(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 80(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 88(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) xorq %r10, %r10 movq %rax, %rcx movq %rdx, %r9 movq $0x1db42, %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r10 adcq %rdx, %r11 movq $0x1db42, %rax mulq 152(%rsp) movq $0x7fffffffffffffff, %r12 addq %rax, %r11 adcq %rdx, %r13 addq 96(%rsp), %rcx adcq 104(%rsp), %r9 adcq 112(%rsp), %r10 adcq 120(%rsp), %r11 adcq $0x00, %r13 shldq $0x01, %r11, %r13 andq %r12, %r11 movq $19, %rax mulq %r13 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Multiply # A[0] * B[0] movq 32(%rsp), %rax mulq (%r8) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 40(%rsp), %rax mulq (%r8) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 32(%rsp), %rax mulq 8(%r8) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 48(%rsp), %rax mulq (%r8) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 40(%rsp), %rax mulq 8(%r8) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 32(%rsp), %rax mulq 16(%r8) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 56(%rsp), %rax mulq (%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 48(%rsp), %rax mulq 8(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 40(%rsp), %rax mulq 16(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 32(%rsp), %rax mulq 24(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 56(%rsp), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 48(%rsp), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 40(%rsp), %rax mulq 24(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 56(%rsp), %rax mulq 16(%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 48(%rsp), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 56(%rsp), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) movq 160(%rsp), %r9 decq %r9 cmpq $3, %r9 jge L_curve25519_x64_bits movq $2, 160(%rsp) negq %rbx # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq (%rsp), %r12 movq 8(%rsp), %r13 movq 16(%rsp), %r14 movq 24(%rsp), %r15 xorq 64(%rsp), %rcx xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 xorq 32(%rsp), %r12 xorq 40(%rsp), %r13 xorq 48(%rsp), %r14 xorq 56(%rsp), %r15 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 andq %rbx, %r12 andq %rbx, %r13 andq %rbx, %r14 andq %rbx, %r15 xorq %rcx, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) xorq %r12, (%rsp) xorq %r13, 8(%rsp) xorq %r14, 16(%rsp) xorq %r15, 24(%rsp) xorq %rcx, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) xorq %r12, 32(%rsp) xorq %r13, 40(%rsp) xorq %r14, 48(%rsp) xorq %r15, 56(%rsp) L_curve25519_x64_3: # Add-Sub # Add movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq %rcx, %r12 addq (%rsp), %rcx movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 movq $0x00, %rbp adcq $0x00, %rbp shldq $0x01, %r11, %rbp imulq $19, %rbp btr $63, %r11 # Sub modulus (if overflow) addq %rbp, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq (%rsp), %r12 sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 sbbq %rbp, %rbp shldq $0x01, %r15, %rbp imulq $-19, %rbp btr $63, %r15 # Add modulus (if underflow) subq %rbp, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rax mulq 136(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 128(%rsp), %rax mulq 144(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 128(%rsp), %rax mulq 152(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 136(%rsp), %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 136(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 144(%rsp), %rax mulq 152(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 128(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 136(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 144(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 152(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) # Sub movq 128(%rsp), %rcx movq 136(%rsp), %r9 movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %rcx sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 sbbq %rbp, %rbp shldq $0x01, %r11, %rbp imulq $-19, %rbp btr $63, %r11 # Add modulus (if underflow) subq %rbp, %rcx sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) xorq %r10, %r10 movq %rax, %rcx movq %rdx, %r9 movq $0x1db42, %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r10 adcq %rdx, %r11 movq $0x1db42, %rax mulq 152(%rsp) movq $0x7fffffffffffffff, %r12 addq %rax, %r11 adcq %rdx, %r13 addq 96(%rsp), %rcx adcq 104(%rsp), %r9 adcq 112(%rsp), %r10 adcq 120(%rsp), %r11 adcq $0x00, %r13 shldq $0x01, %r11, %r13 andq %r12, %r11 movq $19, %rax mulq %r13 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) decq 160(%rsp) jge L_curve25519_x64_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 168(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rax mulq (%rdi) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 8(%rsp), %rax mulq (%rdi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rsp), %rax mulq 8(%rdi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rsp), %rax mulq (%rdi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rsp), %rax mulq 8(%rdi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rsp), %rax mulq 16(%rdi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rsp), %rax mulq (%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rsp), %rax mulq 8(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rsp), %rax mulq 16(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rsp), %rax mulq 24(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rsp), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rsp), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rsp), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rsp), %rax mulq 16(%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rsp), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rsp), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 movq $38, %rax mulq %r15 addq %rax, %r11 adcq $0x00, %rdx movq $0x7fffffffffffffff, %rbp shldq $0x01, %r11, %rdx imulq $19, %rdx, %rdx andq %rbp, %r11 movq %rdx, %rbp movq $38, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 adcq %rdx, %r14 addq %rbp, %rcx adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 movq $0x7fffffffffffffff, %rbp movq %r11, %rax sarq $63, %rax andq $19, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq $0x7fffffffffffffff, %rax movq %rcx, %rdx addq $19, %rdx movq %r9, %rdx adcq $0x00, %rdx movq %r10, %rdx adcq $0x00, %rdx movq %r11, %rdx adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) xorq %rax, %rax addq $0xb0, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size curve25519_x64,.-curve25519_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_x64 .type fe_pow22523_x64,@function .align 16 fe_pow22523_x64: #else .section __TEXT,__text .globl _fe_pow22523_x64 .p2align 4 _fe_pow22523_x64: #endif /* __APPLE__ */ subq $0x70, %rsp # pow22523 movq %rdi, 96(%rsp) movq %rsi, 104(%rsp) movq %rsp, %rdi movq 104(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 104(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq 96(%rsp), %rdi movq %rsp, %rsi movq 104(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 104(%rsp), %rsi movq 96(%rsp), %rdi addq $0x70, %rsp repz retq #ifndef __APPLE__ .text .globl ge_p1p1_to_p2_x64 .type ge_p1p1_to_p2_x64,@function .align 16 ge_p1p1_to_p2_x64: #else .section __TEXT,__text .globl _ge_p1p1_to_p2_x64 .p2align 4 _ge_p1p1_to_p2_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rsi, %rcx addq $0x60, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $0x40, %rsi addq $0x40, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %rsi, %rcx subq $32, %rcx subq $32, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $16, %rsp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p1p1_to_p2_x64,.-ge_p1p1_to_p2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p3_x64 .type ge_p1p1_to_p3_x64,@function .align 16 ge_p1p1_to_p3_x64: #else .section __TEXT,__text .globl _ge_p1p1_to_p3_x64 .p2align 4 _ge_p1p1_to_p3_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rsi, %rcx addq $0x60, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %rsi, %rcx addq $32, %rcx addq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $0x40, %rsi subq $0x40, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %rsi, %rcx addq $32, %rcx addq $32, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $16, %rsp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p1p1_to_p3_x64,.-ge_p1p1_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p2_dbl_x64 .type ge_p2_dbl_x64,@function .align 16 ge_p2_dbl_x64: #else .section __TEXT,__text .globl _ge_p2_dbl_x64 .p2align 4 _ge_p2_dbl_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) addq $0x40, %rdi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r10 movq %rdx, %r11 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r15, %r15 addq %rax, %r14 adcq %rdx, %r15 # Double xorq %rbx, %rbx addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq %r15, %r15 adcq $0x00, %rbx # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r9 movq %rdx, %r8 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r8, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r8 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r8, %r12 adcq %rax, %r13 adcq $0x00, %rdx movq %rdx, %r8 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r15 adcq %rdx, %rbx addq %r8, %r14 adcq $0x00, %r15 adcq $0x00, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $32, %rsi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r10 movq %rdx, %r11 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r15, %r15 addq %rax, %r14 adcq %rdx, %r15 # Double xorq %rbx, %rbx addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq %r15, %r15 adcq $0x00, %rbx # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r9 movq %rdx, %r8 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r8, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r8 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r8, %r12 adcq %rax, %r13 adcq $0x00, %rdx movq %rdx, %r8 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r15 adcq %rdx, %rbx addq %r8, %r14 adcq $0x00, %r15 adcq $0x00, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %rdi, %rsi subq $32, %rdi # Add-Sub # Add movq %r9, %r13 addq (%rsi), %r9 movq %r10, %r14 adcq 8(%rsi), %r10 movq %r11, %r15 adcq 16(%rsi), %r11 movq %r12, %rbx adcq 24(%rsi), %r12 movq $0x00, %r8 adcq $0x00, %r8 shldq $0x01, %r12, %r8 imulq $19, %r8 btr $63, %r12 # Sub modulus (if overflow) addq %r8, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Sub subq (%rsi), %r13 sbbq 8(%rsi), %r14 sbbq 16(%rsi), %r15 sbbq 24(%rsi), %rbx sbbq %r8, %r8 shldq $0x01, %rbx, %r8 imulq $-19, %r8 btr $63, %rbx # Add modulus (if underflow) subq %r8, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %r13, (%rsi) movq %r14, 8(%rsi) movq %r15, 16(%rsi) movq %rbx, 24(%rsi) movq 8(%rsp), %rcx movq %rcx, %rsi addq $32, %rsi subq $32, %rdi # Add movq (%rsi), %r9 movq 8(%rsi), %r10 addq (%rcx), %r9 movq 16(%rsi), %r11 adcq 8(%rcx), %r10 movq 24(%rsi), %r12 adcq 16(%rcx), %r11 adcq 24(%rcx), %r12 movq $0x00, %r8 adcq $0x00, %r8 shldq $0x01, %r12, %r8 imulq $19, %r8 btr $63, %r12 # Sub modulus (if overflow) addq %r8, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r15, %r15 addq %rax, %r14 adcq %rdx, %r15 # Double xorq %rbx, %rbx addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq %r15, %r15 adcq $0x00, %rbx # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %r9 movq %rdx, %r8 # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %r8, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r8 # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %r8, %r12 adcq %rax, %r13 adcq $0x00, %rdx movq %rdx, %r8 # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r15 adcq %rdx, %rbx addq %r8, %r14 adcq $0x00, %r15 adcq $0x00, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 # Store movq %rdi, %rsi addq $32, %rsi # Sub subq (%rsi), %r9 sbbq 8(%rsi), %r10 sbbq 16(%rsi), %r11 sbbq 24(%rsi), %r12 sbbq %r8, %r8 shldq $0x01, %r12, %r8 imulq $-19, %r8 btr $63, %r12 # Add modulus (if underflow) subq %r8, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $0x40, %rcx # Square * 2 # A[0] * A[1] movq (%rcx), %rax mulq 8(%rcx) movq %rax, %r10 movq %rdx, %r11 # A[0] * A[2] movq (%rcx), %rax mulq 16(%rcx) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[0] * A[3] movq (%rcx), %rax mulq 24(%rcx) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # A[1] * A[2] movq 8(%rcx), %rax mulq 16(%rcx) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[1] * A[3] movq 8(%rcx), %rax mulq 24(%rcx) addq %rax, %r13 adcq %rdx, %r14 # A[2] * A[3] movq 16(%rcx), %rax mulq 24(%rcx) xorq %r15, %r15 addq %rax, %r14 adcq %rdx, %r15 # Double xorq %rbx, %rbx addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq %r15, %r15 adcq $0x00, %rbx # A[0] * A[0] movq (%rcx), %rax mulq %rax movq %rax, %r9 movq %rdx, %r8 # A[1] * A[1] movq 8(%rcx), %rax mulq %rax addq %r8, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r8 # A[2] * A[2] movq 16(%rcx), %rax mulq %rax addq %r8, %r12 adcq %rax, %r13 adcq $0x00, %rdx movq %rdx, %r8 # A[3] * A[3] movq 24(%rcx), %rax mulq %rax addq %rax, %r15 adcq %rdx, %rbx addq %r8, %r14 adcq $0x00, %r15 adcq $0x00, %rbx movq $38, %rax mulq %rbx addq %rax, %r12 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r12, %rdx imulq $19, %rdx, %rdx andq %r8, %r12 movq %rdx, %r8 movq $38, %rax mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $38, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 adcq %rdx, %r15 addq %r8, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %r15, %r12 movq %r12, %rax shldq $0x01, %r11, %r12 shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shlq $1, %r9 movq $0x7fffffffffffffff, %r8 shrq $62, %rax andq %r8, %r12 imulq $19, %rax, %rax addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %rdi, %rsi addq $0x40, %rsi addq $0x60, %rdi # Sub subq (%rsi), %r9 sbbq 8(%rsi), %r10 sbbq 16(%rsi), %r11 sbbq 24(%rsi), %r12 sbbq %r8, %r8 shldq $0x01, %r12, %r8 imulq $-19, %r8 btr $63, %r12 # Add modulus (if underflow) subq %r8, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) addq $16, %rsp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p2_dbl_x64,.-ge_p2_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_madd_x64 .type ge_madd_x64,@function .align 16 ge_madd_x64: #else .section __TEXT,__text .globl _ge_madd_x64 .p2align 4 _ge_madd_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rcx subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rcx, 16(%rsp) movq %rsi, %r8 movq %rsi, %rcx addq $32, %rcx movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq (%rcx), %r10 movq 8(%rcx), %r11 movq 16(%rcx), %r12 movq 24(%rcx), %r13 movq %r10, %r14 addq (%r8), %r10 movq %r11, %r15 adcq 8(%r8), %r11 movq %r12, %rbx adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%r8), %r14 sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rcx addq $32, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) addq $0x60, %r8 addq $32, %rcx addq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) subq $0x40, %rcx subq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) subq $32, %r8 # Double movq (%r8), %r10 movq 8(%r8), %r11 addq %r10, %r10 movq 16(%r8), %r12 adcq %r11, %r11 movq 24(%r8), %r13 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %rdi, %rsi addq $0x60, %rsi addq $0x40, %rdi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_madd_x64,.-ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_msub_x64 .type ge_msub_x64,@function .align 16 ge_msub_x64: #else .section __TEXT,__text .globl _ge_msub_x64 .p2align 4 _ge_msub_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rcx subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rcx, 16(%rsp) movq %rsi, %r8 movq %rsi, %rcx addq $32, %rcx movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq (%rcx), %r10 movq 8(%rcx), %r11 movq 16(%rcx), %r12 movq 24(%rcx), %r13 movq %r10, %r14 addq (%r8), %r10 movq %r11, %r15 adcq 8(%r8), %r11 movq %r12, %rbx adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%r8), %r14 sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rcx addq $32, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $0x60, %r8 addq $0x40, %rcx addq $0x40, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) subq $32, %rcx subq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) subq $32, %r8 addq $0x40, %rdi # Double movq (%r8), %r10 movq 8(%r8), %r11 addq %r10, %r10 movq 16(%r8), %r12 adcq %r11, %r11 movq 24(%r8), %r13 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_msub_x64,.-ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_add_x64 .type ge_add_x64,@function .align 16 ge_add_x64: #else .section __TEXT,__text .globl _ge_add_x64 .p2align 4 _ge_add_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rcx subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rcx, 16(%rsp) movq %rsi, %r8 movq %rsi, %rcx addq $32, %rcx movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq (%rcx), %r10 movq 8(%rcx), %r11 movq 16(%rcx), %r12 movq 24(%rcx), %r13 movq %r10, %r14 addq (%r8), %r10 movq %r11, %r15 adcq 8(%r8), %r11 movq %r12, %rbx adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%r8), %r14 sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rcx addq $32, %rcx addq $32, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $0x60, %r8 addq $0x40, %rcx addq $0x40, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) subq $0x60, %rcx subq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) subq $32, %r8 addq $0x40, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store addq $0x40, %rdi # Double addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_add_x64,.-ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_sub_x64 .type ge_sub_x64,@function .align 16 ge_sub_x64: #else .section __TEXT,__text .globl _ge_sub_x64 .p2align 4 _ge_sub_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rcx subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rcx, 16(%rsp) movq %rsi, %r8 movq %rsi, %rcx addq $32, %rcx movq %rdi, %rsi addq $32, %rsi # Add-Sub # Add movq (%rcx), %r10 movq 8(%rcx), %r11 movq 16(%rcx), %r12 movq 24(%rcx), %r13 movq %r10, %r14 addq (%r8), %r10 movq %r11, %r15 adcq 8(%r8), %r11 movq %r12, %rbx adcq 16(%r8), %r12 movq %r13, %rbp adcq 24(%r8), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%r8), %r14 sbbq 8(%r8), %r15 sbbq 16(%r8), %rbx sbbq 24(%r8), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rcx addq $32, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $0x60, %r8 addq $0x60, %rcx addq $0x40, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) subq $0x40, %rcx subq $0x60, %rdi # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rdi) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rdi) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rdi) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rdi) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rdi) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) subq $32, %r8 addq $32, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%r8) movq %rax, %r10 movq %rdx, %r11 # A[0] * B[1] movq 8(%rcx), %rax mulq (%r8) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * B[0] movq (%rcx), %rax mulq 8(%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[0] * B[2] movq 16(%rcx), %rax mulq (%r8) addq %rax, %r12 adcq %rdx, %r13 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[0] movq (%rcx), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[0] * B[3] movq 24(%rcx), %rax mulq (%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[0] movq (%rcx), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%r8) xorq %rbx, %rbx addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rbx # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%r8) xorq %rbp, %rbp addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%r8) addq %rax, %r15 adcq %rdx, %rbx adcq $0x00, %rbp # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%r8) addq %rax, %rbx adcq %rdx, %rbp movq $38, %rax mulq %rbp addq %rax, %r13 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %rdx imulq $19, %rdx, %rdx andq %r9, %r13 movq %rdx, %r9 movq $38, %rax mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $38, %rax adcq %rdx, %r14 mulq %r15 xorq %r15, %r15 addq %rax, %r11 movq $38, %rax adcq %rdx, %r15 mulq %rbx xorq %rbx, %rbx addq %rax, %r12 adcq %rdx, %rbx addq %r9, %r10 adcq %r14, %r11 adcq %r15, %r12 adcq %rbx, %r13 # Store # Double addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %rdi, %rsi addq $0x40, %rsi addq $0x60, %rdi # Add-Sub # Add movq %r10, %r14 addq (%rdi), %r10 movq %r11, %r15 adcq 8(%rdi), %r11 movq %r12, %rbx adcq 16(%rdi), %r12 movq %r13, %rbp adcq 24(%rdi), %r13 movq $0x00, %r9 adcq $0x00, %r9 shldq $0x01, %r13, %r9 imulq $19, %r9 btr $63, %r13 # Sub modulus (if overflow) addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rdi), %r14 sbbq 8(%rdi), %r15 sbbq 16(%rdi), %rbx sbbq 24(%rdi), %rbp sbbq %r9, %r9 shldq $0x01, %rbp, %r9 imulq $-19, %r9 btr $63, %rbp # Add modulus (if underflow) subq %r9, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_sub_x64,.-ge_sub_x64 #endif /* __APPLE__ */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_sq2_x64 .type fe_sq2_x64,@function .align 16 fe_sq2_x64: #else .section __TEXT,__text .globl _fe_sq2_x64 .p2align 4 _fe_sq2_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square * 2 # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %rcx movq %rdx, %r15 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r15, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %r15 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r15, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r15 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %r15, %r12 adcq $0x00, %r13 adcq $0x00, %r14 movq $38, %rax mulq %r14 addq %rax, %r10 adcq $0x00, %rdx movq $0x7fffffffffffffff, %r15 shldq $0x01, %r10, %rdx imulq $19, %rdx, %rdx andq %r15, %r10 movq %rdx, %r15 movq $38, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $38, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $38, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 adcq %rdx, %r13 addq %r15, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 movq %r10, %rax shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shldq $0x01, %rcx, %r8 shlq $1, %rcx movq $0x7fffffffffffffff, %r15 shrq $62, %rax andq %r15, %r10 imulq $19, %rax, %rax addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq2_x64,.-fe_sq2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_reduce_x64 .type sc_reduce_x64,@function .align 16 sc_reduce_x64: #else .section __TEXT,__text .globl _sc_reduce_x64 .p2align 4 _sc_reduce_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq 32(%rdi), %r12 movq 40(%rdi), %r13 movq 48(%rdi), %r14 movq 56(%rdi), %r15 movq %r15, %rcx movq $0xfffffffffffffff, %rsi shrq $56, %rcx shldq $4, %r14, %r15 shldq $4, %r13, %r14 shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rsi, %r11 andq %rsi, %r15 # Add order times bits 504..511 subq %rcx, %r14 sbbq $0x00, %r15 movq $0xeb2106215d086329, %rax mulq %rcx movq $0x00, %rsi addq %rax, %r13 movq $0xa7ed9ce5a30a2c13, %rax adcq %rdx, %rsi mulq %rcx addq %rax, %r12 adcq %rdx, %r13 adcq %rsi, %r14 adcq $0x00, %r15 # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rcx movq %r12, %rax mulq %rcx movq $0x00, %rbp addq %rax, %r8 adcq %rdx, %rbp movq %r13, %rax mulq %rcx movq $0x00, %rsi addq %rax, %r9 adcq %rdx, %rsi movq %r14, %rax mulq %rcx addq %rbp, %r9 adcq %rax, %r10 adcq %rdx, %r11 movq $0x00, %rbx adcq $0x00, %rbx movq %r15, %rax mulq %rcx addq %rsi, %r10 adcq %rax, %r11 adcq %rdx, %rbx movq $0xeb2106215d086329, %rcx movq %r12, %rax mulq %rcx movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %rbp movq %r13, %rax mulq %rcx movq $0x00, %rsi addq %rax, %r10 adcq %rdx, %rsi movq %r14, %rax mulq %rcx addq %rbp, %r10 adcq %rax, %r11 adcq %rdx, %rbx movq $0x00, %rbp adcq $0x00, %rbp movq %r15, %rax mulq %rcx addq %rsi, %r11 adcq %rax, %rbx adcq %rdx, %rbp subq %r12, %r10 movq %rbx, %r12 sbbq %r13, %r11 movq %rbp, %r13 sbbq %r14, %r12 sbbq %r15, %r13 movq %r13, %rcx sarq $57, %rcx # Conditionally subtract order starting at bit 125 movq $0xa000000000000000, %rax movq $0xcb024c634b9eba7d, %rdx movq $0x29bdf3bd45ef39a, %rbx movq $0x200000000000000, %rbp andq %rcx, %rax andq %rcx, %rdx andq %rcx, %rbx andq %rcx, %rbp addq %rax, %r9 adcq %rdx, %r10 adcq %rbx, %r11 adcq $0x00, %r12 adcq %rbp, %r13 # Move bits 252-376 to own registers movq $0xfffffffffffffff, %rcx shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rcx, %r11 # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rcx movq %r12, %rax mulq %rcx movq $0x00, %rbx addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %rbx movq %r13, %rax mulq %rcx addq %rax, %r9 adcq %rdx, %rbx # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rcx movq %r12, %rax mulq %rcx movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %rbp movq %r13, %rax mulq %rcx addq %rax, %r10 adcq %rdx, %rbp # Add overflows at 2 * 64 movq $0xfffffffffffffff, %rsi andq %rsi, %r11 addq %rbx, %r10 adcq %rbp, %r11 # Subtract top at 2 * 64 subq %r12, %r10 sbbq %r13, %r11 sbbq %rsi, %rsi # Conditional sub order movq $0x5812631a5cf5d3ed, %rax movq $0x14def9dea2f79cd6, %rdx movq $0x1000000000000000, %rbx andq %rsi, %rax andq %rsi, %rdx andq %rsi, %rbx addq %rax, %r8 movq $0xfffffffffffffff, %rax adcq %rdx, %r9 adcq $0x00, %r10 adcq %rbx, %r11 andq %rax, %r11 # Store result movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size sc_reduce_x64,.-sc_reduce_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_muladd_x64 .type sc_muladd_x64,@function .align 16 sc_muladd_x64: #else .section __TEXT,__text .globl _sc_muladd_x64 .p2align 4 _sc_muladd_x64: #endif /* __APPLE__ */ pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rbp # Multiply # A[0] * B[0] movq (%rbp), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbp), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbp), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbp), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbp), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbp), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbp), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbp), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbp), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbp), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbp), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbp), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbp), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbp), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbp), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbp), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Add c to a * b addq (%rcx), %r8 adcq 8(%rcx), %r9 adcq 16(%rcx), %r10 adcq 24(%rcx), %r11 adcq $0x00, %r12 adcq $0x00, %r13 adcq $0x00, %r14 adcq $0x00, %r15 movq %r15, %rbx movq $0xfffffffffffffff, %rcx shrq $56, %rbx shldq $4, %r14, %r15 shldq $4, %r13, %r14 shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rcx, %r11 andq %rcx, %r15 # Add order times bits 504..507 subq %rbx, %r14 sbbq $0x00, %r15 movq $0xeb2106215d086329, %rax mulq %rbx movq $0x00, %rcx addq %rax, %r13 movq $0xa7ed9ce5a30a2c13, %rax adcq %rdx, %rcx mulq %rbx addq %rax, %r12 adcq %rdx, %r13 adcq %rcx, %r14 adcq $0x00, %r15 # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rbx movq %r12, %rax mulq %rbx movq $0x00, %rbp addq %rax, %r8 adcq %rdx, %rbp movq %r13, %rax mulq %rbx movq $0x00, %rcx addq %rax, %r9 adcq %rdx, %rcx movq %r14, %rax mulq %rbx addq %rbp, %r9 adcq %rax, %r10 adcq %rdx, %r11 movq $0x00, %rsi adcq $0x00, %rsi movq %r15, %rax mulq %rbx addq %rcx, %r10 adcq %rax, %r11 adcq %rdx, %rsi movq $0xeb2106215d086329, %rbx movq %r12, %rax mulq %rbx movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %rbp movq %r13, %rax mulq %rbx movq $0x00, %rcx addq %rax, %r10 adcq %rdx, %rcx movq %r14, %rax mulq %rbx addq %rbp, %r10 adcq %rax, %r11 adcq %rdx, %rsi movq $0x00, %rbp adcq $0x00, %rbp movq %r15, %rax mulq %rbx addq %rcx, %r11 adcq %rax, %rsi adcq %rdx, %rbp subq %r12, %r10 movq %rsi, %r12 sbbq %r13, %r11 movq %rbp, %r13 sbbq %r14, %r12 sbbq %r15, %r13 movq %r13, %rbx sarq $57, %rbx # Conditionally subtract order starting at bit 125 movq $0xa000000000000000, %rax movq $0xcb024c634b9eba7d, %rdx movq $0x29bdf3bd45ef39a, %rsi movq $0x200000000000000, %rbp andq %rbx, %rax andq %rbx, %rdx andq %rbx, %rsi andq %rbx, %rbp addq %rax, %r9 adcq %rdx, %r10 adcq %rsi, %r11 adcq $0x00, %r12 adcq %rbp, %r13 # Move bits 252-376 to own registers movq $0xfffffffffffffff, %rbx shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rbx, %r11 # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rbx movq %r12, %rax mulq %rbx movq $0x00, %rsi addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %rsi movq %r13, %rax mulq %rbx addq %rax, %r9 adcq %rdx, %rsi # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rbx movq %r12, %rax mulq %rbx movq $0x00, %rbp addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %rbp movq %r13, %rax mulq %rbx addq %rax, %r10 adcq %rdx, %rbp # Add overflows at 2 * 64 movq $0xfffffffffffffff, %rcx andq %rcx, %r11 addq %rsi, %r10 adcq %rbp, %r11 # Subtract top at 2 * 64 subq %r12, %r10 sbbq %r13, %r11 sbbq %rcx, %rcx # Conditional sub order movq $0x5812631a5cf5d3ed, %rax movq $0x14def9dea2f79cd6, %rdx movq $0x1000000000000000, %rsi andq %rcx, %rax andq %rcx, %rdx andq %rcx, %rsi addq %rax, %r8 movq $0xfffffffffffffff, %rax adcq %rdx, %r9 adcq $0x00, %r10 adcq %rsi, %r11 andq %rax, %r11 # Store result movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp repz retq #ifndef __APPLE__ .size sc_muladd_x64,.-sc_muladd_x64 #endif /* __APPLE__ */ /* Non-constant time modular inversion. * * @param [out] r Resulting number. * @param [in] a Number to invert. * @return MP_OKAY on success. */ #ifndef __APPLE__ .text .globl fe_invert_nct_x64 .type fe_invert_nct_x64,@function .align 16 fe_invert_nct_x64: #else .section __TEXT,__text .globl _fe_invert_nct_x64 .p2align 4 _fe_invert_nct_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x201, %rsp movq $-19, %rcx movq $-1, %r8 movq $-1, %r9 movq $0x7fffffffffffffff, %r10 movq (%rsi), %r11 movq 8(%rsi), %r12 movq 16(%rsi), %r13 movq 24(%rsi), %r14 movq $0x00, %r15 testb $0x01, %r11b jnz fe_invert_nct_v_even_end fe_invert_nct_v_even_start: shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrdq $1, %r14, %r13 shrq $1, %r14 movb $0x01, (%rsp,%r15,1) incq %r15 testb $0x01, %r11b jz fe_invert_nct_v_even_start fe_invert_nct_v_even_end: L_fe_invert_nct_uv_start: cmpq %r14, %r10 jb L_fe_invert_nct_uv_v ja L_fe_invert_nct_uv_u cmpq %r13, %r9 jb L_fe_invert_nct_uv_v ja L_fe_invert_nct_uv_u cmpq %r12, %r8 jb L_fe_invert_nct_uv_v ja L_fe_invert_nct_uv_u cmpq %r11, %rcx jb L_fe_invert_nct_uv_v L_fe_invert_nct_uv_u: movb $2, (%rsp,%r15,1) incq %r15 subq %r11, %rcx sbbq %r12, %r8 sbbq %r13, %r9 sbbq %r14, %r10 shrdq $1, %r8, %rcx shrdq $1, %r9, %r8 shrdq $1, %r10, %r9 shrq $1, %r10 testb $0x01, %cl jnz fe_invert_nct_usubv_even_end fe_invert_nct_usubv_even_start: shrdq $1, %r8, %rcx shrdq $1, %r9, %r8 shrdq $1, %r10, %r9 shrq $1, %r10 movb $0x00, (%rsp,%r15,1) incq %r15 testb $0x01, %cl jz fe_invert_nct_usubv_even_start fe_invert_nct_usubv_even_end: cmpq $0x01, %rcx jne L_fe_invert_nct_uv_start movq %r8, %rdx orq %r9, %rdx jne L_fe_invert_nct_uv_start orq %r10, %rdx jne L_fe_invert_nct_uv_start movb $0x01, %al jmp L_fe_invert_nct_uv_end L_fe_invert_nct_uv_v: movb $3, (%rsp,%r15,1) incq %r15 subq %rcx, %r11 sbbq %r8, %r12 sbbq %r9, %r13 sbbq %r10, %r14 shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrdq $1, %r14, %r13 shrq $1, %r14 testb $0x01, %r11b jnz fe_invert_nct_vsubu_even_end fe_invert_nct_vsubu_even_start: shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrdq $1, %r14, %r13 shrq $1, %r14 movb $0x01, (%rsp,%r15,1) incq %r15 testb $0x01, %r11b jz fe_invert_nct_vsubu_even_start fe_invert_nct_vsubu_even_end: cmpq $0x01, %r11 jne L_fe_invert_nct_uv_start movq %r12, %rdx orq %r13, %rdx jne L_fe_invert_nct_uv_start orq %r14, %rdx jne L_fe_invert_nct_uv_start movb $0x00, %al L_fe_invert_nct_uv_end: movq $-19, %rcx movq $-1, %r8 movq $-1, %r9 movq $0x7fffffffffffffff, %r10 movq $0x01, %r11 xorq %r12, %r12 xorq %r13, %r13 xorq %r14, %r14 movb $7, (%rsp,%r15,1) movb (%rsp), %dl movq $0x01, %r15 cmpb $0x01, %dl je L_fe_invert_nct_op_div2_d jl L_fe_invert_nct_op_div2_b cmpb $3, %dl je L_fe_invert_nct_op_d_sub_b jl L_fe_invert_nct_op_b_sub_d jmp L_fe_invert_nct_op_end L_fe_invert_nct_op_b_sub_d: subq %r11, %rcx sbbq %r12, %r8 sbbq %r13, %r9 sbbq %r14, %r10 jnc L_fe_invert_nct_op_div2_b movq $-1, %rdx addq $-19, %rcx adcq %rdx, %r8 adcq %rdx, %r9 movq $0x7fffffffffffffff, %rdx adcq %rdx, %r10 L_fe_invert_nct_op_div2_b: testb $0x01, %cl jz L_fe_invert_nct_op_div2_b_mod addq $-19, %rcx movq $-1, %rdx adcq %rdx, %r8 adcq %rdx, %r9 movq $0x7fffffffffffffff, %rdx adcq %rdx, %r10 L_fe_invert_nct_op_div2_b_mod: shrdq $1, %r8, %rcx shrdq $1, %r9, %r8 shrdq $1, %r10, %r9 shrq $1, %r10 movb (%rsp,%r15,1), %dl incq %r15 cmpb $0x01, %dl je L_fe_invert_nct_op_div2_d jl L_fe_invert_nct_op_div2_b cmpb $3, %dl je L_fe_invert_nct_op_d_sub_b jl L_fe_invert_nct_op_b_sub_d jmp L_fe_invert_nct_op_end L_fe_invert_nct_op_d_sub_b: subq %rcx, %r11 sbbq %r8, %r12 sbbq %r9, %r13 sbbq %r10, %r14 jnc L_fe_invert_nct_op_div2_d movq $-1, %rdx addq $-19, %r11 adcq %rdx, %r12 adcq %rdx, %r13 movq $0x7fffffffffffffff, %rdx adcq %rdx, %r14 L_fe_invert_nct_op_div2_d: testb $0x01, %r11b jz L_fe_invert_nct_op_div2_d_mod addq $-19, %r11 movq $-1, %rdx adcq %rdx, %r12 adcq %rdx, %r13 movq $0x7fffffffffffffff, %rdx adcq %rdx, %r14 L_fe_invert_nct_op_div2_d_mod: shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrdq $1, %r14, %r13 shrq $1, %r14 movb (%rsp,%r15,1), %dl incq %r15 cmpb $0x01, %dl je L_fe_invert_nct_op_div2_d jl L_fe_invert_nct_op_div2_b cmpb $3, %dl je L_fe_invert_nct_op_d_sub_b jl L_fe_invert_nct_op_b_sub_d L_fe_invert_nct_op_end: cmpb $0x01, %al jne L_fe_invert_nct_store_d movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) jmp L_fe_invert_nct_store_end L_fe_invert_nct_store_d: movq %r11, (%rdi) movq %r12, 8(%rdi) movq %r13, 16(%rdi) movq %r14, 24(%rdi) L_fe_invert_nct_store_end: addq $0x201, %rsp popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_invert_nct_x64,.-fe_invert_nct_x64 #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ .text .globl fe_cmov_table_avx2 .type fe_cmov_table_avx2,@function .align 16 fe_cmov_table_avx2: #else .section __TEXT,__text .globl _fe_cmov_table_avx2 .p2align 4 _fe_cmov_table_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx xorq %rbx, %rbx movsbq %cl, %rax cdq xorb %dl, %al subb %dl, %al movb %al, %bl movd %ebx, %xmm7 movq $0x01, %rbx movd %rbx, %xmm9 vmovdqa %ymm9, %ymm3 vmovdqa %ymm9, %ymm4 vpxor %ymm8, %ymm8, %ymm8 vpermd %ymm7, %ymm8, %ymm7 vpermd %ymm9, %ymm8, %ymm9 vpxor %ymm0, %ymm0, %ymm0 vpxor %ymm1, %ymm1, %ymm1 vpxor %ymm2, %ymm2, %ymm2 vpcmpeqd %ymm7, %ymm8, %ymm6 vpxor %ymm5, %ymm5, %ymm5 vpand %ymm6, %ymm3, %ymm3 vpand %ymm6, %ymm4, %ymm4 vmovdqa %ymm9, %ymm8 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd (%rsi), %ymm0 vmovupd 32(%rsi), %ymm1 vmovupd 64(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 96(%rsi), %ymm0 vmovupd 128(%rsi), %ymm1 vmovupd 160(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 192(%rsi), %ymm0 vmovupd 224(%rsi), %ymm1 vmovupd 256(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 288(%rsi), %ymm0 vmovupd 320(%rsi), %ymm1 vmovupd 352(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 384(%rsi), %ymm0 vmovupd 416(%rsi), %ymm1 vmovupd 448(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 480(%rsi), %ymm0 vmovupd 512(%rsi), %ymm1 vmovupd 544(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 576(%rsi), %ymm0 vmovupd 608(%rsi), %ymm1 vmovupd 640(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 vpcmpeqd %ymm7, %ymm8, %ymm6 vpaddd %ymm9, %ymm8, %ymm8 vmovupd 672(%rsi), %ymm0 vmovupd 704(%rsi), %ymm1 vmovupd 736(%rsi), %ymm2 vpand %ymm6, %ymm0, %ymm0 vpand %ymm6, %ymm1, %ymm1 vpand %ymm6, %ymm2, %ymm2 vpor %ymm0, %ymm3, %ymm3 vpor %ymm1, %ymm4, %ymm4 vpor %ymm2, %ymm5, %ymm5 movsbq %cl, %rax sarq $63, %rax vmovd %eax, %xmm6 vpxor %ymm8, %ymm8, %ymm8 vpermd %ymm6, %ymm8, %ymm6 vpxor %ymm4, %ymm3, %ymm8 vpand %ymm6, %ymm8, %ymm8 vpxor %ymm8, %ymm3, %ymm3 vpxor %ymm8, %ymm4, %ymm4 vmovupd %ymm3, (%rdi) vmovupd %ymm4, 32(%rdi) vmovupd %ymm5, 64(%rdi) movq 64(%rdi), %r8 movq 72(%rdi), %r9 movq 80(%rdi), %r10 movq 88(%rdi), %r11 movq $-19, %r12 movq $-1, %r13 movq $-1, %r14 movq $0x7fffffffffffffff, %r15 subq %r8, %r12 sbbq %r9, %r13 sbbq %r10, %r14 sbbq %r11, %r15 cmpb $0x00, %cl cmovlq %r12, %r8 cmovlq %r13, %r9 cmovlq %r14, %r10 cmovlq %r15, %r11 movq %r8, 64(%rdi) movq %r9, 72(%rdi) movq %r10, 80(%rdi) movq %r11, 88(%rdi) vzeroupper popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_cmov_table_avx2,.-fe_cmov_table_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul_avx2 .type fe_mul_avx2,@function .align 16 fe_mul_avx2: #else .section __TEXT,__text .globl _fe_mul_avx2 .p2align 4 _fe_mul_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp movq %rdx, %rbp movq (%rsi), %rbx # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq %rbx, %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rax, %rcx xorq %r15, %r15 adcxq %rax, %r9 # A[3] * B[1] movq 8(%rbp), %rdx mulxq 24(%rsi), %r12, %r13 adcxq %rcx, %r10 # A[0] * B[1] mulxq %rbx, %rax, %rcx adoxq %rax, %r9 # A[2] * B[1] mulxq 16(%rsi), %rax, %r14 adoxq %rcx, %r10 adcxq %rax, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rsi), %rax, %rcx adcxq %r14, %r12 adoxq %rax, %r11 adcxq %r15, %r13 adoxq %rcx, %r12 # A[0] * B[2] mulxq %rbx, %rax, %rcx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rax, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rsi), %rdx, %rax adcxq %rcx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 24(%rbp), %rdx adoxq %rax, %r11 mulxq 8(%rsi), %rax, %rcx adcxq %rax, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rsi), %rdx, %rax adcxq %rcx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rax, %r13 mulxq 24(%rsi), %rax, %rcx adoxq %r15, %r14 adcxq %rax, %r14 # A[0] * B[3] mulxq %rbx, %rdx, %rax adcxq %rcx, %r15 xorq %rcx, %rcx adcxq %rdx, %r11 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rax, %r12 mulxq (%rbp), %rdx, %rax adoxq %rdx, %r11 adoxq %rax, %r12 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rbp), %rdx, %rax adcxq %rdx, %r13 # A[2] * B[3] movq 24(%rbp), %rdx adcxq %rax, %r14 mulxq 16(%rsi), %rax, %rdx adcxq %rcx, %r15 adoxq %rax, %r13 adoxq %rdx, %r14 adoxq %rcx, %r15 movq $38, %rdx mulxq %r15, %r15, %rax addq %r15, %r11 adcq $0x00, %rax movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rax imulq $19, %rax, %rax andq %rcx, %r11 xorq %rcx, %rcx adoxq %rax, %r8 mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 movq $0x7fffffffffffffff, %rcx movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_mul_avx2,.-fe_mul_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_avx2 .type fe_sq_avx2,@function .align 16 fe_sq_avx2: #else .section __TEXT,__text .globl _fe_sq_avx2 .p2align 4 _fe_sq_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square movq (%rsi), %rdx movq 8(%rsi), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rsi), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 16(%rsi), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 24(%rsi), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 movq $0x7fffffffffffffff, %rcx movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq_avx2,.-fe_sq_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_n_avx2 .type fe_sq_n_avx2,@function .align 16 fe_sq_n_avx2: #else .section __TEXT,__text .globl _fe_sq_n_avx2 .p2align 4 _fe_sq_n_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp movq %rdx, %rbp L_fe_sq_n_avx2: # Square movq (%rsi), %rdx movq 8(%rsi), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rsi), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 16(%rsi), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 24(%rsi), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) decb %bpl jnz L_fe_sq_n_avx2 popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq_n_avx2,.-fe_sq_n_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666_avx2 .type fe_mul121666_avx2,@function .align 16 fe_mul121666_avx2: #else .section __TEXT,__text .globl _fe_mul121666_avx2 .p2align 4 _fe_mul121666_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 movq $0x1db42, %rdx mulxq (%rsi), %rax, %r13 mulxq 8(%rsi), %rcx, %r12 mulxq 16(%rsi), %r8, %r11 addq %r13, %rcx mulxq 24(%rsi), %r9, %r10 adcq %r12, %r8 adcq %r11, %r9 adcq $0x00, %r10 shldq $0x01, %r9, %r10 btr $63, %r9 imulq $19, %r10, %r10 addq %r10, %rax adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_mul121666_avx2,.-fe_mul121666_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert_avx2 .type fe_invert_avx2,@function .align 16 fe_invert_avx2: #else .section __TEXT,__text .globl _fe_invert_avx2 .p2align 4 _fe_invert_avx2: #endif /* __APPLE__ */ subq $0x90, %rsp # Invert movq %rdi, 128(%rsp) movq %rsi, 136(%rsp) movq %rsp, %rdi movq 136(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 136(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq 128(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 136(%rsp), %rsi movq 128(%rsp), %rdi addq $0x90, %rsp repz retq #if defined(WOLFSSL_CURVE25519_NOT_USE_ED25519) #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 32 #else .p2align 5 #endif /* __APPLE__ */ L_curve25519_base_avx2_x2: .quad 0x5cae469cdd684efb,0x8f3f5ced1e350b5c .quad 0xd9750c687d157114,0x20d342d51873f1b7 #ifndef __APPLE__ .text .globl curve25519_base_avx2 .type curve25519_base_avx2,@function .align 16 curve25519_base_avx2: #else .section __TEXT,__text .globl _curve25519_base_avx2 .p2align 4 _curve25519_base_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp subq $0xb0, %rsp movq $0x00, 168(%rsp) movq %rdi, 160(%rsp) # Set base point x movq $9, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set one movq $0x01, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) movq 0+L_curve25519_base_avx2_x2(%rip), %r8 movq 8+L_curve25519_base_avx2_x2(%rip), %r9 movq 16+L_curve25519_base_avx2_x2(%rip), %r10 movq 24+L_curve25519_base_avx2_x2(%rip), %r11 # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) movq %r8, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq $0xfd, %rbp L_curve25519_base_avx2_bits: movq 168(%rsp), %rax movq %rbp, %rbx movq %rbp, %rcx shrq $6, %rbx andq $63, %rcx movq (%rsi,%rbx,8), %rbx shrq %cl, %rbx andq $0x01, %rbx xorq %rbx, %rax negq %rax # Conditional Swap movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq (%rsp), %r12 movq 8(%rsp), %r13 movq 16(%rsp), %r14 movq 24(%rsp), %r15 xorq 64(%rsp), %r8 xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 xorq 32(%rsp), %r12 xorq 40(%rsp), %r13 xorq 48(%rsp), %r14 xorq 56(%rsp), %r15 andq %rax, %r8 andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 andq %rax, %r13 andq %rax, %r14 andq %rax, %r15 xorq %r8, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) xorq %r12, (%rsp) xorq %r13, 8(%rsp) xorq %r14, 16(%rsp) xorq %r15, 24(%rsp) xorq %r8, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) xorq %r12, 32(%rsp) xorq %r13, 40(%rsp) xorq %r14, 48(%rsp) xorq %r15, 56(%rsp) movq %rbx, 168(%rsp) # Add-Sub # Add movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq %r8, %r12 addq (%rsp), %r8 movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r11, %rbx imulq $19, %rbx btr $63, %r11 # Sub modulus (if overflow) addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq (%rsp), %r12 sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 sbbq %rbx, %rbx shldq $0x01, %r15, %rbx imulq $-19, %rbx btr $63, %r15 # Add modulus (if underflow) subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) # Add-Sub # Add movq 64(%rsp), %r8 movq 72(%rsp), %r9 movq 80(%rsp), %r10 movq 88(%rsp), %r11 movq %r8, %r12 addq 32(%rsp), %r8 movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r11, %rbx imulq $19, %rbx btr $63, %r11 # Sub modulus (if overflow) addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq 32(%rsp), %r12 sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 sbbq %rbx, %rbx shldq $0x01, %r15, %rbx imulq $-19, %rbx btr $63, %r15 # Add modulus (if underflow) subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) movq %r12, 96(%rsp) movq %r13, 104(%rsp) movq %r14, 112(%rsp) movq %r15, 120(%rsp) movq 32(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 48(%rsp), %r10, %r11 # A[1] * B[0] mulxq 40(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 136(%rsp), %rdx mulxq 56(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 48(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 144(%rsp), %rdx mulxq 40(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 136(%rsp), %rdx mulxq 40(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r11 mulxq 40(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 144(%rsp), %rdx mulxq 48(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r13 mulxq 56(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 56(%rsp), %rdx adcxq %rcx, %r12 mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 56(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 152(%rsp), %rdx adcxq %rcx, %r14 mulxq 48(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq (%rdi), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 112(%rsp), %r10, %r11 # A[1] * B[0] mulxq 104(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 8(%rdi), %rdx mulxq 120(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rdi), %rdx mulxq 104(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rdi), %rdx mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r11 mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rdi), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r13 mulxq 120(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 120(%rsp), %rdx adcxq %rcx, %r12 mulxq (%rdi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 120(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 24(%rdi), %rdx adcxq %rcx, %r14 mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) # Square movq 128(%rsp), %rdx movq 136(%rsp), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 152(%rsp), %r11, %r12 # A[2] * A[1] movq 144(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 152(%rsp), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 144(%rsp), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 152(%rsp), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Square movq (%rdi), %rdx movq 8(%rdi), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rdi), %r11, %r12 # A[2] * A[1] movq 16(%rdi), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rdi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 16(%rdi), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 24(%rdi), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Add-Sub # Add movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq %r8, %r12 addq 32(%rsp), %r8 movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 movq %r11, %r15 adcq 56(%rsp), %r11 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r11, %rbx imulq $19, %rbx btr $63, %r11 # Sub modulus (if overflow) addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq 32(%rsp), %r12 sbbq 40(%rsp), %r13 sbbq 48(%rsp), %r14 sbbq 56(%rsp), %r15 sbbq %rbx, %rbx shldq $0x01, %r15, %rbx imulq $-19, %rbx btr $63, %r15 # Add modulus (if underflow) subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %r8, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq %r12, 32(%rsp) movq %r13, 40(%rsp) movq %r14, 48(%rsp) movq %r15, 56(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r11 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r13 mulxq 152(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r12 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r14 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) # Sub movq 128(%rsp), %r8 movq 136(%rsp), %r9 movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %r8 sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 sbbq %rbx, %rbx shldq $0x01, %r11, %rbx imulq $-19, %rbx btr $63, %r11 # Add modulus (if underflow) subq %rbx, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 movq %r8, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Square movq 32(%rsp), %rdx movq 40(%rsp), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 56(%rsp), %r11, %r12 # A[2] * A[1] movq 48(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 56(%rsp), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 56(%rsp), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 48(%rsp), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 56(%rsp), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Square movq 64(%rsp), %rdx movq 72(%rsp), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 88(%rsp), %r11, %r12 # A[2] * A[1] movq 80(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 88(%rsp), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 88(%rsp), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 80(%rsp), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 88(%rsp), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r8, %r15 mulxq 136(%rsp), %r9, %r14 mulxq 144(%rsp), %r10, %r13 addq %r15, %r9 mulxq 152(%rsp), %r11, %r12 adcq %r14, %r10 adcq %r13, %r11 adcq $0x00, %r12 addq 96(%rsp), %r8 adcq 104(%rsp), %r9 adcq 112(%rsp), %r10 adcq 120(%rsp), %r11 adcq $0x00, %r12 shldq $0x01, %r11, %r12 btr $63, %r11 imulq $19, %r12, %r12 addq %r12, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %r8, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) movq $9, %rdx mulxq 32(%rsp), %r8, %r15 mulxq 40(%rsp), %r9, %r14 mulxq 48(%rsp), %r10, %r13 addq %r15, %r9 mulxq 56(%rsp), %r11, %r12 adcq %r14, %r10 adcq %r13, %r11 adcq $0x00, %r12 shldq $0x01, %r11, %r12 btr $63, %r11 imulq $19, %r12, %r12 addq %r12, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r11 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r13 mulxq 152(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r12 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r14 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) decq %rbp cmpq $3, %rbp jge L_curve25519_base_avx2_bits movq 168(%rsp), %rax negq %rax # Conditional Swap movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq (%rsp), %r12 movq 8(%rsp), %r13 movq 16(%rsp), %r14 movq 24(%rsp), %r15 xorq 64(%rsp), %r8 xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 xorq 32(%rsp), %r12 xorq 40(%rsp), %r13 xorq 48(%rsp), %r14 xorq 56(%rsp), %r15 andq %rax, %r8 andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 andq %rax, %r13 andq %rax, %r14 andq %rax, %r15 xorq %r8, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) xorq %r12, (%rsp) xorq %r13, 8(%rsp) xorq %r14, 16(%rsp) xorq %r15, 24(%rsp) xorq %r8, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) xorq %r12, 32(%rsp) xorq %r13, 40(%rsp) xorq %r14, 48(%rsp) xorq %r15, 56(%rsp) L_curve25519_base_avx2_last_3: # Add-Sub # Add movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq %r8, %r12 addq (%rsp), %r8 movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %r11, %r15 adcq 24(%rsp), %r11 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r11, %rbx imulq $19, %rbx btr $63, %r11 # Sub modulus (if overflow) addq %rbx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Sub subq (%rsp), %r12 sbbq 8(%rsp), %r13 sbbq 16(%rsp), %r14 sbbq 24(%rsp), %r15 sbbq %rbx, %rbx shldq $0x01, %r15, %rbx imulq $-19, %rbx btr $63, %r15 # Add modulus (if underflow) subq %rbx, %r12 sbbq $0x00, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) # Square movq 128(%rsp), %rdx movq 136(%rsp), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 152(%rsp), %r11, %r12 # A[2] * A[1] movq 144(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 152(%rsp), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 144(%rsp), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 152(%rsp), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Square movq (%rdi), %rdx movq 8(%rdi), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rdi), %r11, %r12 # A[2] * A[1] movq 16(%rdi), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rdi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 16(%rdi), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 24(%rdi), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rbx addq %r15, %r11 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rbx imulq $19, %rbx, %rbx andq %rcx, %r11 xorq %rcx, %rcx adoxq %rbx, %r8 mulxq %r12, %rbx, %r12 adcxq %rbx, %r8 adoxq %r12, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 # Store movq %r8, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r11 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r13 mulxq 152(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r12 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r14 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) # Sub movq 128(%rsp), %r8 movq 136(%rsp), %r9 movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %r8 sbbq 104(%rsp), %r9 sbbq 112(%rsp), %r10 sbbq 120(%rsp), %r11 sbbq %rbx, %rbx shldq $0x01, %r11, %rbx imulq $-19, %rbx btr $63, %r11 # Add modulus (if underflow) subq %rbx, %r8 sbbq $0x00, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 movq %r8, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r8, %r15 mulxq 136(%rsp), %r9, %r14 mulxq 144(%rsp), %r10, %r13 addq %r15, %r9 mulxq 152(%rsp), %r11, %r12 adcq %r14, %r10 adcq %r13, %r11 adcq $0x00, %r12 addq 96(%rsp), %r8 adcq 104(%rsp), %r9 adcq 112(%rsp), %r10 adcq 120(%rsp), %r11 adcq $0x00, %r12 shldq $0x01, %r11, %r12 btr $63, %r11 imulq $19, %r12, %r12 addq %r12, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %r8, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 144(%rsp), %r10, %r11 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r11 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r13 mulxq 152(%rsp), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r12 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r14 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 # Store movq %r8, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) decq %rbp jge L_curve25519_base_avx2_last_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 160(%rsp), %rdi movq (%rdi), %rax # Multiply # A[0] * B[0] movq (%rsp), %rdx mulxq %rax, %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rbx xorq %r15, %r15 adcxq %rcx, %r9 # A[3] * B[1] movq 8(%rsp), %rdx mulxq 24(%rdi), %r12, %r13 adcxq %rbx, %r10 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 adoxq %rbx, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rsp), %rdx mulxq 8(%rdi), %rcx, %rbx adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rbx, %r12 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rsp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rbx, %r11 adoxq %rdx, %r10 # A[1] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r11 mulxq 8(%rdi), %rcx, %rbx adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rbx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r13 mulxq 24(%rdi), %rcx, %rbx adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %r15 xorq %rbx, %rbx adcxq %rdx, %r11 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %rcx, %r12 mulxq (%rsp), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rsp), %rdx, %rcx adcxq %rdx, %r13 # A[2] * B[3] movq 24(%rsp), %rdx adcxq %rcx, %r14 mulxq 16(%rdi), %rcx, %rdx adcxq %rbx, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rcx addq %r15, %r11 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r11, %rcx imulq $19, %rcx, %rcx andq %rbx, %r11 xorq %rbx, %rbx adoxq %rcx, %r8 mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 adcxq %rbx, %r11 movq $0x7fffffffffffffff, %rbx movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq $0x7fffffffffffffff, %rcx movq %r8, %rdx addq $19, %rdx movq %r9, %rdx adcq $0x00, %rdx movq %r10, %rdx adcq $0x00, %rdx movq %r11, %rdx adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) xorq %rax, %rax addq $0xb0, %rsp popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size curve25519_base_avx2,.-curve25519_base_avx2 #endif /* __APPLE__ */ #endif /* WOLFSSL_CURVE25519_NOT_USE_ED25519 */ #ifndef __APPLE__ .text .globl curve25519_avx2 .type curve25519_avx2,@function .align 16 curve25519_avx2: #else .section __TEXT,__text .globl _curve25519_avx2 .p2align 4 _curve25519_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp movq %rdx, %r8 subq $0xb8, %rsp movq $0x00, 176(%rsp) movq %rdi, 168(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set zero movq $0x00, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) # Copy movq (%r8), %r9 movq 8(%r8), %r10 movq 16(%r8), %r11 movq 24(%r8), %r12 movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) movq $0xfe, %rbx L_curve25519_avx2_bits: movq %rbx, 160(%rsp) movq %rbx, %rcx movq 176(%rsp), %rax andq $63, %rcx shrq $6, %rbx movq (%rsi,%rbx,8), %rbx shrq %cl, %rbx andq $0x01, %rbx xorq %rbx, %rax movq %rbx, 176(%rsp) negq %rax # Conditional Swap movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 movq (%rsp), %r13 movq 8(%rsp), %r14 movq 16(%rsp), %r15 movq 24(%rsp), %rbp xorq 64(%rsp), %r9 xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 xorq 32(%rsp), %r13 xorq 40(%rsp), %r14 xorq 48(%rsp), %r15 xorq 56(%rsp), %rbp andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 andq %rax, %r13 andq %rax, %r14 andq %rax, %r15 andq %rax, %rbp xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) xorq %r12, 24(%rdi) xorq %r13, (%rsp) xorq %r14, 8(%rsp) xorq %r15, 16(%rsp) xorq %rbp, 24(%rsp) xorq %r9, 64(%rsp) xorq %r10, 72(%rsp) xorq %r11, 80(%rsp) xorq %r12, 88(%rsp) xorq %r13, 32(%rsp) xorq %r14, 40(%rsp) xorq %r15, 48(%rsp) xorq %rbp, 56(%rsp) # Add-Sub # Add movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 movq %r9, %r13 addq (%rsp), %r9 movq %r10, %r14 adcq 8(%rsp), %r10 movq %r11, %r15 adcq 16(%rsp), %r11 movq %r12, %rbp adcq 24(%rsp), %r12 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r12, %rbx imulq $19, %rbx btr $63, %r12 # Sub modulus (if overflow) addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Sub subq (%rsp), %r13 sbbq 8(%rsp), %r14 sbbq 16(%rsp), %r15 sbbq 24(%rsp), %rbp sbbq %rbx, %rbx shldq $0x01, %rbp, %rbx imulq $-19, %rbx btr $63, %rbp # Add modulus (if underflow) subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %r13, 128(%rsp) movq %r14, 136(%rsp) movq %r15, 144(%rsp) movq %rbp, 152(%rsp) # Add-Sub # Add movq 64(%rsp), %r9 movq 72(%rsp), %r10 movq 80(%rsp), %r11 movq 88(%rsp), %r12 movq %r9, %r13 addq 32(%rsp), %r9 movq %r10, %r14 adcq 40(%rsp), %r10 movq %r11, %r15 adcq 48(%rsp), %r11 movq %r12, %rbp adcq 56(%rsp), %r12 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r12, %rbx imulq $19, %rbx btr $63, %r12 # Sub modulus (if overflow) addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Sub subq 32(%rsp), %r13 sbbq 40(%rsp), %r14 sbbq 48(%rsp), %r15 sbbq 56(%rsp), %rbp sbbq %rbx, %rbx shldq $0x01, %rbp, %rbx imulq $-19, %rbx btr $63, %rbp # Add modulus (if underflow) subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) movq %r13, 96(%rsp) movq %r14, 104(%rsp) movq %r15, 112(%rsp) movq %rbp, 120(%rsp) movq 32(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 48(%rsp), %r11, %r12 # A[1] * B[0] mulxq 40(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 136(%rsp), %rdx mulxq 56(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 48(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 144(%rsp), %rdx mulxq 40(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 136(%rsp), %rdx mulxq 40(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r12 mulxq 40(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 144(%rsp), %rdx mulxq 48(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq 56(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 56(%rsp), %rdx adcxq %rcx, %r13 mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 56(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 152(%rsp), %rdx adcxq %rcx, %r15 mulxq 48(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq (%rdi), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 8(%rdi), %rdx mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 16(%rdi), %rdx mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 8(%rdi), %rdx mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r12 mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rdi), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r14 mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 120(%rsp), %rdx adcxq %rcx, %r13 mulxq (%rdi), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 120(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 24(%rdi), %rdx adcxq %rcx, %r15 mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) # Square movq 128(%rsp), %rdx movq 136(%rsp), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 152(%rsp), %r12, %r13 # A[2] * A[1] movq 144(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 152(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 144(%rsp), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Square movq (%rdi), %rdx movq 8(%rdi), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 24(%rdi), %r12, %r13 # A[2] * A[1] movq 16(%rdi), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 24(%rdi), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 16(%rdi), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 24(%rdi), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) # Add-Sub # Add movq (%rsp), %r9 movq 8(%rsp), %r10 movq 16(%rsp), %r11 movq 24(%rsp), %r12 movq %r9, %r13 addq 32(%rsp), %r9 movq %r10, %r14 adcq 40(%rsp), %r10 movq %r11, %r15 adcq 48(%rsp), %r11 movq %r12, %rbp adcq 56(%rsp), %r12 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r12, %rbx imulq $19, %rbx btr $63, %r12 # Sub modulus (if overflow) addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Sub subq 32(%rsp), %r13 sbbq 40(%rsp), %r14 sbbq 48(%rsp), %r15 sbbq 56(%rsp), %rbp sbbq %rbx, %rbx shldq $0x01, %rbp, %rbx imulq $-19, %rbx btr $63, %rbp # Add modulus (if underflow) subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) movq %r13, 32(%rsp) movq %r14, 40(%rsp) movq %r15, 48(%rsp) movq %rbp, 56(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r12 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r14 mulxq 152(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r13 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r15 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) # Sub movq 128(%rsp), %r9 movq 136(%rsp), %r10 movq 144(%rsp), %r11 movq 152(%rsp), %r12 subq 96(%rsp), %r9 sbbq 104(%rsp), %r10 sbbq 112(%rsp), %r11 sbbq 120(%rsp), %r12 sbbq %rbx, %rbx shldq $0x01, %r12, %rbx imulq $-19, %rbx btr $63, %r12 # Add modulus (if underflow) subq %rbx, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) # Square movq 32(%rsp), %rdx movq 40(%rsp), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 56(%rsp), %r12, %r13 # A[2] * A[1] movq 48(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 56(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 56(%rsp), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 48(%rsp), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 56(%rsp), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) # Square movq 64(%rsp), %rdx movq 72(%rsp), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 88(%rsp), %r12, %r13 # A[2] * A[1] movq 80(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 88(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 88(%rsp), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 80(%rsp), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 88(%rsp), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r9, %rbp mulxq 136(%rsp), %r10, %r15 mulxq 144(%rsp), %r11, %r14 addq %rbp, %r10 mulxq 152(%rsp), %r12, %r13 adcq %r15, %r11 adcq %r14, %r12 adcq $0x00, %r13 addq 96(%rsp), %r9 adcq 104(%rsp), %r10 adcq 112(%rsp), %r11 adcq 120(%rsp), %r12 adcq $0x00, %r13 shldq $0x01, %r12, %r13 btr $63, %r12 imulq $19, %r13, %r13 addq %r13, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) movq (%r8), %rax # Multiply # A[0] * B[0] movq 32(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 16(%r8), %r11, %r12 # A[1] * B[0] mulxq 8(%r8), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 40(%rsp), %rdx mulxq 24(%r8), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%r8), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 48(%rsp), %rdx mulxq 8(%r8), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 40(%rsp), %rdx mulxq 8(%r8), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 56(%rsp), %rdx adoxq %rcx, %r12 mulxq 8(%r8), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 48(%rsp), %rdx mulxq 16(%r8), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 56(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%r8), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 24(%r8), %rdx adcxq %rcx, %r13 mulxq 32(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 24(%r8), %rdx mulxq 48(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 56(%rsp), %rdx adcxq %rcx, %r15 mulxq 16(%r8), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 136(%rsp), %rdx mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 144(%rsp), %rdx mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 136(%rsp), %rdx mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r12 mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 144(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 120(%rsp), %rdx adcxq %rcx, %r13 mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 120(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 152(%rsp), %rdx adcxq %rcx, %r15 mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) movq 160(%rsp), %rbx decq %rbx cmpq $3, %rbx jge L_curve25519_avx2_bits movq $2, 160(%rsp) movq 176(%rsp), %rax negq %rax # Conditional Swap movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 movq (%rsp), %r13 movq 8(%rsp), %r14 movq 16(%rsp), %r15 movq 24(%rsp), %rbp xorq 64(%rsp), %r9 xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 xorq 32(%rsp), %r13 xorq 40(%rsp), %r14 xorq 48(%rsp), %r15 xorq 56(%rsp), %rbp andq %rax, %r9 andq %rax, %r10 andq %rax, %r11 andq %rax, %r12 andq %rax, %r13 andq %rax, %r14 andq %rax, %r15 andq %rax, %rbp xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) xorq %r12, 24(%rdi) xorq %r13, (%rsp) xorq %r14, 8(%rsp) xorq %r15, 16(%rsp) xorq %rbp, 24(%rsp) xorq %r9, 64(%rsp) xorq %r10, 72(%rsp) xorq %r11, 80(%rsp) xorq %r12, 88(%rsp) xorq %r13, 32(%rsp) xorq %r14, 40(%rsp) xorq %r15, 48(%rsp) xorq %rbp, 56(%rsp) L_curve25519_avx2_last_3: # Add-Sub # Add movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 movq %r9, %r13 addq (%rsp), %r9 movq %r10, %r14 adcq 8(%rsp), %r10 movq %r11, %r15 adcq 16(%rsp), %r11 movq %r12, %rbp adcq 24(%rsp), %r12 movq $0x00, %rbx adcq $0x00, %rbx shldq $0x01, %r12, %rbx imulq $19, %rbx btr $63, %r12 # Sub modulus (if overflow) addq %rbx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Sub subq (%rsp), %r13 sbbq 8(%rsp), %r14 sbbq 16(%rsp), %r15 sbbq 24(%rsp), %rbp sbbq %rbx, %rbx shldq $0x01, %rbp, %rbx imulq $-19, %rbx btr $63, %rbp # Add modulus (if underflow) subq %rbx, %r13 sbbq $0x00, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbp movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %r13, 128(%rsp) movq %r14, 136(%rsp) movq %r15, 144(%rsp) movq %rbp, 152(%rsp) # Square movq 128(%rsp), %rdx movq 136(%rsp), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 152(%rsp), %r12, %r13 # A[2] * A[1] movq 144(%rsp), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 152(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 144(%rsp), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Square movq (%rdi), %rdx movq 8(%rdi), %rax # A[0] * A[1] movq %rdx, %rbp mulxq %rax, %r10, %r11 # A[0] * A[3] mulxq 24(%rdi), %r12, %r13 # A[2] * A[1] movq 16(%rdi), %rdx mulxq %rax, %rcx, %rbx xorq %r9, %r9 adoxq %rcx, %r12 # A[2] * A[3] mulxq 24(%rdi), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq %rbp, %rcx, %rbx adoxq %r9, %r14 adcxq %rcx, %r11 adoxq %r9, %r15 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %r12 adcxq %rcx, %r13 adcxq %rdx, %r14 adcxq %r9, %r15 # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r9, %rcx xorq %rbp, %rbp adcxq %r10, %r10 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r10 mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rcx, %r11 adcxq %r12, %r12 # A[2] * A[2] movq 16(%rdi), %rdx adoxq %rbx, %r12 mulxq %rdx, %rbx, %rcx adcxq %r13, %r13 adoxq %rbx, %r13 adcxq %r14, %r14 # A[3] * A[3] movq 24(%rdi), %rdx adoxq %rcx, %r14 mulxq %rdx, %rcx, %rbx adcxq %r15, %r15 adoxq %rcx, %r15 adcxq %rbp, %rbp adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rbx addq %rbp, %r12 adcq $0x00, %rbx movq $0x7fffffffffffffff, %rcx shldq $0x01, %r12, %rbx imulq $19, %rbx, %rbx andq %rcx, %r12 xorq %rcx, %rcx adoxq %rbx, %r9 mulxq %r13, %rbx, %r13 adcxq %rbx, %r9 adoxq %r13, %r10 mulxq %r14, %rbx, %r14 adcxq %rbx, %r10 adoxq %r14, %r11 mulxq %r15, %rbx, %r15 adcxq %rbx, %r11 adoxq %r15, %r12 adcxq %rcx, %r12 # Store movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) movq 128(%rsp), %rax # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 104(%rsp), %rdx mulxq 152(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r12 mulxq 136(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r14 mulxq 152(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 152(%rsp), %rdx adcxq %rcx, %r13 mulxq 96(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 152(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 120(%rsp), %rdx adcxq %rcx, %r15 mulxq 144(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) # Sub movq 128(%rsp), %r9 movq 136(%rsp), %r10 movq 144(%rsp), %r11 movq 152(%rsp), %r12 subq 96(%rsp), %r9 sbbq 104(%rsp), %r10 sbbq 112(%rsp), %r11 sbbq 120(%rsp), %r12 sbbq %rbx, %rbx shldq $0x01, %r12, %rbx imulq $-19, %rbx btr $63, %r12 # Add modulus (if underflow) subq %rbx, %r9 sbbq $0x00, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r9, %rbp mulxq 136(%rsp), %r10, %r15 mulxq 144(%rsp), %r11, %r14 addq %rbp, %r10 mulxq 152(%rsp), %r12, %r13 adcq %r15, %r11 adcq %r14, %r12 adcq $0x00, %r13 addq 96(%rsp), %r9 adcq 104(%rsp), %r10 adcq 112(%rsp), %r11 adcq 120(%rsp), %r12 adcq $0x00, %r13 shldq $0x01, %r12, %r13 btr $63, %r12 imulq $19, %r13, %r13 addq %r13, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) movq 96(%rsp), %rax # Multiply # A[0] * B[0] movq 128(%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 136(%rsp), %rdx mulxq 120(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 144(%rsp), %rdx mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 136(%rsp), %rdx mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r12 mulxq 104(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 144(%rsp), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 120(%rsp), %rdx adcxq %rcx, %r13 mulxq 128(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 120(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 152(%rsp), %rdx adcxq %rcx, %r15 mulxq 112(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) decq 160(%rsp) jge L_curve25519_avx2_last_3 # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 168(%rsp), %rdi movq (%rdi), %rax # Multiply # A[0] * B[0] movq (%rsp), %rdx mulxq %rax, %r9, %r10 # A[2] * B[0] mulxq 16(%rdi), %r11, %r12 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[3] * B[1] movq 8(%rsp), %rdx mulxq 24(%rdi), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] mulxq %rax, %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 16(%rsp), %rdx mulxq 8(%rdi), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq %rax, %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 8(%rsp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[1] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r12 mulxq 8(%rdi), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq %rax, %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %rcx, %r13 mulxq (%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[2] * B[3] movq 24(%rsp), %rdx adcxq %rcx, %r15 mulxq 16(%rdi), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r12 adcq $0x00, %rcx movq $0x7fffffffffffffff, %rbx shldq $0x01, %r12, %rcx imulq $19, %rcx, %rcx andq %rbx, %r12 xorq %rbx, %rbx adoxq %rcx, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 adcxq %rbx, %r12 movq $0x7fffffffffffffff, %rbx movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq $0x7fffffffffffffff, %rcx movq %r9, %rdx addq $19, %rdx movq %r10, %rdx adcq $0x00, %rdx movq %r11, %rdx adcq $0x00, %rdx movq %r12, %rdx adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) xorq %rax, %rax addq $0xb8, %rsp popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size curve25519_avx2,.-curve25519_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_avx2 .type fe_pow22523_avx2,@function .align 16 fe_pow22523_avx2: #else .section __TEXT,__text .globl _fe_pow22523_avx2 .p2align 4 _fe_pow22523_avx2: #endif /* __APPLE__ */ subq $0x70, %rsp # pow22523 movq %rdi, 96(%rsp) movq %rsi, 104(%rsp) movq %rsp, %rdi movq 104(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 104(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq 96(%rsp), %rdi movq %rsp, %rsi movq 104(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 104(%rsp), %rsi movq 96(%rsp), %rdi addq $0x70, %rsp repz retq #ifndef __APPLE__ .text .globl ge_p1p1_to_p2_avx2 .type ge_p1p1_to_p2_avx2,@function .align 16 ge_p1p1_to_p2_avx2: #else .section __TEXT,__text .globl _ge_p1p1_to_p2_avx2 .p2align 4 _ge_p1p1_to_p2_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) leaq 96(%rsi), %rax movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 64(%rsi), %rsi leaq 64(%rdi), %rdi movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq -32(%rsi), %rax leaq -32(%rdi), %rdi movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $16, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p1p1_to_p2_avx2,.-ge_p1p1_to_p2_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p3_avx2 .type ge_p1p1_to_p3_avx2,@function .align 16 ge_p1p1_to_p3_avx2: #else .section __TEXT,__text .globl _ge_p1p1_to_p3_avx2 .p2align 4 _ge_p1p1_to_p3_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) leaq 96(%rsi), %rax movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 32(%rsi), %rax leaq 96(%rdi), %rdi movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 64(%rsi), %rsi leaq -64(%rdi), %rdi movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 32(%rsi), %rax leaq 32(%rdi), %rdi movq (%rsi), %r9 # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq %r9, %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rcx, %r8 xorq %rbp, %rbp adcxq %rcx, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r8, %r12 # A[0] * B[1] mulxq %r9, %rcx, %r8 adoxq %rcx, %r11 # A[2] * B[1] mulxq 16(%rsi), %rcx, %rbx adoxq %r8, %r12 adcxq %rcx, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %rcx, %r8 adcxq %rbx, %r14 adoxq %rcx, %r13 adcxq %rbp, %r15 adoxq %r8, %r14 # A[0] * B[2] mulxq %r9, %rcx, %r8 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rcx, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %r8, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r13 mulxq 8(%rsi), %rcx, %r8 adcxq %rcx, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %r8, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq 24(%rsi), %rcx, %r8 adoxq %rbp, %rbx adcxq %rcx, %rbx # A[0] * B[3] mulxq %r9, %rdx, %rcx adcxq %r8, %rbp xorq %r8, %r8 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rcx, %r14 mulxq (%rax), %rdx, %rcx adoxq %rdx, %r13 adoxq %rcx, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %rcx adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %rcx, %rbx mulxq 16(%rsi), %rcx, %rdx adcxq %r8, %rbp adoxq %rcx, %r15 adoxq %rdx, %rbx adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %rcx addq %rbp, %r13 adcq $0x00, %rcx movq $0x7fffffffffffffff, %r8 shldq $0x01, %r13, %rcx imulq $19, %rcx, %rcx andq %r8, %r13 xorq %r8, %r8 adoxq %rcx, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbx, %rcx, %rbx adcxq %rcx, %r12 adoxq %rbx, %r13 adcxq %r8, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $16, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p1p1_to_p3_avx2,.-ge_p1p1_to_p3_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p2_dbl_avx2 .type ge_p2_dbl_avx2,@function .align 16 ge_p2_dbl_avx2: #else .section __TEXT,__text .globl _ge_p2_dbl_avx2 .p2align 4 _ge_p2_dbl_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp subq $16, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) leaq 64(%rdi), %rdi # Square movq (%rsi), %rdx movq 8(%rsi), %r9 # A[0] * A[1] movq %rdx, %rbp mulxq %r9, %r11, %r12 # A[0] * A[3] mulxq 24(%rsi), %r13, %r14 # A[2] * A[1] movq 16(%rsi), %rdx mulxq %r9, %rcx, %r8 xorq %r10, %r10 adoxq %rcx, %r13 # A[2] * A[3] mulxq 24(%rsi), %r15, %rbx adoxq %r8, %r14 # A[2] * A[0] mulxq %rbp, %rcx, %r8 adoxq %r10, %r15 adcxq %rcx, %r12 adoxq %r10, %rbx # A[1] * A[3] movq %r9, %rdx mulxq 24(%rsi), %rcx, %rdx adcxq %r8, %r13 adcxq %rcx, %r14 adcxq %rdx, %r15 adcxq %r10, %rbx # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r10, %rcx xorq %rbp, %rbp adcxq %r11, %r11 # A[1] * A[1] movq %r9, %rdx adoxq %rcx, %r11 mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 adoxq %rcx, %r12 adcxq %r13, %r13 # A[2] * A[2] movq 16(%rsi), %rdx adoxq %r8, %r13 mulxq %rdx, %r8, %rcx adcxq %r14, %r14 adoxq %r8, %r14 adcxq %r15, %r15 # A[3] * A[3] movq 24(%rsi), %rdx adoxq %rcx, %r15 mulxq %rdx, %rcx, %r8 adcxq %rbx, %rbx adoxq %rcx, %rbx adcxq %rbp, %rbp adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %rcx, %r13 xorq %rcx, %rcx adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %rcx, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 32(%rsi), %rsi # Square movq (%rsi), %rdx movq 8(%rsi), %r9 # A[0] * A[1] movq %rdx, %rbp mulxq %r9, %r11, %r12 # A[0] * A[3] mulxq 24(%rsi), %r13, %r14 # A[2] * A[1] movq 16(%rsi), %rdx mulxq %r9, %rcx, %r8 xorq %r10, %r10 adoxq %rcx, %r13 # A[2] * A[3] mulxq 24(%rsi), %r15, %rbx adoxq %r8, %r14 # A[2] * A[0] mulxq %rbp, %rcx, %r8 adoxq %r10, %r15 adcxq %rcx, %r12 adoxq %r10, %rbx # A[1] * A[3] movq %r9, %rdx mulxq 24(%rsi), %rcx, %rdx adcxq %r8, %r13 adcxq %rcx, %r14 adcxq %rdx, %r15 adcxq %r10, %rbx # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r10, %rcx xorq %rbp, %rbp adcxq %r11, %r11 # A[1] * A[1] movq %r9, %rdx adoxq %rcx, %r11 mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 adoxq %rcx, %r12 adcxq %r13, %r13 # A[2] * A[2] movq 16(%rsi), %rdx adoxq %r8, %r13 mulxq %rdx, %r8, %rcx adcxq %r14, %r14 adoxq %r8, %r14 adcxq %r15, %r15 # A[3] * A[3] movq 24(%rsi), %rdx adoxq %rcx, %r15 mulxq %rdx, %rcx, %r8 adcxq %rbx, %rbx adoxq %rcx, %rbx adcxq %rbp, %rbp adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %rcx, %r13 xorq %rcx, %rcx adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %rcx, %r13 # Store movq %rdi, %rsi leaq -32(%rdi), %rdi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 8(%rsp), %rax leaq 32(%rax), %rsi leaq -32(%rdi), %rdi # Add movq (%rsi), %r10 movq 8(%rsi), %r11 addq (%rax), %r10 movq 16(%rsi), %r12 adcq 8(%rax), %r11 movq 24(%rsi), %r13 adcq 16(%rax), %r12 adcq 24(%rax), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) # Square movq (%rdi), %rdx movq 8(%rdi), %r9 # A[0] * A[1] movq %rdx, %rbp mulxq %r9, %r11, %r12 # A[0] * A[3] mulxq 24(%rdi), %r13, %r14 # A[2] * A[1] movq 16(%rdi), %rdx mulxq %r9, %rcx, %r8 xorq %r10, %r10 adoxq %rcx, %r13 # A[2] * A[3] mulxq 24(%rdi), %r15, %rbx adoxq %r8, %r14 # A[2] * A[0] mulxq %rbp, %rcx, %r8 adoxq %r10, %r15 adcxq %rcx, %r12 adoxq %r10, %rbx # A[1] * A[3] movq %r9, %rdx mulxq 24(%rdi), %rcx, %rdx adcxq %r8, %r13 adcxq %rcx, %r14 adcxq %rdx, %r15 adcxq %r10, %rbx # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r10, %rcx xorq %rbp, %rbp adcxq %r11, %r11 # A[1] * A[1] movq %r9, %rdx adoxq %rcx, %r11 mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 adoxq %rcx, %r12 adcxq %r13, %r13 # A[2] * A[2] movq 16(%rdi), %rdx adoxq %r8, %r13 mulxq %rdx, %r8, %rcx adcxq %r14, %r14 adoxq %r8, %r14 adcxq %r15, %r15 # A[3] * A[3] movq 24(%rdi), %rdx adoxq %rcx, %r15 mulxq %rdx, %rcx, %r8 adcxq %rbx, %rbx adoxq %rcx, %rbx adcxq %rbp, %rbp adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %rcx shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %rcx, %r13 xorq %rcx, %rcx adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %rcx, %r13 # Store leaq 32(%rdi), %rsi # Sub subq (%rsi), %r10 sbbq 8(%rsi), %r11 sbbq 16(%rsi), %r12 sbbq 24(%rsi), %r13 sbbq %rdx, %rdx shldq $0x01, %r13, %rdx imulq $-19, %rdx btr $63, %r13 # Add modulus (if underflow) subq %rdx, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 64(%rax), %rax # Square * 2 movq (%rax), %rdx movq 8(%rax), %r9 # A[0] * A[1] movq %rdx, %rbp mulxq %r9, %r11, %r12 # A[0] * A[3] mulxq 24(%rax), %r13, %r14 # A[2] * A[1] movq 16(%rax), %rdx mulxq %r9, %rcx, %r8 xorq %r10, %r10 adoxq %rcx, %r13 # A[2] * A[3] mulxq 24(%rax), %r15, %rbx adoxq %r8, %r14 # A[2] * A[0] mulxq %rbp, %rcx, %r8 adoxq %r10, %r15 adcxq %rcx, %r12 adoxq %r10, %rbx # A[1] * A[3] movq %r9, %rdx mulxq 24(%rax), %rcx, %rdx adcxq %r8, %r13 adcxq %rcx, %r14 adcxq %rdx, %r15 adcxq %r10, %rbx # A[0] * A[0] movq %rbp, %rdx mulxq %rdx, %r10, %rcx xorq %rbp, %rbp adcxq %r11, %r11 # A[1] * A[1] movq %r9, %rdx adoxq %rcx, %r11 mulxq %rdx, %rcx, %r8 adcxq %r12, %r12 adoxq %rcx, %r12 adcxq %r13, %r13 # A[2] * A[2] movq 16(%rax), %rdx adoxq %r8, %r13 mulxq %rdx, %r8, %rcx adcxq %r14, %r14 adoxq %r8, %r14 adcxq %r15, %r15 # A[3] * A[3] movq 24(%rax), %rdx adoxq %rcx, %r15 mulxq %rdx, %rcx, %r8 adcxq %rbx, %rbx adoxq %rcx, %rbx adcxq %rbp, %rbp adoxq %r8, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r9 addq %rbp, %r13 adcq $0x00, %r9 movq $0x7fffffffffffffff, %rcx shldq $0x01, %r13, %r9 imulq $19, %r9, %r9 andq %rcx, %r13 xorq %rcx, %rcx adoxq %r9, %r10 mulxq %r14, %r9, %r14 adcxq %r9, %r10 adoxq %r14, %r11 mulxq %r15, %r9, %r15 adcxq %r9, %r11 adoxq %r15, %r12 mulxq %rbx, %r9, %rbx adcxq %r9, %r12 adoxq %rbx, %r13 adcxq %rcx, %r13 movq %r13, %r9 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 shldq $0x01, %r10, %r11 shlq $1, %r10 movq $0x7fffffffffffffff, %rcx shrq $62, %r9 andq %rcx, %r13 imulq $19, %r9, %r9 addq %r9, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Store leaq 64(%rdi), %rsi leaq 96(%rdi), %rdi # Sub subq (%rsi), %r10 sbbq 8(%rsi), %r11 sbbq 16(%rsi), %r12 sbbq 24(%rsi), %r13 sbbq %rdx, %rdx shldq $0x01, %r13, %rdx imulq $-19, %rdx btr $63, %r13 # Add modulus (if underflow) subq %rdx, %r10 sbbq $0x00, %r11 sbbq $0x00, %r12 sbbq $0x00, %r13 movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) addq $16, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_p2_dbl_avx2,.-ge_p2_dbl_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_madd_avx2 .type ge_madd_avx2,@function .align 16 ge_madd_avx2: #else .section __TEXT,__text .globl _ge_madd_avx2 .p2align 4 _ge_madd_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rax subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rax, 16(%rsp) leaq 96(%rsi), %rcx leaq 64(%rax), %rax leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %rsi, %rcx leaq 32(%rsi), %rax leaq -64(%rdi), %rsi leaq -96(%rdi), %rdi # Add-Sub # Add movq (%rax), %r10 movq 8(%rax), %r11 movq 16(%rax), %r12 movq 24(%rax), %r13 movq %r10, %r14 addq (%rcx), %r10 movq %r11, %r15 adcq 8(%rcx), %r11 movq %r12, %rbx adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rcx), %r14 sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rdi), %r10, %r11 # A[2] * B[0] mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] mulxq 8(%rdi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rdi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rdi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rdi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rdi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rdi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rdi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rdi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rdi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rdi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rdi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rdi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rsi), %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rsi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rsi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rsi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rsi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rsi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rsi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rsi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) # Add-Sub # Add movq (%rdi), %r10 movq 8(%rdi), %r11 movq 16(%rdi), %r12 movq 24(%rdi), %r13 movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) leaq 64(%rcx), %rcx # Double movq (%rcx), %r10 movq 8(%rcx), %r11 addq %r10, %r10 movq 16(%rcx), %r12 adcq %r11, %r11 movq 24(%rcx), %r13 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 leaq 96(%rdi), %rsi leaq 64(%rdi), %rdi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_madd_avx2,.-ge_madd_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_msub_avx2 .type ge_msub_avx2,@function .align 16 ge_msub_avx2: #else .section __TEXT,__text .globl _ge_msub_avx2 .p2align 4 _ge_msub_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rax subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rax, 16(%rsp) leaq 96(%rsi), %rcx leaq 64(%rax), %rax leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %rsi, %rcx leaq 32(%rsi), %rax leaq -64(%rdi), %rsi leaq -96(%rdi), %rdi # Add-Sub # Add movq (%rax), %r10 movq 8(%rax), %r11 movq 16(%rax), %r12 movq 24(%rax), %r13 movq %r10, %r14 addq (%rcx), %r10 movq %r11, %r15 adcq 8(%rcx), %r11 movq %r12, %rbx adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rcx), %r14 sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rax leaq 32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rdi), %r10, %r11 # A[2] * B[0] mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] mulxq 8(%rdi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rdi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rdi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rdi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rdi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rdi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rdi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rdi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rdi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rdi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rdi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rdi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq -32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rsi), %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rsi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rsi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rsi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rsi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rsi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rsi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rsi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) # Add-Sub # Add movq (%rdi), %r10 movq 8(%rdi), %r11 movq 16(%rdi), %r12 movq 24(%rdi), %r13 movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) leaq 64(%rcx), %rcx # Double movq (%rcx), %r10 movq 8(%rcx), %r11 addq %r10, %r10 movq 16(%rcx), %r12 adcq %r11, %r11 movq 24(%rcx), %r13 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 leaq 96(%rdi), %rsi leaq 64(%rdi), %rdi # Add-Sub # Add movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_msub_avx2,.-ge_msub_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_add_avx2 .type ge_add_avx2,@function .align 16 ge_add_avx2: #else .section __TEXT,__text .globl _ge_add_avx2 .p2align 4 _ge_add_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rax subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rax, 16(%rsp) leaq 96(%rsi), %rcx leaq 96(%rax), %rax leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %rsi, %rcx leaq 32(%rsi), %rax leaq -64(%rdi), %rsi leaq -96(%rdi), %rdi # Add-Sub # Add movq (%rax), %r10 movq 8(%rax), %r11 movq 16(%rax), %r12 movq 24(%rax), %r13 movq %r10, %r14 addq (%rcx), %r10 movq %r11, %r15 adcq 8(%rcx), %r11 movq %r12, %rbx adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rcx), %r14 sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rdi), %r10, %r11 # A[2] * B[0] mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] mulxq 8(%rdi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rdi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rdi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rdi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rdi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rdi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rdi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rdi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rdi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rdi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rdi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rdi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq 32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rsi), %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rsi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rsi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rsi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rsi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rsi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rsi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rsi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) leaq 64(%rcx), %rcx leaq 32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store leaq 64(%rdi), %rdi # Double addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq -64(%rdi), %rdi # Add-Sub # Add movq (%rdi), %r10 movq 8(%rdi), %r11 movq 16(%rdi), %r12 movq 24(%rdi), %r13 movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) leaq 96(%rdi), %rsi leaq 64(%rdi), %rdi # Add-Sub # Add movq (%rdi), %r10 movq 8(%rdi), %r11 movq 16(%rdi), %r12 movq 24(%rdi), %r13 movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_add_avx2,.-ge_add_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_sub_avx2 .type ge_sub_avx2,@function .align 16 ge_sub_avx2: #else .section __TEXT,__text .globl _ge_sub_avx2 .p2align 4 _ge_sub_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %rax subq $24, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rax, 16(%rsp) leaq 96(%rsi), %rcx leaq 96(%rax), %rax leaq 96(%rdi), %rdi # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %rsi, %rcx leaq 32(%rsi), %rax leaq -64(%rdi), %rsi leaq -96(%rdi), %rdi # Add-Sub # Add movq (%rax), %r10 movq 8(%rax), %r11 movq 16(%rax), %r12 movq 24(%rax), %r13 movq %r10, %r14 addq (%rcx), %r10 movq %r11, %r15 adcq 8(%rcx), %r11 movq %r12, %rbx adcq 16(%rcx), %r12 movq %r13, %rbp adcq 24(%rcx), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rcx), %r14 sbbq 8(%rcx), %r15 sbbq 16(%rcx), %rbx sbbq 24(%rcx), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) movq 16(%rsp), %rax leaq 32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rdi), %r10, %r11 # A[2] * B[0] mulxq 16(%rdi), %r12, %r13 # A[1] * B[0] mulxq 8(%rdi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rdi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rdi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rdi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rdi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rdi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rdi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rdi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rdi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rdi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rdi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rdi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rdi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rdi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq -32(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rsi), %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rsi), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rsi), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rsi), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rsi), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rsi), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rsi), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rsi), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rsi), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rsi), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rsi), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) leaq 64(%rcx), %rcx leaq 64(%rax), %rax # Multiply # A[0] * B[0] movq (%rax), %rdx mulxq (%rcx), %r10, %r11 # A[2] * B[0] mulxq 16(%rcx), %r12, %r13 # A[1] * B[0] mulxq 8(%rcx), %r8, %r9 xorq %rbp, %rbp adcxq %r8, %r11 # A[3] * B[1] movq 8(%rax), %rdx mulxq 24(%rcx), %r14, %r15 adcxq %r9, %r12 # A[0] * B[1] mulxq (%rcx), %r8, %r9 adoxq %r8, %r11 # A[2] * B[1] mulxq 16(%rcx), %r8, %rbx adoxq %r9, %r12 adcxq %r8, %r13 # A[1] * B[2] movq 16(%rax), %rdx mulxq 8(%rcx), %r8, %r9 adcxq %rbx, %r14 adoxq %r8, %r13 adcxq %rbp, %r15 adoxq %r9, %r14 # A[0] * B[2] mulxq (%rcx), %r8, %r9 adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %r8, %r12 # A[1] * B[1] movq 8(%rax), %rdx mulxq 8(%rcx), %rdx, %r8 adcxq %r9, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%rax), %rdx adoxq %r8, %r13 mulxq 8(%rcx), %r8, %r9 adcxq %r8, %r14 # A[2] * B[2] movq 16(%rax), %rdx mulxq 16(%rcx), %rdx, %r8 adcxq %r9, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%rax), %rdx adoxq %r8, %r15 mulxq 24(%rcx), %r8, %r9 adoxq %rbp, %rbx adcxq %r8, %rbx # A[0] * B[3] mulxq (%rcx), %rdx, %r8 adcxq %r9, %rbp xorq %r9, %r9 adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rcx), %rdx adcxq %r8, %r14 mulxq (%rax), %rdx, %r8 adoxq %rdx, %r13 adoxq %r8, %r14 # A[3] * B[2] movq 24(%rcx), %rdx mulxq 16(%rax), %rdx, %r8 adcxq %rdx, %r15 # A[2] * B[3] movq 24(%rax), %rdx adcxq %r8, %rbx mulxq 16(%rcx), %r8, %rdx adcxq %r9, %rbp adoxq %r8, %r15 adoxq %rdx, %rbx adoxq %r9, %rbp movq $38, %rdx mulxq %rbp, %rbp, %r8 addq %rbp, %r13 adcq $0x00, %r8 movq $0x7fffffffffffffff, %r9 shldq $0x01, %r13, %r8 imulq $19, %r8, %r8 andq %r9, %r13 xorq %r9, %r9 adoxq %r8, %r10 mulxq %r14, %r8, %r14 adcxq %r8, %r10 adoxq %r14, %r11 mulxq %r15, %r8, %r15 adcxq %r8, %r11 adoxq %r15, %r12 mulxq %rbx, %r8, %rbx adcxq %r8, %r12 adoxq %rbx, %r13 adcxq %r9, %r13 # Store leaq 64(%rdi), %rdi # Double addq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) leaq -64(%rdi), %rdi # Add-Sub # Add movq (%rdi), %r10 movq 8(%rdi), %r11 movq 16(%rdi), %r12 movq 24(%rdi), %r13 movq %r10, %r14 addq (%rsi), %r10 movq %r11, %r15 adcq 8(%rsi), %r11 movq %r12, %rbx adcq 16(%rsi), %r12 movq %r13, %rbp adcq 24(%rsi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rsi), %r14 sbbq 8(%rsi), %r15 sbbq 16(%rsi), %rbx sbbq 24(%rsi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rsi) movq %r11, 8(%rsi) movq %r12, 16(%rsi) movq %r13, 24(%rsi) movq %r14, (%rdi) movq %r15, 8(%rdi) movq %rbx, 16(%rdi) movq %rbp, 24(%rdi) leaq 64(%rdi), %rsi leaq 96(%rdi), %rdi # Add-Sub # Add movq (%rsi), %r10 movq 8(%rsi), %r11 movq 16(%rsi), %r12 movq 24(%rsi), %r13 movq %r10, %r14 addq (%rdi), %r10 movq %r11, %r15 adcq 8(%rdi), %r11 movq %r12, %rbx adcq 16(%rdi), %r12 movq %r13, %rbp adcq 24(%rdi), %r13 movq $0x00, %rdx adcq $0x00, %rdx shldq $0x01, %r13, %rdx imulq $19, %rdx btr $63, %r13 # Sub modulus (if overflow) addq %rdx, %r10 adcq $0x00, %r11 adcq $0x00, %r12 adcq $0x00, %r13 # Sub subq (%rdi), %r14 sbbq 8(%rdi), %r15 sbbq 16(%rdi), %rbx sbbq 24(%rdi), %rbp sbbq %rdx, %rdx shldq $0x01, %rbp, %rdx imulq $-19, %rdx btr $63, %rbp # Add modulus (if underflow) subq %rdx, %r14 sbbq $0x00, %r15 sbbq $0x00, %rbx sbbq $0x00, %rbp movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) movq %r14, (%rsi) movq %r15, 8(%rsi) movq %rbx, 16(%rsi) movq %rbp, 24(%rsi) addq $24, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size ge_sub_avx2,.-ge_sub_avx2 #endif /* __APPLE__ */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_sq2_avx2 .type fe_sq2_avx2,@function .align 16 fe_sq2_avx2: #else .section __TEXT,__text .globl _fe_sq2_avx2 .p2align 4 _fe_sq2_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square * 2 movq (%rsi), %rdx movq 8(%rsi), %rax # A[0] * A[1] movq %rdx, %r15 mulxq %rax, %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq %rax, %rcx, %rbx xorq %r8, %r8 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq %r15, %rcx, %rbx adoxq %r8, %r13 adcxq %rcx, %r10 adoxq %r8, %r14 # A[1] * A[3] movq %rax, %rdx mulxq 24(%rsi), %rcx, %rdx adcxq %rbx, %r11 adcxq %rcx, %r12 adcxq %rdx, %r13 adcxq %r8, %r14 # A[0] * A[0] movq %r15, %rdx mulxq %rdx, %r8, %rcx xorq %r15, %r15 adcxq %r9, %r9 # A[1] * A[1] movq %rax, %rdx adoxq %rcx, %r9 mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rcx, %r10 adcxq %r11, %r11 # A[2] * A[2] movq 16(%rsi), %rdx adoxq %rbx, %r11 mulxq %rdx, %rbx, %rcx adcxq %r12, %r12 adoxq %rbx, %r12 adcxq %r13, %r13 # A[3] * A[3] movq 24(%rsi), %rdx adoxq %rcx, %r13 mulxq %rdx, %rcx, %rbx adcxq %r14, %r14 adoxq %rcx, %r14 adcxq %r15, %r15 adoxq %rbx, %r15 movq $38, %rdx mulxq %r15, %r15, %rax addq %r15, %r11 adcq $0x00, %rax movq $0x7fffffffffffffff, %rcx shldq $0x01, %r11, %rax imulq $19, %rax, %rax andq %rcx, %r11 xorq %rcx, %rcx adoxq %rax, %r8 mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 adcxq %rcx, %r11 movq %r11, %rax shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shlq $1, %r8 movq $0x7fffffffffffffff, %rcx shrq $62, %rax andq %rcx, %r11 imulq $19, %rax, %rax addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq2_avx2,.-fe_sq2_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_reduce_avx2 .type sc_reduce_avx2,@function .align 16 sc_reduce_avx2: #else .section __TEXT,__text .globl _sc_reduce_avx2 .p2align 4 _sc_reduce_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq (%rdi), %r8 movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 movq 32(%rdi), %r12 movq 40(%rdi), %r13 movq 48(%rdi), %r14 movq 56(%rdi), %r15 movq %r15, %rax movq $0xfffffffffffffff, %rcx shrq $56, %rax shldq $4, %r14, %r15 shldq $4, %r13, %r14 shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rcx, %r11 andq %rcx, %r15 # Add order times bits 504..511 subq %rax, %r14 sbbq $0x00, %r15 movq $0xeb2106215d086329, %rdx mulxq %rax, %rsi, %rcx movq $0xa7ed9ce5a30a2c13, %rdx addq %rsi, %r13 mulxq %rax, %rsi, %rbx adcq $0x00, %rcx addq %rsi, %r12 adcq %rbx, %r13 adcq %rcx, %r14 adcq $0x00, %r15 # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rdx mulxq %r12, %rcx, %rax addq %rcx, %r8 adcq %rax, %r9 mulxq %r14, %rcx, %rax adcq %rcx, %r10 adcq %rax, %r11 movq $0x00, %rsi adcq $0x00, %rsi mulxq %r13, %rcx, %rax addq %rcx, %r9 adcq %rax, %r10 mulxq %r15, %rcx, %rax adcq %rcx, %r11 adcq %rax, %rsi movq $0xeb2106215d086329, %rdx mulxq %r12, %rcx, %rax addq %rcx, %r9 adcq %rax, %r10 mulxq %r14, %rcx, %rax adcq %rcx, %r11 adcq %rax, %rsi movq $0x00, %rbx adcq $0x00, %rbx mulxq %r13, %rcx, %rax addq %rcx, %r10 adcq %rax, %r11 mulxq %r15, %rcx, %rax adcq %rcx, %rsi adcq %rax, %rbx subq %r12, %r10 movq %rsi, %r12 sbbq %r13, %r11 movq %rbx, %r13 sbbq %r14, %r12 sbbq %r15, %r13 movq %r13, %rax sarq $57, %rax # Conditionally subtract order starting at bit 125 movq $0xa000000000000000, %rsi movq $0xcb024c634b9eba7d, %rbx movq $0x29bdf3bd45ef39a, %rbp movq $0x200000000000000, %rcx andq %rax, %rsi andq %rax, %rbx andq %rax, %rbp andq %rax, %rcx addq %rsi, %r9 adcq %rbx, %r10 adcq %rbp, %r11 adcq $0x00, %r12 adcq %rcx, %r13 # Move bits 252-376 to own registers movq $0xfffffffffffffff, %rax shldq $4, %r12, %r13 shldq $4, %r11, %r12 andq %rax, %r11 # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rdx mulxq %r12, %rbp, %rax movq $0x00, %rsi addq %rbp, %r8 adcq %rax, %r9 mulxq %r13, %rbp, %rax adcq $0x00, %rsi addq %rbp, %r9 adcq %rax, %rsi # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rdx mulxq %r12, %rbp, %rax movq $0x00, %rbx addq %rbp, %r9 adcq %rax, %r10 mulxq %r13, %rbp, %rax adcq $0x00, %rbx addq %rbp, %r10 adcq %rax, %rbx # Add overflows at 2 * 64 movq $0xfffffffffffffff, %rcx andq %rcx, %r11 addq %rsi, %r10 adcq %rbx, %r11 # Subtract top at 2 * 64 subq %r12, %r10 sbbq %r13, %r11 sbbq %rcx, %rcx # Conditional sub order movq $0x5812631a5cf5d3ed, %rsi movq $0x14def9dea2f79cd6, %rbx movq $0x1000000000000000, %rbp andq %rcx, %rsi andq %rcx, %rbx andq %rcx, %rbp addq %rsi, %r8 movq $0xfffffffffffffff, %rsi adcq %rbx, %r9 adcq $0x00, %r10 adcq %rbp, %r11 andq %rsi, %r11 # Store result movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size sc_reduce_avx2,.-sc_reduce_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_muladd_avx2 .type sc_muladd_avx2,@function .align 16 sc_muladd_avx2: #else .section __TEXT,__text .globl _sc_muladd_avx2 .p2align 4 _sc_muladd_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %r8 movq %rcx, %r9 # Multiply # A[0] * B[0] movq (%r8), %rdx mulxq (%rsi), %r10, %r11 # A[2] * B[0] mulxq 16(%rsi), %r12, %r13 # A[1] * B[0] mulxq 8(%rsi), %rax, %rcx xorq %rbp, %rbp adcxq %rax, %r11 # A[3] * B[1] movq 8(%r8), %rdx mulxq 24(%rsi), %r14, %r15 adcxq %rcx, %r12 # A[0] * B[1] mulxq (%rsi), %rax, %rcx adoxq %rax, %r11 # A[2] * B[1] mulxq 16(%rsi), %rax, %rbx adoxq %rcx, %r12 adcxq %rax, %r13 # A[1] * B[2] movq 16(%r8), %rdx mulxq 8(%rsi), %rax, %rcx adcxq %rbx, %r14 adoxq %rax, %r13 adcxq %rbp, %r15 adoxq %rcx, %r14 # A[0] * B[2] mulxq (%rsi), %rax, %rcx adoxq %rbp, %r15 xorq %rbx, %rbx adcxq %rax, %r12 # A[1] * B[1] movq 8(%r8), %rdx mulxq 8(%rsi), %rdx, %rax adcxq %rcx, %r13 adoxq %rdx, %r12 # A[1] * B[3] movq 24(%r8), %rdx adoxq %rax, %r13 mulxq 8(%rsi), %rax, %rcx adcxq %rax, %r14 # A[2] * B[2] movq 16(%r8), %rdx mulxq 16(%rsi), %rdx, %rax adcxq %rcx, %r15 adoxq %rdx, %r14 # A[3] * B[3] movq 24(%r8), %rdx adoxq %rax, %r15 mulxq 24(%rsi), %rax, %rcx adoxq %rbp, %rbx adcxq %rax, %rbx # A[0] * B[3] mulxq (%rsi), %rdx, %rax adcxq %rcx, %rbp xorq %rcx, %rcx adcxq %rdx, %r13 # A[3] * B[0] movq 24(%rsi), %rdx adcxq %rax, %r14 mulxq (%r8), %rdx, %rax adoxq %rdx, %r13 adoxq %rax, %r14 # A[3] * B[2] movq 24(%rsi), %rdx mulxq 16(%r8), %rdx, %rax adcxq %rdx, %r15 # A[2] * B[3] movq 24(%r8), %rdx adcxq %rax, %rbx mulxq 16(%rsi), %rax, %rdx adcxq %rcx, %rbp adoxq %rax, %r15 adoxq %rdx, %rbx adoxq %rcx, %rbp # Add c to a * b addq (%r9), %r10 adcq 8(%r9), %r11 adcq 16(%r9), %r12 adcq 24(%r9), %r13 adcq $0x00, %r14 adcq $0x00, %r15 adcq $0x00, %rbx adcq $0x00, %rbp movq %rbp, %rax movq $0xfffffffffffffff, %rcx shrq $56, %rax shldq $4, %rbx, %rbp shldq $4, %r15, %rbx shldq $4, %r14, %r15 shldq $4, %r13, %r14 andq %rcx, %r13 andq %rcx, %rbp # Add order times bits 504..507 subq %rax, %rbx sbbq $0x00, %rbp movq $0xeb2106215d086329, %rdx mulxq %rax, %rsi, %rcx movq $0xa7ed9ce5a30a2c13, %rdx addq %rsi, %r15 mulxq %rax, %rsi, %r8 adcq $0x00, %rcx addq %rsi, %r14 adcq %r8, %r15 adcq %rcx, %rbx adcq $0x00, %rbp # Sub product of top 4 words and order movq $0xa7ed9ce5a30a2c13, %rdx mulxq %r14, %rcx, %rax addq %rcx, %r10 adcq %rax, %r11 mulxq %rbx, %rcx, %rax adcq %rcx, %r12 adcq %rax, %r13 movq $0x00, %rsi adcq $0x00, %rsi mulxq %r15, %rcx, %rax addq %rcx, %r11 adcq %rax, %r12 mulxq %rbp, %rcx, %rax adcq %rcx, %r13 adcq %rax, %rsi movq $0xeb2106215d086329, %rdx mulxq %r14, %rcx, %rax addq %rcx, %r11 adcq %rax, %r12 mulxq %rbx, %rcx, %rax adcq %rcx, %r13 adcq %rax, %rsi movq $0x00, %r8 adcq $0x00, %r8 mulxq %r15, %rcx, %rax addq %rcx, %r12 adcq %rax, %r13 mulxq %rbp, %rcx, %rax adcq %rcx, %rsi adcq %rax, %r8 subq %r14, %r12 movq %rsi, %r14 sbbq %r15, %r13 movq %r8, %r15 sbbq %rbx, %r14 sbbq %rbp, %r15 movq %r15, %rax sarq $57, %rax # Conditionally subtract order starting at bit 125 movq $0xa000000000000000, %rsi movq $0xcb024c634b9eba7d, %r8 movq $0x29bdf3bd45ef39a, %r9 movq $0x200000000000000, %rcx andq %rax, %rsi andq %rax, %r8 andq %rax, %r9 andq %rax, %rcx addq %rsi, %r11 adcq %r8, %r12 adcq %r9, %r13 adcq $0x00, %r14 adcq %rcx, %r15 # Move bits 252-376 to own registers movq $0xfffffffffffffff, %rax shldq $4, %r14, %r15 shldq $4, %r13, %r14 andq %rax, %r13 # Sub product of top 2 words and order # * -5812631a5cf5d3ed movq $0xa7ed9ce5a30a2c13, %rdx mulxq %r14, %r9, %rax movq $0x00, %rsi addq %r9, %r10 adcq %rax, %r11 mulxq %r15, %r9, %rax adcq $0x00, %rsi addq %r9, %r11 adcq %rax, %rsi # * -14def9dea2f79cd7 movq $0xeb2106215d086329, %rdx mulxq %r14, %r9, %rax movq $0x00, %r8 addq %r9, %r11 adcq %rax, %r12 mulxq %r15, %r9, %rax adcq $0x00, %r8 addq %r9, %r12 adcq %rax, %r8 # Add overflows at 2 * 64 movq $0xfffffffffffffff, %rcx andq %rcx, %r13 addq %rsi, %r12 adcq %r8, %r13 # Subtract top at 2 * 64 subq %r14, %r12 sbbq %r15, %r13 sbbq %rcx, %rcx # Conditional sub order movq $0x5812631a5cf5d3ed, %rsi movq $0x14def9dea2f79cd6, %r8 movq $0x1000000000000000, %r9 andq %rcx, %rsi andq %rcx, %r8 andq %rcx, %r9 addq %rsi, %r10 movq $0xfffffffffffffff, %rsi adcq %r8, %r11 adcq $0x00, %r12 adcq %r9, %r13 andq %rsi, %r13 # Store result movq %r10, (%rdi) movq %r11, 8(%rdi) movq %r12, 16(%rdi) movq %r13, 24(%rdi) popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size sc_muladd_avx2,.-sc_muladd_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_prime: .long 0x03ffffed,0x03ffffff,0x03ffffff,0x03ffffff .long 0x03ffffff,0x00000000,0x00000000,0x00000000 .long 0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff .long 0x001fffff,0x00000000,0x00000000,0x00000000 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 32 #else .p2align 5 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_one: .quad 0x0000000000000001,0x0000000000000000 .quad 0x0000000000000000,0x0000000000000000 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_all_one: .long 0x00000001,0x00000001,0x00000001,0x00000001 .long 0x00000001,0x00000001,0x00000001,0x00000001 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_mask01111: .long 0x00000000,0x00000001,0x00000001,0x00000001 .long 0x00000001,0x00000000,0x00000000,0x00000000 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_down_one_dword: .long 0x00000001,0x00000002,0x00000003,0x00000004 .long 0x00000005,0x00000006,0x00000007,0x00000007 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_neg: .long 0x00000000,0x00000000,0x00000000,0x00000000 .long 0x80000000,0x00000000,0x00000000,0x00000000 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_up_one_dword: .long 0x00000007,0x00000000,0x00000001,0x00000002 .long 0x00000003,0x00000007,0x00000007,0x00000007 #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 16 #else .p2align 4 #endif /* __APPLE__ */ L_fe_invert_nct_avx2_mask26: .long 0x03ffffff,0x03ffffff,0x03ffffff,0x03ffffff .long 0x03ffffff,0x00000000,0x00000000,0x00000000 /* Non-constant time modular inversion. * * @param [out] r Resulting number. * @param [in] a Number to invert. * @param [in] m Modulus. * @return MP_OKAY on success. */ #ifndef __APPLE__ .text .globl fe_invert_nct_avx2 .type fe_invert_nct_avx2,@function .align 16 fe_invert_nct_avx2: #else .section __TEXT,__text .globl _fe_invert_nct_avx2 .p2align 4 _fe_invert_nct_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq $-19, %rax movq $-1, %rcx movq $-1, %r8 movq $0x7fffffffffffffff, %r9 movq (%rsi), %r10 movq 8(%rsi), %r11 movq 16(%rsi), %r12 movq 24(%rsi), %r13 leaq L_fe_invert_nct_avx2_prime(%rip), %rbx vmovupd (%rbx), %ymm6 vmovupd 32(%rbx), %ymm7 leaq L_fe_invert_nct_avx2_one(%rip), %rbx vmovupd (%rbx), %ymm8 leaq L_fe_invert_nct_avx2_mask01111(%rip), %rbx vmovupd (%rbx), %ymm9 leaq L_fe_invert_nct_avx2_all_one(%rip), %rbx vmovupd (%rbx), %ymm10 leaq L_fe_invert_nct_avx2_down_one_dword(%rip), %rbx vmovupd (%rbx), %ymm11 leaq L_fe_invert_nct_avx2_neg(%rip), %rbx vmovupd (%rbx), %ymm12 leaq L_fe_invert_nct_avx2_up_one_dword(%rip), %rbx vmovupd (%rbx), %ymm13 leaq L_fe_invert_nct_avx2_mask26(%rip), %rbx vmovupd (%rbx), %ymm14 vpxor %xmm0, %xmm0, %xmm0 vpxor %xmm1, %xmm1, %xmm1 vmovdqu %ymm8, %ymm2 vpxor %xmm3, %xmm3, %xmm3 testb $0x01, %r10b jnz L_fe_invert_nct_avx2_v_even_end L_fe_invert_nct_avx2_v_even_start: shrdq $1, %r11, %r10 shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrq $1, %r13 vptest %ymm8, %ymm2 jz L_fe_invert_nct_avx2_v_even_shr1 vpaddd %ymm6, %ymm2, %ymm2 vpaddd %ymm7, %ymm3, %ymm3 L_fe_invert_nct_avx2_v_even_shr1: vpand %ymm9, %ymm2, %ymm4 vpand %ymm10, %ymm3, %ymm5 vpermd %ymm4, %ymm11, %ymm4 vpsrad $0x01, %ymm2, %ymm2 vpsrad $0x01, %ymm3, %ymm3 vpslld $25, %ymm5, %ymm5 vpslld $25, %xmm4, %xmm4 vpaddd %ymm5, %ymm2, %ymm2 vpaddd %ymm4, %ymm3, %ymm3 testb $0x01, %r10b jz L_fe_invert_nct_avx2_v_even_start L_fe_invert_nct_avx2_v_even_end: L_fe_invert_nct_avx2_uv_start: cmpq %r13, %r9 jb L_fe_invert_nct_avx2_uv_v ja L_fe_invert_nct_avx2_uv_u cmpq %r12, %r8 jb L_fe_invert_nct_avx2_uv_v ja L_fe_invert_nct_avx2_uv_u cmpq %r11, %rcx jb L_fe_invert_nct_avx2_uv_v ja L_fe_invert_nct_avx2_uv_u cmpq %r10, %rax jb L_fe_invert_nct_avx2_uv_v L_fe_invert_nct_avx2_uv_u: subq %r10, %rax sbbq %r11, %rcx vpsubd %ymm2, %ymm0, %ymm0 sbbq %r12, %r8 vpsubd %ymm3, %ymm1, %ymm1 sbbq %r13, %r9 vptest %ymm12, %ymm1 jz L_fe_invert_nct_avx2_usubv_done_neg vpaddd %ymm6, %ymm0, %ymm0 vpaddd %ymm7, %ymm1, %ymm1 L_fe_invert_nct_avx2_usubv_done_neg: L_fe_invert_nct_avx2_usubv_shr1: shrdq $1, %rcx, %rax shrdq $1, %r8, %rcx shrdq $1, %r9, %r8 shrq $1, %r9 vptest %ymm8, %ymm0 jz L_fe_invert_nct_avx2_usubv_sub_shr1 vpaddd %ymm6, %ymm0, %ymm0 vpaddd %ymm7, %ymm1, %ymm1 L_fe_invert_nct_avx2_usubv_sub_shr1: vpand %ymm9, %ymm0, %ymm4 vpand %ymm10, %ymm1, %ymm5 vpermd %ymm4, %ymm11, %ymm4 vpsrad $0x01, %ymm0, %ymm0 vpsrad $0x01, %ymm1, %ymm1 vpslld $25, %ymm5, %ymm5 vpslld $25, %xmm4, %xmm4 vpaddd %ymm5, %ymm0, %ymm0 vpaddd %ymm4, %ymm1, %ymm1 testb $0x01, %al jz L_fe_invert_nct_avx2_usubv_shr1 cmpq $0x01, %rax jne L_fe_invert_nct_avx2_uv_start movq %rcx, %rdx orq %r8, %rdx jne L_fe_invert_nct_avx2_uv_start orq %r9, %rdx jne L_fe_invert_nct_avx2_uv_start vpextrd $0x00, %xmm0, %eax vpextrd $0x01, %xmm0, %r8d vpextrd $2, %xmm0, %r10d vpextrd $3, %xmm0, %r12d vpextrd $0x00, %xmm1, %ecx vpextrd $0x01, %xmm1, %r9d vpextrd $2, %xmm1, %r11d vpextrd $3, %xmm1, %r13d vextracti128 $0x01, %ymm0, %xmm0 vextracti128 $0x01, %ymm1, %xmm1 vpextrd $0x00, %xmm0, %r14d vpextrd $0x00, %xmm1, %r15d jmp L_fe_invert_nct_avx2_store_done L_fe_invert_nct_avx2_uv_v: subq %rax, %r10 sbbq %rcx, %r11 vpsubd %ymm0, %ymm2, %ymm2 sbbq %r8, %r12 vpsubd %ymm1, %ymm3, %ymm3 sbbq %r9, %r13 vptest %ymm12, %ymm3 jz L_fe_invert_nct_avx2_vsubu_done_neg vpaddd %ymm6, %ymm2, %ymm2 vpaddd %ymm7, %ymm3, %ymm3 L_fe_invert_nct_avx2_vsubu_done_neg: L_fe_invert_nct_avx2_vsubu_shr1: shrdq $1, %r11, %r10 shrdq $1, %r12, %r11 shrdq $1, %r13, %r12 shrq $1, %r13 vptest %ymm8, %ymm2 jz L_fe_invert_nct_avx2_vsubu_sub_shr1 vpaddd %ymm6, %ymm2, %ymm2 vpaddd %ymm7, %ymm3, %ymm3 L_fe_invert_nct_avx2_vsubu_sub_shr1: vpand %ymm9, %ymm2, %ymm4 vpand %ymm10, %ymm3, %ymm5 vpermd %ymm4, %ymm11, %ymm4 vpsrad $0x01, %ymm2, %ymm2 vpsrad $0x01, %ymm3, %ymm3 vpslld $25, %ymm5, %ymm5 vpslld $25, %xmm4, %xmm4 vpaddd %ymm5, %ymm2, %ymm2 vpaddd %ymm4, %ymm3, %ymm3 testb $0x01, %r10b jz L_fe_invert_nct_avx2_vsubu_shr1 cmpq $0x01, %r10 jne L_fe_invert_nct_avx2_uv_start movq %r11, %rdx orq %r12, %rdx jne L_fe_invert_nct_avx2_uv_start orq %r13, %rdx jne L_fe_invert_nct_avx2_uv_start vpextrd $0x00, %xmm2, %eax vpextrd $0x01, %xmm2, %r8d vpextrd $2, %xmm2, %r10d vpextrd $3, %xmm2, %r12d vpextrd $0x00, %xmm3, %ecx vpextrd $0x01, %xmm3, %r9d vpextrd $2, %xmm3, %r11d vpextrd $3, %xmm3, %r13d vextracti128 $0x01, %ymm2, %xmm2 vextracti128 $0x01, %ymm3, %xmm3 vpextrd $0x00, %xmm2, %r14d vpextrd $0x00, %xmm3, %r15d L_fe_invert_nct_avx2_store_done: movl %eax, %edx andl $0x3ffffff, %eax sarl $26, %edx addl %edx, %ecx movl %ecx, %edx andl $0x3ffffff, %ecx sarl $26, %edx addl %edx, %r8d movl %r8d, %edx andl $0x3ffffff, %r8d sarl $26, %edx addl %edx, %r9d movl %r9d, %edx andl $0x3ffffff, %r9d sarl $26, %edx addl %edx, %r10d movl %r10d, %edx andl $0x3ffffff, %r10d sarl $26, %edx addl %edx, %r11d movl %r11d, %edx andl $0x3ffffff, %r11d sarl $26, %edx addl %edx, %r12d movl %r12d, %edx andl $0x3ffffff, %r12d sarl $26, %edx addl %edx, %r13d movl %r13d, %edx andl $0x3ffffff, %r13d sarl $26, %edx addl %edx, %r14d movl %r14d, %edx andl $0x3ffffff, %r14d sarl $26, %edx addl %edx, %r15d movslq %ecx, %rcx movslq %r9d, %r9 movslq %r11d, %r11 movslq %r13d, %r13 movslq %r15d, %r15 shlq $26, %rcx shlq $26, %r9 shlq $26, %r11 shlq $26, %r13 shlq $26, %r15 movslq %eax, %rax addq %rcx, %rax movslq %r8d, %r8 adcq %r9, %r8 movslq %r10d, %r10 adcq %r11, %r10 movslq %r12d, %r12 adcq %r13, %r12 movslq %r14d, %r14 adcq %r15, %r14 jge L_fe_invert_nct_avx2_uv_start_no_add_prime movq $0xfffffffffffed, %rcx movq $0xfffffffffffff, %r9 movq $0xfffffffffffff, %r11 movq $0xfffffffffffff, %r13 movq $0x7fffffffffff, %r15 addq %rcx, %rax addq %r9, %r8 addq %r11, %r10 addq %r13, %r12 addq %r15, %r14 movq $0xfffffffffffff, %rdx movq %rax, %rcx andq %rdx, %rax sarq $52, %rcx addq %rcx, %r8 movq %r8, %r9 andq %rdx, %r8 sarq $52, %r9 addq %r9, %r10 movq %r10, %r11 andq %rdx, %r10 sarq $52, %r11 addq %r11, %r12 movq %r12, %r13 andq %rdx, %r12 sarq $52, %r13 addq %r13, %r14 L_fe_invert_nct_avx2_uv_start_no_add_prime: movq %r8, %rcx movq %r10, %r9 movq %r12, %r11 shlq $52, %rcx sarq $12, %r8 shlq $40, %r9 sarq $24, %r10 shlq $28, %r11 sarq $36, %r12 shlq $16, %r14 addq %rcx, %rax adcq %r9, %r8 adcq %r11, %r10 adcq %r14, %r12 movq %rax, (%rdi) movq %r8, 8(%rdi) movq %r10, 16(%rdi) movq %r12, 24(%rdi) vzeroupper popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_invert_nct_avx2,.-fe_invert_nct_avx2 #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #endif /* HAVE_INTEL_AVX2 */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif