/* x86_vector_register_glue.c -- glue logic to save and restore vector registers * on x86 * * Copyright (C) 2006-2026 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ /* included by linuxkm/module_hooks.c */ #ifndef WC_SKIP_INCLUDED_C_FILES #if !defined(WOLFSSL_USE_SAVE_VECTOR_REGISTERS) || !defined(CONFIG_X86) #error x86_vector_register_glue.c included in non-vectorized/non-x86 project. #endif #ifdef WOLFSSL_LINUXKM_VERBOSE_DEBUG #define VRG_PR_ERR_X pr_err #define VRG_PR_WARN_X pr_warn #else #define VRG_PR_ERR_X pr_err_once #define VRG_PR_WARN_X pr_warn_once #endif static unsigned int wc_linuxkm_fpu_states_n_tracked = 0; struct wc_thread_fpu_count_ent { volatile pid_t pid; unsigned int fpu_state; }; struct wc_thread_fpu_count_ent *wc_linuxkm_fpu_states = NULL; #define WC_FPU_COUNT_MASK 0x3fffffffU #define WC_FPU_INHIBITED_FLAG 0x40000000U WARN_UNUSED_RESULT int allocate_wolfcrypt_linuxkm_fpu_states(void) { if (wc_linuxkm_fpu_states != NULL) { #ifdef HAVE_FIPS /* see note below in wc_linuxkm_fpu_state_assoc_unlikely(). */ return 0; #else static int warned_for_repeat_alloc = 0; if (! warned_for_repeat_alloc) { pr_err("BUG: attempt at repeat allocation" " in allocate_wolfcrypt_linuxkm_fpu_states.\n"); warned_for_repeat_alloc = 1; } return BAD_STATE_E; #endif } wc_linuxkm_fpu_states_n_tracked = nr_cpu_ids; wc_linuxkm_fpu_states = (struct wc_thread_fpu_count_ent *)malloc( wc_linuxkm_fpu_states_n_tracked * sizeof(wc_linuxkm_fpu_states[0])); if (! wc_linuxkm_fpu_states) { pr_err("ERROR: allocation of %lu bytes for " "wc_linuxkm_fpu_states failed.\n", nr_cpu_ids * sizeof(struct fpu_state *)); return MEMORY_E; } memset(wc_linuxkm_fpu_states, 0, wc_linuxkm_fpu_states_n_tracked * sizeof(wc_linuxkm_fpu_states[0])); return 0; } void free_wolfcrypt_linuxkm_fpu_states(void) { struct wc_thread_fpu_count_ent *i, *i_endptr; pid_t i_pid; if (wc_linuxkm_fpu_states == NULL) return; for (i = wc_linuxkm_fpu_states, i_endptr = &wc_linuxkm_fpu_states[wc_linuxkm_fpu_states_n_tracked]; i < i_endptr; ++i) { i_pid = __atomic_load_n(&i->pid, __ATOMIC_CONSUME); if (i_pid == 0) continue; if (i->fpu_state != 0) { pr_err("ERROR: free_wolfcrypt_linuxkm_fpu_states called" " with nonzero state 0x%x for PID %d.\n", i->fpu_state, i_pid); i->fpu_state = 0; } } free(wc_linuxkm_fpu_states); wc_linuxkm_fpu_states = NULL; } /* lock-free O(1)-lookup CPU-local storage facility for tracking recursive fpu * pushing/popping. * * caller must have already locked itself on its CPU before entering this, or * entering the streamlined inline version of it below. */ static struct wc_thread_fpu_count_ent *wc_linuxkm_fpu_state_assoc_unlikely(int create_p) { int my_cpu = raw_smp_processor_id(); pid_t my_pid = task_pid_nr(current), slot_pid; struct wc_thread_fpu_count_ent *slot; { static int _warned_on_null = 0; if (wc_linuxkm_fpu_states == NULL) { #ifdef HAVE_FIPS /* FIPS needs to use SHA256 for the core verify HMAC, before * reaching the regular wolfCrypt_Init() logic. to break the * dependency loop on intelasm builds, we allocate here. * this is not thread-safe and doesn't need to be. */ int ret = allocate_wolfcrypt_linuxkm_fpu_states(); if (ret != 0) #endif { if (_warned_on_null == 0) { pr_err("BUG: wc_linuxkm_fpu_state_assoc called by PID %d" " before allocate_wolfcrypt_linuxkm_fpu_states.\n", my_pid); _warned_on_null = 1; } return NULL; } } } slot = &wc_linuxkm_fpu_states[my_cpu]; slot_pid = __atomic_load_n(&slot->pid, __ATOMIC_CONSUME); if (slot_pid == my_pid) { if (create_p) { static int _warned_on_redundant_create_p = 0; if (_warned_on_redundant_create_p < 10) { pr_err("BUG: wc_linuxkm_fpu_state_assoc called with create_p=1 by" " PID %d on CPU %d with CPU slot already reserved by" " said PID.\n", my_pid, my_cpu); ++_warned_on_redundant_create_p; } } return slot; } if (create_p) { if (slot_pid == 0) { __atomic_store_n(&slot->pid, my_pid, __ATOMIC_RELEASE); return slot; } else { struct pid *slot_pid_struct; /* if the slot is already occupied, that can be benign-ish due to a * unwanted migration, or due to a process crashing in kernel mode. * it will require fixup either here, or by the thread that owns the * slot, which will happen when it releases its lock. */ slot_pid_struct = find_get_pid(slot_pid); if (slot_pid_struct == NULL) { if (__atomic_compare_exchange_n(&slot->pid, &slot_pid, my_pid, 0, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE)) { pr_warn("WARNING: wc_linuxkm_fpu_state_assoc_unlikely fixed up orphaned slot on CPU %d owned by dead PID %d.\n", my_cpu, slot_pid); return slot; } } else { /* drop the refcount bumped by find_get_pid(). */ put_pid(slot_pid_struct); } { static int _warned_on_mismatched_pid = 0; if (_warned_on_mismatched_pid < 10) { pr_warn("WARNING: wc_linuxkm_fpu_state_assoc called by pid %d on CPU %d" " but CPU slot already reserved by pid %d.\n", my_pid, my_cpu, slot_pid); ++_warned_on_mismatched_pid; } } return NULL; } } else { /* check for migration. this can happen despite our best efforts if any * I/O occurred while locked, e.g. kernel messages like "uninitialized * urandom read". since we're locked now, we can safely migrate the * entry in wc_linuxkm_fpu_states[], freeing up the slot on the previous * cpu. */ unsigned int cpu_i; for (cpu_i = 0; cpu_i < wc_linuxkm_fpu_states_n_tracked; ++cpu_i) { if (__atomic_load_n( &wc_linuxkm_fpu_states[cpu_i].pid, __ATOMIC_CONSUME) == my_pid) { wc_linuxkm_fpu_states[my_cpu] = wc_linuxkm_fpu_states[cpu_i]; __atomic_store_n(&wc_linuxkm_fpu_states[cpu_i].fpu_state, 0, __ATOMIC_RELEASE); __atomic_store_n(&wc_linuxkm_fpu_states[cpu_i].pid, 0, __ATOMIC_RELEASE); return &wc_linuxkm_fpu_states[my_cpu]; } } return NULL; } } static inline struct wc_thread_fpu_count_ent *wc_linuxkm_fpu_state_assoc( int create_p, int assume_fpu_began) { int my_cpu = raw_smp_processor_id(); /* my_cpu is only trustworthy if we're * already nonpreemptible -- we'll * determine that soon enough by * checking if the pid matches or, * failing that, if create_p. */ pid_t my_pid, slot_pid; struct wc_thread_fpu_count_ent *slot; if (unlikely(wc_linuxkm_fpu_states == NULL)) { if (! assume_fpu_began) { /* this was just a quick check for whether we're in a recursive * wc_save_vector_registers_x86(). we're not. */ return NULL; } else return wc_linuxkm_fpu_state_assoc_unlikely(create_p); } my_pid = task_pid_nr(current); slot = &wc_linuxkm_fpu_states[my_cpu]; slot_pid = __atomic_load_n(&slot->pid, __ATOMIC_CONSUME); if (slot_pid == my_pid) { if (unlikely(create_p)) return wc_linuxkm_fpu_state_assoc_unlikely(create_p); else return slot; } if (! assume_fpu_began) { /* this was just a quick check for whether we're in a recursive * wc_save_vector_registers_x86(). we're not. * * if we're in a softirq context, we'll always wind up here, because * processes with entries in wc_linuxkm_fpu_states[] always have * softirqs inhibited. */ return NULL; } if (likely(create_p)) { if (likely(slot_pid == 0)) { __atomic_store_n(&slot->pid, my_pid, __ATOMIC_RELEASE); return slot; } else { return wc_linuxkm_fpu_state_assoc_unlikely(create_p); } } else { return wc_linuxkm_fpu_state_assoc_unlikely(create_p); } } static void wc_linuxkm_fpu_state_release_unlikely( struct wc_thread_fpu_count_ent *ent) { if (ent->fpu_state != 0) { static int warned_nonzero_fpu_state = 0; if (! warned_nonzero_fpu_state) { VRG_PR_ERR_X("ERROR: wc_linuxkm_fpu_state_free for pid %d on CPU %d" " with nonzero fpu_state 0x%x.\n", ent->pid, raw_smp_processor_id(), ent->fpu_state); warned_nonzero_fpu_state = 1; } ent->fpu_state = 0; } __atomic_store_n(&ent->pid, 0, __ATOMIC_RELEASE); } static inline void wc_linuxkm_fpu_state_release( struct wc_thread_fpu_count_ent *ent) { if (unlikely(ent->fpu_state != 0)) return wc_linuxkm_fpu_state_release_unlikely(ent); __atomic_store_n(&ent->pid, 0, __ATOMIC_RELEASE); } WARN_UNUSED_RESULT int wc_can_save_vector_registers_x86(void) { struct wc_thread_fpu_count_ent *pstate; /* check for hard interrupt context (unusable current->pid) preemptively. * if we're in a softirq context we'll catch that below with * a second preempt_count() check. */ if (((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) != 0) || (task_pid_nr(current) == 0)) return 0; /* Check if we're already saved, per wc_linuxkm_fpu_states. */ pstate = wc_linuxkm_fpu_state_assoc(0, 0); if ((pstate != NULL) && (pstate->fpu_state != 0U)) { if (unlikely(pstate->fpu_state & WC_FPU_INHIBITED_FLAG)) return 0; if (unlikely((pstate->fpu_state & WC_FPU_COUNT_MASK) == WC_FPU_COUNT_MASK)) { /* would overflow */ return 0; } else { return 1; } } if ((preempt_count() == 0) || may_use_simd()) return 1; else return 0; } WARN_UNUSED_RESULT int wc_save_vector_registers_x86(enum wc_svr_flags flags) { struct wc_thread_fpu_count_ent *pstate; /* check for hard interrupt context (unusable current->pid) preemptively. * if we're in a softirq context we'll catch that below with * a second look at preempt_count(). */ if (((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) != 0) || (task_pid_nr(current) == 0)) { if (! (flags & WC_SVR_FLAG_INHIBIT)) VRG_PR_WARN_X("WARNING: wc_save_vector_registers_x86(0x%x) called with preempt_count 0x%x and pid %d on CPU %d.\n", (unsigned)flags, preempt_count(), task_pid_nr(current), raw_smp_processor_id()); return WC_ACCEL_INHIBIT_E; } pstate = wc_linuxkm_fpu_state_assoc(0, 0); /* allow for nested calls */ if (pstate && (pstate->fpu_state != 0U)) { if (pstate->fpu_state & WC_FPU_INHIBITED_FLAG) { /* don't allow recursive inhibit calls when already inhibited -- * it would add no functionality and require keeping a separate * count of inhibit recursions. */ return WC_ACCEL_INHIBIT_E; } if (unlikely((pstate->fpu_state & WC_FPU_COUNT_MASK) == WC_FPU_COUNT_MASK)) { pr_err("ERROR: wc_save_vector_registers_x86 recursion register overflow for " "pid %d on CPU %d.\n", pstate->pid, raw_smp_processor_id()); return BAD_STATE_E; } if (flags & WC_SVR_FLAG_INHIBIT) { ++pstate->fpu_state; pstate->fpu_state |= WC_FPU_INHIBITED_FLAG; return 0; } else { ++pstate->fpu_state; return 0; } __builtin_unreachable(); } { int ret = WC_CHECK_FOR_INTR_SIGNALS(); if (ret) return ret; } WC_RELAX_LONG_LOOP(); if (flags & WC_SVR_FLAG_INHIBIT) { if ((preempt_count() != 0) && !may_use_simd()) return WC_ACCEL_INHIBIT_E; /* not an error here, just a * short-circuit result. */ /* we need to inhibit migration and softirqs here to assure that we can * support recursive calls safely, i.e. without mistaking a softirq * context for a recursion. */ #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_disable(); #endif local_bh_disable(); if (preempt_count() == 0) { VRG_PR_ERR_X("BUG: wc_save_vector_registers_x86(): zero preempt_count after local_bh_disable() on CPU %d.\n", raw_smp_processor_id()); #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_enable(); #endif local_bh_enable(); return WC_ACCEL_INHIBIT_E; } pstate = wc_linuxkm_fpu_state_assoc(1, 1); if (pstate == NULL) { #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_enable(); #endif local_bh_enable(); return BAD_STATE_E; } pstate->fpu_state = WC_FPU_INHIBITED_FLAG + 1U; return 0; } if ((preempt_count() == 0) || may_use_simd()) { /* fpregs_lock() calls either local_bh_disable() or preempt_disable() * depending on CONFIG_PREEMPT_RT -- we call both, explicitly. * * empirically, on some kernels, kernel_fpu_begin() doesn't reliably * disable softirqs, indeed doesn't make preempt_count() nonzero, which * breaks our locking algorithm. we sidestep this completely by * explicitly disabling softirq's, preemption, and migration. * helpfully, the calls to do that are all guaranteed recursion-safe. */ #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_disable(); #endif local_bh_disable(); #if IS_ENABLED(CONFIG_PREEMPT_RT) preempt_disable(); #endif kernel_fpu_begin(); pstate = wc_linuxkm_fpu_state_assoc(1, 1); if (pstate == NULL) { kernel_fpu_end(); #if IS_ENABLED(CONFIG_PREEMPT_RT) preempt_enable(); #endif local_bh_enable(); #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_enable(); #endif return BAD_STATE_E; } /* set msb to 0 to trigger kernel_fpu_end() at cleanup. */ pstate->fpu_state = 1U; if (preempt_count() == 0) { VRG_PR_ERR_X("BUG: wc_save_vector_registers_x86(): zero preempt_count after kernel_fpu_begin() on CPU %d.\n", raw_smp_processor_id()); } return 0; } else { VRG_PR_WARN_X("WARNING: wc_save_vector_registers_x86 called with no saved state and nonzero preempt_count 0x%x on CPU %d.\n", preempt_count(), raw_smp_processor_id()); #ifdef WOLFSSL_LINUXKM_VERBOSE_DEBUG dump_stack(); #endif return WC_ACCEL_INHIBIT_E; } __builtin_unreachable(); } void wc_restore_vector_registers_x86(enum wc_svr_flags flags) { struct wc_thread_fpu_count_ent *pstate; if (((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) != 0) || (task_pid_nr(current) == 0)) { VRG_PR_WARN_X("BUG: wc_restore_vector_registers_x86() called from interrupt handler on CPU %d.\n", raw_smp_processor_id()); return; } pstate = wc_linuxkm_fpu_state_assoc(0, 1); if (unlikely(pstate == NULL)) { VRG_PR_WARN_X("BUG: wc_restore_vector_registers_x86() called by pid %d on CPU %d " "with no saved state.\n", task_pid_nr(current), raw_smp_processor_id()); return; } if ((--pstate->fpu_state & WC_FPU_COUNT_MASK) > 0U) { if (flags & WC_SVR_FLAG_INHIBIT) { if (pstate->fpu_state & WC_FPU_INHIBITED_FLAG) pstate->fpu_state &= ~WC_FPU_INHIBITED_FLAG; else VRG_PR_WARN_X("BUG: wc_restore_vector_registers_x86() called by pid %d on CPU %d " "with _INHIBIT flag but saved state isn't _INHIBITED_.\n", task_pid_nr(current), raw_smp_processor_id()); } return; } if (pstate->fpu_state == 0U) { wc_linuxkm_fpu_state_release(pstate); kernel_fpu_end(); #if IS_ENABLED(CONFIG_PREEMPT_RT) preempt_enable(); #endif local_bh_enable(); } else if (unlikely(pstate->fpu_state & WC_FPU_INHIBITED_FLAG)) { if (unlikely(! (flags & WC_SVR_FLAG_INHIBIT))) VRG_PR_WARN_X("BUG: wc_restore_vector_registers_x86() called by pid %d on CPU %d " "without _INHIBIT flag but saved state is _INHIBITED_.\n", task_pid_nr(current), raw_smp_processor_id()); pstate->fpu_state = 0U; wc_linuxkm_fpu_state_release(pstate); local_bh_enable(); } #if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_COUNT) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)) migrate_enable(); #endif WC_RELAX_LONG_LOOP(); return; } #endif /* !WC_SKIP_INCLUDED_C_FILES */