1#include "ggml-backend-impl.h"
2
3#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
4
5#ifdef _MSC_VER
6#include <intrin.h>
7#endif
8
9#include <cstring>
10#include <vector>
11#include <bitset>
12#include <array>
13#include <string>
14
15// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf
16struct cpuid_x86 {
17 bool SSE3(void) { return f_1_ecx[0]; }
18 bool PCLMULQDQ(void) { return f_1_ecx[1]; }
19 bool MONITOR(void) { return f_1_ecx[3]; }
20 bool SSSE3(void) { return f_1_ecx[9]; }
21 bool FMA(void) { return f_1_ecx[12]; }
22 bool CMPXCHG16B(void) { return f_1_ecx[13]; }
23 bool SSE41(void) { return f_1_ecx[19]; }
24 bool SSE42(void) { return f_1_ecx[20]; }
25 bool MOVBE(void) { return f_1_ecx[22]; }
26 bool POPCNT(void) { return f_1_ecx[23]; }
27 bool AES(void) { return f_1_ecx[25]; }
28 bool XSAVE(void) { return f_1_ecx[26]; }
29 bool OSXSAVE(void) { return f_1_ecx[27]; }
30 bool AVX(void) { return f_1_ecx[28]; }
31 bool F16C(void) { return f_1_ecx[29]; }
32 bool RDRAND(void) { return f_1_ecx[30]; }
33
34 bool MSR(void) { return f_1_edx[5]; }
35 bool CX8(void) { return f_1_edx[8]; }
36 bool SEP(void) { return f_1_edx[11]; }
37 bool CMOV(void) { return f_1_edx[15]; }
38 bool CLFSH(void) { return f_1_edx[19]; }
39 bool MMX(void) { return f_1_edx[23]; }
40 bool FXSR(void) { return f_1_edx[24]; }
41 bool SSE(void) { return f_1_edx[25]; }
42 bool SSE2(void) { return f_1_edx[26]; }
43
44 bool FSGSBASE(void) { return f_7_ebx[0]; }
45 bool BMI1(void) { return f_7_ebx[3]; }
46 bool HLE(void) { return is_intel && f_7_ebx[4]; }
47 bool AVX2(void) { return f_7_ebx[5]; }
48 bool BMI2(void) { return f_7_ebx[8]; }
49 bool ERMS(void) { return f_7_ebx[9]; }
50 bool INVPCID(void) { return f_7_ebx[10]; }
51 bool RTM(void) { return is_intel && f_7_ebx[11]; }
52 bool AVX512F(void) { return f_7_ebx[16]; }
53 bool AVX512DQ(void) { return f_7_ebx[17]; }
54 bool RDSEED(void) { return f_7_ebx[18]; }
55 bool ADX(void) { return f_7_ebx[19]; }
56 bool AVX512PF(void) { return f_7_ebx[26]; }
57 bool AVX512ER(void) { return f_7_ebx[27]; }
58 bool AVX512CD(void) { return f_7_ebx[28]; }
59 bool AVX512BW(void) { return f_7_ebx[30]; }
60 bool AVX512VL(void) { return f_7_ebx[31]; }
61
62 bool SHA(void) { return f_7_ebx[29]; }
63
64 bool PREFETCHWT1(void) { return f_7_ecx[0]; }
65
66 bool LAHF(void) { return f_81_ecx[0]; }
67 bool LZCNT(void) { return is_intel && f_81_ecx[5]; }
68 bool ABM(void) { return is_amd && f_81_ecx[5]; }
69 bool SSE4a(void) { return is_amd && f_81_ecx[6]; }
70 bool XOP(void) { return is_amd && f_81_ecx[11]; }
71 bool TBM(void) { return is_amd && f_81_ecx[21]; }
72
73 bool SYSCALL(void) { return is_intel && f_81_edx[11]; }
74 bool MMXEXT(void) { return is_amd && f_81_edx[22]; }
75 bool RDTSCP(void) { return is_intel && f_81_edx[27]; }
76 bool _3DNOWEXT(void) { return is_amd && f_81_edx[30]; }
77 bool _3DNOW(void) { return is_amd && f_81_edx[31]; }
78
79 bool AVX512_VBMI(void) { return f_7_ecx[1]; }
80 bool AVX512_VNNI(void) { return f_7_ecx[11]; }
81 bool AVX512_FP16(void) { return f_7_edx[23]; }
82 bool AVX512_BF16(void) { return f_7_1_eax[5]; }
83 bool AVX_VNNI(void) { return f_7_1_eax[4]; }
84
85 bool AMX_TILE(void) { return f_7_edx[24]; }
86 bool AMX_INT8(void) { return f_7_edx[25]; }
87 bool AMX_FP16(void) { return f_7_1_eax[21]; }
88 bool AMX_BF16(void) { return f_7_edx[22]; }
89
90#ifdef _MSC_VER
91 static void cpuid(int cpu_info[4], int eax) {
92 __cpuid(cpu_info, eax);
93 }
94 static void cpuidex(int cpu_info[4], int eax, int ecx) {
95 __cpuidex(cpu_info, eax, ecx);
96 }
97#else
98 static void cpuid(int cpu_info[4], int eax) {
99 __asm__ __volatile__(
100 "cpuid"
101 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
102 : "a"(eax), "c"(0));
103 }
104 static void cpuidex(int cpu_info[4], int eax, int ecx) {
105 __asm__ __volatile__(
106 "cpuid"
107 : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
108 : "a"(eax), "c"(ecx));
109 }
110#endif
111
112 cpuid_x86() {
113 std::array<int, 4> cpui;
114 std::vector<std::array<int, 4>> data;
115
116 // calling __cpuid with 0x0 as the function_id argument
117 // gets the number of the highest valid function ID.
118 cpuid(cpui.data(), 0);
119 int n_ids = cpui[0];
120
121 for (int i = 0; i <= n_ids; ++i) {
122 cpuidex(cpui.data(), i, 0);
123 data.push_back(cpui);
124 }
125
126 // capture vendor string
127 char vendor[0x20] = {};
128 *reinterpret_cast<int *>(vendor) = data[0][1];
129 *reinterpret_cast<int *>(vendor + 4) = data[0][3];
130 *reinterpret_cast<int *>(vendor + 8) = data[0][2];
131 this->vendor = vendor;
132 if (this->vendor == "GenuineIntel") {
133 is_intel = true;
134 } else if (this->vendor == "AuthenticAMD") {
135 is_amd = true;
136 }
137
138 // load bitset with flags for function 0x00000001
139 if (n_ids >= 1) {
140 f_1_ecx = data[1][2];
141 f_1_edx = data[1][3];
142 }
143
144 // load bitset with flags for function 0x00000007
145 if (n_ids >= 7) {
146 f_7_ebx = data[7][1];
147 f_7_ecx = data[7][2];
148 f_7_edx = data[7][3];
149 cpuidex(cpui.data(), 7, 1);
150 f_7_1_eax = cpui[0];
151 }
152
153 // calling __cpuid with 0x80000000 as the function_id argument
154 // gets the number of the highest valid extended ID.
155 cpuid(cpui.data(), 0x80000000);
156 unsigned int n_ex_ids = cpui[0];
157
158 std::vector<std::array<int, 4>> ext_data;
159 for (unsigned int i = 0x80000000; i <= n_ex_ids; ++i) {
160 cpuidex(cpui.data(), i, 0);
161 ext_data.push_back(cpui);
162 }
163
164 // load bitset with flags for function 0x80000001
165 if (n_ex_ids >= 0x80000001) {
166 f_81_ecx = ext_data[1][2];
167 f_81_edx = ext_data[1][3];
168 }
169
170 // interpret CPU brand string if reported
171 char brand[0x40] = {};
172 if (n_ex_ids >= 0x80000004) {
173 std::memcpy(brand, ext_data[2].data(), sizeof(cpui));
174 std::memcpy(brand + 16, ext_data[3].data(), sizeof(cpui));
175 std::memcpy(brand + 32, ext_data[4].data(), sizeof(cpui));
176 this->brand = brand;
177 }
178 }
179
180 bool is_intel = false;
181 bool is_amd = false;
182 std::string vendor;
183 std::string brand;
184 std::bitset<32> f_1_ecx;
185 std::bitset<32> f_1_edx;
186 std::bitset<32> f_7_ebx;
187 std::bitset<32> f_7_ecx;
188 std::bitset<32> f_7_edx;
189 std::bitset<32> f_7_1_eax;
190 std::bitset<32> f_81_ecx;
191 std::bitset<32> f_81_edx;
192};
193
194#if 0
195void test_x86_is() {
196 cpuid_x86 is;
197 printf("CPU Vendor: %s\n", is.vendor.c_str());
198 printf("Brand: %s\n", is.brand.c_str());
199 printf("is_intel: %d\n", is.is_intel);
200 printf("is_amd: %d\n", is.is_amd);
201 printf("sse3: %d\n", is.SSE3());
202 printf("pclmulqdq: %d\n", is.PCLMULQDQ());
203 printf("ssse3: %d\n", is.SSSE3());
204 printf("fma: %d\n", is.FMA());
205 printf("cmpxchg16b: %d\n", is.CMPXCHG16B());
206 printf("sse41: %d\n", is.SSE41());
207 printf("sse42: %d\n", is.SSE42());
208 printf("movbe: %d\n", is.MOVBE());
209 printf("popcnt: %d\n", is.POPCNT());
210 printf("aes: %d\n", is.AES());
211 printf("xsave: %d\n", is.XSAVE());
212 printf("osxsave: %d\n", is.OSXSAVE());
213 printf("avx: %d\n", is.AVX());
214 printf("f16c: %d\n", is.F16C());
215 printf("rdrand: %d\n", is.RDRAND());
216 printf("msr: %d\n", is.MSR());
217 printf("cx8: %d\n", is.CX8());
218 printf("sep: %d\n", is.SEP());
219 printf("cmov: %d\n", is.CMOV());
220 printf("clflush: %d\n", is.CLFSH());
221 printf("mmx: %d\n", is.MMX());
222 printf("fxsr: %d\n", is.FXSR());
223 printf("sse: %d\n", is.SSE());
224 printf("sse2: %d\n", is.SSE2());
225 printf("fsgsbase: %d\n", is.FSGSBASE());
226 printf("bmi1: %d\n", is.BMI1());
227 printf("hle: %d\n", is.HLE());
228 printf("avx2: %d\n", is.AVX2());
229 printf("bmi2: %d\n", is.BMI2());
230 printf("erms: %d\n", is.ERMS());
231 printf("invpcid: %d\n", is.INVPCID());
232 printf("rtm: %d\n", is.RTM());
233 printf("avx512f: %d\n", is.AVX512F());
234 printf("rdseed: %d\n", is.RDSEED());
235 printf("adx: %d\n", is.ADX());
236 printf("avx512pf: %d\n", is.AVX512PF());
237 printf("avx512er: %d\n", is.AVX512ER());
238 printf("avx512cd: %d\n", is.AVX512CD());
239 printf("sha: %d\n", is.SHA());
240 printf("prefetchwt1: %d\n", is.PREFETCHWT1());
241 printf("lahf: %d\n", is.LAHF());
242 printf("lzcnt: %d\n", is.LZCNT());
243 printf("abm: %d\n", is.ABM());
244 printf("sse4a: %d\n", is.SSE4a());
245 printf("xop: %d\n", is.XOP());
246 printf("tbm: %d\n", is.TBM());
247 printf("syscall: %d\n", is.SYSCALL());
248 printf("mmxext: %d\n", is.MMXEXT());
249 printf("rdtscp: %d\n", is.RDTSCP());
250 printf("3dnowext: %d\n", is._3DNOWEXT());
251 printf("3dnow: %d\n", is._3DNOW());
252 printf("avx512_vbmi: %d\n", is.AVX512_VBMI());
253 printf("avx512_vnni: %d\n", is.AVX512_VNNI());
254 printf("avx512_fp16: %d\n", is.AVX512_FP16());
255 printf("avx512_bf16: %d\n", is.AVX512_BF16());
256 printf("amx_tile: %d\n", is.AMX_TILE());
257 printf("amx_int8: %d\n", is.AMX_INT8());
258 printf("amx_fp16: %d\n", is.AMX_FP16());
259 printf("amx_bf16: %d\n", is.AMX_BF16());
260}
261#endif
262
263static int ggml_backend_cpu_x86_score() {
264 // FIXME: this does not check for OS support
265
266 int score = 1;
267 cpuid_x86 is;
268
269#ifdef GGML_FMA
270 if (!is.FMA()) { return 0; }
271 score += 1;
272#endif
273#ifdef GGML_F16C
274 if (!is.F16C()) { return 0; }
275 score += 1<<1;
276#endif
277#ifdef GGML_SSE42
278 if (!is.SSE42()) { return 0; }
279 score += 1<<2;
280#endif
281#ifdef GGML_BMI2
282 if (!is.BMI2()) { return 0; }
283 score += 1<<3;
284#endif
285#ifdef GGML_AVX
286 if (!is.AVX()) { return 0; }
287 score += 1<<4;
288#endif
289#ifdef GGML_AVX2
290 if (!is.AVX2()) { return 0; }
291 score += 1<<5;
292#endif
293#ifdef GGML_AVX_VNNI
294 if (!is.AVX_VNNI()) { return 0; }
295 score += 1<<6;
296#endif
297#ifdef GGML_AVX512
298 if (!is.AVX512F()) { return 0; }
299 if (!is.AVX512CD()) { return 0; }
300 if (!is.AVX512VL()) { return 0; }
301 if (!is.AVX512DQ()) { return 0; }
302 if (!is.AVX512BW()) { return 0; }
303 score += 1<<7;
304#endif
305#ifdef GGML_AVX512_VBMI
306 if (!is.AVX512_VBMI()) { return 0; }
307 score += 1<<8;
308#endif
309#ifdef GGML_AVX512_BF16
310 if (!is.AVX512_BF16()) { return 0; }
311 score += 1<<9;
312#endif
313#ifdef GGML_AVX512_VNNI
314 if (!is.AVX512_VNNI()) { return 0; }
315 score += 1<<10;
316#endif
317#ifdef GGML_AMX_INT8
318 if (!is.AMX_INT8()) { return 0; }
319 score += 1<<11;
320#endif
321
322 return score;
323}
324
325GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_x86_score)
326
327#endif // defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))