1/* Initialize x86 cache info.
2 Copyright (C) 2020-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19static const struct intel_02_cache_info
20{
21 unsigned char idx;
22 unsigned char assoc;
23 unsigned char linesize;
24 unsigned char rel_name;
25 unsigned int size;
26} intel_02_known [] =
27 {
28#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97 };
98
99#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101static int
102intel_02_known_compare (const void *p1, const void *p2)
103{
104 const struct intel_02_cache_info *i1;
105 const struct intel_02_cache_info *i2;
106
107 i1 = (const struct intel_02_cache_info *) p1;
108 i2 = (const struct intel_02_cache_info *) p2;
109
110 if (i1->idx == i2->idx)
111 return 0;
112
113 return i1->idx < i2->idx ? -1 : 1;
114}
115
116
117static long int
118__attribute__ ((noinline))
119intel_check_word (int name, unsigned int value, bool *has_level_2,
120 bool *no_level_2_or_3,
121 const struct cpu_features *cpu_features)
122{
123 if ((value & 0x80000000) != 0)
124 /* The register value is reserved. */
125 return 0;
126
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
128 ASSOC, LINESIZE. */
129 int folded_rel_name = (M(name) / 3) * 3;
130
131 while (value != 0)
132 {
133 unsigned int byte = value & 0xff;
134
135 if (byte == 0x40)
136 {
137 *no_level_2_or_3 = true;
138
139 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 /* No need to look further. */
141 break;
142 }
143 else if (byte == 0xff)
144 {
145 /* CPUID leaf 0x4 contains all the information. We need to
146 iterate over it. */
147 unsigned int eax;
148 unsigned int ebx;
149 unsigned int ecx;
150 unsigned int edx;
151
152 unsigned int round = 0;
153 while (1)
154 {
155 __cpuid_count (4, round, eax, ebx, ecx, edx);
156
157 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 if (type == null)
159 /* That was the end. */
160 break;
161
162 unsigned int level = (eax >> 5) & 0x7;
163
164 if ((level == 1 && type == data
165 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 || (level == 1 && type == inst
167 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 {
172 unsigned int offset = M(name) - folded_rel_name;
173
174 if (offset == 0)
175 /* Cache size. */
176 return (((ebx >> 22) + 1)
177 * (((ebx >> 12) & 0x3ff) + 1)
178 * ((ebx & 0xfff) + 1)
179 * (ecx + 1));
180 if (offset == 1)
181 return (ebx >> 22) + 1;
182
183 assert (offset == 2);
184 return (ebx & 0xfff) + 1;
185 }
186
187 ++round;
188 }
189 /* There is no other cache information anywhere else. */
190 break;
191 }
192 else
193 {
194 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 {
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
198 level cache. */
199 unsigned int family = cpu_features->basic.family;
200 unsigned int model = cpu_features->basic.model;
201
202 if (family == 15 && model == 6)
203 {
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name = (_SC_LEVEL2_CACHE_SIZE
208 + (name - _SC_LEVEL3_CACHE_SIZE));
209 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 }
211 }
212
213 struct intel_02_cache_info *found;
214 struct intel_02_cache_info search;
215
216 search.idx = byte;
217 found = bsearch (&search, intel_02_known, nintel_02_known,
218 sizeof (intel_02_known[0]), intel_02_known_compare);
219 if (found != NULL)
220 {
221 if (found->rel_name == folded_rel_name)
222 {
223 unsigned int offset = M(name) - folded_rel_name;
224
225 if (offset == 0)
226 /* Cache size. */
227 return found->size;
228 if (offset == 1)
229 return found->assoc;
230
231 assert (offset == 2);
232 return found->linesize;
233 }
234
235 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 *has_level_2 = true;
237 }
238 }
239
240 /* Next byte for the next round. */
241 value >>= 8;
242 }
243
244 /* Nothing found. */
245 return 0;
246}
247
248
249static long int __attribute__ ((noinline))
250handle_intel (int name, const struct cpu_features *cpu_features)
251{
252 unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254 /* Return -1 for older CPUs. */
255 if (maxidx < 2)
256 return -1;
257
258 /* OK, we can use the CPUID instruction to get all info about the
259 caches. */
260 unsigned int cnt = 0;
261 unsigned int max = 1;
262 long int result = 0;
263 bool no_level_2_or_3 = false;
264 bool has_level_2 = false;
265
266 while (cnt++ < max)
267 {
268 unsigned int eax;
269 unsigned int ebx;
270 unsigned int ecx;
271 unsigned int edx;
272 __cpuid (2, eax, ebx, ecx, edx);
273
274 /* The low byte of EAX in the first round contain the number of
275 rounds we have to make. At least one, the one we are already
276 doing. */
277 if (cnt == 1)
278 {
279 max = eax & 0xff;
280 eax &= 0xffffff00;
281 }
282
283 /* Process the individual registers' value. */
284 result = intel_check_word (name, eax, &has_level_2,
285 &no_level_2_or_3, cpu_features);
286 if (result != 0)
287 return result;
288
289 result = intel_check_word (name, ebx, &has_level_2,
290 &no_level_2_or_3, cpu_features);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, ecx, &has_level_2,
295 &no_level_2_or_3, cpu_features);
296 if (result != 0)
297 return result;
298
299 result = intel_check_word (name, edx, &has_level_2,
300 &no_level_2_or_3, cpu_features);
301 if (result != 0)
302 return result;
303 }
304
305 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306 && no_level_2_or_3)
307 return -1;
308
309 return 0;
310}
311
312
313static long int __attribute__ ((noinline))
314handle_amd (int name)
315{
316 unsigned int eax;
317 unsigned int ebx;
318 unsigned int ecx;
319 unsigned int edx;
320 unsigned int count = 0x1;
321
322 /* No level 4 cache (yet). */
323 if (name > _SC_LEVEL3_CACHE_LINESIZE)
324 return 0;
325
326 if (name >= _SC_LEVEL3_CACHE_SIZE)
327 count = 0x3;
328 else if (name >= _SC_LEVEL2_CACHE_SIZE)
329 count = 0x2;
330 else if (name >= _SC_LEVEL1_DCACHE_SIZE)
331 count = 0x0;
332
333 __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
334
335 switch (name)
336 {
337 case _SC_LEVEL1_ICACHE_ASSOC:
338 case _SC_LEVEL1_DCACHE_ASSOC:
339 case _SC_LEVEL2_CACHE_ASSOC:
340 case _SC_LEVEL3_CACHE_ASSOC:
341 return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
342 case _SC_LEVEL1_ICACHE_LINESIZE:
343 case _SC_LEVEL1_DCACHE_LINESIZE:
344 case _SC_LEVEL2_CACHE_LINESIZE:
345 case _SC_LEVEL3_CACHE_LINESIZE:
346 return ecx ? (ebx & 0xfff) + 1 : 0;
347 case _SC_LEVEL1_ICACHE_SIZE:
348 case _SC_LEVEL1_DCACHE_SIZE:
349 case _SC_LEVEL2_CACHE_SIZE:
350 case _SC_LEVEL3_CACHE_SIZE:
351 return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
352 default:
353 __builtin_unreachable ();
354 }
355 return -1;
356}
357
358
359static long int __attribute__ ((noinline))
360handle_zhaoxin (int name)
361{
362 unsigned int eax;
363 unsigned int ebx;
364 unsigned int ecx;
365 unsigned int edx;
366
367 int folded_rel_name = (M(name) / 3) * 3;
368
369 unsigned int round = 0;
370 while (1)
371 {
372 __cpuid_count (4, round, eax, ebx, ecx, edx);
373
374 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
375 if (type == null)
376 break;
377
378 unsigned int level = (eax >> 5) & 0x7;
379
380 if ((level == 1 && type == data
381 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
382 || (level == 1 && type == inst
383 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
384 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
385 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
386 {
387 unsigned int offset = M(name) - folded_rel_name;
388
389 if (offset == 0)
390 /* Cache size. */
391 return (((ebx >> 22) + 1)
392 * (((ebx >> 12) & 0x3ff) + 1)
393 * ((ebx & 0xfff) + 1)
394 * (ecx + 1));
395 if (offset == 1)
396 return (ebx >> 22) + 1;
397
398 assert (offset == 2);
399 return (ebx & 0xfff) + 1;
400 }
401
402 ++round;
403 }
404
405 /* Nothing found. */
406 return 0;
407}
408
409static void
410get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
411 long int core)
412{
413 unsigned int eax;
414 unsigned int ebx;
415 unsigned int ecx;
416 unsigned int edx;
417
418 /* Number of logical processors sharing L2 cache. */
419 int threads_l2;
420
421 /* Number of logical processors sharing L3 cache. */
422 int threads_l3;
423
424 const struct cpu_features *cpu_features = __get_cpu_features ();
425 int max_cpuid = cpu_features->basic.max_cpuid;
426 unsigned int family = cpu_features->basic.family;
427 unsigned int model = cpu_features->basic.model;
428 long int shared = *shared_ptr;
429 long int shared_per_thread = *shared_per_thread_ptr;
430 unsigned int threads = *threads_ptr;
431 bool inclusive_cache = true;
432 bool support_count_mask = true;
433
434 /* Try L3 first. */
435 unsigned int level = 3;
436
437 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
438 support_count_mask = false;
439
440 if (shared <= 0)
441 {
442 /* Try L2 otherwise. */
443 level = 2;
444 shared = core;
445 shared_per_thread = core;
446 threads_l2 = 0;
447 threads_l3 = -1;
448 }
449 else
450 {
451 threads_l2 = 0;
452 threads_l3 = 0;
453 }
454
455 /* A value of 0 for the HTT bit indicates there is only a single
456 logical processor. */
457 if (HAS_CPU_FEATURE (HTT))
458 {
459 /* Figure out the number of logical threads that share the
460 highest cache level. */
461 if (max_cpuid >= 4)
462 {
463 int i = 0;
464
465 /* Query until cache level 2 and 3 are enumerated. */
466 int check = 0x1 | (threads_l3 == 0) << 1;
467 do
468 {
469 __cpuid_count (4, i++, eax, ebx, ecx, edx);
470
471 /* There seems to be a bug in at least some Pentium Ds
472 which sometimes fail to iterate all cache parameters.
473 Do not loop indefinitely here, stop in this case and
474 assume there is no such information. */
475 if (cpu_features->basic.kind == arch_kind_intel
476 && (eax & 0x1f) == 0 )
477 goto intel_bug_no_cache_info;
478
479 switch ((eax >> 5) & 0x7)
480 {
481 default:
482 break;
483 case 2:
484 if ((check & 0x1))
485 {
486 /* Get maximum number of logical processors
487 sharing L2 cache. */
488 threads_l2 = (eax >> 14) & 0x3ff;
489 check &= ~0x1;
490 }
491 break;
492 case 3:
493 if ((check & (0x1 << 1)))
494 {
495 /* Get maximum number of logical processors
496 sharing L3 cache. */
497 threads_l3 = (eax >> 14) & 0x3ff;
498
499 /* Check if L2 and L3 caches are inclusive. */
500 inclusive_cache = (edx & 0x2) != 0;
501 check &= ~(0x1 << 1);
502 }
503 break;
504 }
505 }
506 while (check);
507
508 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
509 numbers of addressable IDs for logical processors sharing
510 the cache, instead of the maximum number of threads
511 sharing the cache. */
512 if (max_cpuid >= 11 && support_count_mask)
513 {
514 /* Find the number of logical processors shipped in
515 one core and apply count mask. */
516 i = 0;
517
518 /* Count SMT only if there is L3 cache. Always count
519 core if there is no L3 cache. */
520 int count = ((threads_l2 > 0 && level == 3)
521 | ((threads_l3 > 0
522 || (threads_l2 > 0 && level == 2)) << 1));
523
524 while (count)
525 {
526 __cpuid_count (11, i++, eax, ebx, ecx, edx);
527
528 int shipped = ebx & 0xff;
529 int type = ecx & 0xff00;
530 if (shipped == 0 || type == 0)
531 break;
532 else if (type == 0x100)
533 {
534 /* Count SMT. */
535 if ((count & 0x1))
536 {
537 int count_mask;
538
539 /* Compute count mask. */
540 asm ("bsr %1, %0"
541 : "=r" (count_mask) : "g" (threads_l2));
542 count_mask = ~(-1 << (count_mask + 1));
543 threads_l2 = (shipped - 1) & count_mask;
544 count &= ~0x1;
545 }
546 }
547 else if (type == 0x200)
548 {
549 /* Count core. */
550 if ((count & (0x1 << 1)))
551 {
552 int count_mask;
553 int threads_core
554 = (level == 2 ? threads_l2 : threads_l3);
555
556 /* Compute count mask. */
557 asm ("bsr %1, %0"
558 : "=r" (count_mask) : "g" (threads_core));
559 count_mask = ~(-1 << (count_mask + 1));
560 threads_core = (shipped - 1) & count_mask;
561 if (level == 2)
562 threads_l2 = threads_core;
563 else
564 threads_l3 = threads_core;
565 count &= ~(0x1 << 1);
566 }
567 }
568 }
569 }
570 if (threads_l2 > 0)
571 threads_l2 += 1;
572 if (threads_l3 > 0)
573 threads_l3 += 1;
574 if (level == 2)
575 {
576 if (threads_l2)
577 {
578 threads = threads_l2;
579 if (cpu_features->basic.kind == arch_kind_intel
580 && threads > 2
581 && family == 6)
582 switch (model)
583 {
584 case 0x37:
585 case 0x4a:
586 case 0x4d:
587 case 0x5a:
588 case 0x5d:
589 /* Silvermont has L2 cache shared by 2 cores. */
590 threads = 2;
591 break;
592 default:
593 break;
594 }
595 }
596 }
597 else if (threads_l3)
598 threads = threads_l3;
599 }
600 else
601 {
602 intel_bug_no_cache_info:
603 /* Assume that all logical threads share the highest cache
604 level. */
605 threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
606 & 0xff);
607
608 /* Get per-thread size of highest level cache. */
609 if (shared_per_thread > 0 && threads > 0)
610 shared_per_thread /= threads;
611 }
612 }
613
614 /* Account for non-inclusive L2 and L3 caches. */
615 if (!inclusive_cache)
616 {
617 long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
618 shared_per_thread += core_per_thread;
619 shared += core;
620 }
621
622 *shared_ptr = shared;
623 *shared_per_thread_ptr = shared_per_thread;
624 *threads_ptr = threads;
625}
626
627static void
628dl_init_cacheinfo (struct cpu_features *cpu_features)
629{
630 /* Find out what brand of processor. */
631 long int data = -1;
632 long int shared = -1;
633 long int shared_per_thread = -1;
634 long int core = -1;
635 unsigned int threads = 0;
636 unsigned long int level1_icache_size = -1;
637 unsigned long int level1_icache_linesize = -1;
638 unsigned long int level1_dcache_size = -1;
639 unsigned long int level1_dcache_assoc = -1;
640 unsigned long int level1_dcache_linesize = -1;
641 unsigned long int level2_cache_size = -1;
642 unsigned long int level2_cache_assoc = -1;
643 unsigned long int level2_cache_linesize = -1;
644 unsigned long int level3_cache_size = -1;
645 unsigned long int level3_cache_assoc = -1;
646 unsigned long int level3_cache_linesize = -1;
647 unsigned long int level4_cache_size = -1;
648
649 if (cpu_features->basic.kind == arch_kind_intel)
650 {
651 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
652 core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
653 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
654 shared_per_thread = shared;
655
656 level1_icache_size
657 = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
658 level1_icache_linesize
659 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
660 level1_dcache_size = data;
661 level1_dcache_assoc
662 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
663 level1_dcache_linesize
664 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
665 level2_cache_size = core;
666 level2_cache_assoc
667 = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
668 level2_cache_linesize
669 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
670 level3_cache_size = shared;
671 level3_cache_assoc
672 = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
673 level3_cache_linesize
674 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
675 level4_cache_size
676 = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
677
678 get_common_cache_info (&shared, &shared_per_thread, &threads, core);
679 }
680 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
681 {
682 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
683 core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
684 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
685 shared_per_thread = shared;
686
687 level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
688 level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
689 level1_dcache_size = data;
690 level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
691 level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
692 level2_cache_size = core;
693 level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
694 level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
695 level3_cache_size = shared;
696 level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
697 level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
698
699 get_common_cache_info (&shared, &shared_per_thread, &threads, core);
700 }
701 else if (cpu_features->basic.kind == arch_kind_amd)
702 {
703 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
704 core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
705 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
706 shared_per_thread = shared;
707
708 level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
709 level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
710 level1_dcache_size = data;
711 level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
712 level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
713 level2_cache_size = core;
714 level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
715 level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
716 level3_cache_size = shared;
717 level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
718 level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
719
720 if (shared <= 0)
721 /* No shared L3 cache. All we have is the L2 cache. */
722 shared = core;
723
724 if (shared_per_thread <= 0)
725 shared_per_thread = shared;
726 }
727
728 cpu_features->level1_icache_size = level1_icache_size;
729 cpu_features->level1_icache_linesize = level1_icache_linesize;
730 cpu_features->level1_dcache_size = level1_dcache_size;
731 cpu_features->level1_dcache_assoc = level1_dcache_assoc;
732 cpu_features->level1_dcache_linesize = level1_dcache_linesize;
733 cpu_features->level2_cache_size = level2_cache_size;
734 cpu_features->level2_cache_assoc = level2_cache_assoc;
735 cpu_features->level2_cache_linesize = level2_cache_linesize;
736 cpu_features->level3_cache_size = level3_cache_size;
737 cpu_features->level3_cache_assoc = level3_cache_assoc;
738 cpu_features->level3_cache_linesize = level3_cache_linesize;
739 cpu_features->level4_cache_size = level4_cache_size;
740
741 unsigned long int cachesize_non_temporal_divisor
742 = cpu_features->cachesize_non_temporal_divisor;
743 if (cachesize_non_temporal_divisor <= 0)
744 cachesize_non_temporal_divisor = 4;
745
746 /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
747 of the chip's cache (depending on `cachesize_non_temporal_divisor` which
748 is microarch specific. The default is 1/4). For most Intel processors
749 with an initial release date between 2017 and 2023, a thread's
750 typical share of the cache is from 18-64MB. Using a reasonable size
751 fraction of L3 is meant to estimate the point where non-temporal stores
752 begin out-competing REP MOVSB. As well the point where the fact that
753 non-temporal stores are forced back to main memory would already occurred
754 to the majority of the lines in the copy. Note, concerns about the entire
755 L3 cache being evicted by the copy are mostly alleviated by the fact that
756 modern HW detects streaming patterns and provides proper LRU hints so that
757 the maximum thrashing capped at 1/associativity. */
758 unsigned long int non_temporal_threshold
759 = shared / cachesize_non_temporal_divisor;
760
761 /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
762 likely have incorrect/incomplete cache info in which case, default to
763 3/4 * per-thread L3 to avoid regressions. */
764 unsigned long int non_temporal_threshold_lowbound
765 = shared_per_thread * 3 / 4;
766 if (non_temporal_threshold < non_temporal_threshold_lowbound)
767 non_temporal_threshold = non_temporal_threshold_lowbound;
768
769 /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
770 a higher risk of actually thrashing the cache as they don't have a HW LRU
771 hint. As well, their performance in highly parallel situations is
772 noticeably worse. */
773 if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
774 non_temporal_threshold = non_temporal_threshold_lowbound;
775 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
776 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
777 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
778 L(large_memset_4x) loops need 64-byte to cache align and enough space for
779 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
780 reflected in the manual. */
781 unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
782 unsigned long int minimum_non_temporal_threshold = 0x4040;
783
784 /* If `non_temporal_threshold` less than `minimum_non_temporal_threshold`
785 it most likely means we failed to detect the cache info. We don't want
786 to default to `minimum_non_temporal_threshold` as such a small value,
787 while correct, has bad performance. We default to 64MB as reasonable
788 default bound. 64MB is likely conservative in that most/all systems would
789 choose a lower value so it should never forcing non-temporal stores when
790 they otherwise wouldn't be used. */
791 if (non_temporal_threshold < minimum_non_temporal_threshold)
792 non_temporal_threshold = 64 * 1024 * 1024;
793 else if (non_temporal_threshold > maximum_non_temporal_threshold)
794 non_temporal_threshold = maximum_non_temporal_threshold;
795
796 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
797 unsigned int minimum_rep_movsb_threshold;
798 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
799 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
800 threshold is 2048 * (VEC_SIZE / 16). */
801 unsigned int rep_movsb_threshold;
802 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
803 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
804 {
805 rep_movsb_threshold = 4096 * (64 / 16);
806 minimum_rep_movsb_threshold = 64 * 8;
807 }
808 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
809 AVX_Fast_Unaligned_Load))
810 {
811 rep_movsb_threshold = 4096 * (32 / 16);
812 minimum_rep_movsb_threshold = 32 * 8;
813 }
814 else
815 {
816 rep_movsb_threshold = 2048 * (16 / 16);
817 minimum_rep_movsb_threshold = 16 * 8;
818 }
819 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
820 short REP MOVSB (FSRM). */
821 if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
822 rep_movsb_threshold = 2112;
823
824 /* The default threshold to use Enhanced REP STOSB. */
825 unsigned long int rep_stosb_threshold = 2048;
826
827 long int tunable_size;
828
829 tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
830 /* NB: Ignore the default value 0. */
831 if (tunable_size != 0)
832 data = tunable_size;
833
834 tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
835 /* NB: Ignore the default value 0. */
836 if (tunable_size != 0)
837 shared = tunable_size;
838
839 tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
840 if (tunable_size > minimum_non_temporal_threshold
841 && tunable_size <= maximum_non_temporal_threshold)
842 non_temporal_threshold = tunable_size;
843
844 tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
845 if (tunable_size > minimum_rep_movsb_threshold)
846 rep_movsb_threshold = tunable_size;
847
848 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
849 same as the default value of __x86_rep_stosb_threshold and the
850 minimum value is fixed. */
851 rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
852 long int, NULL);
853
854 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
855 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
856 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
857 minimum_non_temporal_threshold,
858 maximum_non_temporal_threshold);
859 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
860 minimum_rep_movsb_threshold, SIZE_MAX);
861 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
862 SIZE_MAX);
863
864 unsigned long int rep_movsb_stop_threshold;
865 /* ERMS feature is implemented from AMD Zen3 architecture and it is
866 performing poorly for data above L2 cache size. Henceforth, adding
867 an upper bound threshold parameter to limit the usage of Enhanced
868 REP MOVSB operations and setting its value to L2 cache size. */
869 if (cpu_features->basic.kind == arch_kind_amd)
870 rep_movsb_stop_threshold = core;
871 /* Setting the upper bound of ERMS to the computed value of
872 non-temporal threshold for architectures other than AMD. */
873 else
874 rep_movsb_stop_threshold = non_temporal_threshold;
875
876 cpu_features->data_cache_size = data;
877 cpu_features->shared_cache_size = shared;
878 cpu_features->non_temporal_threshold = non_temporal_threshold;
879 cpu_features->rep_movsb_threshold = rep_movsb_threshold;
880 cpu_features->rep_stosb_threshold = rep_stosb_threshold;
881 cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
882}
883