1/* x86_64 cache info.
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21#include <assert.h>
22#include <stdbool.h>
23#include <stdlib.h>
24#include <unistd.h>
25#include <cpuid.h>
26#include <init-arch.h>
27
28static const struct intel_02_cache_info
29{
30 unsigned char idx;
31 unsigned char assoc;
32 unsigned char linesize;
33 unsigned char rel_name;
34 unsigned int size;
35} intel_02_known [] =
36 {
37#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
38 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
39 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
40 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
41 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
42 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
43 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
44 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
45 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
46 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
47 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
48 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
49 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
50 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
51 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
52 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
53 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
54 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
55 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
56 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
57 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
58 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
59 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
60 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
61 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
62 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
63 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
64 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
65 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
66 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
67 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
68 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
69 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
70 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
71 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
72 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
73 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
74 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
75 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
76 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
77 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
78 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
79 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
80 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
83 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
84 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
85 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
86 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
87 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
88 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
89 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
90 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
91 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
92 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
93 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
94 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
95 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
96 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
97 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
98 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
99 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
100 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
101 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
102 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
103 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
104 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
105 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
106 };
107
108#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
109
110static int
111intel_02_known_compare (const void *p1, const void *p2)
112{
113 const struct intel_02_cache_info *i1;
114 const struct intel_02_cache_info *i2;
115
116 i1 = (const struct intel_02_cache_info *) p1;
117 i2 = (const struct intel_02_cache_info *) p2;
118
119 if (i1->idx == i2->idx)
120 return 0;
121
122 return i1->idx < i2->idx ? -1 : 1;
123}
124
125
126static long int
127__attribute__ ((noinline))
128intel_check_word (int name, unsigned int value, bool *has_level_2,
129 bool *no_level_2_or_3,
130 const struct cpu_features *cpu_features)
131{
132 if ((value & 0x80000000) != 0)
133 /* The register value is reserved. */
134 return 0;
135
136 /* Fold the name. The _SC_ constants are always in the order SIZE,
137 ASSOC, LINESIZE. */
138 int folded_rel_name = (M(name) / 3) * 3;
139
140 while (value != 0)
141 {
142 unsigned int byte = value & 0xff;
143
144 if (byte == 0x40)
145 {
146 *no_level_2_or_3 = true;
147
148 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
149 /* No need to look further. */
150 break;
151 }
152 else if (byte == 0xff)
153 {
154 /* CPUID leaf 0x4 contains all the information. We need to
155 iterate over it. */
156 unsigned int eax;
157 unsigned int ebx;
158 unsigned int ecx;
159 unsigned int edx;
160
161 unsigned int round = 0;
162 while (1)
163 {
164 __cpuid_count (4, round, eax, ebx, ecx, edx);
165
166 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
167 if (type == null)
168 /* That was the end. */
169 break;
170
171 unsigned int level = (eax >> 5) & 0x7;
172
173 if ((level == 1 && type == data
174 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
175 || (level == 1 && type == inst
176 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
177 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
178 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
179 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
180 {
181 unsigned int offset = M(name) - folded_rel_name;
182
183 if (offset == 0)
184 /* Cache size. */
185 return (((ebx >> 22) + 1)
186 * (((ebx >> 12) & 0x3ff) + 1)
187 * ((ebx & 0xfff) + 1)
188 * (ecx + 1));
189 if (offset == 1)
190 return (ebx >> 22) + 1;
191
192 assert (offset == 2);
193 return (ebx & 0xfff) + 1;
194 }
195
196 ++round;
197 }
198 /* There is no other cache information anywhere else. */
199 break;
200 }
201 else
202 {
203 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
204 {
205 /* Intel reused this value. For family 15, model 6 it
206 specifies the 3rd level cache. Otherwise the 2nd
207 level cache. */
208 unsigned int family = cpu_features->basic.family;
209 unsigned int model = cpu_features->basic.model;
210
211 if (family == 15 && model == 6)
212 {
213 /* The level 3 cache is encoded for this model like
214 the level 2 cache is for other models. Pretend
215 the caller asked for the level 2 cache. */
216 name = (_SC_LEVEL2_CACHE_SIZE
217 + (name - _SC_LEVEL3_CACHE_SIZE));
218 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
219 }
220 }
221
222 struct intel_02_cache_info *found;
223 struct intel_02_cache_info search;
224
225 search.idx = byte;
226 found = bsearch (&search, intel_02_known, nintel_02_known,
227 sizeof (intel_02_known[0]), intel_02_known_compare);
228 if (found != NULL)
229 {
230 if (found->rel_name == folded_rel_name)
231 {
232 unsigned int offset = M(name) - folded_rel_name;
233
234 if (offset == 0)
235 /* Cache size. */
236 return found->size;
237 if (offset == 1)
238 return found->assoc;
239
240 assert (offset == 2);
241 return found->linesize;
242 }
243
244 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
245 *has_level_2 = true;
246 }
247 }
248
249 /* Next byte for the next round. */
250 value >>= 8;
251 }
252
253 /* Nothing found. */
254 return 0;
255}
256
257
258static long int __attribute__ ((noinline))
259handle_intel (int name, const struct cpu_features *cpu_features)
260{
261 unsigned int maxidx = cpu_features->basic.max_cpuid;
262
263 /* Return -1 for older CPUs. */
264 if (maxidx < 2)
265 return -1;
266
267 /* OK, we can use the CPUID instruction to get all info about the
268 caches. */
269 unsigned int cnt = 0;
270 unsigned int max = 1;
271 long int result = 0;
272 bool no_level_2_or_3 = false;
273 bool has_level_2 = false;
274
275 while (cnt++ < max)
276 {
277 unsigned int eax;
278 unsigned int ebx;
279 unsigned int ecx;
280 unsigned int edx;
281 __cpuid (2, eax, ebx, ecx, edx);
282
283 /* The low byte of EAX in the first round contain the number of
284 rounds we have to make. At least one, the one we are already
285 doing. */
286 if (cnt == 1)
287 {
288 max = eax & 0xff;
289 eax &= 0xffffff00;
290 }
291
292 /* Process the individual registers' value. */
293 result = intel_check_word (name, eax, &has_level_2,
294 &no_level_2_or_3, cpu_features);
295 if (result != 0)
296 return result;
297
298 result = intel_check_word (name, ebx, &has_level_2,
299 &no_level_2_or_3, cpu_features);
300 if (result != 0)
301 return result;
302
303 result = intel_check_word (name, ecx, &has_level_2,
304 &no_level_2_or_3, cpu_features);
305 if (result != 0)
306 return result;
307
308 result = intel_check_word (name, edx, &has_level_2,
309 &no_level_2_or_3, cpu_features);
310 if (result != 0)
311 return result;
312 }
313
314 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
315 && no_level_2_or_3)
316 return -1;
317
318 return 0;
319}
320
321
322static long int __attribute__ ((noinline))
323handle_amd (int name)
324{
325 unsigned int eax;
326 unsigned int ebx;
327 unsigned int ecx;
328 unsigned int edx;
329 __cpuid (0x80000000, eax, ebx, ecx, edx);
330
331 /* No level 4 cache (yet). */
332 if (name > _SC_LEVEL3_CACHE_LINESIZE)
333 return 0;
334
335 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
336 if (eax < fn)
337 return 0;
338
339 __cpuid (fn, eax, ebx, ecx, edx);
340
341 if (name < _SC_LEVEL1_DCACHE_SIZE)
342 {
343 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
344 ecx = edx;
345 }
346
347 switch (name)
348 {
349 case _SC_LEVEL1_DCACHE_SIZE:
350 return (ecx >> 14) & 0x3fc00;
351
352 case _SC_LEVEL1_DCACHE_ASSOC:
353 ecx >>= 16;
354 if ((ecx & 0xff) == 0xff)
355 /* Fully associative. */
356 return (ecx << 2) & 0x3fc00;
357 return ecx & 0xff;
358
359 case _SC_LEVEL1_DCACHE_LINESIZE:
360 return ecx & 0xff;
361
362 case _SC_LEVEL2_CACHE_SIZE:
363 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
364
365 case _SC_LEVEL2_CACHE_ASSOC:
366 switch ((ecx >> 12) & 0xf)
367 {
368 case 0:
369 case 1:
370 case 2:
371 case 4:
372 return (ecx >> 12) & 0xf;
373 case 6:
374 return 8;
375 case 8:
376 return 16;
377 case 10:
378 return 32;
379 case 11:
380 return 48;
381 case 12:
382 return 64;
383 case 13:
384 return 96;
385 case 14:
386 return 128;
387 case 15:
388 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
389 default:
390 return 0;
391 }
392 /* NOTREACHED */
393
394 case _SC_LEVEL2_CACHE_LINESIZE:
395 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
396
397 case _SC_LEVEL3_CACHE_SIZE:
398 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
399
400 case _SC_LEVEL3_CACHE_ASSOC:
401 switch ((edx >> 12) & 0xf)
402 {
403 case 0:
404 case 1:
405 case 2:
406 case 4:
407 return (edx >> 12) & 0xf;
408 case 6:
409 return 8;
410 case 8:
411 return 16;
412 case 10:
413 return 32;
414 case 11:
415 return 48;
416 case 12:
417 return 64;
418 case 13:
419 return 96;
420 case 14:
421 return 128;
422 case 15:
423 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
424 default:
425 return 0;
426 }
427 /* NOTREACHED */
428
429 case _SC_LEVEL3_CACHE_LINESIZE:
430 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
431
432 default:
433 assert (! "cannot happen");
434 }
435 return -1;
436}
437
438
439static long int __attribute__ ((noinline))
440handle_zhaoxin (int name)
441{
442 unsigned int eax;
443 unsigned int ebx;
444 unsigned int ecx;
445 unsigned int edx;
446
447 int folded_rel_name = (M(name) / 3) * 3;
448
449 unsigned int round = 0;
450 while (1)
451 {
452 __cpuid_count (4, round, eax, ebx, ecx, edx);
453
454 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
455 if (type == null)
456 break;
457
458 unsigned int level = (eax >> 5) & 0x7;
459
460 if ((level == 1 && type == data
461 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
462 || (level == 1 && type == inst
463 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
464 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
465 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
466 {
467 unsigned int offset = M(name) - folded_rel_name;
468
469 if (offset == 0)
470 /* Cache size. */
471 return (((ebx >> 22) + 1)
472 * (((ebx >> 12) & 0x3ff) + 1)
473 * ((ebx & 0xfff) + 1)
474 * (ecx + 1));
475 if (offset == 1)
476 return (ebx >> 22) + 1;
477
478 assert (offset == 2);
479 return (ebx & 0xfff) + 1;
480 }
481
482 ++round;
483 }
484
485 /* Nothing found. */
486 return 0;
487}
488
489
490/* Get the value of the system variable NAME. */
491long int
492attribute_hidden
493__cache_sysconf (int name)
494{
495 const struct cpu_features *cpu_features = __get_cpu_features ();
496
497 if (cpu_features->basic.kind == arch_kind_intel)
498 return handle_intel (name, cpu_features);
499
500 if (cpu_features->basic.kind == arch_kind_amd)
501 return handle_amd (name);
502
503 if (cpu_features->basic.kind == arch_kind_zhaoxin)
504 return handle_zhaoxin (name);
505
506 // XXX Fill in more vendors.
507
508 /* CPU not known, we have no information. */
509 return 0;
510}
511
512
513/* Data cache size for use in memory and string routines, typically
514 L1 size, rounded to multiple of 256 bytes. */
515long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
516long int __x86_data_cache_size attribute_hidden = 32 * 1024;
517/* Similar to __x86_data_cache_size_half, but not rounded. */
518long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
519/* Similar to __x86_data_cache_size, but not rounded. */
520long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
521/* Shared cache size for use in memory and string routines, typically
522 L2 or L3 size, rounded to multiple of 256 bytes. */
523long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
524long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
525/* Similar to __x86_shared_cache_size_half, but not rounded. */
526long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
527/* Similar to __x86_shared_cache_size, but not rounded. */
528long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
529
530/* Threshold to use non temporal store. */
531long int __x86_shared_non_temporal_threshold attribute_hidden;
532
533/* Threshold to use Enhanced REP MOVSB. */
534long int __x86_rep_movsb_threshold attribute_hidden = 2048;
535
536/* Threshold to use Enhanced REP STOSB. */
537long int __x86_rep_stosb_threshold attribute_hidden = 2048;
538
539
540static void
541get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
542 long int core)
543{
544 unsigned int eax;
545 unsigned int ebx;
546 unsigned int ecx;
547 unsigned int edx;
548
549 /* Number of logical processors sharing L2 cache. */
550 int threads_l2;
551
552 /* Number of logical processors sharing L3 cache. */
553 int threads_l3;
554
555 const struct cpu_features *cpu_features = __get_cpu_features ();
556 int max_cpuid = cpu_features->basic.max_cpuid;
557 unsigned int family = cpu_features->basic.family;
558 unsigned int model = cpu_features->basic.model;
559 long int shared = *shared_ptr;
560 unsigned int threads = *threads_ptr;
561 bool inclusive_cache = true;
562 bool support_count_mask = true;
563
564 /* Try L3 first. */
565 unsigned int level = 3;
566
567 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
568 support_count_mask = false;
569
570 if (shared <= 0)
571 {
572 /* Try L2 otherwise. */
573 level = 2;
574 shared = core;
575 threads_l2 = 0;
576 threads_l3 = -1;
577 }
578 else
579 {
580 threads_l2 = 0;
581 threads_l3 = 0;
582 }
583
584 /* A value of 0 for the HTT bit indicates there is only a single
585 logical processor. */
586 if (CPU_FEATURE_USABLE (HTT))
587 {
588 /* Figure out the number of logical threads that share the
589 highest cache level. */
590 if (max_cpuid >= 4)
591 {
592 int i = 0;
593
594 /* Query until cache level 2 and 3 are enumerated. */
595 int check = 0x1 | (threads_l3 == 0) << 1;
596 do
597 {
598 __cpuid_count (4, i++, eax, ebx, ecx, edx);
599
600 /* There seems to be a bug in at least some Pentium Ds
601 which sometimes fail to iterate all cache parameters.
602 Do not loop indefinitely here, stop in this case and
603 assume there is no such information. */
604 if (cpu_features->basic.kind == arch_kind_intel
605 && (eax & 0x1f) == 0 )
606 goto intel_bug_no_cache_info;
607
608 switch ((eax >> 5) & 0x7)
609 {
610 default:
611 break;
612 case 2:
613 if ((check & 0x1))
614 {
615 /* Get maximum number of logical processors
616 sharing L2 cache. */
617 threads_l2 = (eax >> 14) & 0x3ff;
618 check &= ~0x1;
619 }
620 break;
621 case 3:
622 if ((check & (0x1 << 1)))
623 {
624 /* Get maximum number of logical processors
625 sharing L3 cache. */
626 threads_l3 = (eax >> 14) & 0x3ff;
627
628 /* Check if L2 and L3 caches are inclusive. */
629 inclusive_cache = (edx & 0x2) != 0;
630 check &= ~(0x1 << 1);
631 }
632 break;
633 }
634 }
635 while (check);
636
637 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
638 numbers of addressable IDs for logical processors sharing
639 the cache, instead of the maximum number of threads
640 sharing the cache. */
641 if (max_cpuid >= 11 && support_count_mask)
642 {
643 /* Find the number of logical processors shipped in
644 one core and apply count mask. */
645 i = 0;
646
647 /* Count SMT only if there is L3 cache. Always count
648 core if there is no L3 cache. */
649 int count = ((threads_l2 > 0 && level == 3)
650 | ((threads_l3 > 0
651 || (threads_l2 > 0 && level == 2)) << 1));
652
653 while (count)
654 {
655 __cpuid_count (11, i++, eax, ebx, ecx, edx);
656
657 int shipped = ebx & 0xff;
658 int type = ecx & 0xff00;
659 if (shipped == 0 || type == 0)
660 break;
661 else if (type == 0x100)
662 {
663 /* Count SMT. */
664 if ((count & 0x1))
665 {
666 int count_mask;
667
668 /* Compute count mask. */
669 asm ("bsr %1, %0"
670 : "=r" (count_mask) : "g" (threads_l2));
671 count_mask = ~(-1 << (count_mask + 1));
672 threads_l2 = (shipped - 1) & count_mask;
673 count &= ~0x1;
674 }
675 }
676 else if (type == 0x200)
677 {
678 /* Count core. */
679 if ((count & (0x1 << 1)))
680 {
681 int count_mask;
682 int threads_core
683 = (level == 2 ? threads_l2 : threads_l3);
684
685 /* Compute count mask. */
686 asm ("bsr %1, %0"
687 : "=r" (count_mask) : "g" (threads_core));
688 count_mask = ~(-1 << (count_mask + 1));
689 threads_core = (shipped - 1) & count_mask;
690 if (level == 2)
691 threads_l2 = threads_core;
692 else
693 threads_l3 = threads_core;
694 count &= ~(0x1 << 1);
695 }
696 }
697 }
698 }
699 if (threads_l2 > 0)
700 threads_l2 += 1;
701 if (threads_l3 > 0)
702 threads_l3 += 1;
703 if (level == 2)
704 {
705 if (threads_l2)
706 {
707 threads = threads_l2;
708 if (cpu_features->basic.kind == arch_kind_intel
709 && threads > 2
710 && family == 6)
711 switch (model)
712 {
713 case 0x37:
714 case 0x4a:
715 case 0x4d:
716 case 0x5a:
717 case 0x5d:
718 /* Silvermont has L2 cache shared by 2 cores. */
719 threads = 2;
720 break;
721 default:
722 break;
723 }
724 }
725 }
726 else if (threads_l3)
727 threads = threads_l3;
728 }
729 else
730 {
731intel_bug_no_cache_info:
732 /* Assume that all logical threads share the highest cache
733 level. */
734 threads
735 = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
736 >> 16) & 0xff);
737 }
738
739 /* Cap usage of highest cache level to the number of supported
740 threads. */
741 if (shared > 0 && threads > 0)
742 shared /= threads;
743 }
744
745 /* Account for non-inclusive L2 and L3 caches. */
746 if (!inclusive_cache)
747 {
748 if (threads_l2 > 0)
749 core /= threads_l2;
750 shared += core;
751 }
752
753 *shared_ptr = shared;
754 *threads_ptr = threads;
755}
756
757
758static void
759__attribute__((constructor))
760init_cacheinfo (void)
761{
762 /* Find out what brand of processor. */
763 unsigned int ebx;
764 unsigned int ecx;
765 unsigned int edx;
766 int max_cpuid_ex;
767 long int data = -1;
768 long int shared = -1;
769 long int core;
770 unsigned int threads = 0;
771 const struct cpu_features *cpu_features = __get_cpu_features ();
772
773 if (cpu_features->basic.kind == arch_kind_intel)
774 {
775 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
776 core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
777 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
778
779 get_common_cache_info (&shared, &threads, core);
780 }
781 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
782 {
783 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
784 core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
785 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
786
787 get_common_cache_info (&shared, &threads, core);
788 }
789 else if (cpu_features->basic.kind == arch_kind_amd)
790 {
791 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
792 long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
793 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
794
795 /* Get maximum extended function. */
796 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
797
798 if (shared <= 0)
799 /* No shared L3 cache. All we have is the L2 cache. */
800 shared = core;
801 else
802 {
803 /* Figure out the number of logical threads that share L3. */
804 if (max_cpuid_ex >= 0x80000008)
805 {
806 /* Get width of APIC ID. */
807 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
808 threads = 1 << ((ecx >> 12) & 0x0f);
809 }
810
811 if (threads == 0)
812 {
813 /* If APIC ID width is not available, use logical
814 processor count. */
815 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
816
817 if ((edx & (1 << 28)) != 0)
818 threads = (ebx >> 16) & 0xff;
819 }
820
821 /* Cap usage of highest cache level to the number of
822 supported threads. */
823 if (threads > 0)
824 shared /= threads;
825
826 /* Account for exclusive L2 and L3 caches. */
827 shared += core;
828 }
829 }
830
831 if (cpu_features->data_cache_size != 0)
832 data = cpu_features->data_cache_size;
833
834 if (data > 0)
835 {
836 __x86_raw_data_cache_size_half = data / 2;
837 __x86_raw_data_cache_size = data;
838 /* Round data cache size to multiple of 256 bytes. */
839 data = data & ~255L;
840 __x86_data_cache_size_half = data / 2;
841 __x86_data_cache_size = data;
842 }
843
844 if (cpu_features->shared_cache_size != 0)
845 shared = cpu_features->shared_cache_size;
846
847 if (shared > 0)
848 {
849 __x86_raw_shared_cache_size_half = shared / 2;
850 __x86_raw_shared_cache_size = shared;
851 /* Round shared cache size to multiple of 256 bytes. */
852 shared = shared & ~255L;
853 __x86_shared_cache_size_half = shared / 2;
854 __x86_shared_cache_size = shared;
855 }
856
857 /* The large memcpy micro benchmark in glibc shows that 6 times of
858 shared cache size is the approximate value above which non-temporal
859 store becomes faster on a 8-core processor. This is the 3/4 of the
860 total shared cache size. */
861 __x86_shared_non_temporal_threshold
862 = (cpu_features->non_temporal_threshold != 0
863 ? cpu_features->non_temporal_threshold
864 : __x86_shared_cache_size * threads * 3 / 4);
865
866 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
867 unsigned int minimum_rep_movsb_threshold;
868 /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
869 unsigned int rep_movsb_threshold;
870 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
871 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
872 {
873 rep_movsb_threshold = 2048 * (64 / 16);
874 minimum_rep_movsb_threshold = 64 * 8;
875 }
876 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
877 AVX_Fast_Unaligned_Load))
878 {
879 rep_movsb_threshold = 2048 * (32 / 16);
880 minimum_rep_movsb_threshold = 32 * 8;
881 }
882 else
883 {
884 rep_movsb_threshold = 2048 * (16 / 16);
885 minimum_rep_movsb_threshold = 16 * 8;
886 }
887 if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
888 __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
889 else
890 __x86_rep_movsb_threshold = rep_movsb_threshold;
891
892# if HAVE_TUNABLES
893 __x86_rep_stosb_threshold = cpu_features->rep_stosb_threshold;
894# endif
895}
896
897#endif
898