dl-cacheinfo.h source code [glibc/sysdeps/x86/dl-cacheinfo.h]

1	/ Initialize x86 cache info.*
2	Copyright (C) 2020-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	static const struct intel_02_cache_info
20	{
21	unsigned char idx;
22	unsigned char assoc;
23	unsigned char linesize;
24	unsigned char rel_name;
25	unsigned int size;
26	} intel_02_known [] =
27	{
28	#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29	{ `0x06`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `8192` },
30	{ `0x08`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `16384` },
31	{ `0x09`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
32	{ `0x0a`, `2`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
33	{ `0x0c`, `4`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
34	{ `0x0d`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
35	{ `0x0e`, `6`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `24576` },
36	{ `0x21`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
37	{ `0x22`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
38	{ `0x23`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
39	{ `0x25`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
40	{ `0x29`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
41	{ `0x2c`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
42	{ `0x30`, `8`, `64`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
43	{ `0x39`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
44	{ `0x3a`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `196608` },
45	{ `0x3b`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
46	{ `0x3c`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
47	{ `0x3d`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `393216` },
48	{ `0x3e`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
49	{ `0x3f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
50	{ `0x41`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
51	{ `0x42`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
52	{ `0x43`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
53	{ `0x44`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
54	{ `0x45`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
55	{ `0x46`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
56	{ `0x47`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
57	{ `0x48`, `12`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `3145728` },
58	{ `0x49`, `16`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `4194304` },
59	{ `0x4a`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `6291456` },
60	{ `0x4b`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
61	{ `0x4c`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
62	{ `0x4d`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `16777216` },
63	{ `0x4e`, `24`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `6291456` },
64	{ `0x60`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
65	{ `0x66`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
66	{ `0x67`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
67	{ `0x68`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
68	{ `0x78`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
69	{ `0x79`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
70	{ `0x7a`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
71	{ `0x7b`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
72	{ `0x7c`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
73	{ `0x7d`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
74	{ `0x7f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
75	{ `0x80`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
76	{ `0x82`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
77	{ `0x83`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
78	{ `0x84`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
79	{ `0x85`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
80	{ `0x86`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
81	{ `0x87`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
82	{ `0xd0`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
83	{ `0xd1`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
84	{ `0xd2`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
85	{ `0xd6`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
86	{ `0xd7`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
87	{ `0xd8`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
88	{ `0xdc`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
89	{ `0xdd`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
90	{ `0xde`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
91	{ `0xe2`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
92	{ `0xe3`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
93	{ `0xe4`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
94	{ `0xea`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
95	{ `0xeb`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `18874368` },
96	{ `0xec`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `25165824` },
97	};
98
99	#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101	static int
102	intel_02_known_compare (const void p1, const* void *p2)
103	{
104	const struct intel_02_cache_info *i1;
105	const struct intel_02_cache_info *i2;
106
107	i1 = (const struct intel_02_cache_info *) p1;
108	i2 = (const struct intel_02_cache_info *) p2;
109
110	if (i1->idx == i2->idx)
111	return `0`;
112
113	return i1->idx < i2->idx ? -`1` : `1`;
114	}
115
116
117	static long int
118	__attribute__ ((noinline))
119	intel_check_word (int name, unsigned int value, bool *has_level_2,
120	bool *no_level_2_or_3,
121	const struct cpu_features *cpu_features)
122	{
123	if ((value & `0x80000000`) != `0`)
124	/ The register value is reserved. /
125	return `0`;
126
127	/ Fold the name. The _SC_ constants are always in the order SIZE,*
128	ASSOC, LINESIZE. /*
129	int folded_rel_name = (M(name) / `3`) * `3`;
130
131	while (value != `0`)
132	{
133	unsigned int byte = value & `0xff`;
134
135	if (byte == `0x40`)
136	{
137	*no_level_2_or_3 = true;
138
139	if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140	/ No need to look further. /
141	break;
142	}
143	else if (byte == `0xff`)
144	{
145	/ CPUID leaf 0x4 contains all the information. We need to*
146	iterate over it. /*
147	unsigned int eax;
148	unsigned int ebx;
149	unsigned int ecx;
150	unsigned int edx;
151
152	unsigned int round = `0`;
153	while (`1`)
154	{
155	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
156
157	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
158	if (type == null)
159	/ That was the end. /
160	break;
161
162	unsigned int level = (eax >> `5`) & `0x7`;
163
164	if ((level == `1` && type == data
165	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166	\|\| (level == `1` && type == inst
167	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170	\|\| (level == `4` && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171	{
172	unsigned int offset = M(name) - folded_rel_name;
173
174	if (offset == `0`)
175	/ Cache size. /
176	return (((ebx >> `22`) + `1`)
177	* (((ebx >> `12`) & `0x3ff`) + `1`)
178	* ((ebx & `0xfff`) + `1`)
179	* (ecx + `1`));
180	if (offset == `1`)
181	return (ebx >> `22`) + `1`;
182
183	assert (offset == `2`);
184	return (ebx & `0xfff`) + `1`;
185	}
186
187	++round;
188	}
189	/ There is no other cache information anywhere else. /
190	break;
191	}
192	else
193	{
194	if (byte == `0x49` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195	{
196	/ Intel reused this value. For family 15, model 6 it*
197	specifies the 3rd level cache. Otherwise the 2nd
198	level cache. /*
199	unsigned int family = cpu_features->basic.family;
200	unsigned int model = cpu_features->basic.model;
201
202	if (family == `15` && model == `6`)
203	{
204	/ The level 3 cache is encoded for this model like*
205	the level 2 cache is for other models. Pretend
206	the caller asked for the level 2 cache. /*
207	name = (_SC_LEVEL2_CACHE_SIZE
208	+ (name - _SC_LEVEL3_CACHE_SIZE));
209	folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210	}
211	}
212
213	struct intel_02_cache_info *found;
214	struct intel_02_cache_info search;
215
216	search.idx = byte;
217	found = bsearch (&search, intel_02_known, nintel_02_known,
218	sizeof (intel_02_known[`0`]), intel_02_known_compare);
219	if (found != NULL)
220	{
221	if (found->rel_name == folded_rel_name)
222	{
223	unsigned int offset = M(name) - folded_rel_name;
224
225	if (offset == `0`)
226	/ Cache size. /
227	return found->size;
228	if (offset == `1`)
229	return found->assoc;
230
231	assert (offset == `2`);
232	return found->linesize;
233	}
234
235	if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236	*has_level_2 = true;
237	}
238	}
239
240	/ Next byte for the next round. /
241	value >>= `8`;
242	}
243
244	/ Nothing found. /
245	return `0`;
246	}
247
248
249	static long int __attribute__ ((noinline))
250	handle_intel (int name, const struct cpu_features *cpu_features)
251	{
252	unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254	/ Return -1 for older CPUs. /
255	if (maxidx < `2`)
256	return -`1`;
257
258	/ OK, we can use the CPUID instruction to get all info about the*
259	caches. /*
260	unsigned int cnt = `0`;
261	unsigned int max = `1`;
262	long int result = `0`;
263	bool no_level_2_or_3 = false;
264	bool has_level_2 = false;
265
266	while (cnt++ < max)
267	{
268	unsigned int eax;
269	unsigned int ebx;
270	unsigned int ecx;
271	unsigned int edx;
272	__cpuid (`2`, eax, ebx, ecx, edx);
273
274	/ The low byte of EAX in the first round contain the number of*
275	rounds we have to make. At least one, the one we are already
276	doing. /*
277	if (cnt == `1`)
278	{
279	max = eax & `0xff`;
280	eax &= `0xffffff00`;
281	}
282
283	/ Process the individual registers' value. /
284	result = intel_check_word (name, eax, &has_level_2,
285	&no_level_2_or_3, cpu_features);
286	if (result != `0`)
287	return result;
288
289	result = intel_check_word (name, ebx, &has_level_2,
290	&no_level_2_or_3, cpu_features);
291	if (result != `0`)
292	return result;
293
294	result = intel_check_word (name, ecx, &has_level_2,
295	&no_level_2_or_3, cpu_features);
296	if (result != `0`)
297	return result;
298
299	result = intel_check_word (name, edx, &has_level_2,
300	&no_level_2_or_3, cpu_features);
301	if (result != `0`)
302	return result;
303	}
304
305	if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306	&& no_level_2_or_3)
307	return -`1`;
308
309	return `0`;
310	}
311
312
313	static long int __attribute__ ((noinline))
314	handle_amd (int name)
315	{
316	unsigned int eax;
317	unsigned int ebx;
318	unsigned int ecx;
319	unsigned int edx;
320	unsigned int count = `0x1`;
321
322	/ No level 4 cache (yet). /
323	if (name > _SC_LEVEL3_CACHE_LINESIZE)
324	return `0`;
325
326	if (name >= _SC_LEVEL3_CACHE_SIZE)
327	count = `0x3`;
328	else if (name >= _SC_LEVEL2_CACHE_SIZE)
329	count = `0x2`;
330	else if (name >= _SC_LEVEL1_DCACHE_SIZE)
331	count = `0x0`;
332
333	__cpuid_count (`0x8000001D`, count, eax, ebx, ecx, edx);
334
335	switch (name)
336	{
337	case _SC_LEVEL1_ICACHE_ASSOC:
338	case _SC_LEVEL1_DCACHE_ASSOC:
339	case _SC_LEVEL2_CACHE_ASSOC:
340	case _SC_LEVEL3_CACHE_ASSOC:
341	return ecx ? ((ebx >> `22`) & `0x3ff`) + `1` : `0`;
342	case _SC_LEVEL1_ICACHE_LINESIZE:
343	case _SC_LEVEL1_DCACHE_LINESIZE:
344	case _SC_LEVEL2_CACHE_LINESIZE:
345	case _SC_LEVEL3_CACHE_LINESIZE:
346	return ecx ? (ebx & `0xfff`) + `1` : `0`;
347	case _SC_LEVEL1_ICACHE_SIZE:
348	case _SC_LEVEL1_DCACHE_SIZE:
349	case _SC_LEVEL2_CACHE_SIZE:
350	case _SC_LEVEL3_CACHE_SIZE:
351	return ecx ? (((ebx >> `22`) & `0x3ff`) + `1`) * ((ebx & `0xfff`) + `1`) * (ecx + `1`): `0`;
352	default:
353	__builtin_unreachable ();
354	}
355	return -`1`;
356	}
357
358
359	static long int __attribute__ ((noinline))
360	handle_zhaoxin (int name)
361	{
362	unsigned int eax;
363	unsigned int ebx;
364	unsigned int ecx;
365	unsigned int edx;
366
367	int folded_rel_name = (M(name) / `3`) * `3`;
368
369	unsigned int round = `0`;
370	while (`1`)
371	{
372	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
373
374	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
375	if (type == null)
376	break;
377
378	unsigned int level = (eax >> `5`) & `0x7`;
379
380	if ((level == `1` && type == data
381	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
382	\|\| (level == `1` && type == inst
383	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
384	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
385	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
386	{
387	unsigned int offset = M(name) - folded_rel_name;
388
389	if (offset == `0`)
390	/ Cache size. /
391	return (((ebx >> `22`) + `1`)
392	* (((ebx >> `12`) & `0x3ff`) + `1`)
393	* ((ebx & `0xfff`) + `1`)
394	* (ecx + `1`));
395	if (offset == `1`)
396	return (ebx >> `22`) + `1`;
397
398	assert (offset == `2`);
399	return (ebx & `0xfff`) + `1`;
400	}
401
402	++round;
403	}
404
405	/ Nothing found. /
406	return `0`;
407	}
408
409	static void
410	get_common_cache_info (long int shared_ptr, long* int * shared_per_thread_ptr, unsigned int *threads_ptr,
411	long int core)
412	{
413	unsigned int eax;
414	unsigned int ebx;
415	unsigned int ecx;
416	unsigned int edx;
417
418	/ Number of logical processors sharing L2 cache. /
419	int threads_l2;
420
421	/ Number of logical processors sharing L3 cache. /
422	int threads_l3;
423
424	const struct cpu_features *cpu_features = __get_cpu_features ();
425	int max_cpuid = cpu_features->basic.max_cpuid;
426	unsigned int family = cpu_features->basic.family;
427	unsigned int model = cpu_features->basic.model;
428	long int shared = *shared_ptr;
429	long int shared_per_thread = *shared_per_thread_ptr;
430	unsigned int threads = *threads_ptr;
431	bool inclusive_cache = true;
432	bool support_count_mask = true;
433
434	/ Try L3 first. /
435	unsigned int level = `3`;
436
437	if (cpu_features->basic.kind == arch_kind_zhaoxin && family == `6`)
438	support_count_mask = false;
439
440	if (shared <= `0`)
441	{
442	/ Try L2 otherwise. /
443	level = `2`;
444	shared = core;
445	shared_per_thread = core;
446	threads_l2 = `0`;
447	threads_l3 = -`1`;
448	}
449	else
450	{
451	threads_l2 = `0`;
452	threads_l3 = `0`;
453	}
454
455	/ A value of 0 for the HTT bit indicates there is only a single*
456	logical processor. /*
457	if (HAS_CPU_FEATURE (HTT))
458	{
459	/ Figure out the number of logical threads that share the*
460	highest cache level. /*
461	if (max_cpuid >= `4`)
462	{
463	int i = `0`;
464
465	/ Query until cache level 2 and 3 are enumerated. /
466	int check = `0x1` \| (threads_l3 == `0`) << `1`;
467	do
468	{
469	__cpuid_count (`4`, i++, eax, ebx, ecx, edx);
470
471	/ There seems to be a bug in at least some Pentium Ds*
472	which sometimes fail to iterate all cache parameters.
473	Do not loop indefinitely here, stop in this case and
474	assume there is no such information. /*
475	if (cpu_features->basic.kind == arch_kind_intel
476	&& (eax & `0x1f`) == `0` )
477	goto intel_bug_no_cache_info;
478
479	switch ((eax >> `5`) & `0x7`)
480	{
481	default:
482	break;
483	case `2`:
484	if ((check & `0x1`))
485	{
486	/ Get maximum number of logical processors*
487	sharing L2 cache. /*
488	threads_l2 = (eax >> `14`) & `0x3ff`;
489	check &= ~`0x1`;
490	}
491	break;
492	case `3`:
493	if ((check & (`0x1` << `1`)))
494	{
495	/ Get maximum number of logical processors*
496	sharing L3 cache. /*
497	threads_l3 = (eax >> `14`) & `0x3ff`;
498
499	/ Check if L2 and L3 caches are inclusive. /
500	inclusive_cache = (edx & `0x2`) != `0`;
501	check &= ~(`0x1` << `1`);
502	}
503	break;
504	}
505	}
506	while (check);
507
508	/ If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum*
509	numbers of addressable IDs for logical processors sharing
510	the cache, instead of the maximum number of threads
511	sharing the cache. /*
512	if (max_cpuid >= `11` && support_count_mask)
513	{
514	/ Find the number of logical processors shipped in*
515	one core and apply count mask. /*
516	i = `0`;
517
518	/ Count SMT only if there is L3 cache. Always count*
519	core if there is no L3 cache. /*
520	int count = ((threads_l2 > `0` && level == `3`)
521	\| ((threads_l3 > `0`
522	\|\| (threads_l2 > `0` && level == `2`)) << `1`));
523
524	while (count)
525	{
526	__cpuid_count (`11`, i++, eax, ebx, ecx, edx);
527
528	int shipped = ebx & `0xff`;
529	int type = ecx & `0xff00`;
530	if (shipped == `0` \|\| type == `0`)
531	break;
532	else if (type == `0x100`)
533	{
534	/ Count SMT. /
535	if ((count & `0x1`))
536	{
537	int count_mask;
538
539	/ Compute count mask. /
540	asm ("bsr %1, %0"
541	: "=r" (count_mask) : "g" (threads_l2));
542	count_mask = ~(-`1` << (count_mask + `1`));
543	threads_l2 = (shipped - `1`) & count_mask;
544	count &= ~`0x1`;
545	}
546	}
547	else if (type == `0x200`)
548	{
549	/ Count core. /
550	if ((count & (`0x1` << `1`)))
551	{
552	int count_mask;
553	int threads_core
554	= (level == `2` ? threads_l2 : threads_l3);
555
556	/ Compute count mask. /
557	asm ("bsr %1, %0"
558	: "=r" (count_mask) : "g" (threads_core));
559	count_mask = ~(-`1` << (count_mask + `1`));
560	threads_core = (shipped - `1`) & count_mask;
561	if (level == `2`)
562	threads_l2 = threads_core;
563	else
564	threads_l3 = threads_core;
565	count &= ~(`0x1` << `1`);
566	}
567	}
568	}
569	}
570	if (threads_l2 > `0`)
571	threads_l2 += `1`;
572	if (threads_l3 > `0`)
573	threads_l3 += `1`;
574	if (level == `2`)
575	{
576	if (threads_l2)
577	{
578	threads = threads_l2;
579	if (cpu_features->basic.kind == arch_kind_intel
580	&& threads > `2`
581	&& family == `6`)
582	switch (model)
583	{
584	case `0x37`:
585	case `0x4a`:
586	case `0x4d`:
587	case `0x5a`:
588	case `0x5d`:
589	/ Silvermont has L2 cache shared by 2 cores. /
590	threads = `2`;
591	break;
592	default:
593	break;
594	}
595	}
596	}
597	else if (threads_l3)
598	threads = threads_l3;
599	}
600	else
601	{
602	intel_bug_no_cache_info:
603	/ Assume that all logical threads share the highest cache*
604	level. /*
605	threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> `16`)
606	& `0xff`);
607
608	/ Get per-thread size of highest level cache. /
609	if (shared_per_thread > `0` && threads > `0`)
610	shared_per_thread /= threads;
611	}
612	}
613
614	/ Account for non-inclusive L2 and L3 caches. /
615	if (!inclusive_cache)
616	{
617	long int core_per_thread = threads_l2 > `0` ? (core / threads_l2) : core;
618	shared_per_thread += core_per_thread;
619	shared += core;
620	}
621
622	*shared_ptr = shared;
623	*shared_per_thread_ptr = shared_per_thread;
624	*threads_ptr = threads;
625	}
626
627	static void
628	dl_init_cacheinfo (struct cpu_features *cpu_features)
629	{
630	/ Find out what brand of processor. /
631	long int data = -`1`;
632	long int shared = -`1`;
633	long int shared_per_thread = -`1`;
634	long int core = -`1`;
635	unsigned int threads = `0`;
636	unsigned long int level1_icache_size = -`1`;
637	unsigned long int level1_icache_linesize = -`1`;
638	unsigned long int level1_dcache_size = -`1`;
639	unsigned long int level1_dcache_assoc = -`1`;
640	unsigned long int level1_dcache_linesize = -`1`;
641	unsigned long int level2_cache_size = -`1`;
642	unsigned long int level2_cache_assoc = -`1`;
643	unsigned long int level2_cache_linesize = -`1`;
644	unsigned long int level3_cache_size = -`1`;
645	unsigned long int level3_cache_assoc = -`1`;
646	unsigned long int level3_cache_linesize = -`1`;
647	unsigned long int level4_cache_size = -`1`;
648
649	if (cpu_features->basic.kind == arch_kind_intel)
650	{
651	data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
652	core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
653	shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
654	shared_per_thread = shared;
655
656	level1_icache_size
657	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
658	level1_icache_linesize
659	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
660	level1_dcache_size = data;
661	level1_dcache_assoc
662	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
663	level1_dcache_linesize
664	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
665	level2_cache_size = core;
666	level2_cache_assoc
667	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
668	level2_cache_linesize
669	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
670	level3_cache_size = shared;
671	level3_cache_assoc
672	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
673	level3_cache_linesize
674	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
675	level4_cache_size
676	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
677
678	get_common_cache_info (&shared, &shared_per_thread, &threads, core);
679	}
680	else if (cpu_features->basic.kind == arch_kind_zhaoxin)
681	{
682	data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
683	core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
684	shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
685	shared_per_thread = shared;
686
687	level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
688	level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
689	level1_dcache_size = data;
690	level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
691	level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
692	level2_cache_size = core;
693	level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
694	level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
695	level3_cache_size = shared;
696	level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
697	level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
698
699	get_common_cache_info (&shared, &shared_per_thread, &threads, core);
700	}
701	else if (cpu_features->basic.kind == arch_kind_amd)
702	{
703	data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
704	core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
705	shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
706	shared_per_thread = shared;
707
708	level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
709	level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
710	level1_dcache_size = data;
711	level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
712	level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
713	level2_cache_size = core;
714	level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
715	level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
716	level3_cache_size = shared;
717	level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
718	level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
719
720	if (shared <= `0`)
721	/ No shared L3 cache. All we have is the L2 cache. /
722	shared = core;
723
724	if (shared_per_thread <= `0`)
725	shared_per_thread = shared;
726	}
727
728	cpu_features->level1_icache_size = level1_icache_size;
729	cpu_features->level1_icache_linesize = level1_icache_linesize;
730	cpu_features->level1_dcache_size = level1_dcache_size;
731	cpu_features->level1_dcache_assoc = level1_dcache_assoc;
732	cpu_features->level1_dcache_linesize = level1_dcache_linesize;
733	cpu_features->level2_cache_size = level2_cache_size;
734	cpu_features->level2_cache_assoc = level2_cache_assoc;
735	cpu_features->level2_cache_linesize = level2_cache_linesize;
736	cpu_features->level3_cache_size = level3_cache_size;
737	cpu_features->level3_cache_assoc = level3_cache_assoc;
738	cpu_features->level3_cache_linesize = level3_cache_linesize;
739	cpu_features->level4_cache_size = level4_cache_size;
740
741	unsigned long int cachesize_non_temporal_divisor
742	= cpu_features->cachesize_non_temporal_divisor;
743	if (cachesize_non_temporal_divisor <= `0`)
744	cachesize_non_temporal_divisor = `4`;
745
746	/ The default setting for the non_temporal threshold is [1/8, 1/2] of size*
747	of the chip's cache (depending on `cachesize_non_temporal_divisor` which
748	is microarch specific. The default is 1/4). For most Intel processors
749	with an initial release date between 2017 and 2023, a thread's
750	typical share of the cache is from 18-64MB. Using a reasonable size
751	fraction of L3 is meant to estimate the point where non-temporal stores
752	begin out-competing REP MOVSB. As well the point where the fact that
753	non-temporal stores are forced back to main memory would already occurred
754	to the majority of the lines in the copy. Note, concerns about the entire
755	L3 cache being evicted by the copy are mostly alleviated by the fact that
756	modern HW detects streaming patterns and provides proper LRU hints so that
757	the maximum thrashing capped at 1/associativity. /*
758	unsigned long int non_temporal_threshold
759	= shared / cachesize_non_temporal_divisor;
760
761	/ If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most*
762	likely have incorrect/incomplete cache info in which case, default to
763	3/4 per-thread L3 to avoid regressions. /
764	unsigned long int non_temporal_threshold_lowbound
765	= shared_per_thread * `3` / `4`;
766	if (non_temporal_threshold < non_temporal_threshold_lowbound)
767	non_temporal_threshold = non_temporal_threshold_lowbound;
768
769	/ If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run*
770	a higher risk of actually thrashing the cache as they don't have a HW LRU
771	hint. As well, their performance in highly parallel situations is
772	noticeably worse. /*
773	if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
774	non_temporal_threshold = non_temporal_threshold_lowbound;
775	/ SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of*
776	'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
777	if that operation cannot overflow. Minimum of 0x4040 (16448) because the
778	L(large_memset_4x) loops need 64-byte to cache align and enough space for
779	at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
780	reflected in the manual. /*
781	unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> `4`;
782	unsigned long int minimum_non_temporal_threshold = `0x4040`;
783
784	/ If `non_temporal_threshold` less than `minimum_non_temporal_threshold`*
785	it most likely means we failed to detect the cache info. We don't want
786	to default to `minimum_non_temporal_threshold` as such a small value,
787	while correct, has bad performance. We default to 64MB as reasonable
788	default bound. 64MB is likely conservative in that most/all systems would
789	choose a lower value so it should never forcing non-temporal stores when
790	they otherwise wouldn't be used. /*
791	if (non_temporal_threshold < minimum_non_temporal_threshold)
792	non_temporal_threshold = `64` * `1024` * `1024`;
793	else if (non_temporal_threshold > maximum_non_temporal_threshold)
794	non_temporal_threshold = maximum_non_temporal_threshold;
795
796	/ NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. /
797	unsigned int minimum_rep_movsb_threshold;
798	/ NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for*
799	VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
800	threshold is 2048 (VEC_SIZE / 16). /
801	unsigned int rep_movsb_threshold;
802	if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
803	&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
804	{
805	rep_movsb_threshold = `4096` * (`64` / `16`);
806	minimum_rep_movsb_threshold = `64` * `8`;
807	}
808	else if (CPU_FEATURE_PREFERRED_P (cpu_features,
809	AVX_Fast_Unaligned_Load))
810	{
811	rep_movsb_threshold = `4096` * (`32` / `16`);
812	minimum_rep_movsb_threshold = `32` * `8`;
813	}
814	else
815	{
816	rep_movsb_threshold = `2048` * (`16` / `16`);
817	minimum_rep_movsb_threshold = `16` * `8`;
818	}
819	/ NB: The default REP MOVSB threshold is 2112 on processors with fast*
820	short REP MOVSB (FSRM). /*
821	if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
822	rep_movsb_threshold = `2112`;
823
824	/ The default threshold to use Enhanced REP STOSB. /
825	unsigned long int rep_stosb_threshold = `2048`;
826
827	long int tunable_size;
828
829	tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
830	/ NB: Ignore the default value 0. /
831	if (tunable_size != `0`)
832	data = tunable_size;
833
834	tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
835	/ NB: Ignore the default value 0. /
836	if (tunable_size != `0`)
837	shared = tunable_size;
838
839	tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
840	if (tunable_size > minimum_non_temporal_threshold
841	&& tunable_size <= maximum_non_temporal_threshold)
842	non_temporal_threshold = tunable_size;
843
844	tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
845	if (tunable_size > minimum_rep_movsb_threshold)
846	rep_movsb_threshold = tunable_size;
847
848	/ NB: The default value of the x86_rep_stosb_threshold tunable is the*
849	same as the default value of __x86_rep_stosb_threshold and the
850	minimum value is fixed. /*
851	rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
852	long int, NULL);
853
854	TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, `0`, SIZE_MAX);
855	TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, `0`, SIZE_MAX);
856	TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
857	minimum_non_temporal_threshold,
858	maximum_non_temporal_threshold);
859	TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
860	minimum_rep_movsb_threshold, SIZE_MAX);
861	TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, `1`,
862	SIZE_MAX);
863
864	unsigned long int rep_movsb_stop_threshold;
865	/ ERMS feature is implemented from AMD Zen3 architecture and it is*
866	performing poorly for data above L2 cache size. Henceforth, adding
867	an upper bound threshold parameter to limit the usage of Enhanced
868	REP MOVSB operations and setting its value to L2 cache size. /*
869	if (cpu_features->basic.kind == arch_kind_amd)
870	rep_movsb_stop_threshold = core;
871	/ Setting the upper bound of ERMS to the computed value of*
872	non-temporal threshold for architectures other than AMD. /*
873	else
874	rep_movsb_stop_threshold = non_temporal_threshold;
875
876	cpu_features->data_cache_size = data;
877	cpu_features->shared_cache_size = shared;
878	cpu_features->non_temporal_threshold = non_temporal_threshold;
879	cpu_features->rep_movsb_threshold = rep_movsb_threshold;
880	cpu_features->rep_stosb_threshold = rep_stosb_threshold;
881	cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
882	}
883

Browse the source code of glibc/sysdeps/x86/dl-cacheinfo.h