dl-cacheinfo.h source code [glibc/sysdeps/x86/dl-cacheinfo.h]

1	/ Initialize x86 cache info.*
2	Copyright (C) 2020-2023 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	static const struct intel_02_cache_info
20	{
21	unsigned char idx;
22	unsigned char assoc;
23	unsigned char linesize;
24	unsigned char rel_name;
25	unsigned int size;
26	} intel_02_known [] =
27	{
28	#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29	{ `0x06`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `8192` },
30	{ `0x08`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `16384` },
31	{ `0x09`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
32	{ `0x0a`, `2`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
33	{ `0x0c`, `4`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
34	{ `0x0d`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
35	{ `0x0e`, `6`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `24576` },
36	{ `0x21`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
37	{ `0x22`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
38	{ `0x23`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
39	{ `0x25`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
40	{ `0x29`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
41	{ `0x2c`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
42	{ `0x30`, `8`, `64`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
43	{ `0x39`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
44	{ `0x3a`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `196608` },
45	{ `0x3b`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
46	{ `0x3c`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
47	{ `0x3d`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `393216` },
48	{ `0x3e`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
49	{ `0x3f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
50	{ `0x41`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
51	{ `0x42`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
52	{ `0x43`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
53	{ `0x44`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
54	{ `0x45`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
55	{ `0x46`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
56	{ `0x47`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
57	{ `0x48`, `12`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `3145728` },
58	{ `0x49`, `16`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `4194304` },
59	{ `0x4a`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `6291456` },
60	{ `0x4b`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
61	{ `0x4c`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
62	{ `0x4d`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `16777216` },
63	{ `0x4e`, `24`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `6291456` },
64	{ `0x60`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
65	{ `0x66`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
66	{ `0x67`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
67	{ `0x68`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
68	{ `0x78`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
69	{ `0x79`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
70	{ `0x7a`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
71	{ `0x7b`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
72	{ `0x7c`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
73	{ `0x7d`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
74	{ `0x7f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
75	{ `0x80`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
76	{ `0x82`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
77	{ `0x83`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
78	{ `0x84`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
79	{ `0x85`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
80	{ `0x86`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
81	{ `0x87`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
82	{ `0xd0`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
83	{ `0xd1`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
84	{ `0xd2`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
85	{ `0xd6`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
86	{ `0xd7`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
87	{ `0xd8`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
88	{ `0xdc`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
89	{ `0xdd`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
90	{ `0xde`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
91	{ `0xe2`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
92	{ `0xe3`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
93	{ `0xe4`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
94	{ `0xea`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
95	{ `0xeb`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `18874368` },
96	{ `0xec`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `25165824` },
97	};
98
99	#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101	static int
102	intel_02_known_compare (const void p1, const* void *p2)
103	{
104	const struct intel_02_cache_info *i1;
105	const struct intel_02_cache_info *i2;
106
107	i1 = (const struct intel_02_cache_info *) p1;
108	i2 = (const struct intel_02_cache_info *) p2;
109
110	if (i1->idx == i2->idx)
111	return `0`;
112
113	return i1->idx < i2->idx ? -`1` : `1`;
114	}
115
116
117	static long int
118	__attribute__ ((noinline))
119	intel_check_word (int name, unsigned int value, bool *has_level_2,
120	bool *no_level_2_or_3,
121	const struct cpu_features *cpu_features)
122	{
123	if ((value & `0x80000000`) != `0`)
124	/ The register value is reserved. /
125	return `0`;
126
127	/ Fold the name. The _SC_ constants are always in the order SIZE,*
128	ASSOC, LINESIZE. /*
129	int folded_rel_name = (M(name) / `3`) * `3`;
130
131	while (value != `0`)
132	{
133	unsigned int byte = value & `0xff`;
134
135	if (byte == `0x40`)
136	{
137	*no_level_2_or_3 = true;
138
139	if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140	/ No need to look further. /
141	break;
142	}
143	else if (byte == `0xff`)
144	{
145	/ CPUID leaf 0x4 contains all the information. We need to*
146	iterate over it. /*
147	unsigned int eax;
148	unsigned int ebx;
149	unsigned int ecx;
150	unsigned int edx;
151
152	unsigned int round = `0`;
153	while (`1`)
154	{
155	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
156
157	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
158	if (type == null)
159	/ That was the end. /
160	break;
161
162	unsigned int level = (eax >> `5`) & `0x7`;
163
164	if ((level == `1` && type == data
165	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166	\|\| (level == `1` && type == inst
167	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170	\|\| (level == `4` && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171	{
172	unsigned int offset = M(name) - folded_rel_name;
173
174	if (offset == `0`)
175	/ Cache size. /
176	return (((ebx >> `22`) + `1`)
177	* (((ebx >> `12`) & `0x3ff`) + `1`)
178	* ((ebx & `0xfff`) + `1`)
179	* (ecx + `1`));
180	if (offset == `1`)
181	return (ebx >> `22`) + `1`;
182
183	assert (offset == `2`);
184	return (ebx & `0xfff`) + `1`;
185	}
186
187	++round;
188	}
189	/ There is no other cache information anywhere else. /
190	break;
191	}
192	else
193	{
194	if (byte == `0x49` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195	{
196	/ Intel reused this value. For family 15, model 6 it*
197	specifies the 3rd level cache. Otherwise the 2nd
198	level cache. /*
199	unsigned int family = cpu_features->basic.family;
200	unsigned int model = cpu_features->basic.model;
201
202	if (family == `15` && model == `6`)
203	{
204	/ The level 3 cache is encoded for this model like*
205	the level 2 cache is for other models. Pretend
206	the caller asked for the level 2 cache. /*
207	name = (_SC_LEVEL2_CACHE_SIZE
208	+ (name - _SC_LEVEL3_CACHE_SIZE));
209	folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210	}
211	}
212
213	struct intel_02_cache_info *found;
214	struct intel_02_cache_info search;
215
216	search.idx = byte;
217	found = bsearch (&search, intel_02_known, nintel_02_known,
218	sizeof (intel_02_known[`0`]), intel_02_known_compare);
219	if (found != NULL)
220	{
221	if (found->rel_name == folded_rel_name)
222	{
223	unsigned int offset = M(name) - folded_rel_name;
224
225	if (offset == `0`)
226	/ Cache size. /
227	return found->size;
228	if (offset == `1`)
229	return found->assoc;
230
231	assert (offset == `2`);
232	return found->linesize;
233	}
234
235	if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236	*has_level_2 = true;
237	}
238	}
239
240	/ Next byte for the next round. /
241	value >>= `8`;
242	}
243
244	/ Nothing found. /
245	return `0`;
246	}
247
248
249	static long int __attribute__ ((noinline))
250	handle_intel (int name, const struct cpu_features *cpu_features)
251	{
252	unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254	/ Return -1 for older CPUs. /
255	if (maxidx < `2`)
256	return -`1`;
257
258	/ OK, we can use the CPUID instruction to get all info about the*
259	caches. /*
260	unsigned int cnt = `0`;
261	unsigned int max = `1`;
262	long int result = `0`;
263	bool no_level_2_or_3 = false;
264	bool has_level_2 = false;
265
266	while (cnt++ < max)
267	{
268	unsigned int eax;
269	unsigned int ebx;
270	unsigned int ecx;
271	unsigned int edx;
272	__cpuid (`2`, eax, ebx, ecx, edx);
273
274	/ The low byte of EAX in the first round contain the number of*
275	rounds we have to make. At least one, the one we are already
276	doing. /*
277	if (cnt == `1`)
278	{
279	max = eax & `0xff`;
280	eax &= `0xffffff00`;
281	}
282
283	/ Process the individual registers' value. /
284	result = intel_check_word (name, eax, &has_level_2,
285	&no_level_2_or_3, cpu_features);
286	if (result != `0`)
287	return result;
288
289	result = intel_check_word (name, ebx, &has_level_2,
290	&no_level_2_or_3, cpu_features);
291	if (result != `0`)
292	return result;
293
294	result = intel_check_word (name, ecx, &has_level_2,
295	&no_level_2_or_3, cpu_features);
296	if (result != `0`)
297	return result;
298
299	result = intel_check_word (name, edx, &has_level_2,
300	&no_level_2_or_3, cpu_features);
301	if (result != `0`)
302	return result;
303	}
304
305	if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306	&& no_level_2_or_3)
307	return -`1`;
308
309	return `0`;
310	}
311
312
313	static long int __attribute__ ((noinline))
314	handle_amd (int name, const struct cpu_features *cpu_features)
315	{
316	unsigned int eax;
317	unsigned int ebx;
318	unsigned int ecx;
319	unsigned int edx;
320	unsigned int count = `0x1`;
321
322	/ No level 4 cache (yet). /
323	if (name > _SC_LEVEL3_CACHE_LINESIZE)
324	return `0`;
325
326	if (name >= _SC_LEVEL3_CACHE_SIZE)
327	count = `0x3`;
328	else if (name >= _SC_LEVEL2_CACHE_SIZE)
329	count = `0x2`;
330	else if (name >= _SC_LEVEL1_DCACHE_SIZE)
331	count = `0x0`;
332
333	__cpuid_count (`0x8000001D`, count, eax, ebx, ecx, edx);
334
335	switch (name)
336	{
337	case _SC_LEVEL1_ICACHE_ASSOC:
338	case _SC_LEVEL1_DCACHE_ASSOC:
339	case _SC_LEVEL2_CACHE_ASSOC:
340	case _SC_LEVEL3_CACHE_ASSOC:
341	return ecx?((ebx >> `22`) & `0x3ff`) + `1` : `0`;
342	case _SC_LEVEL1_ICACHE_LINESIZE:
343	case _SC_LEVEL1_DCACHE_LINESIZE:
344	case _SC_LEVEL2_CACHE_LINESIZE:
345	case _SC_LEVEL3_CACHE_LINESIZE:
346	return ecx?(ebx & `0xfff`) + `1` : `0`;
347	case _SC_LEVEL1_ICACHE_SIZE:
348	case _SC_LEVEL1_DCACHE_SIZE:
349	case _SC_LEVEL2_CACHE_SIZE:
350	case _SC_LEVEL3_CACHE_SIZE:
351	return ecx?(((ebx >> `22`) & `0x3ff`) + `1`)*((ebx & `0xfff`) + `1`)\
352	*(ecx + `1`):`0`;
353	default:
354	assert (! "cannot happen");
355	}
356	return -`1`;
357	}
358
359
360	static long int __attribute__ ((noinline))
361	handle_zhaoxin (int name)
362	{
363	unsigned int eax;
364	unsigned int ebx;
365	unsigned int ecx;
366	unsigned int edx;
367
368	int folded_rel_name = (M(name) / `3`) * `3`;
369
370	unsigned int round = `0`;
371	while (`1`)
372	{
373	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
374
375	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
376	if (type == null)
377	break;
378
379	unsigned int level = (eax >> `5`) & `0x7`;
380
381	if ((level == `1` && type == data
382	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
383	\|\| (level == `1` && type == inst
384	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
385	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
386	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
387	{
388	unsigned int offset = M(name) - folded_rel_name;
389
390	if (offset == `0`)
391	/ Cache size. /
392	return (((ebx >> `22`) + `1`)
393	* (((ebx >> `12`) & `0x3ff`) + `1`)
394	* ((ebx & `0xfff`) + `1`)
395	* (ecx + `1`));
396	if (offset == `1`)
397	return (ebx >> `22`) + `1`;
398
399	assert (offset == `2`);
400	return (ebx & `0xfff`) + `1`;
401	}
402
403	++round;
404	}
405
406	/ Nothing found. /
407	return `0`;
408	}
409
410	static void
411	get_common_cache_info (long int shared_ptr, unsigned* int *threads_ptr,
412	long int core)
413	{
414	unsigned int eax;
415	unsigned int ebx;
416	unsigned int ecx;
417	unsigned int edx;
418
419	/ Number of logical processors sharing L2 cache. /
420	int threads_l2;
421
422	/ Number of logical processors sharing L3 cache. /
423	int threads_l3;
424
425	const struct cpu_features *cpu_features = __get_cpu_features ();
426	int max_cpuid = cpu_features->basic.max_cpuid;
427	unsigned int family = cpu_features->basic.family;
428	unsigned int model = cpu_features->basic.model;
429	long int shared = *shared_ptr;
430	unsigned int threads = *threads_ptr;
431	bool inclusive_cache = true;
432	bool support_count_mask = true;
433
434	/ Try L3 first. /
435	unsigned int level = `3`;
436
437	if (cpu_features->basic.kind == arch_kind_zhaoxin && family == `6`)
438	support_count_mask = false;
439
440	if (shared <= `0`)
441	{
442	/ Try L2 otherwise. /
443	level = `2`;
444	shared = core;
445	threads_l2 = `0`;
446	threads_l3 = -`1`;
447	}
448	else
449	{
450	threads_l2 = `0`;
451	threads_l3 = `0`;
452	}
453
454	/ A value of 0 for the HTT bit indicates there is only a single*
455	logical processor. /*
456	if (HAS_CPU_FEATURE (HTT))
457	{
458	/ Figure out the number of logical threads that share the*
459	highest cache level. /*
460	if (max_cpuid >= `4`)
461	{
462	int i = `0`;
463
464	/ Query until cache level 2 and 3 are enumerated. /
465	int check = `0x1` \| (threads_l3 == `0`) << `1`;
466	do
467	{
468	__cpuid_count (`4`, i++, eax, ebx, ecx, edx);
469
470	/ There seems to be a bug in at least some Pentium Ds*
471	which sometimes fail to iterate all cache parameters.
472	Do not loop indefinitely here, stop in this case and
473	assume there is no such information. /*
474	if (cpu_features->basic.kind == arch_kind_intel
475	&& (eax & `0x1f`) == `0` )
476	goto intel_bug_no_cache_info;
477
478	switch ((eax >> `5`) & `0x7`)
479	{
480	default:
481	break;
482	case `2`:
483	if ((check & `0x1`))
484	{
485	/ Get maximum number of logical processors*
486	sharing L2 cache. /*
487	threads_l2 = (eax >> `14`) & `0x3ff`;
488	check &= ~`0x1`;
489	}
490	break;
491	case `3`:
492	if ((check & (`0x1` << `1`)))
493	{
494	/ Get maximum number of logical processors*
495	sharing L3 cache. /*
496	threads_l3 = (eax >> `14`) & `0x3ff`;
497
498	/ Check if L2 and L3 caches are inclusive. /
499	inclusive_cache = (edx & `0x2`) != `0`;
500	check &= ~(`0x1` << `1`);
501	}
502	break;
503	}
504	}
505	while (check);
506
507	/ If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum*
508	numbers of addressable IDs for logical processors sharing
509	the cache, instead of the maximum number of threads
510	sharing the cache. /*
511	if (max_cpuid >= `11` && support_count_mask)
512	{
513	/ Find the number of logical processors shipped in*
514	one core and apply count mask. /*
515	i = `0`;
516
517	/ Count SMT only if there is L3 cache. Always count*
518	core if there is no L3 cache. /*
519	int count = ((threads_l2 > `0` && level == `3`)
520	\| ((threads_l3 > `0`
521	\|\| (threads_l2 > `0` && level == `2`)) << `1`));
522
523	while (count)
524	{
525	__cpuid_count (`11`, i++, eax, ebx, ecx, edx);
526
527	int shipped = ebx & `0xff`;
528	int type = ecx & `0xff00`;
529	if (shipped == `0` \|\| type == `0`)
530	break;
531	else if (type == `0x100`)
532	{
533	/ Count SMT. /
534	if ((count & `0x1`))
535	{
536	int count_mask;
537
538	/ Compute count mask. /
539	asm ("bsr %1, %0"
540	: "=r" (count_mask) : "g" (threads_l2));
541	count_mask = ~(-`1` << (count_mask + `1`));
542	threads_l2 = (shipped - `1`) & count_mask;
543	count &= ~`0x1`;
544	}
545	}
546	else if (type == `0x200`)
547	{
548	/ Count core. /
549	if ((count & (`0x1` << `1`)))
550	{
551	int count_mask;
552	int threads_core
553	= (level == `2` ? threads_l2 : threads_l3);
554
555	/ Compute count mask. /
556	asm ("bsr %1, %0"
557	: "=r" (count_mask) : "g" (threads_core));
558	count_mask = ~(-`1` << (count_mask + `1`));
559	threads_core = (shipped - `1`) & count_mask;
560	if (level == `2`)
561	threads_l2 = threads_core;
562	else
563	threads_l3 = threads_core;
564	count &= ~(`0x1` << `1`);
565	}
566	}
567	}
568	}
569	if (threads_l2 > `0`)
570	threads_l2 += `1`;
571	if (threads_l3 > `0`)
572	threads_l3 += `1`;
573	if (level == `2`)
574	{
575	if (threads_l2)
576	{
577	threads = threads_l2;
578	if (cpu_features->basic.kind == arch_kind_intel
579	&& threads > `2`
580	&& family == `6`)
581	switch (model)
582	{
583	case `0x37`:
584	case `0x4a`:
585	case `0x4d`:
586	case `0x5a`:
587	case `0x5d`:
588	/ Silvermont has L2 cache shared by 2 cores. /
589	threads = `2`;
590	break;
591	default:
592	break;
593	}
594	}
595	}
596	else if (threads_l3)
597	threads = threads_l3;
598	}
599	else
600	{
601	intel_bug_no_cache_info:
602	/ Assume that all logical threads share the highest cache*
603	level. /*
604	threads
605	= ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> `16`)
606	& `0xff`);
607	}
608
609	/ Cap usage of highest cache level to the number of supported*
610	threads. /*
611	if (shared > `0` && threads > `0`)
612	shared /= threads;
613	}
614
615	/ Account for non-inclusive L2 and L3 caches. /
616	if (!inclusive_cache)
617	{
618	if (threads_l2 > `0`)
619	core /= threads_l2;
620	shared += core;
621	}
622
623	*shared_ptr = shared;
624	*threads_ptr = threads;
625	}
626
627	static void
628	dl_init_cacheinfo (struct cpu_features *cpu_features)
629	{
630	/ Find out what brand of processor. /
631	long int data = -`1`;
632	long int shared = -`1`;
633	long int core = -`1`;
634	unsigned int threads = `0`;
635	unsigned long int level1_icache_size = -`1`;
636	unsigned long int level1_icache_linesize = -`1`;
637	unsigned long int level1_dcache_size = -`1`;
638	unsigned long int level1_dcache_assoc = -`1`;
639	unsigned long int level1_dcache_linesize = -`1`;
640	unsigned long int level2_cache_size = -`1`;
641	unsigned long int level2_cache_assoc = -`1`;
642	unsigned long int level2_cache_linesize = -`1`;
643	unsigned long int level3_cache_size = -`1`;
644	unsigned long int level3_cache_assoc = -`1`;
645	unsigned long int level3_cache_linesize = -`1`;
646	unsigned long int level4_cache_size = -`1`;
647
648	if (cpu_features->basic.kind == arch_kind_intel)
649	{
650	data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
651	core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
652	shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
653
654	level1_icache_size
655	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
656	level1_icache_linesize
657	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
658	level1_dcache_size = data;
659	level1_dcache_assoc
660	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
661	level1_dcache_linesize
662	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
663	level2_cache_size = core;
664	level2_cache_assoc
665	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
666	level2_cache_linesize
667	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
668	level3_cache_size = shared;
669	level3_cache_assoc
670	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
671	level3_cache_linesize
672	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
673	level4_cache_size
674	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
675
676	get_common_cache_info (&shared, &threads, core);
677	}
678	else if (cpu_features->basic.kind == arch_kind_zhaoxin)
679	{
680	data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
681	core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
682	shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
683
684	level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
685	level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
686	level1_dcache_size = data;
687	level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
688	level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
689	level2_cache_size = core;
690	level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
691	level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
692	level3_cache_size = shared;
693	level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
694	level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
695
696	get_common_cache_info (&shared, &threads, core);
697	}
698	else if (cpu_features->basic.kind == arch_kind_amd)
699	{
700	data = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
701	core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
702	shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
703
704	level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
705	level1_icache_linesize
706	= handle_amd (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
707	level1_dcache_size = data;
708	level1_dcache_assoc
709	= handle_amd (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
710	level1_dcache_linesize
711	= handle_amd (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
712	level2_cache_size = core;
713	level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
714	level2_cache_linesize
715	= handle_amd (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
716	level3_cache_size = shared;
717	level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
718	level3_cache_linesize
719	= handle_amd (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
720
721	if (shared <= `0`)
722	/ No shared L3 cache. All we have is the L2 cache. /
723	shared = core;
724	}
725
726	cpu_features->level1_icache_size = level1_icache_size;
727	cpu_features->level1_icache_linesize = level1_icache_linesize;
728	cpu_features->level1_dcache_size = level1_dcache_size;
729	cpu_features->level1_dcache_assoc = level1_dcache_assoc;
730	cpu_features->level1_dcache_linesize = level1_dcache_linesize;
731	cpu_features->level2_cache_size = level2_cache_size;
732	cpu_features->level2_cache_assoc = level2_cache_assoc;
733	cpu_features->level2_cache_linesize = level2_cache_linesize;
734	cpu_features->level3_cache_size = level3_cache_size;
735	cpu_features->level3_cache_assoc = level3_cache_assoc;
736	cpu_features->level3_cache_linesize = level3_cache_linesize;
737	cpu_features->level4_cache_size = level4_cache_size;
738
739	/ The default setting for the non_temporal threshold is 3/4 of one*
740	thread's share of the chip's cache. For most Intel and AMD processors
741	with an initial release date between 2017 and 2020, a thread's typical
742	share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
743	threshold leaves 125 KBytes to 500 KBytes of the thread's data
744	in cache after a maximum temporal copy, which will maintain
745	in cache a reasonable portion of the thread's stack and other
746	active data. If the threshold is set higher than one thread's
747	share of the cache, it has a substantial risk of negatively
748	impacting the performance of other threads running on the chip. /*
749	unsigned long int non_temporal_threshold = shared * `3` / `4`;
750	/ SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of*
751	'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
752	if that operation cannot overflow. Minimum of 0x4040 (16448) because the
753	L(large_memset_4x) loops need 64-byte to cache align and enough space for
754	at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
755	reflected in the manual. /*
756	unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> `4`;
757	unsigned long int minimum_non_temporal_threshold = `0x4040`;
758	if (non_temporal_threshold < minimum_non_temporal_threshold)
759	non_temporal_threshold = minimum_non_temporal_threshold;
760	else if (non_temporal_threshold > maximum_non_temporal_threshold)
761	non_temporal_threshold = maximum_non_temporal_threshold;
762
763	#if HAVE_TUNABLES
764	/ NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. /
765	unsigned int minimum_rep_movsb_threshold;
766	#endif
767	/ NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for*
768	VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
769	threshold is 2048 (VEC_SIZE / 16). /
770	unsigned int rep_movsb_threshold;
771	if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
772	&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
773	{
774	rep_movsb_threshold = `4096` * (`64` / `16`);
775	#if HAVE_TUNABLES
776	minimum_rep_movsb_threshold = `64` * `8`;
777	#endif
778	}
779	else if (CPU_FEATURE_PREFERRED_P (cpu_features,
780	AVX_Fast_Unaligned_Load))
781	{
782	rep_movsb_threshold = `4096` * (`32` / `16`);
783	#if HAVE_TUNABLES
784	minimum_rep_movsb_threshold = `32` * `8`;
785	#endif
786	}
787	else
788	{
789	rep_movsb_threshold = `2048` * (`16` / `16`);
790	#if HAVE_TUNABLES
791	minimum_rep_movsb_threshold = `16` * `8`;
792	#endif
793	}
794	/ NB: The default REP MOVSB threshold is 2112 on processors with fast*
795	short REP MOVSB (FSRM). /*
796	if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
797	rep_movsb_threshold = `2112`;
798
799	/ The default threshold to use Enhanced REP STOSB. /
800	unsigned long int rep_stosb_threshold = `2048`;
801
802	#if HAVE_TUNABLES
803	long int tunable_size;
804
805	tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
806	/ NB: Ignore the default value 0. /
807	if (tunable_size != `0`)
808	data = tunable_size;
809
810	tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
811	/ NB: Ignore the default value 0. /
812	if (tunable_size != `0`)
813	shared = tunable_size;
814
815	tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
816	if (tunable_size > minimum_non_temporal_threshold
817	&& tunable_size <= maximum_non_temporal_threshold)
818	non_temporal_threshold = tunable_size;
819
820	tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
821	if (tunable_size > minimum_rep_movsb_threshold)
822	rep_movsb_threshold = tunable_size;
823
824	/ NB: The default value of the x86_rep_stosb_threshold tunable is the*
825	same as the default value of __x86_rep_stosb_threshold and the
826	minimum value is fixed. /*
827	rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
828	long int, NULL);
829
830	TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, `0`, SIZE_MAX);
831	TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, `0`, SIZE_MAX);
832	TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
833	minimum_non_temporal_threshold,
834	maximum_non_temporal_threshold);
835	TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
836	minimum_rep_movsb_threshold, SIZE_MAX);
837	TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, `1`,
838	SIZE_MAX);
839	#endif
840
841	unsigned long int rep_movsb_stop_threshold;
842	/ ERMS feature is implemented from AMD Zen3 architecture and it is*
843	performing poorly for data above L2 cache size. Henceforth, adding
844	an upper bound threshold parameter to limit the usage of Enhanced
845	REP MOVSB operations and setting its value to L2 cache size. /*
846	if (cpu_features->basic.kind == arch_kind_amd)
847	rep_movsb_stop_threshold = core;
848	/ Setting the upper bound of ERMS to the computed value of*
849	non-temporal threshold for architectures other than AMD. /*
850	else
851	rep_movsb_stop_threshold = non_temporal_threshold;
852
853	cpu_features->data_cache_size = data;
854	cpu_features->shared_cache_size = shared;
855	cpu_features->non_temporal_threshold = non_temporal_threshold;
856	cpu_features->rep_movsb_threshold = rep_movsb_threshold;
857	cpu_features->rep_stosb_threshold = rep_stosb_threshold;
858	cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
859	}
860

Browse the source code of glibc/sysdeps/x86/dl-cacheinfo.h