allocatestack.c source code [glibc/nptl/allocatestack.c]

1	/ Copyright (C) 2002-2023 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	The GNU C Library is free software; you can redistribute it and/or
5	modify it under the terms of the GNU Lesser General Public
6	License as published by the Free Software Foundation; either
7	version 2.1 of the License, or (at your option) any later version.
8
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with the GNU C Library; if not, see
16	<https://www.gnu.org/licenses/>. /*
17
18	#include <assert.h>
19	#include <errno.h>
20	#include <signal.h>
21	#include <stdint.h>
22	#include <string.h>
23	#include <unistd.h>
24	#include <sys/mman.h>
25	#include <sys/param.h>
26	#include <dl-sysdep.h>
27	#include <dl-tls.h>
28	#include <tls.h>
29	#include <list.h>
30	#include <lowlevellock.h>
31	#include <futex-internal.h>
32	#include <kernel-features.h>
33	#include <nptl-stack.h>
34	#include <libc-lock.h>
35	#include <tls-internal.h>
36
37	/ Default alignment of stack. /
38	#ifndef STACK_ALIGN
39	# define STACK_ALIGN __alignof__ (long double)
40	#endif
41
42	/ Default value for minimal stack size after allocating thread*
43	descriptor and guard. /*
44	#ifndef MINIMAL_REST_STACK
45	# define MINIMAL_REST_STACK 4096
46	#endif
47
48
49	/ Newer kernels have the MAP_STACK flag to indicate a mapping is used for*
50	a stack. Use it when possible. /*
51	#ifndef MAP_STACK
52	# define MAP_STACK 0
53	#endif
54
55	/ Get a stack frame from the cache. We have to match by size since*
56	some blocks might be too small or far too large. /*
57	static struct pthread *
58	get_cached_stack (size_t sizep, void* **memp)
59	{
60	size_t size = *sizep;
61	struct pthread *result = NULL;
62	list_t *entry;
63
64	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
65
66	/ Search the cache for a matching entry. We search for the*
67	smallest stack which has at least the required size. Note that
68	in normal situations the size of all allocated stacks is the
69	same. As the very least there are only a few different sizes.
70	Therefore this loop will exit early most of the time with an
71	exact match. /*
72	list_for_each (entry, &GL (dl_stack_cache))
73	{
74	struct pthread *curr;
75
76	curr = list_entry (entry, struct pthread, list);
77	if (__nptl_stack_in_use (curr) && curr->stackblock_size >= size)
78	{
79	if (curr->stackblock_size == size)
80	{
81	result = curr;
82	break;
83	}
84
85	if (result == NULL
86	\|\| result->stackblock_size > curr->stackblock_size)
87	result = curr;
88	}
89	}
90
91	if (__builtin_expect (result == NULL, `0`)
92	/ Make sure the size difference is not too excessive. In that*
93	case we do not use the block. /*
94	\|\| __builtin_expect (result->stackblock_size > `4` * size, `0`))
95	{
96	/ Release the lock. /
97	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
98
99	return NULL;
100	}
101
102	/ Don't allow setxid until cloned. /
103	result->setxid_futex = -`1`;
104
105	/ Dequeue the entry. /
106	__nptl_stack_list_del (&result->list);
107
108	/ And add to the list of stacks in use. /
109	__nptl_stack_list_add (&result->list, &GL (dl_stack_used));
110
111	/ And decrease the cache size. /
112	GL (dl_stack_cache_actsize) -= result->stackblock_size;
113
114	/ Release the lock early. /
115	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
116
117	/ Report size and location of the stack to the caller. /
118	*sizep = result->stackblock_size;
119	*memp = result->stackblock;
120
121	/ Cancellation handling is back to the default. /
122	result->cancelhandling = `0`;
123	result->cleanup = NULL;
124	result->setup_failed = `0`;
125
126	/ No pending event. /
127	result->nextevent = NULL;
128
129	result->exiting = false;
130	__libc_lock_init (result->exit_lock);
131	memset (&result->tls_state, `0`, sizeof result->tls_state);
132
133	/ Clear the DTV. /
134	dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
135	for (size_t cnt = `0`; cnt < dtv[-`1`].counter; ++cnt)
136	free (dtv[`1` + cnt].pointer.to_free);
137	memset (dtv, `'\0'`, (dtv[-`1`].counter + `1`) * sizeof (dtv_t));
138
139	/ Re-initialize the TLS. /
140	_dl_allocate_tls_init (TLS_TPADJ (result), true);
141
142	return result;
143	}
144
145	/ Return the guard page position on allocated stack. /
146	static inline char *
147	__attribute ((always_inline))
148	guard_position (void mem, size_t size, size_t guardsize, struct* pthread *pd,
149	size_t pagesize_m1)
150	{
151	#ifdef NEED_SEPARATE_REGISTER_STACK
152	return mem + (((size - guardsize) / `2`) & ~pagesize_m1);
153	#elif _STACK_GROWS_DOWN
154	return mem;
155	#elif _STACK_GROWS_UP
156	return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
157	#endif
158	}
159
160	/ Based on stack allocated with PROT_NONE, setup the required portions with*
161	'prot' flags based on the guard page position. /*
162	static inline int
163	setup_stack_prot (char mem, size_t size, char* *guard, size_t guardsize,
164	const int prot)
165	{
166	char *guardend = guard + guardsize;
167	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
168	/ As defined at guard_position, for architectures with downward stack*
169	the guard page is always at start of the allocated area. /*
170	if (__mprotect (guardend, size - guardsize, prot) != `0`)
171	return errno;
172	#else
173	size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
174	if (__mprotect (mem, mprots1, prot) != `0`)
175	return errno;
176	size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
177	if (__mprotect (guardend, mprots2, prot) != `0`)
178	return errno;
179	#endif
180	return `0`;
181	}
182
183	/ Mark the memory of the stack as usable to the kernel. It frees everything*
184	except for the space used for the TCB itself. /*
185	static __always_inline void
186	advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
187	{
188	uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
189	size_t pagesize_m1 = __getpagesize () - `1`;
190	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
191	size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
192	assert (freesize < size);
193	if (freesize > PTHREAD_STACK_MIN)
194	__madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
195	#else
196	/ Page aligned start of memory to free (higher than or equal*
197	to current sp plus the minimum stack size). /*
198	uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
199	uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
200	if (free_end > freeblock)
201	{
202	size_t freesize = free_end - freeblock;
203	assert (freesize < size);
204	__madvise ((void*) freeblock, freesize, MADV_DONTNEED);
205	}
206	#endif
207	}
208
209	/ Returns a usable stack for a new thread either by allocating a*
210	new stack or reusing a cached stack of sufficient size.
211	ATTR must be non-NULL and point to a valid pthread_attr.
212	PDP must be non-NULL. /*
213	static int
214	allocate_stack (const struct pthread_attr attr, struct* pthread **pdp,
215	void *stack, size_t stacksize)
216	{
217	struct pthread *pd;
218	size_t size;
219	size_t pagesize_m1 = __getpagesize () - `1`;
220	size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
221	size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - `1`;
222
223	assert (powerof2 (pagesize_m1 + `1`));
224	assert (TCB_ALIGNMENT >= STACK_ALIGN);
225
226	/ Get the stack size from the attribute if it is set. Otherwise we*
227	use the default we determined at start time. /*
228	if (attr->stacksize != `0`)
229	size = attr->stacksize;
230	else
231	{
232	lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
233	size = __default_pthread_attr.internal.stacksize;
234	lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
235	}
236
237	/ Get memory for the stack. /
238	if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
239	{
240	uintptr_t adj;
241	char stackaddr = (char* *) attr->stackaddr;
242
243	/ Assume the same layout as the _STACK_GROWS_DOWN case, with struct*
244	pthread at the top of the stack block. Later we adjust the guard
245	location and stack address to match the _STACK_GROWS_UP case. /*
246	if (_STACK_GROWS_UP)
247	stackaddr += attr->stacksize;
248
249	/ If the user also specified the size of the stack make sure it*
250	is large enough. /*
251	if (attr->stacksize != `0`
252	&& attr->stacksize < (tls_static_size_for_stack
253	+ MINIMAL_REST_STACK))
254	return EINVAL;
255
256	/ Adjust stack size for alignment of the TLS block. /
257	#if TLS_TCB_AT_TP
258	adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
259	& tls_static_align_m1;
260	assert (size > adj + TLS_TCB_SIZE);
261	#elif TLS_DTV_AT_TP
262	adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
263	& tls_static_align_m1;
264	assert (size > adj);
265	#endif
266
267	/ The user provided some memory. Let's hope it matches the*
268	size... We do not allocate guard pages if the user provided
269	the stack. It is the user's responsibility to do this if it
270	is wanted. /*
271	#if TLS_TCB_AT_TP
272	pd = (struct pthread *) ((uintptr_t) stackaddr
273	- TLS_TCB_SIZE - adj);
274	#elif TLS_DTV_AT_TP
275	pd = (struct pthread *) (((uintptr_t) stackaddr
276	- tls_static_size_for_stack - adj)
277	- TLS_PRE_TCB_SIZE);
278	#endif
279
280	/ The user provided stack memory needs to be cleared. /
281	memset (pd, `'\0'`, sizeof (struct pthread));
282
283	/ The first TSD block is included in the TCB. /
284	pd->specific[`0`] = pd->specific_1stblock;
285
286	/ Remember the stack-related values. /
287	pd->stackblock = (char *) stackaddr - size;
288	pd->stackblock_size = size;
289
290	/ This is a user-provided stack. It will not be queued in the*
291	stack cache nor will the memory (except the TLS memory) be freed. /*
292	pd->user_stack = true;
293
294	/ This is at least the second thread. /
295	pd->header.multiple_threads = `1`;
296
297	#ifdef NEED_DL_SYSINFO
298	SETUP_THREAD_SYSINFO (pd);
299	#endif
300
301	/ Don't allow setxid until cloned. /
302	pd->setxid_futex = -`1`;
303
304	/ Allocate the DTV for this thread. /
305	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
306	{
307	/ Something went wrong. /
308	assert (errno == ENOMEM);
309	return errno;
310	}
311
312
313	/ Prepare to modify global data. /
314	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
315
316	/ And add to the list of stacks in use. /
317	list_add (&pd->list, &GL (dl_stack_user));
318
319	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
320	}
321	else
322	{
323	/ Allocate some anonymous memory. If possible use the cache. /
324	size_t guardsize;
325	size_t reported_guardsize;
326	size_t reqsize;
327	void *mem;
328	const int prot = (PROT_READ \| PROT_WRITE
329	\| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : `0`));
330
331	/ Adjust the stack size for alignment. /
332	size &= ~tls_static_align_m1;
333	assert (size != `0`);
334
335	/ Make sure the size of the stack is enough for the guard and*
336	eventually the thread descriptor. On some targets there is
337	a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
338	internally enforce it (unless the guard was disabled), but
339	report the original guard size for backward compatibility:
340	before POSIX 2008 the guardsize was specified to be one page
341	by default which is observable via pthread_attr_getguardsize
342	and pthread_getattr_np. /*
343	guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
344	reported_guardsize = guardsize;
345	if (guardsize > `0` && guardsize < ARCH_MIN_GUARD_SIZE)
346	guardsize = ARCH_MIN_GUARD_SIZE;
347	if (guardsize < attr->guardsize \|\| size + guardsize < guardsize)
348	/ Arithmetic overflow. /
349	return EINVAL;
350	size += guardsize;
351	if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
352	+ MINIMAL_REST_STACK + pagesize_m1)
353	& ~pagesize_m1),
354	`0`))
355	/ The stack is too small (or the guard too large). /
356	return EINVAL;
357
358	/ Try to get a stack from the cache. /
359	reqsize = size;
360	pd = get_cached_stack (&size, &mem);
361	if (pd == NULL)
362	{
363	/ If a guard page is required, avoid committing memory by first*
364	allocate with PROT_NONE and then reserve with required permission
365	excluding the guard page. /*
366	mem = __mmap (NULL, size, (guardsize == `0`) ? prot : PROT_NONE,
367	MAP_PRIVATE \| MAP_ANONYMOUS \| MAP_STACK, -`1`, `0`);
368
369	if (__glibc_unlikely (mem == MAP_FAILED))
370	return errno;
371
372	/ Do madvise in case the tunable glibc.pthread.stack_hugetlb is*
373	set to 0, disabling hugetlb. /*
374	if (__glibc_unlikely (__nptl_stack_hugetlb == `0`)
375	&& __madvise (mem, size, MADV_NOHUGEPAGE) != `0`)
376	return errno;
377
378	/ SIZE is guaranteed to be greater than zero.*
379	So we can never get a null pointer back from mmap. /*
380	assert (mem != NULL);
381
382	/ Place the thread descriptor at the end of the stack. /
383	#if TLS_TCB_AT_TP
384	pd = (struct pthread *) ((((uintptr_t) mem + size)
385	- TLS_TCB_SIZE)
386	& ~tls_static_align_m1);
387	#elif TLS_DTV_AT_TP
388	pd = (struct pthread *) ((((uintptr_t) mem + size
389	- tls_static_size_for_stack)
390	& ~tls_static_align_m1)
391	- TLS_PRE_TCB_SIZE);
392	#endif
393
394	/ Now mprotect the required region excluding the guard area. /
395	if (__glibc_likely (guardsize > `0`))
396	{
397	char *guard = guard_position (mem, size, guardsize, pd,
398	pagesize_m1);
399	if (setup_stack_prot (mem, size, guard, guardsize, prot) != `0`)
400	{
401	__munmap (mem, size);
402	return errno;
403	}
404	}
405
406	/ Remember the stack-related values. /
407	pd->stackblock = mem;
408	pd->stackblock_size = size;
409	/ Update guardsize for newly allocated guardsize to avoid*
410	an mprotect in guard resize below. /*
411	pd->guardsize = guardsize;
412
413	/ We allocated the first block thread-specific data array.*
414	This address will not change for the lifetime of this
415	descriptor. /*
416	pd->specific[`0`] = pd->specific_1stblock;
417
418	/ This is at least the second thread. /
419	pd->header.multiple_threads = `1`;
420
421	#ifdef NEED_DL_SYSINFO
422	SETUP_THREAD_SYSINFO (pd);
423	#endif
424
425	/ Don't allow setxid until cloned. /
426	pd->setxid_futex = -`1`;
427
428	/ Allocate the DTV for this thread. /
429	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
430	{
431	/ Something went wrong. /
432	assert (errno == ENOMEM);
433
434	/ Free the stack memory we just allocated. /
435	(void) __munmap (mem, size);
436
437	return errno;
438	}
439
440
441	/ Prepare to modify global data. /
442	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
443
444	/ And add to the list of stacks in use. /
445	__nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
446
447	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
448
449
450	/ There might have been a race. Another thread might have*
451	caused the stacks to get exec permission while this new
452	stack was prepared. Detect if this was possible and
453	change the permission if necessary. /*
454	if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != `0`
455	&& (prot & PROT_EXEC) == `0`, `0`))
456	{
457	int err = __nptl_change_stack_perm (pd);
458	if (err != `0`)
459	{
460	/ Free the stack memory we just allocated. /
461	(void) __munmap (mem, size);
462
463	return err;
464	}
465	}
466
467
468	/ Note that all of the stack and the thread descriptor is*
469	zeroed. This means we do not have to initialize fields
470	with initial value zero. This is specifically true for
471	the 'tid' field which is always set back to zero once the
472	stack is not used anymore and for the 'guardsize' field
473	which will be read next. /*
474	}
475
476	/ Create or resize the guard area if necessary. /
477	if (__glibc_unlikely (guardsize > pd->guardsize))
478	{
479	char *guard = guard_position (mem, size, guardsize, pd,
480	pagesize_m1);
481	if (__mprotect (guard, guardsize, PROT_NONE) != `0`)
482	{
483	mprot_error:
484	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
485
486	/ Remove the thread from the list. /
487	__nptl_stack_list_del (&pd->list);
488
489	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
490
491	/ Get rid of the TLS block we allocated. /
492	_dl_deallocate_tls (TLS_TPADJ (pd), false);
493
494	/ Free the stack memory regardless of whether the size*
495	of the cache is over the limit or not. If this piece
496	of memory caused problems we better do not use it
497	anymore. Uh, and we ignore possible errors. There
498	is nothing we could do. /*
499	(void) __munmap (mem, size);
500
501	return errno;
502	}
503
504	pd->guardsize = guardsize;
505	}
506	else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
507	`0`))
508	{
509	/ The old guard area is too large. /
510
511	#ifdef NEED_SEPARATE_REGISTER_STACK
512	char *guard = mem + (((size - guardsize) / `2`) & ~pagesize_m1);
513	char *oldguard = mem + (((size - pd->guardsize) / `2`) & ~pagesize_m1);
514
515	if (oldguard < guard
516	&& __mprotect (oldguard, guard - oldguard, prot) != `0`)
517	goto mprot_error;
518
519	if (__mprotect (guard + guardsize,
520	oldguard + pd->guardsize - guard - guardsize,
521	prot) != `0`)
522	goto mprot_error;
523	#elif _STACK_GROWS_DOWN
524	if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
525	prot) != `0`)
526	goto mprot_error;
527	#elif _STACK_GROWS_UP
528	char new_guard = (char* *)(((uintptr_t) pd - guardsize)
529	& ~pagesize_m1);
530	char old_guard = (char* *)(((uintptr_t) pd - pd->guardsize)
531	& ~pagesize_m1);
532	/ The guard size difference might be > 0, but once rounded*
533	to the nearest page the size difference might be zero. /*
534	if (new_guard > old_guard
535	&& __mprotect (old_guard, new_guard - old_guard, prot) != `0`)
536	goto mprot_error;
537	#endif
538
539	pd->guardsize = guardsize;
540	}
541	/ The pthread_getattr_np() calls need to get passed the size*
542	requested in the attribute, regardless of how large the
543	actually used guardsize is. /*
544	pd->reported_guardsize = reported_guardsize;
545	}
546
547	/ Initialize the lock. We have to do this unconditionally since the*
548	stillborn thread could be canceled while the lock is taken. /*
549	pd->lock = LLL_LOCK_INITIALIZER;
550
551	/ The robust mutex lists also need to be initialized*
552	unconditionally because the cleanup for the previous stack owner
553	might have happened in the kernel. /*
554	pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
555	- offsetof (pthread_mutex_t,
556	__data.__list.__next));
557	pd->robust_head.list_op_pending = NULL;
558	#if __PTHREAD_MUTEX_HAVE_PREV
559	pd->robust_prev = &pd->robust_head;
560	#endif
561	pd->robust_head.list = &pd->robust_head;
562
563	/ We place the thread descriptor at the end of the stack. /
564	*pdp = pd;
565
566	void *stacktop;
567
568	#if TLS_TCB_AT_TP
569	/ The stack begins before the TCB and the static TLS block. /
570	stacktop = ((char *) (pd + `1`) - tls_static_size_for_stack);
571	#elif TLS_DTV_AT_TP
572	stacktop = (char *) (pd - `1`);
573	#endif
574
575	*stacksize = stacktop - pd->stackblock;
576	*stack = pd->stackblock;
577
578	return `0`;
579	}
580

Browse the source code of glibc/nptl/allocatestack.c