allocatestack.c source code [glibc/nptl/allocatestack.c]

1	/ Copyright (C) 2002-2020 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3	Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#include <assert.h>
20	#include <errno.h>
21	#include <signal.h>
22	#include <stdint.h>
23	#include <string.h>
24	#include <unistd.h>
25	#include <sys/mman.h>
26	#include <sys/param.h>
27	#include <dl-sysdep.h>
28	#include <dl-tls.h>
29	#include <tls.h>
30	#include <list.h>
31	#include <lowlevellock.h>
32	#include <futex-internal.h>
33	#include <kernel-features.h>
34	#include <stack-aliasing.h>
35
36
37	#ifndef NEED_SEPARATE_REGISTER_STACK
38
39	/ Most architectures have exactly one stack pointer. Some have more. /
40	# define STACK_VARIABLES void *stackaddr = NULL
41
42	/ How to pass the values to the 'create_thread' function. /
43	# define STACK_VARIABLES_ARGS stackaddr
44
45	/ How to declare function which gets there parameters. /
46	# define STACK_VARIABLES_PARMS void *stackaddr
47
48	/ How to declare allocate_stack. /
49	# define ALLOCATE_STACK_PARMS void **stack
50
51	/ This is how the function is called. We do it this way to allow*
52	other variants of the function to have more parameters. /*
53	# define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
54
55	#else
56
57	/ We need two stacks. The kernel will place them but we have to tell*
58	the kernel about the size of the reserved address space. /*
59	# define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
60
61	/ How to pass the values to the 'create_thread' function. /
62	# define STACK_VARIABLES_ARGS stackaddr, stacksize
63
64	/ How to declare function which gets there parameters. /
65	# define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
66
67	/ How to declare allocate_stack. /
68	# define ALLOCATE_STACK_PARMS void *stack, size_t stacksize
69
70	/ This is how the function is called. We do it this way to allow*
71	other variants of the function to have more parameters. /*
72	# define ALLOCATE_STACK(attr, pd) \
73	allocate_stack (attr, pd, &stackaddr, &stacksize)
74
75	#endif
76
77
78	/ Default alignment of stack. /
79	#ifndef STACK_ALIGN
80	# define STACK_ALIGN __alignof__ (long double)
81	#endif
82
83	/ Default value for minimal stack size after allocating thread*
84	descriptor and guard. /*
85	#ifndef MINIMAL_REST_STACK
86	# define MINIMAL_REST_STACK 4096
87	#endif
88
89
90	/ Newer kernels have the MAP_STACK flag to indicate a mapping is used for*
91	a stack. Use it when possible. /*
92	#ifndef MAP_STACK
93	# define MAP_STACK 0
94	#endif
95
96	/ This yields the pointer that TLS support code calls the thread pointer. /
97	#if TLS_TCB_AT_TP
98	# define TLS_TPADJ(pd) (pd)
99	#elif TLS_DTV_AT_TP
100	# define TLS_TPADJ(pd) ((struct pthread )((char ) (pd) + TLS_PRE_TCB_SIZE))
101	#endif
102
103	/ Cache handling for not-yet free stacks. /
104
105	/ Maximum size in kB of cache. /
106	static size_t stack_cache_maxsize = `40` * `1024` * `1024`; / 40MiBi by default. /
107	static size_t stack_cache_actsize;
108
109	/ Mutex protecting this variable. /
110	static int stack_cache_lock = LLL_LOCK_INITIALIZER;
111
112	/ List of queued stack frames. /
113	static LIST_HEAD (stack_cache);
114
115	/ List of the stacks in use. /
116	static LIST_HEAD (stack_used);
117
118	/ We need to record what list operations we are going to do so that,*
119	in case of an asynchronous interruption due to a fork() call, we
120	can correct for the work. /*
121	static uintptr_t in_flight_stack;
122
123	/ List of the threads with user provided stacks in use. No need to*
124	initialize this, since it's done in __pthread_initialize_minimal. /*
125	list_t __stack_user __attribute__ ((nocommon));
126	hidden_data_def (__stack_user)
127
128
129	/ Check whether the stack is still used or not. /
130	#define FREE_P(descr) ((descr)->tid <= 0)
131
132
133	static void
134	stack_list_del (list_t *elem)
135	{
136	in_flight_stack = (uintptr_t) elem;
137
138	atomic_write_barrier ();
139
140	list_del (elem);
141
142	atomic_write_barrier ();
143
144	in_flight_stack = `0`;
145	}
146
147
148	static void
149	stack_list_add (list_t elem, list_t list)
150	{
151	in_flight_stack = (uintptr_t) elem \| `1`;
152
153	atomic_write_barrier ();
154
155	list_add (elem, list);
156
157	atomic_write_barrier ();
158
159	in_flight_stack = `0`;
160	}
161
162
163	/ We create a double linked list of all cache entries. Double linked*
164	because this allows removing entries from the end. /*
165
166
167	/ Get a stack frame from the cache. We have to match by size since*
168	some blocks might be too small or far too large. /*
169	static struct pthread *
170	get_cached_stack (size_t sizep, void* **memp)
171	{
172	size_t size = *sizep;
173	struct pthread *result = NULL;
174	list_t *entry;
175
176	lll_lock (stack_cache_lock, LLL_PRIVATE);
177
178	/ Search the cache for a matching entry. We search for the*
179	smallest stack which has at least the required size. Note that
180	in normal situations the size of all allocated stacks is the
181	same. As the very least there are only a few different sizes.
182	Therefore this loop will exit early most of the time with an
183	exact match. /*
184	list_for_each (entry, &stack_cache)
185	{
186	struct pthread *curr;
187
188	curr = list_entry (entry, struct pthread, list);
189	if (FREE_P (curr) && curr->stackblock_size >= size)
190	{
191	if (curr->stackblock_size == size)
192	{
193	result = curr;
194	break;
195	}
196
197	if (result == NULL
198	\|\| result->stackblock_size > curr->stackblock_size)
199	result = curr;
200	}
201	}
202
203	if (__builtin_expect (result == NULL, `0`)
204	/ Make sure the size difference is not too excessive. In that*
205	case we do not use the block. /*
206	\|\| __builtin_expect (result->stackblock_size > `4` * size, `0`))
207	{
208	/ Release the lock. /
209	lll_unlock (stack_cache_lock, LLL_PRIVATE);
210
211	return NULL;
212	}
213
214	/ Don't allow setxid until cloned. /
215	result->setxid_futex = -`1`;
216
217	/ Dequeue the entry. /
218	stack_list_del (&result->list);
219
220	/ And add to the list of stacks in use. /
221	stack_list_add (&result->list, &stack_used);
222
223	/ And decrease the cache size. /
224	stack_cache_actsize -= result->stackblock_size;
225
226	/ Release the lock early. /
227	lll_unlock (stack_cache_lock, LLL_PRIVATE);
228
229	/ Report size and location of the stack to the caller. /
230	*sizep = result->stackblock_size;
231	*memp = result->stackblock;
232
233	/ Cancellation handling is back to the default. /
234	result->cancelhandling = `0`;
235	result->cleanup = NULL;
236
237	/ No pending event. /
238	result->nextevent = NULL;
239
240	result->tls_state = (struct tls_internal_t) { `0` };
241
242	/ Clear the DTV. /
243	dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
244	for (size_t cnt = `0`; cnt < dtv[-`1`].counter; ++cnt)
245	free (dtv[`1` + cnt].pointer.to_free);
246	memset (dtv, `'\0'`, (dtv[-`1`].counter + `1`) * sizeof (dtv_t));
247
248	/ Re-initialize the TLS. /
249	_dl_allocate_tls_init (TLS_TPADJ (result));
250
251	return result;
252	}
253
254
255	/ Free stacks until cache size is lower than LIMIT. /
256	static void
257	free_stacks (size_t limit)
258	{
259	/ We reduce the size of the cache. Remove the last entries until*
260	the size is below the limit. /*
261	list_t *entry;
262	list_t *prev;
263
264	/ Search from the end of the list. /
265	list_for_each_prev_safe (entry, prev, &stack_cache)
266	{
267	struct pthread *curr;
268
269	curr = list_entry (entry, struct pthread, list);
270	if (FREE_P (curr))
271	{
272	/ Unlink the block. /
273	stack_list_del (entry);
274
275	/ Account for the freed memory. /
276	stack_cache_actsize -= curr->stackblock_size;
277
278	/ Free the memory associated with the ELF TLS. /
279	_dl_deallocate_tls (TLS_TPADJ (curr), false);
280
281	/ Remove this block. This should never fail. If it does*
282	something is really wrong. /*
283	if (__munmap (curr->stackblock, curr->stackblock_size) != `0`)
284	abort ();
285
286	/ Maybe we have freed enough. /
287	if (stack_cache_actsize <= limit)
288	break;
289	}
290	}
291	}
292
293	/ Free all the stacks on cleanup. /
294	void
295	__nptl_stacks_freeres (void)
296	{
297	free_stacks (`0`);
298	}
299
300	/ Add a stack frame which is not used anymore to the stack. Must be*
301	called with the cache lock held. /*
302	static inline void
303	__attribute ((always_inline))
304	queue_stack (struct pthread *stack)
305	{
306	/ We unconditionally add the stack to the list. The memory may*
307	still be in use but it will not be reused until the kernel marks
308	the stack as not used anymore. /*
309	stack_list_add (&stack->list, &stack_cache);
310
311	stack_cache_actsize += stack->stackblock_size;
312	if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
313	free_stacks (stack_cache_maxsize);
314	}
315
316
317	static int
318	change_stack_perm (struct pthread *pd
319	#ifdef NEED_SEPARATE_REGISTER_STACK
320	, size_t pagemask
321	#endif
322	)
323	{
324	#ifdef NEED_SEPARATE_REGISTER_STACK
325	void *stack = (pd->stackblock
326	+ (((((pd->stackblock_size - pd->guardsize) / `2`)
327	& pagemask) + pd->guardsize) & pagemask));
328	size_t len = pd->stackblock + pd->stackblock_size - stack;
329	#elif _STACK_GROWS_DOWN
330	void *stack = pd->stackblock + pd->guardsize;
331	size_t len = pd->stackblock_size - pd->guardsize;
332	#elif _STACK_GROWS_UP
333	void *stack = pd->stackblock;
334	size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
335	#else
336	# error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
337	#endif
338	if (__mprotect (stack, len, PROT_READ \| PROT_WRITE \| PROT_EXEC) != `0`)
339	return errno;
340
341	return `0`;
342	}
343
344	/ Return the guard page position on allocated stack. /
345	static inline char *
346	__attribute ((always_inline))
347	guard_position (void mem, size_t size, size_t guardsize, struct* pthread *pd,
348	size_t pagesize_m1)
349	{
350	#ifdef NEED_SEPARATE_REGISTER_STACK
351	return mem + (((size - guardsize) / `2`) & ~pagesize_m1);
352	#elif _STACK_GROWS_DOWN
353	return mem;
354	#elif _STACK_GROWS_UP
355	return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
356	#endif
357	}
358
359	/ Based on stack allocated with PROT_NONE, setup the required portions with*
360	'prot' flags based on the guard page position. /*
361	static inline int
362	setup_stack_prot (char mem, size_t size, char* *guard, size_t guardsize,
363	const int prot)
364	{
365	char *guardend = guard + guardsize;
366	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
367	/ As defined at guard_position, for architectures with downward stack*
368	the guard page is always at start of the allocated area. /*
369	if (__mprotect (guardend, size - guardsize, prot) != `0`)
370	return errno;
371	#else
372	size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
373	if (__mprotect (mem, mprots1, prot) != `0`)
374	return errno;
375	size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
376	if (__mprotect (guardend, mprots2, prot) != `0`)
377	return errno;
378	#endif
379	return `0`;
380	}
381
382	/ Mark the memory of the stack as usable to the kernel. It frees everything*
383	except for the space used for the TCB itself. /*
384	static __always_inline void
385	advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
386	{
387	uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
388	size_t pagesize_m1 = __getpagesize () - `1`;
389	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
390	size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
391	assert (freesize < size);
392	if (freesize > PTHREAD_STACK_MIN)
393	__madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
394	#else
395	/ Page aligned start of memory to free (higher than or equal*
396	to current sp plus the minimum stack size). /*
397	uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
398	uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
399	if (free_end > freeblock)
400	{
401	size_t freesize = free_end - freeblock;
402	assert (freesize < size);
403	__madvise ((void*) freeblock, freesize, MADV_DONTNEED);
404	}
405	#endif
406	}
407
408	/ Returns a usable stack for a new thread either by allocating a*
409	new stack or reusing a cached stack of sufficient size.
410	ATTR must be non-NULL and point to a valid pthread_attr.
411	PDP must be non-NULL. /*
412	static int
413	allocate_stack (const struct pthread_attr attr, struct* pthread **pdp,
414	ALLOCATE_STACK_PARMS)
415	{
416	struct pthread *pd;
417	size_t size;
418	size_t pagesize_m1 = __getpagesize () - `1`;
419
420	assert (powerof2 (pagesize_m1 + `1`));
421	assert (TCB_ALIGNMENT >= STACK_ALIGN);
422
423	/ Get the stack size from the attribute if it is set. Otherwise we*
424	use the default we determined at start time. /*
425	if (attr->stacksize != `0`)
426	size = attr->stacksize;
427	else
428	{
429	lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
430	size = __default_pthread_attr.internal.stacksize;
431	lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
432	}
433
434	/ Get memory for the stack. /
435	if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
436	{
437	uintptr_t adj;
438	char stackaddr = (char* *) attr->stackaddr;
439
440	/ Assume the same layout as the _STACK_GROWS_DOWN case, with struct*
441	pthread at the top of the stack block. Later we adjust the guard
442	location and stack address to match the _STACK_GROWS_UP case. /*
443	if (_STACK_GROWS_UP)
444	stackaddr += attr->stacksize;
445
446	/ If the user also specified the size of the stack make sure it*
447	is large enough. /*
448	if (attr->stacksize != `0`
449	&& attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
450	return EINVAL;
451
452	/ Adjust stack size for alignment of the TLS block. /
453	#if TLS_TCB_AT_TP
454	adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
455	& __static_tls_align_m1;
456	assert (size > adj + TLS_TCB_SIZE);
457	#elif TLS_DTV_AT_TP
458	adj = ((uintptr_t) stackaddr - __static_tls_size)
459	& __static_tls_align_m1;
460	assert (size > adj);
461	#endif
462
463	/ The user provided some memory. Let's hope it matches the*
464	size... We do not allocate guard pages if the user provided
465	the stack. It is the user's responsibility to do this if it
466	is wanted. /*
467	#if TLS_TCB_AT_TP
468	pd = (struct pthread *) ((uintptr_t) stackaddr
469	- TLS_TCB_SIZE - adj);
470	#elif TLS_DTV_AT_TP
471	pd = (struct pthread *) (((uintptr_t) stackaddr
472	- __static_tls_size - adj)
473	- TLS_PRE_TCB_SIZE);
474	#endif
475
476	/ The user provided stack memory needs to be cleared. /
477	memset (pd, `'\0'`, sizeof (struct pthread));
478
479	/ The first TSD block is included in the TCB. /
480	pd->specific[`0`] = pd->specific_1stblock;
481
482	/ Remember the stack-related values. /
483	pd->stackblock = (char *) stackaddr - size;
484	pd->stackblock_size = size;
485
486	/ This is a user-provided stack. It will not be queued in the*
487	stack cache nor will the memory (except the TLS memory) be freed. /*
488	pd->user_stack = true;
489
490	/ This is at least the second thread. /
491	pd->header.multiple_threads = `1`;
492	#ifndef TLS_MULTIPLE_THREADS_IN_TCB
493	__pthread_multiple_threads = *__libc_multiple_threads_ptr = `1`;
494	#endif
495
496	#ifdef NEED_DL_SYSINFO
497	SETUP_THREAD_SYSINFO (pd);
498	#endif
499
500	/ Don't allow setxid until cloned. /
501	pd->setxid_futex = -`1`;
502
503	/ Allocate the DTV for this thread. /
504	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
505	{
506	/ Something went wrong. /
507	assert (errno == ENOMEM);
508	return errno;
509	}
510
511
512	/ Prepare to modify global data. /
513	lll_lock (stack_cache_lock, LLL_PRIVATE);
514
515	/ And add to the list of stacks in use. /
516	list_add (&pd->list, &__stack_user);
517
518	lll_unlock (stack_cache_lock, LLL_PRIVATE);
519	}
520	else
521	{
522	/ Allocate some anonymous memory. If possible use the cache. /
523	size_t guardsize;
524	size_t reqsize;
525	void *mem;
526	const int prot = (PROT_READ \| PROT_WRITE
527	\| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : `0`));
528
529	/ Adjust the stack size for alignment. /
530	size &= ~__static_tls_align_m1;
531	assert (size != `0`);
532
533	/ Make sure the size of the stack is enough for the guard and*
534	eventually the thread descriptor. /*
535	guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
536	if (guardsize < attr->guardsize \|\| size + guardsize < guardsize)
537	/ Arithmetic overflow. /
538	return EINVAL;
539	size += guardsize;
540	if (__builtin_expect (size < ((guardsize + __static_tls_size
541	+ MINIMAL_REST_STACK + pagesize_m1)
542	& ~pagesize_m1),
543	`0`))
544	/ The stack is too small (or the guard too large). /
545	return EINVAL;
546
547	/ Try to get a stack from the cache. /
548	reqsize = size;
549	pd = get_cached_stack (&size, &mem);
550	if (pd == NULL)
551	{
552	/ To avoid aliasing effects on a larger scale than pages we*
553	adjust the allocated stack size if necessary. This way
554	allocations directly following each other will not have
555	aliasing problems. /*
556	#if MULTI_PAGE_ALIASING != 0
557	if ((size % MULTI_PAGE_ALIASING) == `0`)
558	size += pagesize_m1 + `1`;
559	#endif
560
561	/ If a guard page is required, avoid committing memory by first*
562	allocate with PROT_NONE and then reserve with required permission
563	excluding the guard page. /*
564	mem = __mmap (NULL, size, (guardsize == `0`) ? prot : PROT_NONE,
565	MAP_PRIVATE \| MAP_ANONYMOUS \| MAP_STACK, -`1`, `0`);
566
567	if (__glibc_unlikely (mem == MAP_FAILED))
568	return errno;
569
570	/ SIZE is guaranteed to be greater than zero.*
571	So we can never get a null pointer back from mmap. /*
572	assert (mem != NULL);
573
574	/ Place the thread descriptor at the end of the stack. /
575	#if TLS_TCB_AT_TP
576	pd = (struct pthread *) ((((uintptr_t) mem + size)
577	- TLS_TCB_SIZE)
578	& ~__static_tls_align_m1);
579	#elif TLS_DTV_AT_TP
580	pd = (struct pthread *) ((((uintptr_t) mem + size
581	- __static_tls_size)
582	& ~__static_tls_align_m1)
583	- TLS_PRE_TCB_SIZE);
584	#endif
585
586	/ Now mprotect the required region excluding the guard area. /
587	if (__glibc_likely (guardsize > `0`))
588	{
589	char *guard = guard_position (mem, size, guardsize, pd,
590	pagesize_m1);
591	if (setup_stack_prot (mem, size, guard, guardsize, prot) != `0`)
592	{
593	__munmap (mem, size);
594	return errno;
595	}
596	}
597
598	/ Remember the stack-related values. /
599	pd->stackblock = mem;
600	pd->stackblock_size = size;
601	/ Update guardsize for newly allocated guardsize to avoid*
602	an mprotect in guard resize below. /*
603	pd->guardsize = guardsize;
604
605	/ We allocated the first block thread-specific data array.*
606	This address will not change for the lifetime of this
607	descriptor. /*
608	pd->specific[`0`] = pd->specific_1stblock;
609
610	/ This is at least the second thread. /
611	pd->header.multiple_threads = `1`;
612	#ifndef TLS_MULTIPLE_THREADS_IN_TCB
613	__pthread_multiple_threads = *__libc_multiple_threads_ptr = `1`;
614	#endif
615
616	#ifdef NEED_DL_SYSINFO
617	SETUP_THREAD_SYSINFO (pd);
618	#endif
619
620	/ Don't allow setxid until cloned. /
621	pd->setxid_futex = -`1`;
622
623	/ Allocate the DTV for this thread. /
624	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
625	{
626	/ Something went wrong. /
627	assert (errno == ENOMEM);
628
629	/ Free the stack memory we just allocated. /
630	(void) __munmap (mem, size);
631
632	return errno;
633	}
634
635
636	/ Prepare to modify global data. /
637	lll_lock (stack_cache_lock, LLL_PRIVATE);
638
639	/ And add to the list of stacks in use. /
640	stack_list_add (&pd->list, &stack_used);
641
642	lll_unlock (stack_cache_lock, LLL_PRIVATE);
643
644
645	/ There might have been a race. Another thread might have*
646	caused the stacks to get exec permission while this new
647	stack was prepared. Detect if this was possible and
648	change the permission if necessary. /*
649	if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != `0`
650	&& (prot & PROT_EXEC) == `0`, `0`))
651	{
652	int err = change_stack_perm (pd
653	#ifdef NEED_SEPARATE_REGISTER_STACK
654	, ~pagesize_m1
655	#endif
656	);
657	if (err != `0`)
658	{
659	/ Free the stack memory we just allocated. /
660	(void) __munmap (mem, size);
661
662	return err;
663	}
664	}
665
666
667	/ Note that all of the stack and the thread descriptor is*
668	zeroed. This means we do not have to initialize fields
669	with initial value zero. This is specifically true for
670	the 'tid' field which is always set back to zero once the
671	stack is not used anymore and for the 'guardsize' field
672	which will be read next. /*
673	}
674
675	/ Create or resize the guard area if necessary. /
676	if (__glibc_unlikely (guardsize > pd->guardsize))
677	{
678	char *guard = guard_position (mem, size, guardsize, pd,
679	pagesize_m1);
680	if (__mprotect (guard, guardsize, PROT_NONE) != `0`)
681	{
682	mprot_error:
683	lll_lock (stack_cache_lock, LLL_PRIVATE);
684
685	/ Remove the thread from the list. /
686	stack_list_del (&pd->list);
687
688	lll_unlock (stack_cache_lock, LLL_PRIVATE);
689
690	/ Get rid of the TLS block we allocated. /
691	_dl_deallocate_tls (TLS_TPADJ (pd), false);
692
693	/ Free the stack memory regardless of whether the size*
694	of the cache is over the limit or not. If this piece
695	of memory caused problems we better do not use it
696	anymore. Uh, and we ignore possible errors. There
697	is nothing we could do. /*
698	(void) __munmap (mem, size);
699
700	return errno;
701	}
702
703	pd->guardsize = guardsize;
704	}
705	else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
706	`0`))
707	{
708	/ The old guard area is too large. /
709
710	#ifdef NEED_SEPARATE_REGISTER_STACK
711	char *guard = mem + (((size - guardsize) / `2`) & ~pagesize_m1);
712	char *oldguard = mem + (((size - pd->guardsize) / `2`) & ~pagesize_m1);
713
714	if (oldguard < guard
715	&& __mprotect (oldguard, guard - oldguard, prot) != `0`)
716	goto mprot_error;
717
718	if (__mprotect (guard + guardsize,
719	oldguard + pd->guardsize - guard - guardsize,
720	prot) != `0`)
721	goto mprot_error;
722	#elif _STACK_GROWS_DOWN
723	if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
724	prot) != `0`)
725	goto mprot_error;
726	#elif _STACK_GROWS_UP
727	char new_guard = (char* *)(((uintptr_t) pd - guardsize)
728	& ~pagesize_m1);
729	char old_guard = (char* *)(((uintptr_t) pd - pd->guardsize)
730	& ~pagesize_m1);
731	/ The guard size difference might be > 0, but once rounded*
732	to the nearest page the size difference might be zero. /*
733	if (new_guard > old_guard
734	&& __mprotect (old_guard, new_guard - old_guard, prot) != `0`)
735	goto mprot_error;
736	#endif
737
738	pd->guardsize = guardsize;
739	}
740	/ The pthread_getattr_np() calls need to get passed the size*
741	requested in the attribute, regardless of how large the
742	actually used guardsize is. /*
743	pd->reported_guardsize = guardsize;
744	}
745
746	/ Initialize the lock. We have to do this unconditionally since the*
747	stillborn thread could be canceled while the lock is taken. /*
748	pd->lock = LLL_LOCK_INITIALIZER;
749
750	/ The robust mutex lists also need to be initialized*
751	unconditionally because the cleanup for the previous stack owner
752	might have happened in the kernel. /*
753	pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
754	- offsetof (pthread_mutex_t,
755	__data.__list.__next));
756	pd->robust_head.list_op_pending = NULL;
757	#if __PTHREAD_MUTEX_HAVE_PREV
758	pd->robust_prev = &pd->robust_head;
759	#endif
760	pd->robust_head.list = &pd->robust_head;
761
762	/ We place the thread descriptor at the end of the stack. /
763	*pdp = pd;
764
765	#if _STACK_GROWS_DOWN
766	void *stacktop;
767
768	# if TLS_TCB_AT_TP
769	/ The stack begins before the TCB and the static TLS block. /
770	stacktop = ((char *) (pd + `1`) - __static_tls_size);
771	# elif TLS_DTV_AT_TP
772	stacktop = (char *) (pd - `1`);
773	# endif
774
775	# ifdef NEED_SEPARATE_REGISTER_STACK
776	*stack = pd->stackblock;
777	stacksize = stacktop - stack;
778	# else
779	*stack = stacktop;
780	# endif
781	#else
782	*stack = pd->stackblock;
783	#endif
784
785	return `0`;
786	}
787
788
789	void
790	__deallocate_stack (struct pthread *pd)
791	{
792	lll_lock (stack_cache_lock, LLL_PRIVATE);
793
794	/ Remove the thread from the list of threads with user defined*
795	stacks. /*
796	stack_list_del (&pd->list);
797
798	/ Not much to do. Just free the mmap()ed memory. Note that we do*
799	not reset the 'used' flag in the 'tid' field. This is done by
800	the kernel. If no thread has been created yet this field is
801	still zero. /*
802	if (__glibc_likely (! pd->user_stack))
803	(void) queue_stack (pd);
804	else
805	/ Free the memory associated with the ELF TLS. /
806	_dl_deallocate_tls (TLS_TPADJ (pd), false);
807
808	lll_unlock (stack_cache_lock, LLL_PRIVATE);
809	}
810
811
812	int
813	__make_stacks_executable (void **stack_endp)
814	{
815	/ First the main thread's stack. /
816	int err = _dl_make_stack_executable (stack_endp);
817	if (err != `0`)
818	return err;
819
820	#ifdef NEED_SEPARATE_REGISTER_STACK
821	const size_t pagemask = ~(__getpagesize () - `1`);
822	#endif
823
824	lll_lock (stack_cache_lock, LLL_PRIVATE);
825
826	list_t *runp;
827	list_for_each (runp, &stack_used)
828	{
829	err = change_stack_perm (list_entry (runp, struct pthread, list)
830	#ifdef NEED_SEPARATE_REGISTER_STACK
831	, pagemask
832	#endif
833	);
834	if (err != `0`)
835	break;
836	}
837
838	/ Also change the permission for the currently unused stacks. This*
839	might be wasted time but better spend it here than adding a check
840	in the fast path. /*
841	if (err == `0`)
842	list_for_each (runp, &stack_cache)
843	{
844	err = change_stack_perm (list_entry (runp, struct pthread, list)
845	#ifdef NEED_SEPARATE_REGISTER_STACK
846	, pagemask
847	#endif
848	);
849	if (err != `0`)
850	break;
851	}
852
853	lll_unlock (stack_cache_lock, LLL_PRIVATE);
854
855	return err;
856	}
857
858
859	/ In case of a fork() call the memory allocation in the child will be*
860	the same but only one thread is running. All stacks except that of
861	the one running thread are not used anymore. We have to recycle
862	them. /*
863	void
864	__reclaim_stacks (void)
865	{
866	struct pthread self = (struct* pthread *) THREAD_SELF;
867
868	/ No locking necessary. The caller is the only stack in use. But*
869	we have to be aware that we might have interrupted a list
870	operation. /*
871
872	if (in_flight_stack != `0`)
873	{
874	bool add_p = in_flight_stack & `1`;
875	list_t elem = (list_t ) (in_flight_stack & ~(uintptr_t) `1`);
876
877	if (add_p)
878	{
879	/ We always add at the beginning of the list. So in this case we*
880	only need to check the beginning of these lists to see if the
881	pointers at the head of the list are inconsistent. /*
882	list_t *l = NULL;
883
884	if (stack_used.next->prev != &stack_used)
885	l = &stack_used;
886	else if (stack_cache.next->prev != &stack_cache)
887	l = &stack_cache;
888
889	if (l != NULL)
890	{
891	assert (l->next->prev == elem);
892	elem->next = l->next;
893	elem->prev = l;
894	l->next = elem;
895	}
896	}
897	else
898	{
899	/ We can simply always replay the delete operation. /
900	elem->next->prev = elem->prev;
901	elem->prev->next = elem->next;
902	}
903	}
904
905	/ Mark all stacks except the still running one as free. /
906	list_t *runp;
907	list_for_each (runp, &stack_used)
908	{
909	struct pthread curp = list_entry (runp, struct* pthread, list);
910	if (curp != self)
911	{
912	/ This marks the stack as free. /
913	curp->tid = `0`;
914
915	/ Account for the size of the stack. /
916	stack_cache_actsize += curp->stackblock_size;
917
918	if (curp->specific_used)
919	{
920	/ Clear the thread-specific data. /
921	memset (curp->specific_1stblock, `'\0'`,
922	sizeof (curp->specific_1stblock));
923
924	curp->specific_used = false;
925
926	for (size_t cnt = `1`; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
927	if (curp->specific[cnt] != NULL)
928	{
929	memset (curp->specific[cnt], `'\0'`,
930	sizeof (curp->specific_1stblock));
931
932	/ We have allocated the block which we do not*
933	free here so re-set the bit. /*
934	curp->specific_used = true;
935	}
936	}
937	}
938	}
939
940	/ Add the stack of all running threads to the cache. /
941	list_splice (&stack_used, &stack_cache);
942
943	/ Remove the entry for the current thread to from the cache list*
944	and add it to the list of running threads. Which of the two
945	lists is decided by the user_stack flag. /*
946	stack_list_del (&self->list);
947
948	/ Re-initialize the lists for all the threads. /
949	INIT_LIST_HEAD (&stack_used);
950	INIT_LIST_HEAD (&__stack_user);
951
952	if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
953	list_add (&self->list, &__stack_user);
954	else
955	list_add (&self->list, &stack_used);
956
957	/ There is one thread running. /
958	__nptl_nthreads = `1`;
959
960	in_flight_stack = `0`;
961
962	/ Initialize locks. /
963	stack_cache_lock = LLL_LOCK_INITIALIZER;
964	__default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
965	}
966
967
968	static void
969	setxid_mark_thread (struct xid_command cmdp, struct* pthread *t)
970	{
971	int ch;
972
973	/ Wait until this thread is cloned. /
974	if (t->setxid_futex == -`1`
975	&& ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -`2`, -`1`))
976	do
977	futex_wait_simple (&t->setxid_futex, -`2`, FUTEX_PRIVATE);
978	while (t->setxid_futex == -`2`);
979
980	/ Don't let the thread exit before the setxid handler runs. /
981	t->setxid_futex = `0`;
982
983	do
984	{
985	ch = t->cancelhandling;
986
987	/ If the thread is exiting right now, ignore it. /
988	if ((ch & EXITING_BITMASK) != `0`)
989	{
990	/ Release the futex if there is no other setxid in*
991	progress. /*
992	if ((ch & SETXID_BITMASK) == `0`)
993	{
994	t->setxid_futex = `1`;
995	futex_wake (&t->setxid_futex, `1`, FUTEX_PRIVATE);
996	}
997	return;
998	}
999	}
1000	while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1001	ch \| SETXID_BITMASK, ch));
1002	}
1003
1004
1005	static void
1006	setxid_unmark_thread (struct xid_command cmdp, struct* pthread *t)
1007	{
1008	int ch;
1009
1010	do
1011	{
1012	ch = t->cancelhandling;
1013	if ((ch & SETXID_BITMASK) == `0`)
1014	return;
1015	}
1016	while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1017	ch & ~SETXID_BITMASK, ch));
1018
1019	/ Release the futex just in case. /
1020	t->setxid_futex = `1`;
1021	futex_wake (&t->setxid_futex, `1`, FUTEX_PRIVATE);
1022	}
1023
1024
1025	static int
1026	setxid_signal_thread (struct xid_command cmdp, struct* pthread *t)
1027	{
1028	if ((t->cancelhandling & SETXID_BITMASK) == `0`)
1029	return `0`;
1030
1031	int val;
1032	pid_t pid = __getpid ();
1033	val = INTERNAL_SYSCALL_CALL (tgkill, pid, t->tid, SIGSETXID);
1034
1035	/ If this failed, it must have had not started yet or else exited. /
1036	if (!INTERNAL_SYSCALL_ERROR_P (val))
1037	{
1038	atomic_increment (&cmdp->cntr);
1039	return `1`;
1040	}
1041	else
1042	return `0`;
1043	}
1044
1045	/ Check for consistency across setid system call results. The abort
1046	should not happen as long as all privileges changes happen through
1047	the glibc wrappers. ERROR must be 0 (no error) or an errno
1048	code. /*
1049	void
1050	attribute_hidden
1051	__nptl_setxid_error (struct xid_command cmdp, int* error)
1052	{
1053	do
1054	{
1055	int olderror = cmdp->error;
1056	if (olderror == error)
1057	break;
1058	if (olderror != -`1`)
1059	{
1060	/ Mismatch between current and previous results. Save the*
1061	error value to memory so that is not clobbered by the
1062	abort function and preserved in coredumps. /*
1063	volatile int xid_err __attribute__((unused)) = error;
1064	abort ();
1065	}
1066	}
1067	while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -`1`));
1068	}
1069
1070	int
1071	attribute_hidden
1072	__nptl_setxid (struct xid_command *cmdp)
1073	{
1074	int signalled;
1075	int result;
1076	lll_lock (stack_cache_lock, LLL_PRIVATE);
1077
1078	__xidcmd = cmdp;
1079	cmdp->cntr = `0`;
1080	cmdp->error = -`1`;
1081
1082	struct pthread *self = THREAD_SELF;
1083
1084	/ Iterate over the list with system-allocated threads first. /
1085	list_t *runp;
1086	list_for_each (runp, &stack_used)
1087	{
1088	struct pthread t = list_entry (runp, struct* pthread, list);
1089	if (t == self)
1090	continue;
1091
1092	setxid_mark_thread (cmdp, t);
1093	}
1094
1095	/ Now the list with threads using user-allocated stacks. /
1096	list_for_each (runp, &__stack_user)
1097	{
1098	struct pthread t = list_entry (runp, struct* pthread, list);
1099	if (t == self)
1100	continue;
1101
1102	setxid_mark_thread (cmdp, t);
1103	}
1104
1105	/ Iterate until we don't succeed in signalling anyone. That means*
1106	we have gotten all running threads, and their children will be
1107	automatically correct once started. /*
1108	do
1109	{
1110	signalled = `0`;
1111
1112	list_for_each (runp, &stack_used)
1113	{
1114	struct pthread t = list_entry (runp, struct* pthread, list);
1115	if (t == self)
1116	continue;
1117
1118	signalled += setxid_signal_thread (cmdp, t);
1119	}
1120
1121	list_for_each (runp, &__stack_user)
1122	{
1123	struct pthread t = list_entry (runp, struct* pthread, list);
1124	if (t == self)
1125	continue;
1126
1127	signalled += setxid_signal_thread (cmdp, t);
1128	}
1129
1130	int cur = cmdp->cntr;
1131	while (cur != `0`)
1132	{
1133	futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1134	FUTEX_PRIVATE);
1135	cur = cmdp->cntr;
1136	}
1137	}
1138	while (signalled != `0`);
1139
1140	/ Clean up flags, so that no thread blocks during exit waiting*
1141	for a signal which will never come. /*
1142	list_for_each (runp, &stack_used)
1143	{
1144	struct pthread t = list_entry (runp, struct* pthread, list);
1145	if (t == self)
1146	continue;
1147
1148	setxid_unmark_thread (cmdp, t);
1149	}
1150
1151	list_for_each (runp, &__stack_user)
1152	{
1153	struct pthread t = list_entry (runp, struct* pthread, list);
1154	if (t == self)
1155	continue;
1156
1157	setxid_unmark_thread (cmdp, t);
1158	}
1159
1160	/ This must be last, otherwise the current thread might not have*
1161	permissions to send SIGSETXID syscall to the other threads. /*
1162	result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, `3`,
1163	cmdp->id[`0`], cmdp->id[`1`], cmdp->id[`2`]);
1164	int error = `0`;
1165	if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result)))
1166	{
1167	error = INTERNAL_SYSCALL_ERRNO (result);
1168	__set_errno (error);
1169	result = -`1`;
1170	}
1171	__nptl_setxid_error (cmdp, error);
1172
1173	lll_unlock (stack_cache_lock, LLL_PRIVATE);
1174	return result;
1175	}
1176
1177	static inline void __attribute__((always_inline))
1178	init_one_static_tls (struct pthread curp, struct* link_map *map)
1179	{
1180	# if TLS_TCB_AT_TP
1181	void dest = (char* *) curp - map->l_tls_offset;
1182	# elif TLS_DTV_AT_TP
1183	void dest = (char* *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1184	# else
1185	# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1186	# endif
1187
1188	/ Initialize the memory. /
1189	memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1190	`'\0'`, map->l_tls_blocksize - map->l_tls_initimage_size);
1191	}
1192
1193	void
1194	attribute_hidden
1195	__pthread_init_static_tls (struct link_map *map)
1196	{
1197	lll_lock (stack_cache_lock, LLL_PRIVATE);
1198
1199	/ Iterate over the list with system-allocated threads first. /
1200	list_t *runp;
1201	list_for_each (runp, &stack_used)
1202	init_one_static_tls (list_entry (runp, struct pthread, list), map);
1203
1204	/ Now the list with threads using user-allocated stacks. /
1205	list_for_each (runp, &__stack_user)
1206	init_one_static_tls (list_entry (runp, struct pthread, list), map);
1207
1208	lll_unlock (stack_cache_lock, LLL_PRIVATE);
1209	}
1210
1211
1212	void
1213	attribute_hidden
1214	__wait_lookup_done (void)
1215	{
1216	lll_lock (stack_cache_lock, LLL_PRIVATE);
1217
1218	struct pthread *self = THREAD_SELF;
1219
1220	/ Iterate over the list with system-allocated threads first. /
1221	list_t *runp;
1222	list_for_each (runp, &stack_used)
1223	{
1224	struct pthread t = list_entry (runp, struct* pthread, list);
1225	if (t == self \|\| t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1226	continue;
1227
1228	int *const gscope_flagp = &t->header.gscope_flag;
1229
1230	/ We have to wait until this thread is done with the global*
1231	scope. First tell the thread that we are waiting and
1232	possibly have to be woken. /*
1233	if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1234	THREAD_GSCOPE_FLAG_WAIT,
1235	THREAD_GSCOPE_FLAG_USED))
1236	continue;
1237
1238	do
1239	futex_wait_simple ((unsigned int *) gscope_flagp,
1240	THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1241	while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1242	}
1243
1244	/ Now the list with threads using user-allocated stacks. /
1245	list_for_each (runp, &__stack_user)
1246	{
1247	struct pthread t = list_entry (runp, struct* pthread, list);
1248	if (t == self \|\| t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1249	continue;
1250
1251	int *const gscope_flagp = &t->header.gscope_flag;
1252
1253	/ We have to wait until this thread is done with the global*
1254	scope. First tell the thread that we are waiting and
1255	possibly have to be woken. /*
1256	if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1257	THREAD_GSCOPE_FLAG_WAIT,
1258	THREAD_GSCOPE_FLAG_USED))
1259	continue;
1260
1261	do
1262	futex_wait_simple ((unsigned int *) gscope_flagp,
1263	THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1264	while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1265	}
1266
1267	lll_unlock (stack_cache_lock, LLL_PRIVATE);
1268	}
1269

Browse the source code of glibc/nptl/allocatestack.c