1 | /* Copyright (C) 2003-2023 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library; if not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | #include <errno.h> |
19 | #include <sysdep.h> |
20 | #include <futex-internal.h> |
21 | #include <pthreadP.h> |
22 | #include <shlib-compat.h> |
23 | |
24 | |
25 | /* Wait on the barrier. |
26 | |
27 | In each round, we wait for a fixed number of threads to enter the barrier |
28 | (COUNT). Once that has happened, exactly these threads are allowed to |
29 | leave the barrier. Note that POSIX does not require that only COUNT |
30 | threads can attempt to block using the barrier concurrently. |
31 | |
32 | We count the number of threads that have entered (IN). Each thread |
33 | increments IN when entering, thus getting a position in the sequence of |
34 | threads that are or have been waiting (starting with 1, so the position |
35 | is the number of threads that have entered so far including the current |
36 | thread). |
37 | CURRENT_ROUND designates the most recent thread whose round has been |
38 | detected as complete. When a thread detects that enough threads have |
39 | entered to make a round complete, it finishes this round by effectively |
40 | adding COUNT to CURRENT_ROUND atomically. Threads that believe that their |
41 | round is not complete yet wait until CURRENT_ROUND is not smaller than |
42 | their position anymore. |
43 | |
44 | A barrier can be destroyed as soon as no threads are blocked on the |
45 | barrier. This is already the case if just one thread from the last round |
46 | has stopped waiting and returned to the caller; the assumption is that |
47 | all threads from the round are unblocked atomically, even though they may |
48 | return at different times from the respective calls to |
49 | pthread_barrier_wait). Thus, a valid call to pthread_barrier_destroy can |
50 | be concurrent with other threads still figuring out that their round has |
51 | been completed. Therefore, threads need to confirm that they have left |
52 | the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait |
53 | until OUT equals IN. |
54 | |
55 | To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with |
56 | 32b-only atomics, we additionally reset the barrier when IN reaches |
57 | a threshold to avoid overflow. We assume that the total number of threads |
58 | is less than UINT_MAX/2, and set the threshold accordingly so that we can |
59 | use a simple atomic_fetch_add on IN instead of a CAS when entering. The |
60 | threshold is always set to the end of a round, so all threads that have |
61 | entered are either pre-reset threads or post-reset threads (i.e., have a |
62 | position larger than the threshold). |
63 | Pre-reset threads just run the algorithm explained above. Post-reset |
64 | threads wait until IN is reset to a pre-threshold value. |
65 | When the last pre-reset thread leaves the barrier (i.e., OUT equals the |
66 | threshold), it resets the barrier to its initial state. Other (post-reset) |
67 | threads wait for the reset to have finished by waiting until IN is less |
68 | than the threshold and then restart by trying to enter the barrier again. |
69 | |
70 | We reuse the reset mechanism in pthread_barrier_destroy to get notified |
71 | when all threads have left the barrier: We trigger an artificial reset and |
72 | wait for the last pre-reset thread to finish reset, thus notifying the |
73 | thread that is about to destroy the barrier. |
74 | |
75 | Blocking using futexes is straightforward: pre-reset threads wait for |
76 | completion of their round using CURRENT_ROUND as futex word, and post-reset |
77 | threads and pthread_barrier_destroy use IN as futex word. |
78 | |
79 | Further notes: |
80 | * It is not simple to let some of the post-reset threads help with the |
81 | reset because of the ABA issues that arise; therefore, we simply make |
82 | the last thread to leave responsible for the reset. |
83 | * POSIX leaves it unspecified whether a signal handler running in a thread |
84 | that has been unblocked (because its round is complete) can stall all |
85 | other threads and prevent them from returning from the barrier. In this |
86 | implementation, other threads will return. However, |
87 | pthread_barrier_destroy will of course wait for the signal handler thread |
88 | to confirm that it left the barrier. |
89 | |
90 | TODO We should add spinning with back-off. Once we do that, we could also |
91 | try to avoid the futex_wake syscall when a round is detected as finished. |
92 | If we do not spin, it is quite likely that at least some other threads will |
93 | have called futex_wait already. */ |
94 | int |
95 | ___pthread_barrier_wait (pthread_barrier_t *barrier) |
96 | { |
97 | struct pthread_barrier *bar = (struct pthread_barrier *) barrier; |
98 | |
99 | /* How many threads entered so far, including ourself. */ |
100 | unsigned int i; |
101 | |
102 | reset_restart: |
103 | /* Try to enter the barrier. We need acquire MO to (1) ensure that if we |
104 | observe that our round can be completed (see below for our attempt to do |
105 | so), all pre-barrier-entry effects of all threads in our round happen |
106 | before us completing the round, and (2) to make our use of the barrier |
107 | happen after a potential reset. We need release MO to make sure that our |
108 | pre-barrier-entry effects happen before threads in this round leaving the |
109 | barrier. */ |
110 | i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1; |
111 | /* These loads are after the fetch_add so that we're less likely to first |
112 | pull in the cache line as shared. */ |
113 | unsigned int count = bar->count; |
114 | /* This is the number of threads that can enter before we need to reset. |
115 | Always at the end of a round. */ |
116 | unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD |
117 | - BARRIER_IN_THRESHOLD % count; |
118 | |
119 | if (i > max_in_before_reset) |
120 | { |
121 | /* We're in a reset round. Just wait for a reset to finish; do not |
122 | help finishing previous rounds because this could happen |
123 | concurrently with a reset. */ |
124 | while (i > max_in_before_reset) |
125 | { |
126 | futex_wait_simple (&bar->in, i, bar->shared); |
127 | /* Relaxed MO is fine here because we just need an indication for |
128 | when we should retry to enter (which will use acquire MO, see |
129 | above). */ |
130 | i = atomic_load_relaxed (&bar->in); |
131 | } |
132 | goto reset_restart; |
133 | } |
134 | |
135 | /* Look at the current round. At this point, we are just interested in |
136 | whether we can complete rounds, based on the information we obtained |
137 | through our acquire-MO load of IN. Nonetheless, if we notice that |
138 | our round has been completed using this load, we use the acquire-MO |
139 | fence below to make sure that all pre-barrier-entry effects of all |
140 | threads in our round happen before us leaving the barrier. Therefore, |
141 | relaxed MO is sufficient. */ |
142 | unsigned cr = atomic_load_relaxed (&bar->current_round); |
143 | |
144 | /* Try to finish previous rounds and/or the current round. We simply |
145 | consider just our position here and do not try to do the work of threads |
146 | that entered more recently. */ |
147 | while (cr + count <= i) |
148 | { |
149 | /* Calculate the new current round based on how many threads entered. |
150 | NEWCR must be larger than CR because CR+COUNT ends a round. */ |
151 | unsigned int newcr = i - i % count; |
152 | /* Try to complete previous and/or the current round. We need release |
153 | MO to propagate the happens-before that we observed through reading |
154 | with acquire MO from IN to other threads. If the CAS fails, it |
155 | is like the relaxed-MO load of CURRENT_ROUND above. */ |
156 | if (atomic_compare_exchange_weak_release (&bar->current_round, &cr, |
157 | newcr)) |
158 | { |
159 | /* Update CR with the modification we just did. */ |
160 | cr = newcr; |
161 | /* Wake threads belonging to the rounds we just finished. We may |
162 | wake more threads than necessary if more than COUNT threads try |
163 | to block concurrently on the barrier, but this is not a typical |
164 | use of barriers. |
165 | Note that we can still access SHARED because we haven't yet |
166 | confirmed to have left the barrier. */ |
167 | futex_wake (&bar->current_round, INT_MAX, bar->shared); |
168 | /* We did as much as we could based on our position. If we advanced |
169 | the current round to a round sufficient for us, do not wait for |
170 | that to happen and skip the acquire fence (we already |
171 | synchronize-with all other threads in our round through the |
172 | initial acquire MO fetch_add of IN. */ |
173 | if (i <= cr) |
174 | goto ready_to_leave; |
175 | else |
176 | break; |
177 | } |
178 | } |
179 | |
180 | /* Wait until the current round is more recent than the round we are in. */ |
181 | while (i > cr) |
182 | { |
183 | /* Wait for the current round to finish. */ |
184 | futex_wait_simple (&bar->current_round, cr, bar->shared); |
185 | /* See the fence below. */ |
186 | cr = atomic_load_relaxed (&bar->current_round); |
187 | } |
188 | |
189 | /* Our round finished. Use the acquire MO fence to synchronize-with the |
190 | thread that finished the round, either through the initial load of |
191 | CURRENT_ROUND above or a failed CAS in the loop above. */ |
192 | atomic_thread_fence_acquire (); |
193 | |
194 | /* Now signal that we left. */ |
195 | unsigned int o; |
196 | ready_to_leave: |
197 | /* We need release MO here so that our use of the barrier happens before |
198 | reset or memory reuse after pthread_barrier_destroy. */ |
199 | o = atomic_fetch_add_release (&bar->out, 1) + 1; |
200 | if (o == max_in_before_reset) |
201 | { |
202 | /* Perform a reset if we are the last pre-reset thread leaving. All |
203 | other threads accessing the barrier are post-reset threads and are |
204 | incrementing or spinning on IN. Thus, resetting IN as the last step |
205 | of reset ensures that the reset is not concurrent with actual use of |
206 | the barrier. We need the acquire MO fence so that the reset happens |
207 | after use of the barrier by all earlier pre-reset threads. */ |
208 | atomic_thread_fence_acquire (); |
209 | atomic_store_relaxed (&bar->current_round, 0); |
210 | atomic_store_relaxed (&bar->out, 0); |
211 | /* When destroying the barrier, we wait for a reset to happen. Thus, |
212 | we must load SHARED now so that this happens before the barrier is |
213 | destroyed. */ |
214 | int shared = bar->shared; |
215 | atomic_store_release (&bar->in, 0); |
216 | futex_wake (&bar->in, INT_MAX, shared); |
217 | |
218 | } |
219 | |
220 | /* Return a special value for exactly one thread per round. */ |
221 | return i % count == 0 ? PTHREAD_BARRIER_SERIAL_THREAD : 0; |
222 | } |
223 | versioned_symbol (libc, ___pthread_barrier_wait, pthread_barrier_wait, |
224 | GLIBC_2_34); |
225 | libc_hidden_ver (___pthread_barrier_wait, __pthread_barrier_wait) |
226 | #ifndef SHARED |
227 | strong_alias (___pthread_barrier_wait, __pthread_barrier_wait) |
228 | #endif |
229 | |
230 | #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_2, GLIBC_2_34) |
231 | compat_symbol (libpthread, ___pthread_barrier_wait, pthread_barrier_wait, |
232 | GLIBC_2_2); |
233 | #endif |
234 | |