1 | /* Copyright (C) 2003-2021 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <errno.h> |
20 | #include <sysdep.h> |
21 | #include <futex-internal.h> |
22 | #include <pthreadP.h> |
23 | #include <shlib-compat.h> |
24 | |
25 | |
26 | /* Wait on the barrier. |
27 | |
28 | In each round, we wait for a fixed number of threads to enter the barrier |
29 | (COUNT). Once that has happened, exactly these threads are allowed to |
30 | leave the barrier. Note that POSIX does not require that only COUNT |
31 | threads can attempt to block using the barrier concurrently. |
32 | |
33 | We count the number of threads that have entered (IN). Each thread |
34 | increments IN when entering, thus getting a position in the sequence of |
35 | threads that are or have been waiting (starting with 1, so the position |
36 | is the number of threads that have entered so far including the current |
37 | thread). |
38 | CURRENT_ROUND designates the most recent thread whose round has been |
39 | detected as complete. When a thread detects that enough threads have |
40 | entered to make a round complete, it finishes this round by effectively |
41 | adding COUNT to CURRENT_ROUND atomically. Threads that believe that their |
42 | round is not complete yet wait until CURRENT_ROUND is not smaller than |
43 | their position anymore. |
44 | |
45 | A barrier can be destroyed as soon as no threads are blocked on the |
46 | barrier. This is already the case if just one thread from the last round |
47 | has stopped waiting and returned to the caller; the assumption is that |
48 | all threads from the round are unblocked atomically, even though they may |
49 | return at different times from the respective calls to |
50 | pthread_barrier_wait). Thus, a valid call to pthread_barrier_destroy can |
51 | be concurrent with other threads still figuring out that their round has |
52 | been completed. Therefore, threads need to confirm that they have left |
53 | the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait |
54 | until OUT equals IN. |
55 | |
56 | To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with |
57 | 32b-only atomics, we additionally reset the barrier when IN reaches |
58 | a threshold to avoid overflow. We assume that the total number of threads |
59 | is less than UINT_MAX/2, and set the threshold accordingly so that we can |
60 | use a simple atomic_fetch_add on IN instead of a CAS when entering. The |
61 | threshold is always set to the end of a round, so all threads that have |
62 | entered are either pre-reset threads or post-reset threads (i.e., have a |
63 | position larger than the threshold). |
64 | Pre-reset threads just run the algorithm explained above. Post-reset |
65 | threads wait until IN is reset to a pre-threshold value. |
66 | When the last pre-reset thread leaves the barrier (i.e., OUT equals the |
67 | threshold), it resets the barrier to its initial state. Other (post-reset) |
68 | threads wait for the reset to have finished by waiting until IN is less |
69 | than the threshold and then restart by trying to enter the barrier again. |
70 | |
71 | We reuse the reset mechanism in pthread_barrier_destroy to get notified |
72 | when all threads have left the barrier: We trigger an artificial reset and |
73 | wait for the last pre-reset thread to finish reset, thus notifying the |
74 | thread that is about to destroy the barrier. |
75 | |
76 | Blocking using futexes is straightforward: pre-reset threads wait for |
77 | completion of their round using CURRENT_ROUND as futex word, and post-reset |
78 | threads and pthread_barrier_destroy use IN as futex word. |
79 | |
80 | Further notes: |
81 | * It is not simple to let some of the post-reset threads help with the |
82 | reset because of the ABA issues that arise; therefore, we simply make |
83 | the last thread to leave responsible for the reset. |
84 | * POSIX leaves it unspecified whether a signal handler running in a thread |
85 | that has been unblocked (because its round is complete) can stall all |
86 | other threads and prevent them from returning from the barrier. In this |
87 | implementation, other threads will return. However, |
88 | pthread_barrier_destroy will of course wait for the signal handler thread |
89 | to confirm that it left the barrier. |
90 | |
91 | TODO We should add spinning with back-off. Once we do that, we could also |
92 | try to avoid the futex_wake syscall when a round is detected as finished. |
93 | If we do not spin, it is quite likely that at least some other threads will |
94 | have called futex_wait already. */ |
95 | int |
96 | ___pthread_barrier_wait (pthread_barrier_t *barrier) |
97 | { |
98 | struct pthread_barrier *bar = (struct pthread_barrier *) barrier; |
99 | |
100 | /* How many threads entered so far, including ourself. */ |
101 | unsigned int i; |
102 | |
103 | reset_restart: |
104 | /* Try to enter the barrier. We need acquire MO to (1) ensure that if we |
105 | observe that our round can be completed (see below for our attempt to do |
106 | so), all pre-barrier-entry effects of all threads in our round happen |
107 | before us completing the round, and (2) to make our use of the barrier |
108 | happen after a potential reset. We need release MO to make sure that our |
109 | pre-barrier-entry effects happen before threads in this round leaving the |
110 | barrier. */ |
111 | i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1; |
112 | /* These loads are after the fetch_add so that we're less likely to first |
113 | pull in the cache line as shared. */ |
114 | unsigned int count = bar->count; |
115 | /* This is the number of threads that can enter before we need to reset. |
116 | Always at the end of a round. */ |
117 | unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD |
118 | - BARRIER_IN_THRESHOLD % count; |
119 | |
120 | if (i > max_in_before_reset) |
121 | { |
122 | /* We're in a reset round. Just wait for a reset to finish; do not |
123 | help finishing previous rounds because this could happen |
124 | concurrently with a reset. */ |
125 | while (i > max_in_before_reset) |
126 | { |
127 | futex_wait_simple (&bar->in, i, bar->shared); |
128 | /* Relaxed MO is fine here because we just need an indication for |
129 | when we should retry to enter (which will use acquire MO, see |
130 | above). */ |
131 | i = atomic_load_relaxed (&bar->in); |
132 | } |
133 | goto reset_restart; |
134 | } |
135 | |
136 | /* Look at the current round. At this point, we are just interested in |
137 | whether we can complete rounds, based on the information we obtained |
138 | through our acquire-MO load of IN. Nonetheless, if we notice that |
139 | our round has been completed using this load, we use the acquire-MO |
140 | fence below to make sure that all pre-barrier-entry effects of all |
141 | threads in our round happen before us leaving the barrier. Therefore, |
142 | relaxed MO is sufficient. */ |
143 | unsigned cr = atomic_load_relaxed (&bar->current_round); |
144 | |
145 | /* Try to finish previous rounds and/or the current round. We simply |
146 | consider just our position here and do not try to do the work of threads |
147 | that entered more recently. */ |
148 | while (cr + count <= i) |
149 | { |
150 | /* Calculate the new current round based on how many threads entered. |
151 | NEWCR must be larger than CR because CR+COUNT ends a round. */ |
152 | unsigned int newcr = i - i % count; |
153 | /* Try to complete previous and/or the current round. We need release |
154 | MO to propagate the happens-before that we observed through reading |
155 | with acquire MO from IN to other threads. If the CAS fails, it |
156 | is like the relaxed-MO load of CURRENT_ROUND above. */ |
157 | if (atomic_compare_exchange_weak_release (&bar->current_round, &cr, |
158 | newcr)) |
159 | { |
160 | /* Update CR with the modification we just did. */ |
161 | cr = newcr; |
162 | /* Wake threads belonging to the rounds we just finished. We may |
163 | wake more threads than necessary if more than COUNT threads try |
164 | to block concurrently on the barrier, but this is not a typical |
165 | use of barriers. |
166 | Note that we can still access SHARED because we haven't yet |
167 | confirmed to have left the barrier. */ |
168 | futex_wake (&bar->current_round, INT_MAX, bar->shared); |
169 | /* We did as much as we could based on our position. If we advanced |
170 | the current round to a round sufficient for us, do not wait for |
171 | that to happen and skip the acquire fence (we already |
172 | synchronize-with all other threads in our round through the |
173 | initial acquire MO fetch_add of IN. */ |
174 | if (i <= cr) |
175 | goto ready_to_leave; |
176 | else |
177 | break; |
178 | } |
179 | } |
180 | |
181 | /* Wait until the current round is more recent than the round we are in. */ |
182 | while (i > cr) |
183 | { |
184 | /* Wait for the current round to finish. */ |
185 | futex_wait_simple (&bar->current_round, cr, bar->shared); |
186 | /* See the fence below. */ |
187 | cr = atomic_load_relaxed (&bar->current_round); |
188 | } |
189 | |
190 | /* Our round finished. Use the acquire MO fence to synchronize-with the |
191 | thread that finished the round, either through the initial load of |
192 | CURRENT_ROUND above or a failed CAS in the loop above. */ |
193 | atomic_thread_fence_acquire (); |
194 | |
195 | /* Now signal that we left. */ |
196 | unsigned int o; |
197 | ready_to_leave: |
198 | /* We need release MO here so that our use of the barrier happens before |
199 | reset or memory reuse after pthread_barrier_destroy. */ |
200 | o = atomic_fetch_add_release (&bar->out, 1) + 1; |
201 | if (o == max_in_before_reset) |
202 | { |
203 | /* Perform a reset if we are the last pre-reset thread leaving. All |
204 | other threads accessing the barrier are post-reset threads and are |
205 | incrementing or spinning on IN. Thus, resetting IN as the last step |
206 | of reset ensures that the reset is not concurrent with actual use of |
207 | the barrier. We need the acquire MO fence so that the reset happens |
208 | after use of the barrier by all earlier pre-reset threads. */ |
209 | atomic_thread_fence_acquire (); |
210 | atomic_store_relaxed (&bar->current_round, 0); |
211 | atomic_store_relaxed (&bar->out, 0); |
212 | /* When destroying the barrier, we wait for a reset to happen. Thus, |
213 | we must load SHARED now so that this happens before the barrier is |
214 | destroyed. */ |
215 | int shared = bar->shared; |
216 | atomic_store_release (&bar->in, 0); |
217 | futex_wake (&bar->in, INT_MAX, shared); |
218 | |
219 | } |
220 | |
221 | /* Return a special value for exactly one thread per round. */ |
222 | return i % count == 0 ? PTHREAD_BARRIER_SERIAL_THREAD : 0; |
223 | } |
224 | versioned_symbol (libc, ___pthread_barrier_wait, pthread_barrier_wait, |
225 | GLIBC_2_34); |
226 | libc_hidden_ver (___pthread_barrier_wait, __pthread_barrier_wait) |
227 | #ifndef SHARED |
228 | strong_alias (___pthread_barrier_wait, __pthread_barrier_wait) |
229 | #endif |
230 | |
231 | #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_2, GLIBC_2_34) |
232 | compat_symbol (libpthread, ___pthread_barrier_wait, pthread_barrier_wait, |
233 | GLIBC_2_2); |
234 | #endif |
235 | |