| 1 | /* Copyright (C) 2003-2021 Free Software Foundation, Inc. |
| 2 | This file is part of the GNU C Library. |
| 3 | Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <errno.h> |
| 20 | #include <sysdep.h> |
| 21 | #include <futex-internal.h> |
| 22 | #include <pthreadP.h> |
| 23 | #include <shlib-compat.h> |
| 24 | |
| 25 | |
| 26 | /* Wait on the barrier. |
| 27 | |
| 28 | In each round, we wait for a fixed number of threads to enter the barrier |
| 29 | (COUNT). Once that has happened, exactly these threads are allowed to |
| 30 | leave the barrier. Note that POSIX does not require that only COUNT |
| 31 | threads can attempt to block using the barrier concurrently. |
| 32 | |
| 33 | We count the number of threads that have entered (IN). Each thread |
| 34 | increments IN when entering, thus getting a position in the sequence of |
| 35 | threads that are or have been waiting (starting with 1, so the position |
| 36 | is the number of threads that have entered so far including the current |
| 37 | thread). |
| 38 | CURRENT_ROUND designates the most recent thread whose round has been |
| 39 | detected as complete. When a thread detects that enough threads have |
| 40 | entered to make a round complete, it finishes this round by effectively |
| 41 | adding COUNT to CURRENT_ROUND atomically. Threads that believe that their |
| 42 | round is not complete yet wait until CURRENT_ROUND is not smaller than |
| 43 | their position anymore. |
| 44 | |
| 45 | A barrier can be destroyed as soon as no threads are blocked on the |
| 46 | barrier. This is already the case if just one thread from the last round |
| 47 | has stopped waiting and returned to the caller; the assumption is that |
| 48 | all threads from the round are unblocked atomically, even though they may |
| 49 | return at different times from the respective calls to |
| 50 | pthread_barrier_wait). Thus, a valid call to pthread_barrier_destroy can |
| 51 | be concurrent with other threads still figuring out that their round has |
| 52 | been completed. Therefore, threads need to confirm that they have left |
| 53 | the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait |
| 54 | until OUT equals IN. |
| 55 | |
| 56 | To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with |
| 57 | 32b-only atomics, we additionally reset the barrier when IN reaches |
| 58 | a threshold to avoid overflow. We assume that the total number of threads |
| 59 | is less than UINT_MAX/2, and set the threshold accordingly so that we can |
| 60 | use a simple atomic_fetch_add on IN instead of a CAS when entering. The |
| 61 | threshold is always set to the end of a round, so all threads that have |
| 62 | entered are either pre-reset threads or post-reset threads (i.e., have a |
| 63 | position larger than the threshold). |
| 64 | Pre-reset threads just run the algorithm explained above. Post-reset |
| 65 | threads wait until IN is reset to a pre-threshold value. |
| 66 | When the last pre-reset thread leaves the barrier (i.e., OUT equals the |
| 67 | threshold), it resets the barrier to its initial state. Other (post-reset) |
| 68 | threads wait for the reset to have finished by waiting until IN is less |
| 69 | than the threshold and then restart by trying to enter the barrier again. |
| 70 | |
| 71 | We reuse the reset mechanism in pthread_barrier_destroy to get notified |
| 72 | when all threads have left the barrier: We trigger an artificial reset and |
| 73 | wait for the last pre-reset thread to finish reset, thus notifying the |
| 74 | thread that is about to destroy the barrier. |
| 75 | |
| 76 | Blocking using futexes is straightforward: pre-reset threads wait for |
| 77 | completion of their round using CURRENT_ROUND as futex word, and post-reset |
| 78 | threads and pthread_barrier_destroy use IN as futex word. |
| 79 | |
| 80 | Further notes: |
| 81 | * It is not simple to let some of the post-reset threads help with the |
| 82 | reset because of the ABA issues that arise; therefore, we simply make |
| 83 | the last thread to leave responsible for the reset. |
| 84 | * POSIX leaves it unspecified whether a signal handler running in a thread |
| 85 | that has been unblocked (because its round is complete) can stall all |
| 86 | other threads and prevent them from returning from the barrier. In this |
| 87 | implementation, other threads will return. However, |
| 88 | pthread_barrier_destroy will of course wait for the signal handler thread |
| 89 | to confirm that it left the barrier. |
| 90 | |
| 91 | TODO We should add spinning with back-off. Once we do that, we could also |
| 92 | try to avoid the futex_wake syscall when a round is detected as finished. |
| 93 | If we do not spin, it is quite likely that at least some other threads will |
| 94 | have called futex_wait already. */ |
| 95 | int |
| 96 | ___pthread_barrier_wait (pthread_barrier_t *barrier) |
| 97 | { |
| 98 | struct pthread_barrier *bar = (struct pthread_barrier *) barrier; |
| 99 | |
| 100 | /* How many threads entered so far, including ourself. */ |
| 101 | unsigned int i; |
| 102 | |
| 103 | reset_restart: |
| 104 | /* Try to enter the barrier. We need acquire MO to (1) ensure that if we |
| 105 | observe that our round can be completed (see below for our attempt to do |
| 106 | so), all pre-barrier-entry effects of all threads in our round happen |
| 107 | before us completing the round, and (2) to make our use of the barrier |
| 108 | happen after a potential reset. We need release MO to make sure that our |
| 109 | pre-barrier-entry effects happen before threads in this round leaving the |
| 110 | barrier. */ |
| 111 | i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1; |
| 112 | /* These loads are after the fetch_add so that we're less likely to first |
| 113 | pull in the cache line as shared. */ |
| 114 | unsigned int count = bar->count; |
| 115 | /* This is the number of threads that can enter before we need to reset. |
| 116 | Always at the end of a round. */ |
| 117 | unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD |
| 118 | - BARRIER_IN_THRESHOLD % count; |
| 119 | |
| 120 | if (i > max_in_before_reset) |
| 121 | { |
| 122 | /* We're in a reset round. Just wait for a reset to finish; do not |
| 123 | help finishing previous rounds because this could happen |
| 124 | concurrently with a reset. */ |
| 125 | while (i > max_in_before_reset) |
| 126 | { |
| 127 | futex_wait_simple (&bar->in, i, bar->shared); |
| 128 | /* Relaxed MO is fine here because we just need an indication for |
| 129 | when we should retry to enter (which will use acquire MO, see |
| 130 | above). */ |
| 131 | i = atomic_load_relaxed (&bar->in); |
| 132 | } |
| 133 | goto reset_restart; |
| 134 | } |
| 135 | |
| 136 | /* Look at the current round. At this point, we are just interested in |
| 137 | whether we can complete rounds, based on the information we obtained |
| 138 | through our acquire-MO load of IN. Nonetheless, if we notice that |
| 139 | our round has been completed using this load, we use the acquire-MO |
| 140 | fence below to make sure that all pre-barrier-entry effects of all |
| 141 | threads in our round happen before us leaving the barrier. Therefore, |
| 142 | relaxed MO is sufficient. */ |
| 143 | unsigned cr = atomic_load_relaxed (&bar->current_round); |
| 144 | |
| 145 | /* Try to finish previous rounds and/or the current round. We simply |
| 146 | consider just our position here and do not try to do the work of threads |
| 147 | that entered more recently. */ |
| 148 | while (cr + count <= i) |
| 149 | { |
| 150 | /* Calculate the new current round based on how many threads entered. |
| 151 | NEWCR must be larger than CR because CR+COUNT ends a round. */ |
| 152 | unsigned int newcr = i - i % count; |
| 153 | /* Try to complete previous and/or the current round. We need release |
| 154 | MO to propagate the happens-before that we observed through reading |
| 155 | with acquire MO from IN to other threads. If the CAS fails, it |
| 156 | is like the relaxed-MO load of CURRENT_ROUND above. */ |
| 157 | if (atomic_compare_exchange_weak_release (&bar->current_round, &cr, |
| 158 | newcr)) |
| 159 | { |
| 160 | /* Update CR with the modification we just did. */ |
| 161 | cr = newcr; |
| 162 | /* Wake threads belonging to the rounds we just finished. We may |
| 163 | wake more threads than necessary if more than COUNT threads try |
| 164 | to block concurrently on the barrier, but this is not a typical |
| 165 | use of barriers. |
| 166 | Note that we can still access SHARED because we haven't yet |
| 167 | confirmed to have left the barrier. */ |
| 168 | futex_wake (&bar->current_round, INT_MAX, bar->shared); |
| 169 | /* We did as much as we could based on our position. If we advanced |
| 170 | the current round to a round sufficient for us, do not wait for |
| 171 | that to happen and skip the acquire fence (we already |
| 172 | synchronize-with all other threads in our round through the |
| 173 | initial acquire MO fetch_add of IN. */ |
| 174 | if (i <= cr) |
| 175 | goto ready_to_leave; |
| 176 | else |
| 177 | break; |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | /* Wait until the current round is more recent than the round we are in. */ |
| 182 | while (i > cr) |
| 183 | { |
| 184 | /* Wait for the current round to finish. */ |
| 185 | futex_wait_simple (&bar->current_round, cr, bar->shared); |
| 186 | /* See the fence below. */ |
| 187 | cr = atomic_load_relaxed (&bar->current_round); |
| 188 | } |
| 189 | |
| 190 | /* Our round finished. Use the acquire MO fence to synchronize-with the |
| 191 | thread that finished the round, either through the initial load of |
| 192 | CURRENT_ROUND above or a failed CAS in the loop above. */ |
| 193 | atomic_thread_fence_acquire (); |
| 194 | |
| 195 | /* Now signal that we left. */ |
| 196 | unsigned int o; |
| 197 | ready_to_leave: |
| 198 | /* We need release MO here so that our use of the barrier happens before |
| 199 | reset or memory reuse after pthread_barrier_destroy. */ |
| 200 | o = atomic_fetch_add_release (&bar->out, 1) + 1; |
| 201 | if (o == max_in_before_reset) |
| 202 | { |
| 203 | /* Perform a reset if we are the last pre-reset thread leaving. All |
| 204 | other threads accessing the barrier are post-reset threads and are |
| 205 | incrementing or spinning on IN. Thus, resetting IN as the last step |
| 206 | of reset ensures that the reset is not concurrent with actual use of |
| 207 | the barrier. We need the acquire MO fence so that the reset happens |
| 208 | after use of the barrier by all earlier pre-reset threads. */ |
| 209 | atomic_thread_fence_acquire (); |
| 210 | atomic_store_relaxed (&bar->current_round, 0); |
| 211 | atomic_store_relaxed (&bar->out, 0); |
| 212 | /* When destroying the barrier, we wait for a reset to happen. Thus, |
| 213 | we must load SHARED now so that this happens before the barrier is |
| 214 | destroyed. */ |
| 215 | int shared = bar->shared; |
| 216 | atomic_store_release (&bar->in, 0); |
| 217 | futex_wake (&bar->in, INT_MAX, shared); |
| 218 | |
| 219 | } |
| 220 | |
| 221 | /* Return a special value for exactly one thread per round. */ |
| 222 | return i % count == 0 ? PTHREAD_BARRIER_SERIAL_THREAD : 0; |
| 223 | } |
| 224 | versioned_symbol (libc, ___pthread_barrier_wait, pthread_barrier_wait, |
| 225 | GLIBC_2_34); |
| 226 | libc_hidden_ver (___pthread_barrier_wait, __pthread_barrier_wait) |
| 227 | #ifndef SHARED |
| 228 | strong_alias (___pthread_barrier_wait, __pthread_barrier_wait) |
| 229 | #endif |
| 230 | |
| 231 | #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_2, GLIBC_2_34) |
| 232 | compat_symbol (libpthread, ___pthread_barrier_wait, pthread_barrier_wait, |
| 233 | GLIBC_2_2); |
| 234 | #endif |
| 235 | |