1 | /* pthread_cond_common -- shared code for condition variable. |
2 | Copyright (C) 2016-2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <atomic.h> |
20 | #include <stdint.h> |
21 | #include <pthread.h> |
22 | #include <libc-internal.h> |
23 | |
24 | /* We need 3 least-significant bits on __wrefs for something else. */ |
25 | #define __PTHREAD_COND_MAX_GROUP_SIZE ((unsigned) 1 << 29) |
26 | |
27 | #if __HAVE_64B_ATOMICS == 1 |
28 | |
29 | static uint64_t __attribute__ ((unused)) |
30 | __condvar_load_wseq_relaxed (pthread_cond_t *cond) |
31 | { |
32 | return atomic_load_relaxed (&cond->__data.__wseq); |
33 | } |
34 | |
35 | static uint64_t __attribute__ ((unused)) |
36 | __condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val) |
37 | { |
38 | return atomic_fetch_add_acquire (&cond->__data.__wseq, val); |
39 | } |
40 | |
41 | static uint64_t __attribute__ ((unused)) |
42 | __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val) |
43 | { |
44 | return atomic_fetch_xor_release (&cond->__data.__wseq, val); |
45 | } |
46 | |
47 | static uint64_t __attribute__ ((unused)) |
48 | __condvar_load_g1_start_relaxed (pthread_cond_t *cond) |
49 | { |
50 | return atomic_load_relaxed (&cond->__data.__g1_start); |
51 | } |
52 | |
53 | static void __attribute__ ((unused)) |
54 | __condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val) |
55 | { |
56 | atomic_store_relaxed (&cond->__data.__g1_start, |
57 | atomic_load_relaxed (&cond->__data.__g1_start) + val); |
58 | } |
59 | |
60 | #else |
61 | |
62 | /* We use two 64b counters: __wseq and __g1_start. They are monotonically |
63 | increasing and single-writer-multiple-readers counters, so we can implement |
64 | load, fetch-and-add, and fetch-and-xor operations even when we just have |
65 | 32b atomics. Values we add or xor are less than or equal to 1<<31 (*), |
66 | so we only have to make overflow-and-addition atomic wrt. to concurrent |
67 | load operations and xor operations. To do that, we split each counter into |
68 | two 32b values of which we reserve the MSB of each to represent an |
69 | overflow from the lower-order half to the higher-order half. |
70 | |
71 | In the common case, the state is (higher-order / lower-order half, and . is |
72 | basically concatenation of the bits): |
73 | 0.h / 0.l = h.l |
74 | |
75 | When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the |
76 | following steps S1-S4 (the values these represent are on the right-hand |
77 | side): |
78 | S1: 0.h / 1.L == (h+1).L |
79 | S2: 1.(h+1) / 1.L == (h+1).L |
80 | S3: 1.(h+1) / 0.L == (h+1).L |
81 | S4: 0.(h+1) / 0.L == (h+1).L |
82 | If the LSB of the higher-order half is set, readers will ignore the |
83 | overflow bit in the lower-order half. |
84 | |
85 | To get an atomic snapshot in load operations, we exploit that the |
86 | higher-order half is monotonically increasing; if we load a value V from |
87 | it, then read the lower-order half, and then read the higher-order half |
88 | again and see the same value V, we know that both halves have existed in |
89 | the sequence of values the full counter had. This is similar to the |
90 | validated reads in the time-based STMs in GCC's libitm (e.g., |
91 | method_ml_wt). |
92 | |
93 | The xor operation needs to be an atomic read-modify-write. The write |
94 | itself is not an issue as it affects just the lower-order half but not bits |
95 | used in the add operation. To make the full fetch-and-xor atomic, we |
96 | exploit that concurrently, the value can increase by at most 1<<31 (*): The |
97 | xor operation is only called while having acquired the lock, so not more |
98 | than __PTHREAD_COND_MAX_GROUP_SIZE waiters can enter concurrently and thus |
99 | increment __wseq. Therefore, if the xor operation observes a value of |
100 | __wseq, then the value it applies the modification to later on can be |
101 | derived (see below). |
102 | |
103 | One benefit of this scheme is that this makes load operations |
104 | obstruction-free because unlike if we would just lock the counter, readers |
105 | can almost always interpret a snapshot of each halves. Readers can be |
106 | forced to read a new snapshot when the read is concurrent with an overflow. |
107 | However, overflows will happen infrequently, so load operations are |
108 | practically lock-free. |
109 | |
110 | (*) The highest value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to |
111 | __g1_start (the two extra bits are for the lock in the two LSBs of |
112 | __g1_start). */ |
113 | |
114 | typedef struct |
115 | { |
116 | unsigned int low; |
117 | unsigned int high; |
118 | } _condvar_lohi; |
119 | |
120 | static uint64_t |
121 | __condvar_fetch_add_64_relaxed (_condvar_lohi *lh, unsigned int op) |
122 | { |
123 | /* S1. Note that this is an atomic read-modify-write so it extends the |
124 | release sequence of release MO store at S3. */ |
125 | unsigned int l = atomic_fetch_add_relaxed (&lh->low, op); |
126 | unsigned int h = atomic_load_relaxed (&lh->high); |
127 | uint64_t result = ((uint64_t) h << 31) | l; |
128 | l += op; |
129 | if ((l >> 31) > 0) |
130 | { |
131 | /* Overflow. Need to increment higher-order half. Note that all |
132 | add operations are ordered in happens-before. */ |
133 | h++; |
134 | /* S2. Release MO to synchronize with the loads of the higher-order half |
135 | in the load operation. See __condvar_load_64_relaxed. */ |
136 | atomic_store_release (&lh->high, h | ((unsigned int) 1 << 31)); |
137 | l ^= (unsigned int) 1 << 31; |
138 | /* S3. See __condvar_load_64_relaxed. */ |
139 | atomic_store_release (&lh->low, l); |
140 | /* S4. Likewise. */ |
141 | atomic_store_release (&lh->high, h); |
142 | } |
143 | return result; |
144 | } |
145 | |
146 | static uint64_t |
147 | __condvar_load_64_relaxed (_condvar_lohi *lh) |
148 | { |
149 | unsigned int h, l, h2; |
150 | do |
151 | { |
152 | /* This load and the second one below to the same location read from the |
153 | stores in the overflow handling of the add operation or the |
154 | initializing stores (which is a simple special case because |
155 | initialization always completely happens before further use). |
156 | Because no two stores to the higher-order half write the same value, |
157 | the loop ensures that if we continue to use the snapshot, this load |
158 | and the second one read from the same store operation. All candidate |
159 | store operations have release MO. |
160 | If we read from S2 in the first load, then we will see the value of |
161 | S1 on the next load (because we synchronize with S2), or a value |
162 | later in modification order. We correctly ignore the lower-half's |
163 | overflow bit in this case. If we read from S4, then we will see the |
164 | value of S3 in the next load (or a later value), which does not have |
165 | the overflow bit set anymore. |
166 | */ |
167 | h = atomic_load_acquire (&lh->high); |
168 | /* This will read from the release sequence of S3 (i.e, either the S3 |
169 | store or the read-modify-writes at S1 following S3 in modification |
170 | order). Thus, the read synchronizes with S3, and the following load |
171 | of the higher-order half will read from the matching S2 (or a later |
172 | value). |
173 | Thus, if we read a lower-half value here that already overflowed and |
174 | belongs to an increased higher-order half value, we will see the |
175 | latter and h and h2 will not be equal. */ |
176 | l = atomic_load_acquire (&lh->low); |
177 | /* See above. */ |
178 | h2 = atomic_load_relaxed (&lh->high); |
179 | } |
180 | while (h != h2); |
181 | if (((l >> 31) > 0) && ((h >> 31) > 0)) |
182 | l ^= (unsigned int) 1 << 31; |
183 | return ((uint64_t) (h & ~((unsigned int) 1 << 31)) << 31) + l; |
184 | } |
185 | |
186 | static uint64_t __attribute__ ((unused)) |
187 | __condvar_load_wseq_relaxed (pthread_cond_t *cond) |
188 | { |
189 | return __condvar_load_64_relaxed ((_condvar_lohi *) &cond->__data.__wseq32); |
190 | } |
191 | |
192 | static uint64_t __attribute__ ((unused)) |
193 | __condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val) |
194 | { |
195 | uint64_t r = __condvar_fetch_add_64_relaxed |
196 | ((_condvar_lohi *) &cond->__data.__wseq32, val); |
197 | atomic_thread_fence_acquire (); |
198 | return r; |
199 | } |
200 | |
201 | static uint64_t __attribute__ ((unused)) |
202 | __condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val) |
203 | { |
204 | _condvar_lohi *lh = (_condvar_lohi *) &cond->__data.__wseq32; |
205 | /* First, get the current value. See __condvar_load_64_relaxed. */ |
206 | unsigned int h, l, h2; |
207 | do |
208 | { |
209 | h = atomic_load_acquire (&lh->high); |
210 | l = atomic_load_acquire (&lh->low); |
211 | h2 = atomic_load_relaxed (&lh->high); |
212 | } |
213 | while (h != h2); |
214 | if (((l >> 31) > 0) && ((h >> 31) == 0)) |
215 | h++; |
216 | h &= ~((unsigned int) 1 << 31); |
217 | l &= ~((unsigned int) 1 << 31); |
218 | |
219 | /* Now modify. Due to the coherence rules, the prior load will read a value |
220 | earlier in modification order than the following fetch-xor. |
221 | This uses release MO to make the full operation have release semantics |
222 | (all other operations access the lower-order half). */ |
223 | unsigned int l2 = atomic_fetch_xor_release (&lh->low, val) |
224 | & ~((unsigned int) 1 << 31); |
225 | if (l2 < l) |
226 | /* The lower-order half overflowed in the meantime. This happened exactly |
227 | once due to the limit on concurrent waiters (see above). */ |
228 | h++; |
229 | return ((uint64_t) h << 31) + l2; |
230 | } |
231 | |
232 | static uint64_t __attribute__ ((unused)) |
233 | __condvar_load_g1_start_relaxed (pthread_cond_t *cond) |
234 | { |
235 | return __condvar_load_64_relaxed |
236 | ((_condvar_lohi *) &cond->__data.__g1_start32); |
237 | } |
238 | |
239 | static void __attribute__ ((unused)) |
240 | __condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val) |
241 | { |
242 | ignore_value (__condvar_fetch_add_64_relaxed |
243 | ((_condvar_lohi *) &cond->__data.__g1_start32, val)); |
244 | } |
245 | |
246 | #endif /* !__HAVE_64B_ATOMICS */ |
247 | |
248 | |
249 | /* The lock that signalers use. See pthread_cond_wait_common for uses. |
250 | The lock is our normal three-state lock: not acquired (0) / acquired (1) / |
251 | acquired-with-futex_wake-request (2). However, we need to preserve the |
252 | other bits in the unsigned int used for the lock, and therefore it is a |
253 | little more complex. */ |
254 | static void __attribute__ ((unused)) |
255 | __condvar_acquire_lock (pthread_cond_t *cond, int private) |
256 | { |
257 | unsigned int s = atomic_load_relaxed (&cond->__data.__g1_orig_size); |
258 | while ((s & 3) == 0) |
259 | { |
260 | if (atomic_compare_exchange_weak_acquire (&cond->__data.__g1_orig_size, |
261 | &s, s | 1)) |
262 | return; |
263 | /* TODO Spinning and back-off. */ |
264 | } |
265 | /* We can't change from not acquired to acquired, so try to change to |
266 | acquired-with-futex-wake-request and do a futex wait if we cannot change |
267 | from not acquired. */ |
268 | while (1) |
269 | { |
270 | while ((s & 3) != 2) |
271 | { |
272 | if (atomic_compare_exchange_weak_acquire |
273 | (&cond->__data.__g1_orig_size, &s, (s & ~(unsigned int) 3) | 2)) |
274 | { |
275 | if ((s & 3) == 0) |
276 | return; |
277 | break; |
278 | } |
279 | /* TODO Back off. */ |
280 | } |
281 | futex_wait_simple (&cond->__data.__g1_orig_size, |
282 | (s & ~(unsigned int) 3) | 2, private); |
283 | /* Reload so we see a recent value. */ |
284 | s = atomic_load_relaxed (&cond->__data.__g1_orig_size); |
285 | } |
286 | } |
287 | |
288 | /* See __condvar_acquire_lock. */ |
289 | static void __attribute__ ((unused)) |
290 | __condvar_release_lock (pthread_cond_t *cond, int private) |
291 | { |
292 | if ((atomic_fetch_and_release (&cond->__data.__g1_orig_size, |
293 | ~(unsigned int) 3) & 3) |
294 | == 2) |
295 | futex_wake (&cond->__data.__g1_orig_size, 1, private); |
296 | } |
297 | |
298 | /* Only use this when having acquired the lock. */ |
299 | static unsigned int __attribute__ ((unused)) |
300 | __condvar_get_orig_size (pthread_cond_t *cond) |
301 | { |
302 | return atomic_load_relaxed (&cond->__data.__g1_orig_size) >> 2; |
303 | } |
304 | |
305 | /* Only use this when having acquired the lock. */ |
306 | static void __attribute__ ((unused)) |
307 | __condvar_set_orig_size (pthread_cond_t *cond, unsigned int size) |
308 | { |
309 | /* We have acquired the lock, but might get one concurrent update due to a |
310 | lock state change from acquired to acquired-with-futex_wake-request. |
311 | The store with relaxed MO is fine because there will be no further |
312 | changes to the lock bits nor the size, and we will subsequently release |
313 | the lock with release MO. */ |
314 | unsigned int s; |
315 | s = (atomic_load_relaxed (&cond->__data.__g1_orig_size) & 3) |
316 | | (size << 2); |
317 | if ((atomic_exchange_relaxed (&cond->__data.__g1_orig_size, s) & 3) |
318 | != (s & 3)) |
319 | atomic_store_relaxed (&cond->__data.__g1_orig_size, (size << 2) | 2); |
320 | } |
321 | |
322 | /* Returns FUTEX_SHARED or FUTEX_PRIVATE based on the provided __wrefs |
323 | value. */ |
324 | static int __attribute__ ((unused)) |
325 | __condvar_get_private (int flags) |
326 | { |
327 | if ((flags & __PTHREAD_COND_SHARED_MASK) == 0) |
328 | return FUTEX_PRIVATE; |
329 | else |
330 | return FUTEX_SHARED; |
331 | } |
332 | |
333 | /* This closes G1 (whose index is in G1INDEX), waits for all futex waiters to |
334 | leave G1, converts G1 into a fresh G2, and then switches group roles so that |
335 | the former G2 becomes the new G1 ending at the current __wseq value when we |
336 | eventually make the switch (WSEQ is just an observation of __wseq by the |
337 | signaler). |
338 | If G2 is empty, it will not switch groups because then it would create an |
339 | empty G1 which would require switching groups again on the next signal. |
340 | Returns false iff groups were not switched because G2 was empty. */ |
341 | static bool __attribute__ ((unused)) |
342 | __condvar_quiesce_and_switch_g1 (pthread_cond_t *cond, uint64_t wseq, |
343 | unsigned int *g1index, int private) |
344 | { |
345 | const unsigned int maxspin = 0; |
346 | unsigned int g1 = *g1index; |
347 | |
348 | /* If there is no waiter in G2, we don't do anything. The expression may |
349 | look odd but remember that __g_size might hold a negative value, so |
350 | putting the expression this way avoids relying on implementation-defined |
351 | behavior. |
352 | Note that this works correctly for a zero-initialized condvar too. */ |
353 | unsigned int old_orig_size = __condvar_get_orig_size (cond); |
354 | uint64_t old_g1_start = __condvar_load_g1_start_relaxed (cond) >> 1; |
355 | if (((unsigned) (wseq - old_g1_start - old_orig_size) |
356 | + cond->__data.__g_size[g1 ^ 1]) == 0) |
357 | return false; |
358 | |
359 | /* Now try to close and quiesce G1. We have to consider the following kinds |
360 | of waiters: |
361 | * Waiters from less recent groups than G1 are not affected because |
362 | nothing will change for them apart from __g1_start getting larger. |
363 | * New waiters arriving concurrently with the group switching will all go |
364 | into G2 until we atomically make the switch. Waiters existing in G2 |
365 | are not affected. |
366 | * Waiters in G1 will be closed out immediately by setting a flag in |
367 | __g_signals, which will prevent waiters from blocking using a futex on |
368 | __g_signals and also notifies them that the group is closed. As a |
369 | result, they will eventually remove their group reference, allowing us |
370 | to close switch group roles. */ |
371 | |
372 | /* First, set the closed flag on __g_signals. This tells waiters that are |
373 | about to wait that they shouldn't do that anymore. This basically |
374 | serves as an advance notificaton of the upcoming change to __g1_start; |
375 | waiters interpret it as if __g1_start was larger than their waiter |
376 | sequence position. This allows us to change __g1_start after waiting |
377 | for all existing waiters with group references to leave, which in turn |
378 | makes recovery after stealing a signal simpler because it then can be |
379 | skipped if __g1_start indicates that the group is closed (otherwise, |
380 | we would have to recover always because waiters don't know how big their |
381 | groups are). Relaxed MO is fine. */ |
382 | atomic_fetch_or_relaxed (cond->__data.__g_signals + g1, 1); |
383 | |
384 | /* Wait until there are no group references anymore. The fetch-or operation |
385 | injects us into the modification order of __g_refs; release MO ensures |
386 | that waiters incrementing __g_refs after our fetch-or see the previous |
387 | changes to __g_signals and to __g1_start that had to happen before we can |
388 | switch this G1 and alias with an older group (we have two groups, so |
389 | aliasing requires switching group roles twice). Note that nobody else |
390 | can have set the wake-request flag, so we do not have to act upon it. |
391 | |
392 | Also note that it is harmless if older waiters or waiters from this G1 |
393 | get a group reference after we have quiesced the group because it will |
394 | remain closed for them either because of the closed flag in __g_signals |
395 | or the later update to __g1_start. New waiters will never arrive here |
396 | but instead continue to go into the still current G2. */ |
397 | unsigned r = atomic_fetch_or_release (cond->__data.__g_refs + g1, 0); |
398 | while ((r >> 1) > 0) |
399 | { |
400 | for (unsigned int spin = maxspin; ((r >> 1) > 0) && (spin > 0); spin--) |
401 | { |
402 | /* TODO Back off. */ |
403 | r = atomic_load_relaxed (cond->__data.__g_refs + g1); |
404 | } |
405 | if ((r >> 1) > 0) |
406 | { |
407 | /* There is still a waiter after spinning. Set the wake-request |
408 | flag and block. Relaxed MO is fine because this is just about |
409 | this futex word. */ |
410 | r = atomic_fetch_or_relaxed (cond->__data.__g_refs + g1, 1); |
411 | |
412 | if ((r >> 1) > 0) |
413 | futex_wait_simple (cond->__data.__g_refs + g1, r, private); |
414 | /* Reload here so we eventually see the most recent value even if we |
415 | do not spin. */ |
416 | r = atomic_load_relaxed (cond->__data.__g_refs + g1); |
417 | } |
418 | } |
419 | /* Acquire MO so that we synchronize with the release operation that waiters |
420 | use to decrement __g_refs and thus happen after the waiters we waited |
421 | for. */ |
422 | atomic_thread_fence_acquire (); |
423 | |
424 | /* Update __g1_start, which finishes closing this group. The value we add |
425 | will never be negative because old_orig_size can only be zero when we |
426 | switch groups the first time after a condvar was initialized, in which |
427 | case G1 will be at index 1 and we will add a value of 1. See above for |
428 | why this takes place after waiting for quiescence of the group. |
429 | Relaxed MO is fine because the change comes with no additional |
430 | constraints that others would have to observe. */ |
431 | __condvar_add_g1_start_relaxed (cond, |
432 | (old_orig_size << 1) + (g1 == 1 ? 1 : - 1)); |
433 | |
434 | /* Now reopen the group, thus enabling waiters to again block using the |
435 | futex controlled by __g_signals. Release MO so that observers that see |
436 | no signals (and thus can block) also see the write __g1_start and thus |
437 | that this is now a new group (see __pthread_cond_wait_common for the |
438 | matching acquire MO loads). */ |
439 | atomic_store_release (cond->__data.__g_signals + g1, 0); |
440 | |
441 | /* At this point, the old G1 is now a valid new G2 (but not in use yet). |
442 | No old waiter can neither grab a signal nor acquire a reference without |
443 | noticing that __g1_start is larger. |
444 | We can now publish the group switch by flipping the G2 index in __wseq. |
445 | Release MO so that this synchronizes with the acquire MO operation |
446 | waiters use to obtain a position in the waiter sequence. */ |
447 | wseq = __condvar_fetch_xor_wseq_release (cond, 1) >> 1; |
448 | g1 ^= 1; |
449 | *g1index ^= 1; |
450 | |
451 | /* These values are just observed by signalers, and thus protected by the |
452 | lock. */ |
453 | unsigned int orig_size = wseq - (old_g1_start + old_orig_size); |
454 | __condvar_set_orig_size (cond, orig_size); |
455 | /* Use and addition to not loose track of cancellations in what was |
456 | previously G2. */ |
457 | cond->__data.__g_size[g1] += orig_size; |
458 | |
459 | /* The new G1's size may be zero because of cancellations during its time |
460 | as G2. If this happens, there are no waiters that have to receive a |
461 | signal, so we do not need to add any and return false. */ |
462 | if (cond->__data.__g_size[g1] == 0) |
463 | return false; |
464 | |
465 | return true; |
466 | } |
467 | |