1 | /* POSIX reader--writer lock: core parts. |
2 | Copyright (C) 2016-2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <errno.h> |
20 | #include <sysdep.h> |
21 | #include <pthread.h> |
22 | #include <pthreadP.h> |
23 | #include <sys/time.h> |
24 | #include <stap-probe.h> |
25 | #include <atomic.h> |
26 | #include <futex-internal.h> |
27 | |
28 | |
29 | /* A reader--writer lock that fulfills the POSIX requirements (but operations |
30 | on this lock are not necessarily full barriers, as one may interpret the |
31 | POSIX requirement about "synchronizing memory"). All critical sections are |
32 | in a total order, writers synchronize with prior writers and readers, and |
33 | readers synchronize with prior writers. |
34 | |
35 | A thread is allowed to acquire a read lock recursively (i.e., have rdlock |
36 | critical sections that overlap in sequenced-before) unless the kind of the |
37 | rwlock is set to PTHREAD_RWLOCK_PREFER_WRITERS_NONRECURSIVE_NP. |
38 | |
39 | This lock is built so that workloads of mostly readers can be executed with |
40 | low runtime overheads. This matches that the default kind of the lock is |
41 | PTHREAD_RWLOCK_PREFER_READER_NP. Acquiring a read lock requires a single |
42 | atomic addition if the lock is or was previously acquired by other |
43 | readers; releasing the lock is a single CAS if there are no concurrent |
44 | writers. |
45 | Workloads consisting of mostly writers are of secondary importance. |
46 | An uncontended write lock acquisition is as fast as for a normal |
47 | exclusive mutex but writer contention is somewhat more costly due to |
48 | keeping track of the exact number of writers. If the rwlock kind requests |
49 | writers to be preferred (i.e., PTHREAD_RWLOCK_PREFER_WRITERS_NP or the |
50 | no-recursive-readers variant of it), then writer--to--writer lock ownership |
51 | hand-over is fairly fast and bypasses lock acquisition attempts by readers. |
52 | The costs of lock ownership transfer between readers and writers vary. If |
53 | the program asserts that there are no recursive readers and writers are |
54 | preferred, then write lock acquisition attempts will block subsequent read |
55 | lock acquisition attempts, so that new incoming readers do not prolong a |
56 | phase in which readers have acquired the lock. |
57 | |
58 | |
59 | The main components of the rwlock are a writer-only lock that allows only |
60 | one of the concurrent writers to be the primary writer, and a |
61 | single-writer-multiple-readers lock that decides between read phases, in |
62 | which readers have acquired the rwlock, and write phases in which a primary |
63 | writer or a sequence of different primary writers have acquired the rwlock. |
64 | |
65 | The single-writer-multiple-readers lock is the central piece of state |
66 | describing the rwlock and is encoded in the __readers field (see below for |
67 | a detailed explanation): |
68 | |
69 | State WP WL R RW Notes |
70 | --------------------------- |
71 | #1 0 0 0 0 Lock is idle (and in a read phase). |
72 | #2 0 0 >0 0 Readers have acquired the lock. |
73 | #3 0 1 0 0 Lock is not acquired; a writer is waiting for a write |
74 | phase to start or will try to start one. |
75 | #4 0 1 >0 0 Readers have acquired the lock; a writer is waiting |
76 | and explicit hand-over to the writer is required. |
77 | #4a 0 1 >0 1 Same as #4 except that there are further readers |
78 | waiting because the writer is to be preferred. |
79 | #5 1 0 0 0 Lock is idle (and in a write phase). |
80 | #6 1 0 >0 0 Write phase; readers are waiting for a read phase to |
81 | start or will try to start one. |
82 | #7 1 1 0 0 Lock is acquired by a writer. |
83 | #8 1 1 >0 0 Lock acquired by a writer and readers are waiting; |
84 | explicit hand-over to the readers is required. |
85 | |
86 | WP (PTHREAD_RWLOCK_WRPHASE) is true if the lock is in a write phase, so |
87 | potentially acquired by a primary writer. |
88 | WL (PTHREAD_RWLOCK_WRLOCKED) is true if there is a primary writer (i.e., |
89 | the thread that was able to set this bit from false to true). |
90 | R (all bits in __readers except the number of least-significant bits |
91 | denoted in PTHREAD_RWLOCK_READER_SHIFT) is the number of readers that have |
92 | or are trying to acquired the lock. There may be more readers waiting if |
93 | writers are preferred and there will be no recursive readers, in which |
94 | case RW (PTHREAD_RWLOCK_RWAITING) is true in state #4a. |
95 | |
96 | We want to block using futexes but using __readers as a futex word directly |
97 | is not a good solution. First, we want to wait on different conditions |
98 | such as waiting for a phase change vs. waiting for the primary writer to |
99 | release the writer-only lock. Second, the number of readers could change |
100 | frequently, which would make it likely that a writer's futex_wait fails |
101 | frequently too because the expected value does not match the value of |
102 | __readers anymore. |
103 | Therefore, we split out the futex words into the __wrphase_futex and |
104 | __writers_futex fields. The former tracks the value of the WP bit and is |
105 | changed after changing WP by the thread that changes WP. However, because |
106 | of the POSIX requirements regarding mutex/rwlock destruction (i.e., that |
107 | destroying a rwlock is allowed as soon as no thread has acquired or will |
108 | acquire the lock), we have to be careful and hand over lock ownership (via |
109 | a phase change) carefully to those threads waiting. Specifically, we must |
110 | prevent a situation in which we are not quite sure whether we still have |
111 | to unblock another thread through a change to memory (executing a |
112 | futex_wake on a former futex word that is now used for something else is |
113 | fine). |
114 | The scheme we use for __wrphase_futex is that waiting threads that may |
115 | use the futex word to block now all have to use the futex word to block; it |
116 | is not allowed to take the short-cut and spin-wait on __readers because |
117 | then the waking thread cannot just make one final change to memory to |
118 | unblock all potentially waiting threads. If, for example, a reader |
119 | increments R in states #7 or #8, it has to then block until __wrphase_futex |
120 | is 0 and it can confirm that the value of 0 was stored by the primary |
121 | writer; in turn, the primary writer has to change to a read phase too when |
122 | releasing WL (i.e., to state #2), and it must change __wrphase_futex to 0 |
123 | as the next step. This ensures that the waiting reader will not be able to |
124 | acquire, release, and then destroy the lock concurrently with the pending |
125 | futex unblock operations by the former primary writer. This scheme is |
126 | called explicit hand-over in what follows. |
127 | Note that waiting threads can cancel waiting only if explicit hand-over has |
128 | not yet started (e.g., if __readers is still in states #7 or #8 in the |
129 | example above). |
130 | |
131 | Writers determine the primary writer through WL. Blocking using futexes |
132 | is performed using __writers_futex as a futex word; primary writers will |
133 | enable waiting on this futex by setting it to 1 after they acquired the WL |
134 | bit and will disable waiting by setting it to 0 before they release WL. |
135 | This leaves small windows where blocking using futexes is not possible |
136 | although a primary writer exists, but in turn decreases complexity of the |
137 | writer--writer synchronization and does not affect correctness. |
138 | If writers are preferred, writers can hand over WL directly to other |
139 | waiting writers that registered by incrementing __writers: If the primary |
140 | writer can CAS __writers from a non-zero value to the same value with the |
141 | PTHREAD_RWLOCK_WRHANDOVER bit set, it effectively transfers WL ownership |
142 | to one of the registered waiting writers and does not reset WL; in turn, |
143 | a registered writer that can clear PTHREAD_RWLOCK_WRHANDOVER using a CAS |
144 | then takes over WL. Note that registered waiting writers can cancel |
145 | waiting by decrementing __writers, but the last writer to unregister must |
146 | become the primary writer if PTHREAD_RWLOCK_WRHANDOVER is set. |
147 | Also note that adding another state/bit to signal potential writer--writer |
148 | contention (e.g., as done in the normal mutex algorithm) would not be |
149 | helpful because we would have to conservatively assume that there is in |
150 | fact no other writer, and wake up readers too. |
151 | |
152 | To avoid having to call futex_wake when no thread uses __wrphase_futex or |
153 | __writers_futex, threads will set the PTHREAD_RWLOCK_FUTEX_USED bit in the |
154 | respective futex words before waiting on it (using a CAS so it will only be |
155 | set if in a state in which waiting would be possible). In the case of |
156 | __writers_futex, we wake only one thread but several threads may share |
157 | PTHREAD_RWLOCK_FUTEX_USED, so we must assume that there are still others. |
158 | This is similar to what we do in pthread_mutex_lock. We do not need to |
159 | do this for __wrphase_futex because there, we always wake all waiting |
160 | threads. |
161 | |
162 | Blocking in the state #4a simply uses __readers as futex word. This |
163 | simplifies the algorithm but suffers from some of the drawbacks discussed |
164 | before, though not to the same extent because R can only decrease in this |
165 | state, so the number of potentially failing futex_wait attempts will be |
166 | bounded. All threads moving from state #4a to another state must wake |
167 | up threads blocked on the __readers futex. |
168 | |
169 | The ordering invariants that we have to take care of in the implementation |
170 | are primarily those necessary for a reader--writer lock; this is rather |
171 | straightforward and happens during write/read phase switching (potentially |
172 | through explicit hand-over), and between writers through synchronization |
173 | involving the PTHREAD_RWLOCK_WRLOCKED or PTHREAD_RWLOCK_WRHANDOVER bits. |
174 | Additionally, we need to take care that modifications of __writers_futex |
175 | and __wrphase_futex (e.g., by otherwise unordered readers) take place in |
176 | the writer critical sections or read/write phases, respectively, and that |
177 | explicit hand-over observes stores from the previous phase. How this is |
178 | done is explained in more detail in comments in the code. |
179 | |
180 | Many of the accesses to the futex words just need relaxed MO. This is |
181 | possible because we essentially drive both the core rwlock synchronization |
182 | and the futex synchronization in parallel. For example, an unlock will |
183 | unlock the rwlock and take part in the futex synchronization (using |
184 | PTHREAD_RWLOCK_FUTEX_USED, see above); even if they are not tightly |
185 | ordered in some way, the futex synchronization ensures that there are no |
186 | lost wake-ups, and woken threads will then eventually see the most recent |
187 | state of the rwlock. IOW, waiting threads will always be woken up, while |
188 | not being able to wait using futexes (which can happen) is harmless; in |
189 | turn, this means that waiting threads don't need special ordering wrt. |
190 | waking threads. |
191 | |
192 | The futex synchronization consists of the three-state futex word: |
193 | (1) cannot block on it, (2) can block on it, and (3) there might be a |
194 | thread blocked on it (i.e., with PTHREAD_RWLOCK_FUTEX_USED set). |
195 | Relaxed-MO atomic read-modify-write operations are sufficient to maintain |
196 | this (e.g., using a CAS to go from (2) to (3) but not from (1) to (3)), |
197 | but we need ordering of the futex word modifications by the waking threads |
198 | so that they collectively make correct state changes between (1)-(3). |
199 | The futex-internal synchronization (i.e., the conceptual critical sections |
200 | around futex operations in the kernel) then ensures that even an |
201 | unconstrained load (i.e., relaxed MO) inside of futex_wait will not lead to |
202 | lost wake-ups because either the waiting thread will see the change from |
203 | (3) to (1) when a futex_wake came first, or this futex_wake will wake this |
204 | waiting thread because the waiting thread came first. |
205 | |
206 | |
207 | POSIX allows but does not require rwlock acquisitions to be a cancellation |
208 | point. We do not support cancellation. |
209 | |
210 | TODO We do not try to elide any read or write lock acquisitions currently. |
211 | While this would be possible, it is unclear whether HTM performance is |
212 | currently predictable enough and our runtime tuning is good enough at |
213 | deciding when to use elision so that enabling it would lead to consistently |
214 | better performance. */ |
215 | |
216 | |
217 | static int |
218 | __pthread_rwlock_get_private (pthread_rwlock_t *rwlock) |
219 | { |
220 | return rwlock->__data.__shared != 0 ? FUTEX_SHARED : FUTEX_PRIVATE; |
221 | } |
222 | |
223 | static __always_inline void |
224 | __pthread_rwlock_rdunlock (pthread_rwlock_t *rwlock) |
225 | { |
226 | int private = __pthread_rwlock_get_private (rwlock); |
227 | /* We decrease the number of readers, and if we are the last reader and |
228 | there is a primary writer, we start a write phase. We use a CAS to |
229 | make this atomic so that it is clear whether we must hand over ownership |
230 | explicitly. */ |
231 | unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers); |
232 | unsigned int rnew; |
233 | for (;;) |
234 | { |
235 | rnew = r - (1 << PTHREAD_RWLOCK_READER_SHIFT); |
236 | /* If we are the last reader, we also need to unblock any readers |
237 | that are waiting for a writer to go first (PTHREAD_RWLOCK_RWAITING) |
238 | so that they can register while the writer is active. */ |
239 | if ((rnew >> PTHREAD_RWLOCK_READER_SHIFT) == 0) |
240 | { |
241 | if ((rnew & PTHREAD_RWLOCK_WRLOCKED) != 0) |
242 | rnew |= PTHREAD_RWLOCK_WRPHASE; |
243 | rnew &= ~(unsigned int) PTHREAD_RWLOCK_RWAITING; |
244 | } |
245 | /* We need release MO here for three reasons. First, so that we |
246 | synchronize with subsequent writers. Second, we might have been the |
247 | first reader and set __wrphase_futex to 0, so we need to synchronize |
248 | with the last reader that will set it to 1 (note that we will always |
249 | change __readers before the last reader, or we are the last reader). |
250 | Third, a writer that takes part in explicit hand-over needs to see |
251 | the first reader's store to __wrphase_futex (or a later value) if |
252 | the writer observes that a write phase has been started. */ |
253 | if (atomic_compare_exchange_weak_release (&rwlock->__data.__readers, |
254 | &r, rnew)) |
255 | break; |
256 | /* TODO Back-off. */ |
257 | } |
258 | if ((rnew & PTHREAD_RWLOCK_WRPHASE) != 0) |
259 | { |
260 | /* We need to do explicit hand-over. We need the acquire MO fence so |
261 | that our modification of _wrphase_futex happens after a store by |
262 | another reader that started a read phase. Relaxed MO is sufficient |
263 | for the modification of __wrphase_futex because it is just used |
264 | to delay acquisition by a writer until all threads are unblocked |
265 | irrespective of whether they are looking at __readers or |
266 | __wrphase_futex; any other synchronizes-with relations that are |
267 | necessary are established through __readers. */ |
268 | atomic_thread_fence_acquire (); |
269 | if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 1) |
270 | & PTHREAD_RWLOCK_FUTEX_USED) != 0) |
271 | futex_wake (&rwlock->__data.__wrphase_futex, INT_MAX, private); |
272 | } |
273 | /* Also wake up waiting readers if we did reset the RWAITING flag. */ |
274 | if ((r & PTHREAD_RWLOCK_RWAITING) != (rnew & PTHREAD_RWLOCK_RWAITING)) |
275 | futex_wake (&rwlock->__data.__readers, INT_MAX, private); |
276 | } |
277 | |
278 | |
279 | static __always_inline int |
280 | __pthread_rwlock_rdlock_full (pthread_rwlock_t *rwlock, |
281 | const struct timespec *abstime) |
282 | { |
283 | unsigned int r; |
284 | |
285 | /* Make sure we are not holding the rwlock as a writer. This is a deadlock |
286 | situation we recognize and report. */ |
287 | if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer) |
288 | == THREAD_GETMEM (THREAD_SELF, tid))) |
289 | return EDEADLK; |
290 | |
291 | /* If we prefer writers, recursive rdlock is disallowed, we are in a read |
292 | phase, and there are other readers present, we try to wait without |
293 | extending the read phase. We will be unblocked by either one of the |
294 | other active readers, or if the writer gives up WRLOCKED (e.g., on |
295 | timeout). |
296 | If there are no other readers, we simply race with any existing primary |
297 | writer; it would have been a race anyway, and changing the odds slightly |
298 | will likely not make a big difference. */ |
299 | if (rwlock->__data.__flags == PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP) |
300 | { |
301 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
302 | while (((r & PTHREAD_RWLOCK_WRPHASE) == 0) |
303 | && ((r & PTHREAD_RWLOCK_WRLOCKED) != 0) |
304 | && ((r >> PTHREAD_RWLOCK_READER_SHIFT) > 0)) |
305 | { |
306 | /* TODO Spin first. */ |
307 | /* Try setting the flag signaling that we are waiting without having |
308 | incremented the number of readers. Relaxed MO is fine because |
309 | this is just about waiting for a state change in __readers. */ |
310 | if (atomic_compare_exchange_weak_relaxed |
311 | (&rwlock->__data.__readers, &r, r | PTHREAD_RWLOCK_RWAITING)) |
312 | { |
313 | /* Wait for as long as the flag is set. An ABA situation is |
314 | harmless because the flag is just about the state of |
315 | __readers, and all threads set the flag under the same |
316 | conditions. */ |
317 | while ((atomic_load_relaxed (&rwlock->__data.__readers) |
318 | & PTHREAD_RWLOCK_RWAITING) != 0) |
319 | { |
320 | int private = __pthread_rwlock_get_private (rwlock); |
321 | int err = futex_abstimed_wait (&rwlock->__data.__readers, |
322 | r, abstime, private); |
323 | /* We ignore EAGAIN and EINTR. On time-outs, we can just |
324 | return because we don't need to clean up anything. */ |
325 | if (err == ETIMEDOUT) |
326 | return err; |
327 | } |
328 | /* It makes sense to not break out of the outer loop here |
329 | because we might be in the same situation again. */ |
330 | } |
331 | else |
332 | { |
333 | /* TODO Back-off. */ |
334 | } |
335 | } |
336 | } |
337 | /* Register as a reader, using an add-and-fetch so that R can be used as |
338 | expected value for future operations. Acquire MO so we synchronize with |
339 | prior writers as well as the last reader of the previous read phase (see |
340 | below). */ |
341 | r = atomic_fetch_add_acquire (&rwlock->__data.__readers, |
342 | (1 << PTHREAD_RWLOCK_READER_SHIFT)) + (1 << PTHREAD_RWLOCK_READER_SHIFT); |
343 | |
344 | /* Check whether there is an overflow in the number of readers. We assume |
345 | that the total number of threads is less than half the maximum number |
346 | of readers that we have bits for in __readers (i.e., with 32-bit int and |
347 | PTHREAD_RWLOCK_READER_SHIFT of 3, we assume there are less than |
348 | 1 << (32-3-1) concurrent threads). |
349 | If there is an overflow, we use a CAS to try to decrement the number of |
350 | readers if there still is an overflow situation. If so, we return |
351 | EAGAIN; if not, we are not a thread causing an overflow situation, and so |
352 | we just continue. Using a fetch-add instead of the CAS isn't possible |
353 | because other readers might release the lock concurrently, which could |
354 | make us the last reader and thus responsible for handing ownership over |
355 | to writers (which requires a CAS too to make the decrement and ownership |
356 | transfer indivisible). */ |
357 | while (__glibc_unlikely (r >= PTHREAD_RWLOCK_READER_OVERFLOW)) |
358 | { |
359 | /* Relaxed MO is okay because we just want to undo our registration and |
360 | cannot have changed the rwlock state substantially if the CAS |
361 | succeeds. */ |
362 | if (atomic_compare_exchange_weak_relaxed (&rwlock->__data.__readers, &r, |
363 | r - (1 << PTHREAD_RWLOCK_READER_SHIFT))) |
364 | return EAGAIN; |
365 | } |
366 | |
367 | /* We have registered as a reader, so if we are in a read phase, we have |
368 | acquired a read lock. This is also the reader--reader fast-path. |
369 | Even if there is a primary writer, we just return. If writers are to |
370 | be preferred and we are the only active reader, we could try to enter a |
371 | write phase to let the writer proceed. This would be okay because we |
372 | cannot have acquired the lock previously as a reader (which could result |
373 | in deadlock if we would wait for the primary writer to run). However, |
374 | this seems to be a corner case and handling it specially not be worth the |
375 | complexity. */ |
376 | if (__glibc_likely ((r & PTHREAD_RWLOCK_WRPHASE) == 0)) |
377 | return 0; |
378 | |
379 | /* If there is no primary writer but we are in a write phase, we can try |
380 | to install a read phase ourself. */ |
381 | while (((r & PTHREAD_RWLOCK_WRPHASE) != 0) |
382 | && ((r & PTHREAD_RWLOCK_WRLOCKED) == 0)) |
383 | { |
384 | /* Try to enter a read phase: If the CAS below succeeds, we have |
385 | ownership; if it fails, we will simply retry and reassess the |
386 | situation. |
387 | Acquire MO so we synchronize with prior writers. */ |
388 | if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers, &r, |
389 | r ^ PTHREAD_RWLOCK_WRPHASE)) |
390 | { |
391 | /* We started the read phase, so we are also responsible for |
392 | updating the write-phase futex. Relaxed MO is sufficient. |
393 | Note that there can be no other reader that we have to wake |
394 | because all other readers will see the read phase started by us |
395 | (or they will try to start it themselves); if a writer started |
396 | the read phase, we cannot have started it. Furthermore, we |
397 | cannot discard a PTHREAD_RWLOCK_FUTEX_USED flag because we will |
398 | overwrite the value set by the most recent writer (or the readers |
399 | before it in case of explicit hand-over) and we know that there |
400 | are no waiting readers. */ |
401 | atomic_store_relaxed (&rwlock->__data.__wrphase_futex, 0); |
402 | return 0; |
403 | } |
404 | else |
405 | { |
406 | /* TODO Back off before retrying. Also see above. */ |
407 | } |
408 | } |
409 | |
410 | if ((r & PTHREAD_RWLOCK_WRPHASE) != 0) |
411 | { |
412 | /* We are in a write phase, and there must be a primary writer because |
413 | of the previous loop. Block until the primary writer gives up the |
414 | write phase. This case requires explicit hand-over using |
415 | __wrphase_futex. |
416 | However, __wrphase_futex might not have been set to 1 yet (either |
417 | because explicit hand-over to the writer is still ongoing, or because |
418 | the writer has started the write phase but does not yet have updated |
419 | __wrphase_futex). The least recent value of __wrphase_futex we can |
420 | read from here is the modification of the last read phase (because |
421 | we synchronize with the last reader in this read phase through |
422 | __readers; see the use of acquire MO on the fetch_add above). |
423 | Therefore, if we observe a value of 0 for __wrphase_futex, we need |
424 | to subsequently check that __readers now indicates a read phase; we |
425 | need to use acquire MO for this so that if we observe a read phase, |
426 | we will also see the modification of __wrphase_futex by the previous |
427 | writer. We then need to load __wrphase_futex again and continue to |
428 | wait if it is not 0, so that we do not skip explicit hand-over. |
429 | Relaxed MO is sufficient for the load from __wrphase_futex because |
430 | we just use it as an indicator for when we can proceed; we use |
431 | __readers and the acquire MO accesses to it to eventually read from |
432 | the proper stores to __wrphase_futex. */ |
433 | unsigned int wpf; |
434 | bool ready = false; |
435 | for (;;) |
436 | { |
437 | while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex)) |
438 | | PTHREAD_RWLOCK_FUTEX_USED) == (1 | PTHREAD_RWLOCK_FUTEX_USED)) |
439 | { |
440 | int private = __pthread_rwlock_get_private (rwlock); |
441 | if (((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0) |
442 | && !atomic_compare_exchange_weak_relaxed |
443 | (&rwlock->__data.__wrphase_futex, |
444 | &wpf, wpf | PTHREAD_RWLOCK_FUTEX_USED)) |
445 | continue; |
446 | int err = futex_abstimed_wait (&rwlock->__data.__wrphase_futex, |
447 | 1 | PTHREAD_RWLOCK_FUTEX_USED, abstime, private); |
448 | if (err == ETIMEDOUT) |
449 | { |
450 | /* If we timed out, we need to unregister. If no read phase |
451 | has been installed while we waited, we can just decrement |
452 | the number of readers. Otherwise, we just acquire the |
453 | lock, which is allowed because we give no precise timing |
454 | guarantees, and because the timeout is only required to |
455 | be in effect if we would have had to wait for other |
456 | threads (e.g., if futex_wait would time-out immediately |
457 | because the given absolute time is in the past). */ |
458 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
459 | while ((r & PTHREAD_RWLOCK_WRPHASE) != 0) |
460 | { |
461 | /* We don't need to make anything else visible to |
462 | others besides unregistering, so relaxed MO is |
463 | sufficient. */ |
464 | if (atomic_compare_exchange_weak_relaxed |
465 | (&rwlock->__data.__readers, &r, |
466 | r - (1 << PTHREAD_RWLOCK_READER_SHIFT))) |
467 | return ETIMEDOUT; |
468 | /* TODO Back-off. */ |
469 | } |
470 | /* Use the acquire MO fence to mirror the steps taken in the |
471 | non-timeout case. Note that the read can happen both |
472 | in the atomic_load above as well as in the failure case |
473 | of the CAS operation. */ |
474 | atomic_thread_fence_acquire (); |
475 | /* We still need to wait for explicit hand-over, but we must |
476 | not use futex_wait anymore because we would just time out |
477 | in this case and thus make the spin-waiting we need |
478 | unnecessarily expensive. */ |
479 | while ((atomic_load_relaxed (&rwlock->__data.__wrphase_futex) |
480 | | PTHREAD_RWLOCK_FUTEX_USED) |
481 | == (1 | PTHREAD_RWLOCK_FUTEX_USED)) |
482 | { |
483 | /* TODO Back-off? */ |
484 | } |
485 | ready = true; |
486 | break; |
487 | } |
488 | /* If we got interrupted (EINTR) or the futex word does not have the |
489 | expected value (EAGAIN), retry. */ |
490 | } |
491 | if (ready) |
492 | /* See below. */ |
493 | break; |
494 | /* We need acquire MO here so that we synchronize with the lock |
495 | release of the writer, and so that we observe a recent value of |
496 | __wrphase_futex (see below). */ |
497 | if ((atomic_load_acquire (&rwlock->__data.__readers) |
498 | & PTHREAD_RWLOCK_WRPHASE) == 0) |
499 | /* We are in a read phase now, so the least recent modification of |
500 | __wrphase_futex we can read from is the store by the writer |
501 | with value 1. Thus, only now we can assume that if we observe |
502 | a value of 0, explicit hand-over is finished. Retry the loop |
503 | above one more time. */ |
504 | ready = true; |
505 | } |
506 | } |
507 | |
508 | return 0; |
509 | } |
510 | |
511 | |
512 | static __always_inline void |
513 | __pthread_rwlock_wrunlock (pthread_rwlock_t *rwlock) |
514 | { |
515 | int private = __pthread_rwlock_get_private (rwlock); |
516 | |
517 | atomic_store_relaxed (&rwlock->__data.__cur_writer, 0); |
518 | /* Disable waiting by writers. We will wake up after we decided how to |
519 | proceed. */ |
520 | bool wake_writers = ((atomic_exchange_relaxed |
521 | (&rwlock->__data.__writers_futex, 0) & PTHREAD_RWLOCK_FUTEX_USED) != 0); |
522 | |
523 | if (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP) |
524 | { |
525 | /* First, try to hand over to another writer. */ |
526 | unsigned int w = atomic_load_relaxed (&rwlock->__data.__writers); |
527 | while (w != 0) |
528 | { |
529 | /* Release MO so that another writer that gets WRLOCKED from us will |
530 | synchronize with us and thus can take over our view of |
531 | __readers (including, for example, whether we are in a write |
532 | phase or not). */ |
533 | if (atomic_compare_exchange_weak_release (&rwlock->__data.__writers, |
534 | &w, w | PTHREAD_RWLOCK_WRHANDOVER)) |
535 | /* Another writer will take over. */ |
536 | goto done; |
537 | /* TODO Back-off. */ |
538 | } |
539 | } |
540 | |
541 | /* We have done everything we needed to do to prefer writers, so now we |
542 | either hand over explicitly to readers if there are any, or we simply |
543 | stay in a write phase. See pthread_rwlock_rdunlock for more details. */ |
544 | unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers); |
545 | /* Release MO so that subsequent readers or writers synchronize with us. */ |
546 | while (!atomic_compare_exchange_weak_release |
547 | (&rwlock->__data.__readers, &r, (r ^ PTHREAD_RWLOCK_WRLOCKED) |
548 | ^ ((r >> PTHREAD_RWLOCK_READER_SHIFT) == 0 ? 0 |
549 | : PTHREAD_RWLOCK_WRPHASE))) |
550 | { |
551 | /* TODO Back-off. */ |
552 | } |
553 | if ((r >> PTHREAD_RWLOCK_READER_SHIFT) != 0) |
554 | { |
555 | /* We must hand over explicitly through __wrphase_futex. Relaxed MO is |
556 | sufficient because it is just used to delay acquisition by a writer; |
557 | any other synchronizes-with relations that are necessary are |
558 | established through __readers. */ |
559 | if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 0) |
560 | & PTHREAD_RWLOCK_FUTEX_USED) != 0) |
561 | futex_wake (&rwlock->__data.__wrphase_futex, INT_MAX, private); |
562 | } |
563 | |
564 | done: |
565 | /* We released WRLOCKED in some way, so wake a writer. */ |
566 | if (wake_writers) |
567 | futex_wake (&rwlock->__data.__writers_futex, 1, private); |
568 | } |
569 | |
570 | |
571 | static __always_inline int |
572 | __pthread_rwlock_wrlock_full (pthread_rwlock_t *rwlock, |
573 | const struct timespec *abstime) |
574 | { |
575 | /* Make sure we are not holding the rwlock as a writer. This is a deadlock |
576 | situation we recognize and report. */ |
577 | if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer) |
578 | == THREAD_GETMEM (THREAD_SELF, tid))) |
579 | return EDEADLK; |
580 | |
581 | /* First we try to acquire the role of primary writer by setting WRLOCKED; |
582 | if it was set before, there already is a primary writer. Acquire MO so |
583 | that we synchronize with previous primary writers. |
584 | |
585 | We do not try to change to a write phase right away using a fetch_or |
586 | because we would have to reset it again and wake readers if there are |
587 | readers present (some readers could try to acquire the lock more than |
588 | once, so setting a write phase in the middle of this could cause |
589 | deadlock). Changing to a write phase eagerly would only speed up the |
590 | transition from a read phase to a write phase in the uncontended case, |
591 | but it would slow down the contended case if readers are preferred (which |
592 | is the default). |
593 | We could try to CAS from a state with no readers to a write phase, but |
594 | this could be less scalable if readers arrive and leave frequently. */ |
595 | bool may_share_futex_used_flag = false; |
596 | unsigned int r = atomic_fetch_or_acquire (&rwlock->__data.__readers, |
597 | PTHREAD_RWLOCK_WRLOCKED); |
598 | if (__glibc_unlikely ((r & PTHREAD_RWLOCK_WRLOCKED) != 0)) |
599 | { |
600 | /* There is another primary writer. */ |
601 | bool prefer_writer = |
602 | (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP); |
603 | if (prefer_writer) |
604 | { |
605 | /* We register as a waiting writer, so that we can make use of |
606 | writer--writer hand-over. Relaxed MO is fine because we just |
607 | want to register. We assume that the maximum number of threads |
608 | is less than the capacity in __writers. */ |
609 | atomic_fetch_add_relaxed (&rwlock->__data.__writers, 1); |
610 | } |
611 | for (;;) |
612 | { |
613 | /* TODO Spin until WRLOCKED is 0 before trying the CAS below. |
614 | But pay attention to not delay trying writer--writer hand-over |
615 | for too long (which we must try eventually anyway). */ |
616 | if ((r & PTHREAD_RWLOCK_WRLOCKED) == 0) |
617 | { |
618 | /* Try to become the primary writer or retry. Acquire MO as in |
619 | the fetch_or above. */ |
620 | if (atomic_compare_exchange_weak_acquire |
621 | (&rwlock->__data.__readers, &r, |
622 | r | PTHREAD_RWLOCK_WRLOCKED)) |
623 | { |
624 | if (prefer_writer) |
625 | { |
626 | /* Unregister as a waiting writer. Note that because we |
627 | acquired WRLOCKED, WRHANDOVER will not be set. |
628 | Acquire MO on the CAS above ensures that |
629 | unregistering happens after the previous writer; |
630 | this sorts the accesses to __writers by all |
631 | primary writers in a useful way (e.g., any other |
632 | primary writer acquiring after us or getting it from |
633 | us through WRHANDOVER will see both our changes to |
634 | __writers). |
635 | ??? Perhaps this is not strictly necessary for |
636 | reasons we do not yet know of. */ |
637 | atomic_fetch_add_relaxed (&rwlock->__data.__writers, |
638 | -1); |
639 | } |
640 | break; |
641 | } |
642 | /* Retry if the CAS fails (r will have been updated). */ |
643 | continue; |
644 | } |
645 | /* If writer--writer hand-over is available, try to become the |
646 | primary writer this way by grabbing the WRHANDOVER token. If we |
647 | succeed, we own WRLOCKED. */ |
648 | if (prefer_writer) |
649 | { |
650 | unsigned int w = atomic_load_relaxed |
651 | (&rwlock->__data.__writers); |
652 | if ((w & PTHREAD_RWLOCK_WRHANDOVER) != 0) |
653 | { |
654 | /* Acquire MO is required here so that we synchronize with |
655 | the writer that handed over WRLOCKED. We also need this |
656 | for the reload of __readers below because our view of |
657 | __readers must be at least as recent as the view of the |
658 | writer that handed over WRLOCKED; we must avoid an ABA |
659 | through WRHANDOVER, which could, for example, lead to us |
660 | assuming we are still in a write phase when in fact we |
661 | are not. */ |
662 | if (atomic_compare_exchange_weak_acquire |
663 | (&rwlock->__data.__writers, |
664 | &w, (w - PTHREAD_RWLOCK_WRHANDOVER - 1))) |
665 | { |
666 | /* Reload so our view is consistent with the view of |
667 | the previous owner of WRLOCKED. See above. */ |
668 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
669 | break; |
670 | } |
671 | /* We do not need to reload __readers here. We should try |
672 | to perform writer--writer hand-over if possible; if it |
673 | is not possible anymore, we will reload __readers |
674 | elsewhere in this loop. */ |
675 | continue; |
676 | } |
677 | } |
678 | /* We did not acquire WRLOCKED nor were able to use writer--writer |
679 | hand-over, so we block on __writers_futex. */ |
680 | int private = __pthread_rwlock_get_private (rwlock); |
681 | unsigned int wf = atomic_load_relaxed |
682 | (&rwlock->__data.__writers_futex); |
683 | if (((wf & ~(unsigned int) PTHREAD_RWLOCK_FUTEX_USED) != 1) |
684 | || ((wf != (1 | PTHREAD_RWLOCK_FUTEX_USED)) |
685 | && !atomic_compare_exchange_weak_relaxed |
686 | (&rwlock->__data.__writers_futex, &wf, |
687 | 1 | PTHREAD_RWLOCK_FUTEX_USED))) |
688 | { |
689 | /* If we cannot block on __writers_futex because there is no |
690 | primary writer, or we cannot set PTHREAD_RWLOCK_FUTEX_USED, |
691 | we retry. We must reload __readers here in case we cannot |
692 | block on __writers_futex so that we can become the primary |
693 | writer and are not stuck in a loop that just continuously |
694 | fails to block on __writers_futex. */ |
695 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
696 | continue; |
697 | } |
698 | /* We set the flag that signals that the futex is used, or we could |
699 | have set it if we had been faster than other waiters. As a |
700 | result, we may share the flag with an unknown number of other |
701 | writers. Therefore, we must keep this flag set when we acquire |
702 | the lock. We do not need to do this when we do not reach this |
703 | point here because then we are not part of the group that may |
704 | share the flag, and another writer will wake one of the writers |
705 | in this group. */ |
706 | may_share_futex_used_flag = true; |
707 | int err = futex_abstimed_wait (&rwlock->__data.__writers_futex, |
708 | 1 | PTHREAD_RWLOCK_FUTEX_USED, abstime, private); |
709 | if (err == ETIMEDOUT) |
710 | { |
711 | if (prefer_writer) |
712 | { |
713 | /* We need to unregister as a waiting writer. If we are the |
714 | last writer and writer--writer hand-over is available, |
715 | we must make use of it because nobody else will reset |
716 | WRLOCKED otherwise. (If we use it, we simply pretend |
717 | that this happened before the timeout; see |
718 | pthread_rwlock_rdlock_full for the full reasoning.) |
719 | Also see the similar code above. */ |
720 | unsigned int w = atomic_load_relaxed |
721 | (&rwlock->__data.__writers); |
722 | while (!atomic_compare_exchange_weak_acquire |
723 | (&rwlock->__data.__writers, &w, |
724 | (w == PTHREAD_RWLOCK_WRHANDOVER + 1 ? 0 : w - 1))) |
725 | { |
726 | /* TODO Back-off. */ |
727 | } |
728 | if (w == PTHREAD_RWLOCK_WRHANDOVER + 1) |
729 | { |
730 | /* We must continue as primary writer. See above. */ |
731 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
732 | break; |
733 | } |
734 | } |
735 | /* We cleaned up and cannot have stolen another waiting writer's |
736 | futex wake-up, so just return. */ |
737 | return ETIMEDOUT; |
738 | } |
739 | /* If we got interrupted (EINTR) or the futex word does not have the |
740 | expected value (EAGAIN), retry after reloading __readers. */ |
741 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
742 | } |
743 | /* Our snapshot of __readers is up-to-date at this point because we |
744 | either set WRLOCKED using a CAS or were handed over WRLOCKED from |
745 | another writer whose snapshot of __readers we inherit. */ |
746 | } |
747 | |
748 | /* If we are in a read phase and there are no readers, try to start a write |
749 | phase. */ |
750 | while (((r & PTHREAD_RWLOCK_WRPHASE) == 0) |
751 | && ((r >> PTHREAD_RWLOCK_READER_SHIFT) == 0)) |
752 | { |
753 | /* Acquire MO so that we synchronize with prior writers and do |
754 | not interfere with their updates to __writers_futex, as well |
755 | as regarding prior readers and their updates to __wrphase_futex, |
756 | respectively. */ |
757 | if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers, |
758 | &r, r | PTHREAD_RWLOCK_WRPHASE)) |
759 | { |
760 | /* We have started a write phase, so need to enable readers to wait. |
761 | See the similar case in__pthread_rwlock_rdlock_full. */ |
762 | atomic_store_relaxed (&rwlock->__data.__wrphase_futex, 1); |
763 | /* Make sure we fall through to the end of the function. */ |
764 | r |= PTHREAD_RWLOCK_WRPHASE; |
765 | break; |
766 | } |
767 | /* TODO Back-off. */ |
768 | } |
769 | |
770 | /* We are the primary writer; enable blocking on __writers_futex. Relaxed |
771 | MO is sufficient for futex words; acquire MO on the previous |
772 | modifications of __readers ensures that this store happens after the |
773 | store of value 0 by the previous primary writer. */ |
774 | atomic_store_relaxed (&rwlock->__data.__writers_futex, |
775 | 1 | (may_share_futex_used_flag ? PTHREAD_RWLOCK_FUTEX_USED : 0)); |
776 | |
777 | if (__glibc_unlikely ((r & PTHREAD_RWLOCK_WRPHASE) == 0)) |
778 | { |
779 | /* We are not in a read phase and there are readers (because of the |
780 | previous loop). Thus, we have to wait for explicit hand-over from |
781 | one of these readers. |
782 | We basically do the same steps as for the similar case in |
783 | __pthread_rwlock_rdlock_full, except that we additionally might try |
784 | to directly hand over to another writer and need to wake up |
785 | other writers or waiting readers (i.e., PTHREAD_RWLOCK_RWAITING). */ |
786 | unsigned int wpf; |
787 | bool ready = false; |
788 | for (;;) |
789 | { |
790 | while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex)) |
791 | | PTHREAD_RWLOCK_FUTEX_USED) == PTHREAD_RWLOCK_FUTEX_USED) |
792 | { |
793 | int private = __pthread_rwlock_get_private (rwlock); |
794 | if (((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0) |
795 | && !atomic_compare_exchange_weak_relaxed |
796 | (&rwlock->__data.__wrphase_futex, &wpf, |
797 | PTHREAD_RWLOCK_FUTEX_USED)) |
798 | continue; |
799 | int err = futex_abstimed_wait (&rwlock->__data.__wrphase_futex, |
800 | PTHREAD_RWLOCK_FUTEX_USED, abstime, private); |
801 | if (err == ETIMEDOUT) |
802 | { |
803 | if (rwlock->__data.__flags |
804 | != PTHREAD_RWLOCK_PREFER_READER_NP) |
805 | { |
806 | /* We try writer--writer hand-over. */ |
807 | unsigned int w = atomic_load_relaxed |
808 | (&rwlock->__data.__writers); |
809 | if (w != 0) |
810 | { |
811 | /* We are about to hand over WRLOCKED, so we must |
812 | release __writers_futex too; otherwise, we'd have |
813 | a pending store, which could at least prevent |
814 | other threads from waiting using the futex |
815 | because it could interleave with the stores |
816 | by subsequent writers. In turn, this means that |
817 | we have to clean up when we do not hand over |
818 | WRLOCKED. |
819 | Release MO so that another writer that gets |
820 | WRLOCKED from us can take over our view of |
821 | __readers. */ |
822 | unsigned int wf = atomic_exchange_relaxed |
823 | (&rwlock->__data.__writers_futex, 0); |
824 | while (w != 0) |
825 | { |
826 | if (atomic_compare_exchange_weak_release |
827 | (&rwlock->__data.__writers, &w, |
828 | w | PTHREAD_RWLOCK_WRHANDOVER)) |
829 | { |
830 | /* Wake other writers. */ |
831 | if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0) |
832 | futex_wake |
833 | (&rwlock->__data.__writers_futex, 1, |
834 | private); |
835 | return ETIMEDOUT; |
836 | } |
837 | /* TODO Back-off. */ |
838 | } |
839 | /* We still own WRLOCKED and someone else might set |
840 | a write phase concurrently, so enable waiting |
841 | again. Make sure we don't loose the flag that |
842 | signals whether there are threads waiting on |
843 | this futex. */ |
844 | atomic_store_relaxed |
845 | (&rwlock->__data.__writers_futex, wf); |
846 | } |
847 | } |
848 | /* If we timed out and we are not in a write phase, we can |
849 | just stop being a primary writer. Otherwise, we just |
850 | acquire the lock. */ |
851 | r = atomic_load_relaxed (&rwlock->__data.__readers); |
852 | if ((r & PTHREAD_RWLOCK_WRPHASE) == 0) |
853 | { |
854 | /* We are about to release WRLOCKED, so we must release |
855 | __writers_futex too; see the handling of |
856 | writer--writer hand-over above. */ |
857 | unsigned int wf = atomic_exchange_relaxed |
858 | (&rwlock->__data.__writers_futex, 0); |
859 | while ((r & PTHREAD_RWLOCK_WRPHASE) == 0) |
860 | { |
861 | /* While we don't need to make anything from a |
862 | caller's critical section visible to other |
863 | threads, we need to ensure that our changes to |
864 | __writers_futex are properly ordered. |
865 | Therefore, use release MO to synchronize with |
866 | subsequent primary writers. Also wake up any |
867 | waiting readers as they are waiting because of |
868 | us. */ |
869 | if (atomic_compare_exchange_weak_release |
870 | (&rwlock->__data.__readers, &r, |
871 | (r ^ PTHREAD_RWLOCK_WRLOCKED) |
872 | & ~(unsigned int) PTHREAD_RWLOCK_RWAITING)) |
873 | { |
874 | /* Wake other writers. */ |
875 | if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0) |
876 | futex_wake (&rwlock->__data.__writers_futex, |
877 | 1, private); |
878 | /* Wake waiting readers. */ |
879 | if ((r & PTHREAD_RWLOCK_RWAITING) != 0) |
880 | futex_wake (&rwlock->__data.__readers, |
881 | INT_MAX, private); |
882 | return ETIMEDOUT; |
883 | } |
884 | } |
885 | /* We still own WRLOCKED and someone else might set a |
886 | write phase concurrently, so enable waiting again. |
887 | Make sure we don't loose the flag that signals |
888 | whether there are threads waiting on this futex. */ |
889 | atomic_store_relaxed (&rwlock->__data.__writers_futex, |
890 | wf); |
891 | } |
892 | /* Use the acquire MO fence to mirror the steps taken in the |
893 | non-timeout case. Note that the read can happen both |
894 | in the atomic_load above as well as in the failure case |
895 | of the CAS operation. */ |
896 | atomic_thread_fence_acquire (); |
897 | /* We still need to wait for explicit hand-over, but we must |
898 | not use futex_wait anymore. */ |
899 | while ((atomic_load_relaxed |
900 | (&rwlock->__data.__wrphase_futex) |
901 | | PTHREAD_RWLOCK_FUTEX_USED) |
902 | == PTHREAD_RWLOCK_FUTEX_USED) |
903 | { |
904 | /* TODO Back-off. */ |
905 | } |
906 | ready = true; |
907 | break; |
908 | } |
909 | /* If we got interrupted (EINTR) or the futex word does not have |
910 | the expected value (EAGAIN), retry. */ |
911 | } |
912 | /* See pthread_rwlock_rdlock_full. */ |
913 | if (ready) |
914 | break; |
915 | if ((atomic_load_acquire (&rwlock->__data.__readers) |
916 | & PTHREAD_RWLOCK_WRPHASE) != 0) |
917 | ready = true; |
918 | } |
919 | } |
920 | |
921 | atomic_store_relaxed (&rwlock->__data.__cur_writer, |
922 | THREAD_GETMEM (THREAD_SELF, tid)); |
923 | return 0; |
924 | } |
925 | |