locks_i386.c source code [codebrowser/osfmk/i386/locks_i386.c]

1	/*
2	* Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	* File: kern/lock.c
58	* Author: Avadis Tevanian, Jr., Michael Wayne Young
59	* Date: 1985
60	*
61	* Locking primitives implementation
62	*/
63
64	#define ATOMIC_PRIVATE 1
65	#define LOCK_PRIVATE 1
66
67	#include <mach_ldebug.h>
68
69	#include <kern/locks.h>
70	#include <kern/kalloc.h>
71	#include <kern/misc_protos.h>
72	#include <kern/thread.h>
73	#include <kern/processor.h>
74	#include <kern/cpu_data.h>
75	#include <kern/cpu_number.h>
76	#include <kern/sched_prim.h>
77	#include <kern/xpr.h>
78	#include <kern/debug.h>
79	#include <string.h>
80
81	#include <i386/machine_routines.h> /* machine_timeout_suspended() */
82	#include <machine/atomic.h>
83	#include <machine/machine_cpu.h>
84	#include <i386/mp.h>
85	#include <machine/atomic.h>
86	#include <sys/kdebug.h>
87	#include <i386/locks_i386_inlines.h>
88
89	/*
90	* We need only enough declarations from the BSD-side to be able to
91	* test if our probe is active, and to call __dtrace_probe(). Setting
92	* NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
93	*/
94	#if CONFIG_DTRACE
95	#define NEED_DTRACE_DEFS
96	#include <../bsd/sys/lockstat.h>
97
98	#define DTRACE_RW_SHARED 0x0 //reader
99	#define DTRACE_RW_EXCL 0x1 //writer
100	#define DTRACE_NO_FLAG 0x0 //not applicable
101
102	#endif
103
104	#define LCK_RW_LCK_EXCLUSIVE_CODE 0x100
105	#define LCK_RW_LCK_EXCLUSIVE1_CODE 0x101
106	#define LCK_RW_LCK_SHARED_CODE 0x102
107	#define LCK_RW_LCK_SH_TO_EX_CODE 0x103
108	#define LCK_RW_LCK_SH_TO_EX1_CODE 0x104
109	#define LCK_RW_LCK_EX_TO_SH_CODE 0x105
110
111	#define LCK_RW_LCK_EX_WRITER_SPIN_CODE 0x106
112	#define LCK_RW_LCK_EX_WRITER_WAIT_CODE 0x107
113	#define LCK_RW_LCK_EX_READER_SPIN_CODE 0x108
114	#define LCK_RW_LCK_EX_READER_WAIT_CODE 0x109
115	#define LCK_RW_LCK_SHARED_SPIN_CODE 0x110
116	#define LCK_RW_LCK_SHARED_WAIT_CODE 0x111
117	#define LCK_RW_LCK_SH_TO_EX_SPIN_CODE 0x112
118	#define LCK_RW_LCK_SH_TO_EX_WAIT_CODE 0x113
119
120
121	#define ANY_LOCK_DEBUG (USLOCK_DEBUG \|\| LOCK_DEBUG \|\| MUTEX_DEBUG)
122
123	unsigned int LcksOpts=`0`;
124
125	#if DEVELOPMENT \|\| DEBUG
126	unsigned int LckDisablePreemptCheck = `0`;
127	#endif
128
129	/ Forwards /
130
131	#if USLOCK_DEBUG
132	/*
133	* Perform simple lock checks.
134	*/
135	int uslock_check = `1`;
136	int max_lock_loops = `100000000`;
137	decl_simple_lock_data(extern , printf_lock)
138	decl_simple_lock_data(extern , panic_lock)
139	#endif /* USLOCK_DEBUG */
140
141	extern unsigned int not_in_kdp;
142
143	/*
144	* We often want to know the addresses of the callers
145	* of the various lock routines. However, this information
146	* is only used for debugging and statistics.
147	*/
148	typedef void *pc_t;
149	#define INVALID_PC ((void *) VM_MAX_KERNEL_ADDRESS)
150	#define INVALID_THREAD ((void *) VM_MAX_KERNEL_ADDRESS)
151	#if ANY_LOCK_DEBUG
152	#define OBTAIN_PC(pc) ((pc) = GET_RETURN_PC())
153	#define DECL_PC(pc) pc_t pc;
154	#else /* ANY_LOCK_DEBUG */
155	#define DECL_PC(pc)
156	#ifdef lint
157	/*
158	* Eliminate lint complaints about unused local pc variables.
159	*/
160	#define OBTAIN_PC(pc) ++pc
161	#else /* lint */
162	#define OBTAIN_PC(pc)
163	#endif /* lint */
164	#endif /* USLOCK_DEBUG */
165
166	/*
167	* atomic exchange API is a low level abstraction of the operations
168	* to atomically read, modify, and write a pointer. This abstraction works
169	* for both Intel and ARMv8.1 compare and exchange atomic instructions as
170	* well as the ARM exclusive instructions.
171	*
172	* atomic_exchange_begin() - begin exchange and retrieve current value
173	* atomic_exchange_complete() - conclude an exchange
174	* atomic_exchange_abort() - cancel an exchange started with atomic_exchange_begin()
175	*/
176	static uint32_t
177	atomic_exchange_begin32(uint32_t target, uint32_t previous, enum memory_order ord)
178	{
179	uint32_t val;
180
181	(void)ord; // Memory order not used
182	val = __c11_atomic_load((_Atomic uint32_t *)target, memory_order_relaxed);
183	*previous = val;
184	return val;
185	}
186
187	static boolean_t
188	atomic_exchange_complete32(uint32_t target, uint32_t previous, uint32_t newval, enum* memory_order ord)
189	{
190	return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &previous, newval, ord, memory_order_relaxed);
191	}
192
193	static void
194	atomic_exchange_abort(void) { }
195
196	static boolean_t
197	atomic_test_and_set32(uint32_t target, uint32_t test_mask, uint32_t set_mask, enum* memory_order ord, boolean_t wait)
198	{
199	uint32_t value, prev;
200
201	for ( ; ; ) {
202	value = atomic_exchange_begin32(target, &prev, ord);
203	if (value & test_mask) {
204	if (wait)
205	cpu_pause();
206	else
207	atomic_exchange_abort();
208	return FALSE;
209	}
210	value \|= set_mask;
211	if (atomic_exchange_complete32(target, prev, value, ord))
212	return TRUE;
213	}
214	}
215
216	/*
217	* Portable lock package implementation of usimple_locks.
218	*/
219
220	#if USLOCK_DEBUG
221	#define USLDBG(stmt) stmt
222	void usld_lock_init(usimple_lock_t, unsigned short);
223	void usld_lock_pre(usimple_lock_t, pc_t);
224	void usld_lock_post(usimple_lock_t, pc_t);
225	void usld_unlock(usimple_lock_t, pc_t);
226	void usld_lock_try_pre(usimple_lock_t, pc_t);
227	void usld_lock_try_post(usimple_lock_t, pc_t);
228	int usld_lock_common_checks(usimple_lock_t, char *);
229	#else /* USLOCK_DEBUG */
230	#define USLDBG(stmt)
231	#endif /* USLOCK_DEBUG */
232
233	/*
234	* Forward definitions
235	*/
236
237	static void lck_rw_lock_shared_gen(lck_rw_t *lck);
238	static void lck_rw_lock_exclusive_gen(lck_rw_t *lck);
239	static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
240	static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
241	static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
242	static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
243	void lck_rw_clear_promotions_x86(thread_t thread);
244	static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
245	static boolean_t lck_rw_grab_want(lck_rw_t *lock);
246	static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
247	static void lck_mtx_unlock_wakeup_tail(lck_mtx_t mutex, int* prior_lock_state, boolean_t indirect);
248	static void lck_mtx_interlock_lock(lck_mtx_t mutex, uint32_t new_state);
249	static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t mutex, uint32_t and_flags, uint32_t new_state);
250	static int lck_mtx_interlock_try_lock(lck_mtx_t mutex, uint32_t new_state);
251	static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t mutex, uint32_t or_flags, uint32_t new_state);
252	static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t lock, uint32_t new_state);
253	static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t lock, uint32_t new_state);
254
255
256	/*
257	* Routine: lck_spin_alloc_init
258	*/
259	lck_spin_t *
260	lck_spin_alloc_init(
261	lck_grp_t *grp,
262	lck_attr_t *attr)
263	{
264	lck_spin_t *lck;
265
266	if ((lck = (lck_spin_t )kalloc(sizeof*(lck_spin_t))) != `0`)
267	lck_spin_init(lck, grp, attr);
268
269	return(lck);
270	}
271
272	/*
273	* Routine: lck_spin_free
274	*/
275	void
276	lck_spin_free(
277	lck_spin_t *lck,
278	lck_grp_t *grp)
279	{
280	lck_spin_destroy(lck, grp);
281	kfree(lck, sizeof(lck_spin_t));
282	}
283
284	/*
285	* Routine: lck_spin_init
286	*/
287	void
288	lck_spin_init(
289	lck_spin_t *lck,
290	lck_grp_t *grp,
291	__unused lck_attr_t *attr)
292	{
293	usimple_lock_init((usimple_lock_t) lck, `0`);
294	lck_grp_reference(grp);
295	lck_grp_lckcnt_incr(grp, LCK_TYPE_SPIN);
296	}
297
298	/*
299	* Routine: lck_spin_destroy
300	*/
301	void
302	lck_spin_destroy(
303	lck_spin_t *lck,
304	lck_grp_t *grp)
305	{
306	if (lck->interlock == LCK_SPIN_TAG_DESTROYED)
307	return;
308	lck->interlock = LCK_SPIN_TAG_DESTROYED;
309	lck_grp_lckcnt_decr(grp, LCK_TYPE_SPIN);
310	lck_grp_deallocate(grp);
311	return;
312	}
313
314	/*
315	* Routine: lck_spin_lock
316	*/
317	void
318	lck_spin_lock(
319	lck_spin_t *lck)
320	{
321	usimple_lock((usimple_lock_t) lck);
322	}
323
324	/*
325	* Routine: lck_spin_unlock
326	*/
327	void
328	lck_spin_unlock(
329	lck_spin_t *lck)
330	{
331	usimple_unlock((usimple_lock_t) lck);
332	}
333
334
335	/*
336	* Routine: lck_spin_try_lock
337	*/
338	boolean_t
339	lck_spin_try_lock(
340	lck_spin_t *lck)
341	{
342	boolean_t lrval = (boolean_t)usimple_lock_try((usimple_lock_t) lck);
343	#if DEVELOPMENT \|\| DEBUG
344	if (lrval) {
345	pltrace(FALSE);
346	}
347	#endif
348	return(lrval);
349	}
350
351	/*
352	* Routine: lck_spin_assert
353	*/
354	void
355	lck_spin_assert(lck_spin_t lock, unsigned* int type)
356	{
357	thread_t thread, holder;
358	uintptr_t state;
359
360	if (__improbable(type != LCK_ASSERT_OWNED && type != LCK_ASSERT_NOTOWNED)) {
361	panic("lck_spin_assert(): invalid arg (%u)", type);
362	}
363
364	state = lock->interlock;
365	holder = (thread_t)state;
366	thread = current_thread();
367	if (type == LCK_ASSERT_OWNED) {
368	if (__improbable(holder == THREAD_NULL)) {
369	panic("Lock not owned %p = %lx", lock, state);
370	}
371	if (__improbable(holder != thread)) {
372	panic("Lock not owned by current thread %p = %lx", lock, state);
373	}
374	} else if (type == LCK_ASSERT_NOTOWNED) {
375	if (__improbable(holder != THREAD_NULL)) {
376	if (holder == thread) {
377	panic("Lock owned by current thread %p = %lx", lock, state);
378	} else {
379	panic("Lock %p owned by thread %p", lock, holder);
380	}
381	}
382	}
383	}
384
385	/*
386	* Routine: kdp_lck_spin_is_acquired
387	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
388	* Returns: TRUE if lock is acquired.
389	*/
390	boolean_t
391	kdp_lck_spin_is_acquired(lck_spin_t *lck) {
392	if (not_in_kdp) {
393	panic("panic: spinlock acquired check done outside of kernel debugger");
394	}
395	return (lck->interlock != `0`)? TRUE : FALSE;
396	}
397
398	/*
399	* Initialize a usimple_lock.
400	*
401	* No change in preemption state.
402	*/
403	void
404	usimple_lock_init(
405	usimple_lock_t l,
406	__unused unsigned short tag)
407	{
408	#ifndef MACHINE_SIMPLE_LOCK
409	USLDBG(usld_lock_init(l, tag));
410	hw_lock_init(&l->interlock);
411	#else
412	simple_lock_init((simple_lock_t)l,tag);
413	#endif
414	}
415
416	volatile uint32_t spinlock_owner_cpu = ~`0`;
417	volatile usimple_lock_t spinlock_timed_out;
418
419	uint32_t spinlock_timeout_NMI(uintptr_t thread_addr) {
420	uint32_t i;
421
422	for (i = `0`; i < real_ncpus; i++) {
423	if ((cpu_data_ptr[i] != NULL) && ((uintptr_t)cpu_data_ptr[i]->cpu_active_thread == thread_addr)) {
424	spinlock_owner_cpu = i;
425	if ((uint32_t) cpu_number() != i) {
426	/ Cause NMI and panic on the owner's cpu /
427	NMIPI_panic(cpu_to_cpumask(i), SPINLOCK_TIMEOUT);
428	}
429	break;
430	}
431	}
432
433	return spinlock_owner_cpu;
434	}
435
436	/*
437	* Acquire a usimple_lock.
438	*
439	* Returns with preemption disabled. Note
440	* that the hw_lock routines are responsible for
441	* maintaining preemption state.
442	*/
443	void
444	usimple_lock(
445	usimple_lock_t l)
446	{
447	#ifndef MACHINE_SIMPLE_LOCK
448	DECL_PC(pc);
449
450	OBTAIN_PC(pc);
451	USLDBG(usld_lock_pre(l, pc));
452
453	if(__improbable(hw_lock_to(&l->interlock, LockTimeOutTSC) == `0`)) {
454	boolean_t uslock_acquired = FALSE;
455	while (machine_timeout_suspended()) {
456	enable_preemption();
457	if ((uslock_acquired = hw_lock_to(&l->interlock, LockTimeOutTSC)))
458	break;
459	}
460
461	if (uslock_acquired == FALSE) {
462	uint32_t lock_cpu;
463	uintptr_t lowner = (uintptr_t)l->interlock.lock_data;
464	spinlock_timed_out = l;
465	lock_cpu = spinlock_timeout_NMI(lowner);
466	panic("Spinlock acquisition timed out: lock=%p, lock owner thread=0x%lx, current_thread: %p, lock owner active on CPU 0x%x, current owner: 0x%lx, time: %llu",
467	l, lowner, current_thread(), lock_cpu, (uintptr_t)l->interlock.lock_data, mach_absolute_time());
468	}
469	}
470	#if DEVELOPMENT \|\| DEBUG
471	pltrace(FALSE);
472	#endif
473
474	USLDBG(usld_lock_post(l, pc));
475	#else
476	simple_lock((simple_lock_t)l);
477	#endif
478	#if CONFIG_DTRACE
479	LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, l, `0`);
480	#endif
481	}
482
483
484	/*
485	* Release a usimple_lock.
486	*
487	* Returns with preemption enabled. Note
488	* that the hw_lock routines are responsible for
489	* maintaining preemption state.
490	*/
491	void
492	usimple_unlock(
493	usimple_lock_t l)
494	{
495	#ifndef MACHINE_SIMPLE_LOCK
496	DECL_PC(pc);
497
498	OBTAIN_PC(pc);
499	USLDBG(usld_unlock(l, pc));
500	#if DEVELOPMENT \|\| DEBUG
501	pltrace(TRUE);
502	#endif
503	hw_lock_unlock(&l->interlock);
504	#else
505	simple_unlock_rwmb((simple_lock_t)l);
506	#endif
507	}
508
509
510	/*
511	* Conditionally acquire a usimple_lock.
512	*
513	* On success, returns with preemption disabled.
514	* On failure, returns with preemption in the same state
515	* as when first invoked. Note that the hw_lock routines
516	* are responsible for maintaining preemption state.
517	*
518	* XXX No stats are gathered on a miss; I preserved this
519	* behavior from the original assembly-language code, but
520	* doesn't it make sense to log misses? XXX
521	*/
522	unsigned int
523	usimple_lock_try(
524	usimple_lock_t l)
525	{
526	#ifndef MACHINE_SIMPLE_LOCK
527	unsigned int success;
528	DECL_PC(pc);
529
530	OBTAIN_PC(pc);
531	USLDBG(usld_lock_try_pre(l, pc));
532	if ((success = hw_lock_try(&l->interlock))) {
533	#if DEVELOPMENT \|\| DEBUG
534	pltrace(FALSE);
535	#endif
536	USLDBG(usld_lock_try_post(l, pc));
537	}
538	return success;
539	#else
540	return(simple_lock_try((simple_lock_t)l));
541	#endif
542	}
543
544	/*
545	* Acquire a usimple_lock while polling for pending TLB flushes
546	* and spinning on a lock.
547	*
548	*/
549	void
550	usimple_lock_try_lock_loop(usimple_lock_t l)
551	{
552	boolean_t istate = ml_get_interrupts_enabled();
553	while (!simple_lock_try((l))) {
554	if (!istate)
555	handle_pending_TLB_flushes();
556	cpu_pause();
557	}
558	}
559
560	#if USLOCK_DEBUG
561	/*
562	* States of a usimple_lock. The default when initializing
563	* a usimple_lock is setting it up for debug checking.
564	*/
565	#define USLOCK_CHECKED 0x0001 /* lock is being checked */
566	#define USLOCK_TAKEN 0x0002 /* lock has been taken */
567	#define USLOCK_INIT 0xBAA0 /* lock has been initialized */
568	#define USLOCK_INITIALIZED (USLOCK_INIT\|USLOCK_CHECKED)
569	#define USLOCK_CHECKING(l) (uslock_check && \
570	((l)->debug.state & USLOCK_CHECKED))
571
572	/*
573	* Trace activities of a particularly interesting lock.
574	*/
575	void usl_trace(usimple_lock_t, int, pc_t, const char *);
576
577
578	/*
579	* Initialize the debugging information contained
580	* in a usimple_lock.
581	*/
582	void
583	usld_lock_init(
584	usimple_lock_t l,
585	__unused unsigned short tag)
586	{
587	if (l == USIMPLE_LOCK_NULL)
588	panic("lock initialization: null lock pointer");
589	l->lock_type = USLOCK_TAG;
590	l->debug.state = uslock_check ? USLOCK_INITIALIZED : `0`;
591	l->debug.lock_cpu = l->debug.unlock_cpu = `0`;
592	l->debug.lock_pc = l->debug.unlock_pc = INVALID_PC;
593	l->debug.lock_thread = l->debug.unlock_thread = INVALID_THREAD;
594	l->debug.duration[`0`] = l->debug.duration[`1`] = `0`;
595	l->debug.unlock_cpu = l->debug.unlock_cpu = `0`;
596	l->debug.unlock_pc = l->debug.unlock_pc = INVALID_PC;
597	l->debug.unlock_thread = l->debug.unlock_thread = INVALID_THREAD;
598	}
599
600
601	/*
602	* These checks apply to all usimple_locks, not just
603	* those with USLOCK_CHECKED turned on.
604	*/
605	int
606	usld_lock_common_checks(
607	usimple_lock_t l,
608	char *caller)
609	{
610	if (l == USIMPLE_LOCK_NULL)
611	panic("%s: null lock pointer", caller);
612	if (l->lock_type != USLOCK_TAG)
613	panic("%s: %p is not a usimple lock, 0x%x", caller, l, l->lock_type);
614	if (!(l->debug.state & USLOCK_INIT))
615	panic("%s: %p is not an initialized lock, 0x%x", caller, l, l->debug.state);
616	return USLOCK_CHECKING(l);
617	}
618
619
620	/*
621	* Debug checks on a usimple_lock just before attempting
622	* to acquire it.
623	*/
624	/ ARGSUSED /
625	void
626	usld_lock_pre(
627	usimple_lock_t l,
628	pc_t pc)
629	{
630	char caller[] = "usimple_lock";
631
632
633	if (!usld_lock_common_checks(l, caller))
634	return;
635
636	/*
637	* Note that we have a weird case where we are getting a lock when we are]
638	* in the process of putting the system to sleep. We are running with no
639	* current threads, therefore we can't tell if we are trying to retake a lock
640	* we have or someone on the other processor has it. Therefore we just
641	* ignore this test if the locking thread is 0.
642	*/
643
644	if ((l->debug.state & USLOCK_TAKEN) && l->debug.lock_thread &&
645	l->debug.lock_thread == (void *) current_thread()) {
646	printf("%s: lock %p already locked (at %p) by",
647	caller, l, l->debug.lock_pc);
648	printf(" current thread %p (new attempt at pc %p)\n",
649	l->debug.lock_thread, pc);
650	panic("%s", caller);
651	}
652	mp_disable_preemption();
653	usl_trace(l, cpu_number(), pc, caller);
654	mp_enable_preemption();
655	}
656
657
658	/*
659	* Debug checks on a usimple_lock just after acquiring it.
660	*
661	* Pre-emption has been disabled at this point,
662	* so we are safe in using cpu_number.
663	*/
664	void
665	usld_lock_post(
666	usimple_lock_t l,
667	pc_t pc)
668	{
669	int mycpu;
670	char caller[] = "successful usimple_lock";
671
672
673	if (!usld_lock_common_checks(l, caller))
674	return;
675
676	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
677	panic("%s: lock %p became uninitialized",
678	caller, l);
679	if ((l->debug.state & USLOCK_TAKEN))
680	panic("%s: lock 0x%p became TAKEN by someone else",
681	caller, l);
682
683	mycpu = cpu_number();
684	l->debug.lock_thread = (void *)current_thread();
685	l->debug.state \|= USLOCK_TAKEN;
686	l->debug.lock_pc = pc;
687	l->debug.lock_cpu = mycpu;
688
689	usl_trace(l, mycpu, pc, caller);
690	}
691
692
693	/*
694	* Debug checks on a usimple_lock just before
695	* releasing it. Note that the caller has not
696	* yet released the hardware lock.
697	*
698	* Preemption is still disabled, so there's
699	* no problem using cpu_number.
700	*/
701	void
702	usld_unlock(
703	usimple_lock_t l,
704	pc_t pc)
705	{
706	int mycpu;
707	char caller[] = "usimple_unlock";
708
709
710	if (!usld_lock_common_checks(l, caller))
711	return;
712
713	mycpu = cpu_number();
714
715	if (!(l->debug.state & USLOCK_TAKEN))
716	panic("%s: lock 0x%p hasn't been taken",
717	caller, l);
718	if (l->debug.lock_thread != (void *) current_thread())
719	panic("%s: unlocking lock 0x%p, owned by thread %p",
720	caller, l, l->debug.lock_thread);
721	if (l->debug.lock_cpu != mycpu) {
722	printf("%s: unlocking lock 0x%p on cpu 0x%x",
723	caller, l, mycpu);
724	printf(" (acquired on cpu 0x%x)\n", l->debug.lock_cpu);
725	panic("%s", caller);
726	}
727	usl_trace(l, mycpu, pc, caller);
728
729	l->debug.unlock_thread = l->debug.lock_thread;
730	l->debug.lock_thread = INVALID_PC;
731	l->debug.state &= ~USLOCK_TAKEN;
732	l->debug.unlock_pc = pc;
733	l->debug.unlock_cpu = mycpu;
734	}
735
736
737	/*
738	* Debug checks on a usimple_lock just before
739	* attempting to acquire it.
740	*
741	* Preemption isn't guaranteed to be disabled.
742	*/
743	void
744	usld_lock_try_pre(
745	usimple_lock_t l,
746	pc_t pc)
747	{
748	char caller[] = "usimple_lock_try";
749
750	if (!usld_lock_common_checks(l, caller))
751	return;
752	mp_disable_preemption();
753	usl_trace(l, cpu_number(), pc, caller);
754	mp_enable_preemption();
755	}
756
757
758	/*
759	* Debug checks on a usimple_lock just after
760	* successfully attempting to acquire it.
761	*
762	* Preemption has been disabled by the
763	* lock acquisition attempt, so it's safe
764	* to use cpu_number.
765	*/
766	void
767	usld_lock_try_post(
768	usimple_lock_t l,
769	pc_t pc)
770	{
771	int mycpu;
772	char caller[] = "successful usimple_lock_try";
773
774	if (!usld_lock_common_checks(l, caller))
775	return;
776
777	if (!((l->debug.state & ~USLOCK_TAKEN) == USLOCK_INITIALIZED))
778	panic("%s: lock 0x%p became uninitialized",
779	caller, l);
780	if ((l->debug.state & USLOCK_TAKEN))
781	panic("%s: lock 0x%p became TAKEN by someone else",
782	caller, l);
783
784	mycpu = cpu_number();
785	l->debug.lock_thread = (void *) current_thread();
786	l->debug.state \|= USLOCK_TAKEN;
787	l->debug.lock_pc = pc;
788	l->debug.lock_cpu = mycpu;
789
790	usl_trace(l, mycpu, pc, caller);
791	}
792
793
794	/*
795	* For very special cases, set traced_lock to point to a
796	* specific lock of interest. The result is a series of
797	* XPRs showing lock operations on that lock. The lock_seq
798	* value is used to show the order of those operations.
799	*/
800	usimple_lock_t traced_lock;
801	unsigned int lock_seq;
802
803	void
804	usl_trace(
805	usimple_lock_t l,
806	int mycpu,
807	pc_t pc,
808	const char * op_name)
809	{
810	if (traced_lock == l) {
811	XPR(XPR_SLOCK,
812	"seq %d, cpu %d, %s @ %x\n",
813	(uintptr_t) lock_seq, (uintptr_t) mycpu,
814	(uintptr_t) op_name, (uintptr_t) pc, `0`);
815	lock_seq++;
816	}
817	}
818
819
820	#endif /* USLOCK_DEBUG */
821
822	/*
823	* Routine: lck_rw_alloc_init
824	*/
825	lck_rw_t *
826	lck_rw_alloc_init(
827	lck_grp_t *grp,
828	lck_attr_t *attr) {
829	lck_rw_t *lck;
830
831	if ((lck = (lck_rw_t )kalloc(sizeof*(lck_rw_t))) != `0`) {
832	bzero(lck, sizeof(lck_rw_t));
833	lck_rw_init(lck, grp, attr);
834	}
835
836	return(lck);
837	}
838
839	/*
840	* Routine: lck_rw_free
841	*/
842	void
843	lck_rw_free(
844	lck_rw_t *lck,
845	lck_grp_t *grp) {
846	lck_rw_destroy(lck, grp);
847	kfree(lck, sizeof(lck_rw_t));
848	}
849
850	/*
851	* Routine: lck_rw_init
852	*/
853	void
854	lck_rw_init(
855	lck_rw_t *lck,
856	lck_grp_t *grp,
857	lck_attr_t *attr)
858	{
859	lck_attr_t *lck_attr = (attr != LCK_ATTR_NULL) ?
860	attr : &LockDefaultLckAttr;
861
862	hw_lock_byte_init(&lck->lck_rw_interlock);
863	lck->lck_rw_want_write = FALSE;
864	lck->lck_rw_want_upgrade = FALSE;
865	lck->lck_rw_shared_count = `0`;
866	lck->lck_rw_can_sleep = TRUE;
867	lck->lck_r_waiting = lck->lck_w_waiting = `0`;
868	lck->lck_rw_tag = `0`;
869	lck->lck_rw_priv_excl = ((lck_attr->lck_attr_val &
870	LCK_ATTR_RW_SHARED_PRIORITY) == `0`);
871
872	lck_grp_reference(grp);
873	lck_grp_lckcnt_incr(grp, LCK_TYPE_RW);
874	}
875
876	/*
877	* Routine: lck_rw_destroy
878	*/
879	void
880	lck_rw_destroy(
881	lck_rw_t *lck,
882	lck_grp_t *grp)
883	{
884	if (lck->lck_rw_tag == LCK_RW_TAG_DESTROYED)
885	return;
886	#if MACH_LDEBUG
887	lck_rw_assert(lck, LCK_RW_ASSERT_NOTHELD);
888	#endif
889	lck->lck_rw_tag = LCK_RW_TAG_DESTROYED;
890	lck_grp_lckcnt_decr(grp, LCK_TYPE_RW);
891	lck_grp_deallocate(grp);
892	return;
893	}
894
895	/*
896	* Sleep locks. These use the same data structure and algorithm
897	* as the spin locks, but the process sleeps while it is waiting
898	* for the lock. These work on uniprocessor systems.
899	*/
900
901	#define DECREMENTER_TIMEOUT 1000000
902
903	/*
904	* We disable interrupts while holding the RW interlock to prevent an
905	* interrupt from exacerbating hold time.
906	* Hence, local helper functions lck_interlock_lock()/lck_interlock_unlock().
907	*/
908	static inline boolean_t
909	lck_interlock_lock(lck_rw_t *lck)
910	{
911	boolean_t istate;
912
913	istate = ml_set_interrupts_enabled(FALSE);
914	hw_lock_byte_lock(&lck->lck_rw_interlock);
915	return istate;
916	}
917
918	static inline void
919	lck_interlock_unlock(lck_rw_t *lck, boolean_t istate)
920	{
921	hw_lock_byte_unlock(&lck->lck_rw_interlock);
922	ml_set_interrupts_enabled(istate);
923	}
924
925	/*
926	* This inline is used when busy-waiting for an rw lock.
927	* If interrupts were disabled when the lock primitive was called,
928	* we poll the IPI handler for pending tlb flushes.
929	* XXX This is a hack to avoid deadlocking on the pmap_system_lock.
930	*/
931	static inline void
932	lck_rw_lock_pause(boolean_t interrupts_enabled)
933	{
934	if (!interrupts_enabled)
935	handle_pending_TLB_flushes();
936	cpu_pause();
937	}
938
939	static inline boolean_t
940	lck_rw_held_read_or_upgrade(lck_rw_t *lock)
941	{
942	if (ordered_load(&lock->data) & (LCK_RW_SHARED_MASK \| LCK_RW_INTERLOCK \| LCK_RW_WANT_UPGRADE))
943	return TRUE;
944	return FALSE;
945	}
946
947	/*
948	* compute the deadline to spin against when
949	* waiting for a change of state on a lck_rw_t
950	*/
951	static inline uint64_t
952	lck_rw_deadline_for_spin(lck_rw_t *lck)
953	{
954	if (lck->lck_rw_can_sleep) {
955	if (lck->lck_r_waiting \|\| lck->lck_w_waiting \|\| lck->lck_rw_shared_count > machine_info.max_cpus) {
956	/*
957	* there are already threads waiting on this lock... this
958	* implies that they have spun beyond their deadlines waiting for
959	* the desired state to show up so we will not bother spinning at this time...
960	* or
961	* the current number of threads sharing this lock exceeds our capacity to run them
962	* concurrently and since all states we're going to spin for require the rw_shared_count
963	* to be at 0, we'll not bother spinning since the latency for this to happen is
964	* unpredictable...
965	*/
966	return (mach_absolute_time());
967	}
968	return (mach_absolute_time() + MutexSpin);
969	} else
970	return (mach_absolute_time() + (`100000LL` * `1000000000LL`));
971	}
972
973
974	/*
975	* Spin while interlock is held.
976	*/
977
978	static inline void
979	lck_rw_interlock_spin(lck_rw_t *lock)
980	{
981	while (ordered_load(&lock->data) & LCK_RW_INTERLOCK) {
982	cpu_pause();
983	}
984	}
985
986	static boolean_t
987	lck_rw_grab_want(lck_rw_t *lock)
988	{
989	uint32_t data, prev;
990
991	for ( ; ; ) {
992	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_relaxed);
993	if ((data & LCK_RW_INTERLOCK) == `0`)
994	break;
995	atomic_exchange_abort();
996	lck_rw_interlock_spin(lock);
997	}
998	if (data & LCK_RW_WANT_WRITE) {
999	atomic_exchange_abort();
1000	return FALSE;
1001	}
1002	data \|= LCK_RW_WANT_WRITE;
1003	return atomic_exchange_complete32(&lock->data, prev, data, memory_order_relaxed);
1004	}
1005
1006	static boolean_t
1007	lck_rw_grab_shared(lck_rw_t *lock)
1008	{
1009	uint32_t data, prev;
1010
1011	for ( ; ; ) {
1012	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1013	if ((data & LCK_RW_INTERLOCK) == `0`)
1014	break;
1015	atomic_exchange_abort();
1016	lck_rw_interlock_spin(lock);
1017	}
1018	if (data & (LCK_RW_WANT_WRITE \| LCK_RW_WANT_UPGRADE)) {
1019	if (((data & LCK_RW_SHARED_MASK) == `0`) \|\| (data & LCK_RW_PRIV_EXCL)) {
1020	atomic_exchange_abort();
1021	return FALSE;
1022	}
1023	}
1024	data += LCK_RW_SHARED_READER;
1025	return atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp);
1026	}
1027
1028	/*
1029	* Routine: lck_rw_lock_exclusive
1030	*/
1031	static void
1032	lck_rw_lock_exclusive_gen(
1033	lck_rw_t *lck)
1034	{
1035	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1036	uint64_t deadline = `0`;
1037	int slept = `0`;
1038	int gotlock = `0`;
1039	int lockheld = `0`;
1040	wait_result_t res = `0`;
1041	boolean_t istate = -`1`;
1042
1043	#if CONFIG_DTRACE
1044	boolean_t dtrace_ls_initialized = FALSE;
1045	boolean_t dtrace_rwl_excl_spin, dtrace_rwl_excl_block, dtrace_ls_enabled= FALSE;
1046	uint64_t wait_interval = `0`;
1047	int readers_at_sleep = `0`;
1048	#endif
1049
1050	/*
1051	* Try to acquire the lck_rw_want_write bit.
1052	*/
1053	while ( !lck_rw_grab_want(lck)) {
1054
1055	#if CONFIG_DTRACE
1056	if (dtrace_ls_initialized == FALSE) {
1057	dtrace_ls_initialized = TRUE;
1058	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != `0`);
1059	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != `0`);
1060	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
1061	if (dtrace_ls_enabled) {
1062	/*
1063	* Either sleeping or spinning is happening,
1064	* start a timing of our delay interval now.
1065	*/
1066	readers_at_sleep = lck->lck_rw_shared_count;
1067	wait_interval = mach_absolute_time();
1068	}
1069	}
1070	#endif
1071	if (istate == -`1`)
1072	istate = ml_get_interrupts_enabled();
1073
1074	deadline = lck_rw_deadline_for_spin(lck);
1075
1076	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, `0`, `0`, `0`, `0`);
1077
1078	while (((gotlock = lck_rw_grab_want(lck)) == `0`) && mach_absolute_time() < deadline)
1079	lck_rw_lock_pause(istate);
1080
1081	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, `0`, `0`, gotlock, `0`);
1082
1083	if (gotlock)
1084	break;
1085	/*
1086	* if we get here, the deadline has expired w/o us
1087	* being able to grab the lock exclusively
1088	* check to see if we're allowed to do a thread_block
1089	*/
1090	if (lck->lck_rw_can_sleep) {
1091
1092	istate = lck_interlock_lock(lck);
1093
1094	if (lck->lck_rw_want_write) {
1095
1096	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, `0`, `0`, `0`, `0`);
1097
1098	lck->lck_w_waiting = TRUE;
1099
1100	thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1101	res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1102	THREAD_UNINT \| THREAD_WAIT_NOREPORT_USER);
1103	lck_interlock_unlock(lck, istate);
1104
1105	if (res == THREAD_WAITING) {
1106	res = thread_block(THREAD_CONTINUE_NULL);
1107	slept++;
1108	}
1109	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_WRITER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, `0`, `0`);
1110	} else {
1111	lck->lck_rw_want_write = TRUE;
1112	lck_interlock_unlock(lck, istate);
1113	break;
1114	}
1115	}
1116	}
1117	/*
1118	* Wait for readers (and upgrades) to finish...
1119	* the test for these conditions must be done simultaneously with
1120	* a check of the interlock not being held since
1121	* the rw_shared_count will drop to 0 first and then want_upgrade
1122	* will be set to 1 in the shared_to_exclusive scenario... those
1123	* adjustments are done behind the interlock and represent an
1124	* atomic change in state and must be considered as such
1125	* however, once we see the read count at 0, the want_upgrade not set
1126	* and the interlock not held, we are safe to proceed
1127	*/
1128	while (lck_rw_held_read_or_upgrade(lck)) {
1129
1130	#if CONFIG_DTRACE
1131	/*
1132	* Either sleeping or spinning is happening, start
1133	* a timing of our delay interval now. If we set it
1134	* to -1 we don't have accurate data so we cannot later
1135	* decide to record a dtrace spin or sleep event.
1136	*/
1137	if (dtrace_ls_initialized == FALSE) {
1138	dtrace_ls_initialized = TRUE;
1139	dtrace_rwl_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_SPIN] != `0`);
1140	dtrace_rwl_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_EXCL_BLOCK] != `0`);
1141	dtrace_ls_enabled = dtrace_rwl_excl_spin \|\| dtrace_rwl_excl_block;
1142	if (dtrace_ls_enabled) {
1143	/*
1144	* Either sleeping or spinning is happening,
1145	* start a timing of our delay interval now.
1146	*/
1147	readers_at_sleep = lck->lck_rw_shared_count;
1148	wait_interval = mach_absolute_time();
1149	}
1150	}
1151	#endif
1152	if (istate == -`1`)
1153	istate = ml_get_interrupts_enabled();
1154
1155	deadline = lck_rw_deadline_for_spin(lck);
1156
1157	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_START, trace_lck, `0`, `0`, `0`, `0`);
1158
1159	while ((lockheld = lck_rw_held_read_or_upgrade(lck)) && mach_absolute_time() < deadline)
1160	lck_rw_lock_pause(istate);
1161
1162	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_SPIN_CODE) \| DBG_FUNC_END, trace_lck, `0`, `0`, lockheld, `0`);
1163
1164	if ( !lockheld)
1165	break;
1166	/*
1167	* if we get here, the deadline has expired w/o us
1168	* being able to grab the lock exclusively
1169	* check to see if we're allowed to do a thread_block
1170	*/
1171	if (lck->lck_rw_can_sleep) {
1172
1173	istate = lck_interlock_lock(lck);
1174
1175	if (lck->lck_rw_shared_count != `0` \|\| lck->lck_rw_want_upgrade) {
1176	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_START, trace_lck, `0`, `0`, `0`, `0`);
1177
1178	lck->lck_w_waiting = TRUE;
1179
1180	thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
1181	res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1182	THREAD_UNINT \| THREAD_WAIT_NOREPORT_USER);
1183	lck_interlock_unlock(lck, istate);
1184
1185	if (res == THREAD_WAITING) {
1186	res = thread_block(THREAD_CONTINUE_NULL);
1187	slept++;
1188	}
1189	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_READER_WAIT_CODE) \| DBG_FUNC_END, trace_lck, res, slept, `0`, `0`);
1190	} else {
1191	lck_interlock_unlock(lck, istate);
1192	/*
1193	* must own the lock now, since we checked for
1194	* readers or upgrade owner behind the interlock
1195	* no need for a call to 'lck_rw_held_read_or_upgrade'
1196	*/
1197	break;
1198	}
1199	}
1200	}
1201
1202	#if CONFIG_DTRACE
1203	/*
1204	* Decide what latencies we suffered that are Dtrace events.
1205	* If we have set wait_interval, then we either spun or slept.
1206	* At least we get out from under the interlock before we record
1207	* which is the best we can do here to minimize the impact
1208	* of the tracing.
1209	* If we have set wait_interval to -1, then dtrace was not enabled when we
1210	* started sleeping/spinning so we don't record this event.
1211	*/
1212	if (dtrace_ls_enabled == TRUE) {
1213	if (slept == `0`) {
1214	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_EXCL_SPIN, lck,
1215	mach_absolute_time() - wait_interval, `1`);
1216	} else {
1217	/*
1218	* For the blocking case, we also record if when we blocked
1219	* it was held for read or write, and how many readers.
1220	* Notice that above we recorded this before we dropped
1221	* the interlock so the count is accurate.
1222	*/
1223	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_EXCL_BLOCK, lck,
1224	mach_absolute_time() - wait_interval, `1`,
1225	(readers_at_sleep == `0` ? `1` : `0`), readers_at_sleep);
1226	}
1227	}
1228	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lck, `1`);
1229	#endif
1230	}
1231
1232	/*
1233	* Routine: lck_rw_done
1234	*/
1235
1236	lck_rw_type_t lck_rw_done(lck_rw_t *lock)
1237	{
1238	uint32_t data, prev;
1239
1240	for ( ; ; ) {
1241	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1242	if (data & LCK_RW_INTERLOCK) { / wait for interlock to clear /
1243	atomic_exchange_abort();
1244	lck_rw_interlock_spin(lock);
1245	continue;
1246	}
1247	if (data & LCK_RW_SHARED_MASK) {
1248	data -= LCK_RW_SHARED_READER;
1249	if ((data & LCK_RW_SHARED_MASK) == `0`) / if reader count has now gone to 0, check for waiters /
1250	goto check_waiters;
1251	} else { / if reader count == 0, must be exclusive lock /
1252	if (data & LCK_RW_WANT_UPGRADE) {
1253	data &= ~(LCK_RW_WANT_UPGRADE);
1254	} else {
1255	if (data & LCK_RW_WANT_WRITE)
1256	data &= ~(LCK_RW_WANT_EXCL);
1257	else / lock is not 'owned', panic /
1258	panic("Releasing non-exclusive RW lock without a reader refcount!");
1259	}
1260	check_waiters:
1261	if (prev & LCK_RW_W_WAITING) {
1262	data &= ~(LCK_RW_W_WAITING);
1263	if ((prev & LCK_RW_PRIV_EXCL) == `0`)
1264	data &= ~(LCK_RW_R_WAITING);
1265	} else
1266	data &= ~(LCK_RW_R_WAITING);
1267	}
1268	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1269	break;
1270	cpu_pause();
1271	}
1272	return lck_rw_done_gen(lock, prev);
1273	}
1274
1275	/*
1276	* Routine: lck_rw_done_gen
1277	*
1278	* called from lck_rw_done()
1279	* prior_lock_state is the value in the 1st
1280	* word of the lock at the time of a successful
1281	* atomic compare and exchange with the new value...
1282	* it represents the state of the lock before we
1283	* decremented the rw_shared_count or cleared either
1284	* rw_want_upgrade or rw_want_write and
1285	* the lck_x_waiting bits... since the wrapper
1286	* routine has already changed the state atomically,
1287	* we just need to decide if we should
1288	* wake up anyone and what value to return... we do
1289	* this by examining the state of the lock before
1290	* we changed it
1291	*/
1292	static lck_rw_type_t
1293	lck_rw_done_gen(
1294	lck_rw_t *lck,
1295	uint32_t prior_lock_state)
1296	{
1297	lck_rw_t *fake_lck;
1298	lck_rw_type_t lock_type;
1299	thread_t thread;
1300	uint32_t rwlock_count;
1301
1302	/*
1303	* prior_lock state is a snapshot of the 1st word of the
1304	* lock in question... we'll fake up a pointer to it
1305	* and carefully not access anything beyond whats defined
1306	* in the first word of a lck_rw_t
1307	*/
1308	fake_lck = (lck_rw_t *)&prior_lock_state;
1309
1310	if (fake_lck->lck_rw_shared_count <= `1`) {
1311	if (fake_lck->lck_w_waiting)
1312	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1313
1314	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1315	thread_wakeup(RW_LOCK_READER_EVENT(lck));
1316	}
1317	if (fake_lck->lck_rw_shared_count)
1318	lock_type = LCK_RW_TYPE_SHARED;
1319	else
1320	lock_type = LCK_RW_TYPE_EXCLUSIVE;
1321
1322	/ Check if dropping the lock means that we need to unpromote /
1323	thread = current_thread();
1324	rwlock_count = thread->rwlock_count--;
1325	#if MACH_LDEBUG
1326	if (rwlock_count == `0`) {
1327	panic("rw lock count underflow for thread %p", thread);
1328	}
1329	#endif
1330	if ((rwlock_count == `1` / field now 0 /) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1331	/ sched_flags checked without lock, but will be rechecked while clearing /
1332	lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1333	}
1334
1335	#if CONFIG_DTRACE
1336	LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? `0` : `1`);
1337	#endif
1338
1339	return(lock_type);
1340	}
1341
1342
1343	/*
1344	* Routine: lck_rw_unlock
1345	*/
1346	void
1347	lck_rw_unlock(
1348	lck_rw_t *lck,
1349	lck_rw_type_t lck_rw_type)
1350	{
1351	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1352	lck_rw_unlock_shared(lck);
1353	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1354	lck_rw_unlock_exclusive(lck);
1355	else
1356	panic("lck_rw_unlock(): Invalid RW lock type: %d\n", lck_rw_type);
1357	}
1358
1359
1360	/*
1361	* Routine: lck_rw_unlock_shared
1362	*/
1363	void
1364	lck_rw_unlock_shared(
1365	lck_rw_t *lck)
1366	{
1367	lck_rw_type_t ret;
1368
1369	assertf(lck->lck_rw_shared_count > `0`, "lck %p has shared_count=0x%x", lck, lck->lck_rw_shared_count);
1370	ret = lck_rw_done(lck);
1371
1372	if (ret != LCK_RW_TYPE_SHARED)
1373	panic("lck_rw_unlock_shared(): lock %p held in mode: %d\n", lck, ret);
1374	}
1375
1376
1377	/*
1378	* Routine: lck_rw_unlock_exclusive
1379	*/
1380	void
1381	lck_rw_unlock_exclusive(
1382	lck_rw_t *lck)
1383	{
1384	lck_rw_type_t ret;
1385
1386	ret = lck_rw_done(lck);
1387
1388	if (ret != LCK_RW_TYPE_EXCLUSIVE)
1389	panic("lck_rw_unlock_exclusive(): lock held in mode: %d\n", ret);
1390	}
1391
1392
1393	/*
1394	* Routine: lck_rw_lock
1395	*/
1396	void
1397	lck_rw_lock(
1398	lck_rw_t *lck,
1399	lck_rw_type_t lck_rw_type)
1400	{
1401	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1402	lck_rw_lock_shared(lck);
1403	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1404	lck_rw_lock_exclusive(lck);
1405	else
1406	panic("lck_rw_lock(): Invalid RW lock type: %x\n", lck_rw_type);
1407	}
1408
1409	/*
1410	* Routine: lck_rw_lock_shared
1411	*/
1412	void
1413	lck_rw_lock_shared(lck_rw_t *lock)
1414	{
1415	uint32_t data, prev;
1416
1417	current_thread()->rwlock_count++;
1418	for ( ; ; ) {
1419	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1420	if (data & (LCK_RW_WANT_EXCL \| LCK_RW_WANT_UPGRADE \| LCK_RW_INTERLOCK)) {
1421	atomic_exchange_abort();
1422	lck_rw_lock_shared_gen(lock);
1423	break;
1424	}
1425	data += LCK_RW_SHARED_READER;
1426	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1427	break;
1428	cpu_pause();
1429	}
1430	#if CONFIG_DTRACE
1431	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1432	#endif /* CONFIG_DTRACE */
1433	return;
1434	}
1435
1436	/*
1437	* Routine: lck_rw_lock_shared_gen
1438	* Function:
1439	* assembly fast path code has determined that this lock
1440	* is held exclusively... this is where we spin/block
1441	* until we can acquire the lock in the shared mode
1442	*/
1443	static void
1444	lck_rw_lock_shared_gen(
1445	lck_rw_t *lck)
1446	{
1447	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1448	uint64_t deadline = `0`;
1449	int gotlock = `0`;
1450	int slept = `0`;
1451	wait_result_t res = `0`;
1452	boolean_t istate = -`1`;
1453
1454	#if CONFIG_DTRACE
1455	uint64_t wait_interval = `0`;
1456	int readers_at_sleep = `0`;
1457	boolean_t dtrace_ls_initialized = FALSE;
1458	boolean_t dtrace_rwl_shared_spin, dtrace_rwl_shared_block, dtrace_ls_enabled = FALSE;
1459	#endif
1460
1461	while ( !lck_rw_grab_shared(lck)) {
1462
1463	#if CONFIG_DTRACE
1464	if (dtrace_ls_initialized == FALSE) {
1465	dtrace_ls_initialized = TRUE;
1466	dtrace_rwl_shared_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_SPIN] != `0`);
1467	dtrace_rwl_shared_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_BLOCK] != `0`);
1468	dtrace_ls_enabled = dtrace_rwl_shared_spin \|\| dtrace_rwl_shared_block;
1469	if (dtrace_ls_enabled) {
1470	/*
1471	* Either sleeping or spinning is happening,
1472	* start a timing of our delay interval now.
1473	*/
1474	readers_at_sleep = lck->lck_rw_shared_count;
1475	wait_interval = mach_absolute_time();
1476	}
1477	}
1478	#endif
1479	if (istate == -`1`)
1480	istate = ml_get_interrupts_enabled();
1481
1482	deadline = lck_rw_deadline_for_spin(lck);
1483
1484	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_START,
1485	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, `0`, `0`);
1486
1487	while (((gotlock = lck_rw_grab_shared(lck)) == `0`) && mach_absolute_time() < deadline)
1488	lck_rw_lock_pause(istate);
1489
1490	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_SPIN_CODE) \| DBG_FUNC_END,
1491	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, gotlock, `0`);
1492
1493	if (gotlock)
1494	break;
1495	/*
1496	* if we get here, the deadline has expired w/o us
1497	* being able to grab the lock for read
1498	* check to see if we're allowed to do a thread_block
1499	*/
1500	if (lck->lck_rw_can_sleep) {
1501
1502	istate = lck_interlock_lock(lck);
1503
1504	if ((lck->lck_rw_want_write \|\| lck->lck_rw_want_upgrade) &&
1505	((lck->lck_rw_shared_count == `0`) \|\| lck->lck_rw_priv_excl)) {
1506
1507	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_START,
1508	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, `0`, `0`);
1509
1510	lck->lck_r_waiting = TRUE;
1511
1512	thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
1513	res = assert_wait(RW_LOCK_READER_EVENT(lck),
1514	THREAD_UNINT \| THREAD_WAIT_NOREPORT_USER);
1515	lck_interlock_unlock(lck, istate);
1516
1517	if (res == THREAD_WAITING) {
1518	res = thread_block(THREAD_CONTINUE_NULL);
1519	slept++;
1520	}
1521	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SHARED_WAIT_CODE) \| DBG_FUNC_END,
1522	trace_lck, res, slept, `0`, `0`);
1523	} else {
1524	lck->lck_rw_shared_count++;
1525	lck_interlock_unlock(lck, istate);
1526	break;
1527	}
1528	}
1529	}
1530
1531	#if CONFIG_DTRACE
1532	if (dtrace_ls_enabled == TRUE) {
1533	if (slept == `0`) {
1534	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_SPIN, lck, mach_absolute_time() - wait_interval, `0`);
1535	} else {
1536	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_BLOCK, lck,
1537	mach_absolute_time() - wait_interval, `0`,
1538	(readers_at_sleep == `0` ? `1` : `0`), readers_at_sleep);
1539	}
1540	}
1541	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_ACQUIRE, lck, `0`);
1542	#endif
1543	}
1544
1545
1546	/*
1547	* Routine: lck_rw_lock_exclusive
1548	*/
1549
1550	void
1551	lck_rw_lock_exclusive(lck_rw_t *lock)
1552	{
1553	current_thread()->rwlock_count++;
1554	if (atomic_test_and_set32(&lock->data,
1555	(LCK_RW_SHARED_MASK \| LCK_RW_WANT_EXCL \| LCK_RW_WANT_UPGRADE \| LCK_RW_INTERLOCK),
1556	LCK_RW_WANT_EXCL, memory_order_acquire_smp, FALSE)) {
1557	#if CONFIG_DTRACE
1558	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1559	#endif /* CONFIG_DTRACE */
1560	} else
1561	lck_rw_lock_exclusive_gen(lock);
1562	}
1563
1564
1565	/*
1566	* Routine: lck_rw_lock_shared_to_exclusive
1567	*/
1568
1569	boolean_t
1570	lck_rw_lock_shared_to_exclusive(lck_rw_t *lock)
1571	{
1572	uint32_t data, prev;
1573
1574	for ( ; ; ) {
1575	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1576	if (data & LCK_RW_INTERLOCK) {
1577	atomic_exchange_abort();
1578	lck_rw_interlock_spin(lock);
1579	continue;
1580	}
1581	if (data & LCK_RW_WANT_UPGRADE) {
1582	data -= LCK_RW_SHARED_READER;
1583	if ((data & LCK_RW_SHARED_MASK) == `0`) / we were the last reader /
1584	data &= ~(LCK_RW_W_WAITING); / so clear the wait indicator /
1585	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1586	return lck_rw_lock_shared_to_exclusive_failure(lock, prev);
1587	} else {
1588	data \|= LCK_RW_WANT_UPGRADE; / ask for WANT_UPGRADE /
1589	data -= LCK_RW_SHARED_READER; / and shed our read count /
1590	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1591	break;
1592	}
1593	cpu_pause();
1594	}
1595	/ we now own the WANT_UPGRADE /
1596	if (data & LCK_RW_SHARED_MASK) / check to see if all of the readers are drained /
1597	lck_rw_lock_shared_to_exclusive_success(lock); / if not, we need to go wait /
1598	#if CONFIG_DTRACE
1599	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lock, `0`);
1600	#endif
1601	return TRUE;
1602	}
1603
1604
1605	/*
1606	* Routine: lck_rw_lock_shared_to_exclusive_failure
1607	* Function:
1608	* assembly fast path code has already dropped our read
1609	* count and determined that someone else owns 'lck_rw_want_upgrade'
1610	* if 'lck_rw_shared_count' == 0, its also already dropped 'lck_w_waiting'
1611	* all we need to do here is determine if a wakeup is needed
1612	*/
1613	static boolean_t
1614	lck_rw_lock_shared_to_exclusive_failure(
1615	lck_rw_t *lck,
1616	uint32_t prior_lock_state)
1617	{
1618	lck_rw_t *fake_lck;
1619	thread_t thread = current_thread();
1620	uint32_t rwlock_count;
1621
1622	/ Check if dropping the lock means that we need to unpromote /
1623	rwlock_count = thread->rwlock_count--;
1624	#if MACH_LDEBUG
1625	if (rwlock_count == `0`) {
1626	panic("rw lock count underflow for thread %p", thread);
1627	}
1628	#endif
1629	fake_lck = (lck_rw_t *)&prior_lock_state;
1630
1631	if (fake_lck->lck_w_waiting && fake_lck->lck_rw_shared_count == `1`) {
1632	/*
1633	* Someone else has requested upgrade.
1634	* Since we've released the read lock, wake
1635	* him up if he's blocked waiting
1636	*/
1637	thread_wakeup(RW_LOCK_WRITER_EVENT(lck));
1638	}
1639
1640	if ((rwlock_count == `1` / field now 0 /) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
1641	/ sched_flags checked without lock, but will be rechecked while clearing /
1642	lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
1643	}
1644
1645	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) \| DBG_FUNC_NONE,
1646	VM_KERNEL_UNSLIDE_OR_PERM(lck), lck->lck_rw_shared_count, lck->lck_rw_want_upgrade, `0`, `0`);
1647
1648	return (FALSE);
1649	}
1650
1651
1652	/*
1653	* Routine: lck_rw_lock_shared_to_exclusive_failure
1654	* Function:
1655	* assembly fast path code has already dropped our read
1656	* count and successfully acquired 'lck_rw_want_upgrade'
1657	* we just need to wait for the rest of the readers to drain
1658	* and then we can return as the exclusive holder of this lock
1659	*/
1660	static boolean_t
1661	lck_rw_lock_shared_to_exclusive_success(
1662	lck_rw_t *lck)
1663	{
1664	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1665	uint64_t deadline = `0`;
1666	int slept = `0`;
1667	int still_shared = `0`;
1668	wait_result_t res;
1669	boolean_t istate = -`1`;
1670
1671	#if CONFIG_DTRACE
1672	uint64_t wait_interval = `0`;
1673	int readers_at_sleep = `0`;
1674	boolean_t dtrace_ls_initialized = FALSE;
1675	boolean_t dtrace_rwl_shared_to_excl_spin, dtrace_rwl_shared_to_excl_block, dtrace_ls_enabled = FALSE;
1676	#endif
1677
1678	while (lck->lck_rw_shared_count != `0`) {
1679
1680	#if CONFIG_DTRACE
1681	if (dtrace_ls_initialized == FALSE) {
1682	dtrace_ls_initialized = TRUE;
1683	dtrace_rwl_shared_to_excl_spin = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN] != `0`);
1684	dtrace_rwl_shared_to_excl_block = (lockstat_probemap[LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK] != `0`);
1685	dtrace_ls_enabled = dtrace_rwl_shared_to_excl_spin \|\| dtrace_rwl_shared_to_excl_block;
1686	if (dtrace_ls_enabled) {
1687	/*
1688	* Either sleeping or spinning is happening,
1689	* start a timing of our delay interval now.
1690	*/
1691	readers_at_sleep = lck->lck_rw_shared_count;
1692	wait_interval = mach_absolute_time();
1693	}
1694	}
1695	#endif
1696	if (istate == -`1`)
1697	istate = ml_get_interrupts_enabled();
1698
1699	deadline = lck_rw_deadline_for_spin(lck);
1700
1701	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_START,
1702	trace_lck, lck->lck_rw_shared_count, `0`, `0`, `0`);
1703
1704	while ((still_shared = lck->lck_rw_shared_count) && mach_absolute_time() < deadline)
1705	lck_rw_lock_pause(istate);
1706
1707	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_SPIN_CODE) \| DBG_FUNC_END,
1708	trace_lck, lck->lck_rw_shared_count, `0`, `0`, `0`);
1709
1710	if ( !still_shared)
1711	break;
1712	/*
1713	* if we get here, the deadline has expired w/o
1714	* the rw_shared_count having drained to 0
1715	* check to see if we're allowed to do a thread_block
1716	*/
1717	if (lck->lck_rw_can_sleep) {
1718
1719	istate = lck_interlock_lock(lck);
1720
1721	if (lck->lck_rw_shared_count != `0`) {
1722	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_START,
1723	trace_lck, lck->lck_rw_shared_count, `0`, `0`, `0`);
1724
1725	lck->lck_w_waiting = TRUE;
1726
1727	thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
1728	res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
1729	THREAD_UNINT \| THREAD_WAIT_NOREPORT_USER);
1730	lck_interlock_unlock(lck, istate);
1731
1732	if (res == THREAD_WAITING) {
1733	res = thread_block(THREAD_CONTINUE_NULL);
1734	slept++;
1735	}
1736	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_WAIT_CODE) \| DBG_FUNC_END,
1737	trace_lck, res, slept, `0`, `0`);
1738	} else {
1739	lck_interlock_unlock(lck, istate);
1740	break;
1741	}
1742	}
1743	}
1744	#if CONFIG_DTRACE
1745	/*
1746	* We infer whether we took the sleep/spin path above by checking readers_at_sleep.
1747	*/
1748	if (dtrace_ls_enabled == TRUE) {
1749	if (slept == `0`) {
1750	LOCKSTAT_RECORD2(LS_LCK_RW_LOCK_SHARED_TO_EXCL_SPIN, lck, mach_absolute_time() - wait_interval, `0`);
1751	} else {
1752	LOCKSTAT_RECORD4(LS_LCK_RW_LOCK_SHARED_TO_EXCL_BLOCK, lck,
1753	mach_absolute_time() - wait_interval, `1`,
1754	(readers_at_sleep == `0` ? `1` : `0`), readers_at_sleep);
1755	}
1756	}
1757	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_SHARED_TO_EXCL_UPGRADE, lck, `1`);
1758	#endif
1759	return (TRUE);
1760	}
1761
1762	/*
1763	* Routine: lck_rw_lock_exclusive_to_shared
1764	*/
1765
1766	void lck_rw_lock_exclusive_to_shared(lck_rw_t *lock)
1767	{
1768	uint32_t data, prev;
1769
1770	for ( ; ; ) {
1771	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_release_smp);
1772	if (data & LCK_RW_INTERLOCK) {
1773	atomic_exchange_abort();
1774	lck_rw_interlock_spin(lock); / wait for interlock to clear /
1775	continue;
1776	}
1777	data += LCK_RW_SHARED_READER;
1778	if (data & LCK_RW_WANT_UPGRADE)
1779	data &= ~(LCK_RW_WANT_UPGRADE);
1780	else
1781	data &= ~(LCK_RW_WANT_EXCL);
1782	if (!((prev & LCK_RW_W_WAITING) && (prev & LCK_RW_PRIV_EXCL)))
1783	data &= ~(LCK_RW_W_WAITING);
1784	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_release_smp))
1785	break;
1786	cpu_pause();
1787	}
1788	return lck_rw_lock_exclusive_to_shared_gen(lock, prev);
1789	}
1790
1791
1792	/*
1793	* Routine: lck_rw_lock_exclusive_to_shared_gen
1794	* Function:
1795	* assembly fast path has already dropped
1796	* our exclusive state and bumped lck_rw_shared_count
1797	* all we need to do here is determine if anyone
1798	* needs to be awakened.
1799	*/
1800	static void
1801	lck_rw_lock_exclusive_to_shared_gen(
1802	lck_rw_t *lck,
1803	uint32_t prior_lock_state)
1804	{
1805	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
1806	lck_rw_t *fake_lck;
1807
1808	fake_lck = (lck_rw_t *)&prior_lock_state;
1809
1810	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_START,
1811	trace_lck, fake_lck->lck_rw_want_write, fake_lck->lck_rw_want_upgrade, `0`, `0`);
1812
1813	/*
1814	* don't wake up anyone waiting to take the lock exclusively
1815	* since we hold a read count... when the read count drops to 0,
1816	* the writers will be woken.
1817	*
1818	* wake up any waiting readers if we don't have any writers waiting,
1819	* or the lock is NOT marked as rw_priv_excl (writers have privilege)
1820	*/
1821	if (!(fake_lck->lck_rw_priv_excl && fake_lck->lck_w_waiting) && fake_lck->lck_r_waiting)
1822	thread_wakeup(RW_LOCK_READER_EVENT(lck));
1823
1824	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_EX_TO_SH_CODE) \| DBG_FUNC_END,
1825	trace_lck, lck->lck_rw_want_write, lck->lck_rw_want_upgrade, lck->lck_rw_shared_count, `0`);
1826
1827	#if CONFIG_DTRACE
1828	LOCKSTAT_RECORD(LS_LCK_RW_LOCK_EXCL_TO_SHARED_DOWNGRADE, lck, `0`);
1829	#endif
1830	}
1831
1832
1833	/*
1834	* Routine: lck_rw_try_lock
1835	*/
1836	boolean_t
1837	lck_rw_try_lock(
1838	lck_rw_t *lck,
1839	lck_rw_type_t lck_rw_type)
1840	{
1841	if (lck_rw_type == LCK_RW_TYPE_SHARED)
1842	return(lck_rw_try_lock_shared(lck));
1843	else if (lck_rw_type == LCK_RW_TYPE_EXCLUSIVE)
1844	return(lck_rw_try_lock_exclusive(lck));
1845	else
1846	panic("lck_rw_try_lock(): Invalid rw lock type: %x\n", lck_rw_type);
1847	return(FALSE);
1848	}
1849
1850	/*
1851	* Routine: lck_rw_try_lock_shared
1852	*/
1853
1854	boolean_t lck_rw_try_lock_shared(lck_rw_t *lock)
1855	{
1856	uint32_t data, prev;
1857
1858	for ( ; ; ) {
1859	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1860	if (data & LCK_RW_INTERLOCK) {
1861	atomic_exchange_abort();
1862	lck_rw_interlock_spin(lock);
1863	continue;
1864	}
1865	if (data & (LCK_RW_WANT_EXCL \| LCK_RW_WANT_UPGRADE)) {
1866	atomic_exchange_abort();
1867	return FALSE; / lock is busy /
1868	}
1869	data += LCK_RW_SHARED_READER; / Increment reader refcount /
1870	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1871	break;
1872	cpu_pause();
1873	}
1874	current_thread()->rwlock_count++;
1875	/ There is a 3 instr window where preemption may not notice rwlock_count after cmpxchg /
1876	#if CONFIG_DTRACE
1877	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_SHARED_ACQUIRE, lock, DTRACE_RW_SHARED);
1878	#endif /* CONFIG_DTRACE */
1879	return TRUE;
1880	}
1881
1882
1883	/*
1884	* Routine: lck_rw_try_lock_exclusive
1885	*/
1886
1887	boolean_t lck_rw_try_lock_exclusive(lck_rw_t *lock)
1888	{
1889	uint32_t data, prev;
1890
1891	for ( ; ; ) {
1892	data = atomic_exchange_begin32(&lock->data, &prev, memory_order_acquire_smp);
1893	if (data & LCK_RW_INTERLOCK) {
1894	atomic_exchange_abort();
1895	lck_rw_interlock_spin(lock);
1896	continue;
1897	}
1898	if (data & (LCK_RW_SHARED_MASK \| LCK_RW_WANT_EXCL \| LCK_RW_WANT_UPGRADE)) {
1899	atomic_exchange_abort();
1900	return FALSE; / can't get it /
1901	}
1902	data \|= LCK_RW_WANT_EXCL;
1903	if (atomic_exchange_complete32(&lock->data, prev, data, memory_order_acquire_smp))
1904	break;
1905	cpu_pause();
1906	}
1907
1908	current_thread()->rwlock_count++;
1909	#if CONFIG_DTRACE
1910	LOCKSTAT_RECORD(LS_LCK_RW_TRY_LOCK_EXCL_ACQUIRE, lock, DTRACE_RW_EXCL);
1911	#endif /* CONFIG_DTRACE */
1912	return TRUE;
1913	}
1914
1915
1916	void
1917	lck_rw_assert(
1918	lck_rw_t *lck,
1919	unsigned int type)
1920	{
1921	switch (type) {
1922	case LCK_RW_ASSERT_SHARED:
1923	if (lck->lck_rw_shared_count != `0`) {
1924	return;
1925	}
1926	break;
1927	case LCK_RW_ASSERT_EXCLUSIVE:
1928	if ((lck->lck_rw_want_write \|\|
1929	lck->lck_rw_want_upgrade) &&
1930	lck->lck_rw_shared_count == `0`) {
1931	return;
1932	}
1933	break;
1934	case LCK_RW_ASSERT_HELD:
1935	if (lck->lck_rw_want_write \|\|
1936	lck->lck_rw_want_upgrade \|\|
1937	lck->lck_rw_shared_count != `0`) {
1938	return;
1939	}
1940	break;
1941	case LCK_RW_ASSERT_NOTHELD:
1942	if (!(lck->lck_rw_want_write \|\|
1943	lck->lck_rw_want_upgrade \|\|
1944	lck->lck_rw_shared_count != `0`)) {
1945	return;
1946	}
1947	break;
1948	default:
1949	break;
1950	}
1951
1952	panic("rw lock (%p)%s held (mode=%u), first word %08x\n", lck, (type == LCK_RW_ASSERT_NOTHELD ? "" : " not"), type, (uint32_t )lck);
1953	}
1954
1955	/ On return to userspace, this routine is called if the rwlock_count is somehow imbalanced /
1956	void
1957	lck_rw_clear_promotions_x86(thread_t thread)
1958	{
1959	#if MACH_LDEBUG
1960	/ It's fatal to leave a RW lock locked and return to userspace /
1961	panic("%u rw lock(s) held on return to userspace for thread %p", thread->rwlock_count, thread);
1962	#else
1963	/ Paper over the issue /
1964	thread->rwlock_count = `0`;
1965	lck_rw_clear_promotion(thread, `0`);
1966	#endif
1967	}
1968
1969	boolean_t
1970	lck_rw_lock_yield_shared(lck_rw_t *lck, boolean_t force_yield)
1971	{
1972	lck_rw_assert(lck, LCK_RW_ASSERT_SHARED);
1973
1974	if (lck->lck_rw_want_write \|\| lck->lck_rw_want_upgrade \|\| force_yield) {
1975	lck_rw_unlock_shared(lck);
1976	mutex_pause(`2`);
1977	lck_rw_lock_shared(lck);
1978	return TRUE;
1979	}
1980
1981	return FALSE;
1982	}
1983
1984	/*
1985	* Routine: kdp_lck_rw_lock_is_acquired_exclusive
1986	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
1987	*/
1988	boolean_t
1989	kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
1990	if (not_in_kdp) {
1991	panic("panic: rw lock exclusive check done outside of kernel debugger");
1992	}
1993	return ((lck->lck_rw_want_upgrade \|\| lck->lck_rw_want_write) && (lck->lck_rw_shared_count == `0`)) ? TRUE : FALSE;
1994	}
1995
1996	/*
1997	* Slow path routines for lck_mtx locking and unlocking functions.
1998	*
1999	* These functions were previously implemented in x86 assembly,
2000	* and some optimizations are in place in this c code to obtain a compiled code
2001	* as performant and compact as the assembly version.
2002	*
2003	* To avoid to inline these functions on the fast path, all functions directly called by
2004	* the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
2005	* in such a way the fast path can tail call into them. In this way the return address
2006	* does not need to be pushed on the caller stack and stack optimization can happen on the caller.
2007	*
2008	* Slow path code is structured in such a way there are no calls to functions that will return
2009	* on the context of the caller function, i.e. all functions called are or tail call functions
2010	* or inline functions. The number of arguments of the tail call functions are less then six,
2011	* so that they can be passed over registers and do not need to be pushed on stack.
2012	* This allows the compiler to not create a stack frame for the functions.
2013	*
2014	* __improbable and __probable are used to compile the slow path code in such a way
2015	* the fast path case will be on a sequence of instructions with as less jumps as possible,
2016	* to make this case the most optimized even if falling through the slow path.
2017	*/
2018
2019	/*
2020	* Intel lock invariants:
2021	*
2022	* lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
2023	* lck_mtx_pri: contains the max priority of all waiters during a contention period
2024	* not cleared on last unlock, but stomped over on next first contention
2025	* lck_mtx_promoted: set when the current lock owner has been promoted
2026	* cleared when lock owner unlocks, set on acquire or wait.
2027	*
2028	* The lock owner is promoted to the max priority of all its waiters only if it
2029	* was a lower priority when it acquired or was an owner when a waiter waited.
2030	* Max priority is capped at MAXPRI_PROMOTE.
2031	*
2032	* The last waiter will not be promoted as it is woken up, but the last
2033	* lock owner may not have been the last thread to have been woken up depending on the
2034	* luck of the draw. Therefore a last-owner may still have the promoted-on-wakeup
2035	* flag set.
2036	*
2037	* TODO: Figure out an algorithm for stopping a lock holder which is already at the right
2038	* priority from dropping priority in the future without having to take thread lock
2039	* on acquire.
2040	*/
2041
2042	#ifdef MUTEX_ZONE
2043	extern zone_t lck_mtx_zone;
2044	#endif
2045
2046	/*
2047	* N.B.: On x86, statistics are currently recorded for all indirect mutexes.
2048	* Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained
2049	* as a 64-bit quantity (the new x86 specific statistics are also maintained
2050	* as 32-bit quantities).
2051	*
2052	*
2053	* Enable this preprocessor define to record the first miss alone
2054	* By default, we count every miss, hence multiple misses may be
2055	* recorded for a single lock acquire attempt via lck_mtx_lock
2056	*/
2057	#undef LOG_FIRST_MISS_ALONE
2058
2059	/*
2060	* This preprocessor define controls whether the R-M-W update of the
2061	* per-group statistics elements are atomic (LOCK-prefixed)
2062	* Enabled by default.
2063	*/
2064	#define ATOMIC_STAT_UPDATES 1
2065
2066
2067	/*
2068	* Routine: lck_mtx_alloc_init
2069	*/
2070	lck_mtx_t *
2071	lck_mtx_alloc_init(
2072	lck_grp_t *grp,
2073	lck_attr_t *attr)
2074	{
2075	lck_mtx_t *lck;
2076	#ifdef MUTEX_ZONE
2077	if ((lck = (lck_mtx_t *)zalloc(lck_mtx_zone)) != `0`)
2078	lck_mtx_init(lck, grp, attr);
2079	#else
2080	if ((lck = (lck_mtx_t )kalloc(sizeof*(lck_mtx_t))) != `0`)
2081	lck_mtx_init(lck, grp, attr);
2082	#endif
2083	return(lck);
2084	}
2085
2086	/*
2087	* Routine: lck_mtx_free
2088	*/
2089	void
2090	lck_mtx_free(
2091	lck_mtx_t *lck,
2092	lck_grp_t *grp)
2093	{
2094	lck_mtx_destroy(lck, grp);
2095	#ifdef MUTEX_ZONE
2096	zfree(lck_mtx_zone, lck);
2097	#else
2098	kfree(lck, sizeof(lck_mtx_t));
2099	#endif
2100	}
2101
2102	/*
2103	* Routine: lck_mtx_ext_init
2104	*/
2105	static void
2106	lck_mtx_ext_init(
2107	lck_mtx_ext_t *lck,
2108	lck_grp_t *grp,
2109	lck_attr_t *attr)
2110	{
2111	bzero((void )lck, sizeof*(lck_mtx_ext_t));
2112
2113	if ((attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2114	lck->lck_mtx_deb.type = MUTEX_TAG;
2115	lck->lck_mtx_attr \|= LCK_MTX_ATTR_DEBUG;
2116	}
2117
2118	lck->lck_mtx_grp = grp;
2119
2120	if (grp->lck_grp_attr & LCK_GRP_ATTR_STAT)
2121	lck->lck_mtx_attr \|= LCK_MTX_ATTR_STAT;
2122
2123	lck->lck_mtx.lck_mtx_is_ext = `1`;
2124	lck->lck_mtx.lck_mtx_pad32 = `0xFFFFFFFF`;
2125	}
2126
2127	/*
2128	* Routine: lck_mtx_init
2129	*/
2130	void
2131	lck_mtx_init(
2132	lck_mtx_t *lck,
2133	lck_grp_t *grp,
2134	lck_attr_t *attr)
2135	{
2136	lck_mtx_ext_t *lck_ext;
2137	lck_attr_t *lck_attr;
2138
2139	if (attr != LCK_ATTR_NULL)
2140	lck_attr = attr;
2141	else
2142	lck_attr = &LockDefaultLckAttr;
2143
2144	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2145	if ((lck_ext = (lck_mtx_ext_t )kalloc(sizeof*(lck_mtx_ext_t))) != `0`) {
2146	lck_mtx_ext_init(lck_ext, grp, lck_attr);
2147	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2148	lck->lck_mtx_ptr = lck_ext;
2149	}
2150	} else {
2151	lck->lck_mtx_owner = `0`;
2152	lck->lck_mtx_state = `0`;
2153	}
2154	lck->lck_mtx_pad32 = `0xFFFFFFFF`;
2155	lck_grp_reference(grp);
2156	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2157	}
2158
2159	/*
2160	* Routine: lck_mtx_init_ext
2161	*/
2162	void
2163	lck_mtx_init_ext(
2164	lck_mtx_t *lck,
2165	lck_mtx_ext_t *lck_ext,
2166	lck_grp_t *grp,
2167	lck_attr_t *attr)
2168	{
2169	lck_attr_t *lck_attr;
2170
2171	if (attr != LCK_ATTR_NULL)
2172	lck_attr = attr;
2173	else
2174	lck_attr = &LockDefaultLckAttr;
2175
2176	if ((lck_attr->lck_attr_val) & LCK_ATTR_DEBUG) {
2177	lck_mtx_ext_init(lck_ext, grp, lck_attr);
2178	lck->lck_mtx_tag = LCK_MTX_TAG_INDIRECT;
2179	lck->lck_mtx_ptr = lck_ext;
2180	} else {
2181	lck->lck_mtx_owner = `0`;
2182	lck->lck_mtx_state = `0`;
2183	}
2184	lck->lck_mtx_pad32 = `0xFFFFFFFF`;
2185
2186	lck_grp_reference(grp);
2187	lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
2188	}
2189
2190	static void
2191	lck_mtx_lock_mark_destroyed(
2192	lck_mtx_t *mutex,
2193	boolean_t indirect)
2194	{
2195	uint32_t state;
2196
2197	if (indirect) {
2198	/ convert to destroyed state /
2199	ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2200	return;
2201	}
2202
2203	state = ordered_load_mtx_state(mutex);
2204	lck_mtx_interlock_lock(mutex, &state);
2205
2206	ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
2207
2208	enable_preemption();
2209	}
2210
2211	/*
2212	* Routine: lck_mtx_destroy
2213	*/
2214	void
2215	lck_mtx_destroy(
2216	lck_mtx_t *lck,
2217	lck_grp_t *grp)
2218	{
2219	boolean_t indirect;
2220
2221	if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
2222	return;
2223	#if MACH_LDEBUG
2224	lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
2225	#endif
2226	indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
2227
2228	lck_mtx_lock_mark_destroyed(lck, indirect);
2229
2230	if (indirect)
2231	kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
2232	lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
2233	lck_grp_deallocate(grp);
2234	return;
2235	}
2236
2237
2238	#if DEVELOPMENT \| DEBUG
2239	__attribute__((noinline))
2240	void
2241	lck_mtx_owner_check_panic(
2242	lck_mtx_t *lock)
2243	{
2244	thread_t owner = (thread_t)lock->lck_mtx_owner;
2245	panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
2246	}
2247	#endif
2248
2249	__attribute__((always_inline))
2250	static boolean_t
2251	get_indirect_mutex(
2252	lck_mtx_t **lock,
2253	uint32_t *state)
2254	{
2255	lock = &((lock)->lck_mtx_ptr->lck_mtx);
2256	state = ordered_load_mtx_state(lock);
2257	return TRUE;
2258	}
2259
2260	/*
2261	* Routine: lck_mtx_unlock_slow
2262	*
2263	* Unlocks a mutex held by current thread.
2264	*
2265	* It will wake up waiters if necessary and
2266	* drop promotions.
2267	*
2268	* Interlock can be held.
2269	*/
2270	__attribute__((noinline))
2271	void
2272	lck_mtx_unlock_slow(
2273	lck_mtx_t *lock)
2274	{
2275	thread_t thread;
2276	uint32_t state, prev;
2277	boolean_t indirect = FALSE;
2278
2279	state = ordered_load_mtx_state(lock);
2280
2281	/ Is this an indirect mutex? /
2282	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2283	indirect = get_indirect_mutex(&lock, &state);
2284	}
2285
2286	thread = current_thread();
2287
2288	#if DEVELOPMENT \| DEBUG
2289	thread_t owner = (thread_t)lock->lck_mtx_owner;
2290	if(__improbable(owner != thread))
2291	return lck_mtx_owner_check_panic(lock);
2292	#endif
2293
2294	/ check if it is held as a spinlock /
2295	if (__improbable((state & LCK_MTX_MLOCKED_MSK) == `0`))
2296	goto unlock;
2297
2298	lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
2299
2300	unlock:
2301	/ preemption disabled, interlock held and mutex not held /
2302
2303	/ clear owner /
2304	ordered_store_mtx_owner(lock, `0`);
2305	/ keep original state in prev for later evaluation /
2306	prev = state;
2307	/ release interlock, promotion and clear spin flag /
2308	state &= (~(LCK_MTX_ILOCKED_MSK \| LCK_MTX_SPIN_MSK \| LCK_MTX_PROMOTED_MSK));
2309	if ((state & LCK_MTX_WAITERS_MSK))
2310	state -= LCK_MTX_WAITER; / decrement waiter count /
2311	ordered_store_mtx_state_release(lock, state); / since I own the interlock, I don't need an atomic update /
2312
2313	#if MACH_LDEBUG
2314	/ perform lock statistics after drop to prevent delay /
2315	if (thread)
2316	thread->mutex_count--; / lock statistic /
2317	#endif /* MACH_LDEBUG */
2318
2319	/ check if there are waiters to wake up or priority to drop /
2320	if ((prev & (LCK_MTX_PROMOTED_MSK \| LCK_MTX_WAITERS_MSK)))
2321	return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
2322
2323	/ re-enable preemption /
2324	lck_mtx_unlock_finish_inline(lock, FALSE);
2325
2326	return;
2327	}
2328
2329	#define LCK_MTX_LCK_WAIT_CODE 0x20
2330	#define LCK_MTX_LCK_WAKEUP_CODE 0x21
2331	#define LCK_MTX_LCK_SPIN_CODE 0x22
2332	#define LCK_MTX_LCK_ACQUIRE_CODE 0x23
2333	#define LCK_MTX_LCK_DEMOTE_CODE 0x24
2334
2335	/*
2336	* Routine: lck_mtx_unlock_wakeup_tail
2337	*
2338	* Invoked on unlock when there is
2339	* contention, i.e. the assembly routine sees
2340	* that mutex->lck_mtx_waiters != 0 or
2341	* that mutex->lck_mtx_promoted != 0
2342	*
2343	* neither the mutex or interlock is held
2344	*
2345	* Note that this routine might not be called if there are pending
2346	* waiters which have previously been woken up, and they didn't
2347	* end up boosting the old owner.
2348	*
2349	* assembly routine previously did the following to mutex:
2350	* (after saving the state in prior_lock_state)
2351	* cleared lck_mtx_promoted
2352	* decremented lck_mtx_waiters if nonzero
2353	*
2354	* This function needs to be called as a tail call
2355	* to optimize the compiled code.
2356	*/
2357	__attribute__((noinline))
2358	static void
2359	lck_mtx_unlock_wakeup_tail (
2360	lck_mtx_t *mutex,
2361	int prior_lock_state,
2362	boolean_t indirect)
2363	{
2364	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2365	lck_mtx_t fake_lck;
2366
2367	/*
2368	* prior_lock state is a snapshot of the 2nd word of the
2369	* lock in question... we'll fake up a lock with the bits
2370	* copied into place and carefully not access anything
2371	* beyond whats defined in the second word of a lck_mtx_t
2372	*/
2373	fake_lck.lck_mtx_state = prior_lock_state;
2374
2375	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_START,
2376	trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, `0`);
2377
2378	if (__probable(fake_lck.lck_mtx_waiters)) {
2379	kern_return_t did_wake;
2380
2381	if (fake_lck.lck_mtx_waiters > `1`)
2382	did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
2383	else
2384	did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
2385	/*
2386	* The waiters count always precisely matches the number of threads on the waitqueue.
2387	* i.e. we should never see ret == KERN_NOT_WAITING.
2388	*/
2389	assert(did_wake == KERN_SUCCESS);
2390	}
2391
2392	/ When lck_mtx_promoted was set, then I as the owner definitely have a promotion /
2393	if (__improbable(fake_lck.lck_mtx_promoted)) {
2394	thread_t thread = current_thread();
2395
2396	spl_t s = splsched();
2397	thread_lock(thread);
2398
2399	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) \| DBG_FUNC_NONE,
2400	thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, `0`, `0`);
2401	assert(thread->was_promoted_on_wakeup == `0`);
2402	assert(thread->promotions > `0`);
2403
2404	assert_promotions_invariant(thread);
2405
2406	if (--thread->promotions == `0`)
2407	sched_thread_unpromote(thread, trace_lck);
2408
2409	assert_promotions_invariant(thread);
2410
2411	thread_unlock(thread);
2412	splx(s);
2413	}
2414
2415	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) \| DBG_FUNC_END,
2416	trace_lck, `0`, mutex->lck_mtx_waiters, `0`, `0`);
2417
2418	lck_mtx_unlock_finish_inline(mutex, indirect);
2419	}
2420
2421	/*
2422	* Routine: lck_mtx_lock_acquire_x86
2423	*
2424	* Invoked on acquiring the mutex when there is
2425	* contention (i.e. the assembly routine sees that
2426	* that mutex->lck_mtx_waiters != 0 or
2427	* thread->was_promoted_on_wakeup != 0)...
2428	*
2429	* mutex is owned... interlock is held... preemption is disabled
2430	*/
2431	__attribute__((always_inline))
2432	static void
2433	lck_mtx_lock_acquire_inline(
2434	lck_mtx_t *mutex)
2435	{
2436	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
2437	integer_t priority;
2438
2439	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_START,
2440	trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, `0`);
2441
2442	if (mutex->lck_mtx_waiters)
2443	priority = mutex->lck_mtx_pri;
2444	else
2445	priority = `0`; / not worth resetting lck_mtx_pri here, it will be reset by next waiter /
2446
2447	/ the priority must have been set correctly by wait /
2448	assert(priority <= MAXPRI_PROMOTE);
2449	assert(priority == `0` \|\| priority >= BASEPRI_DEFAULT);
2450
2451	/ if the mutex wasn't owned, then the owner wasn't promoted /
2452	assert(mutex->lck_mtx_promoted == `0`);
2453
2454	thread_t thread = (thread_t)mutex->lck_mtx_owner; / faster than current_thread() /
2455
2456	if (thread->sched_pri < priority \|\| thread->was_promoted_on_wakeup) {
2457	spl_t s = splsched();
2458	thread_lock(thread);
2459
2460	if (thread->was_promoted_on_wakeup)
2461	assert(thread->promotions > `0`);
2462
2463	/ Intel only promotes if priority goes up /
2464	if (thread->sched_pri < priority && thread->promotion_priority < priority) {
2465	/ Remember that I need to drop this promotion on unlock /
2466	mutex->lck_mtx_promoted = `1`;
2467
2468	if (thread->promotions++ == `0`) {
2469	/ This is the first promotion for the owner /
2470	sched_thread_promote_to_pri(thread, priority, trace_lck);
2471	} else {
2472	/*
2473	* Holder was previously promoted due to a different mutex,
2474	* raise to match this one.
2475	* Or, this thread was promoted on wakeup but someone else
2476	* later contended on mutex at higher priority before we got here
2477	*/
2478	sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
2479	}
2480	}
2481
2482	if (thread->was_promoted_on_wakeup) {
2483	thread->was_promoted_on_wakeup = `0`;
2484	if (--thread->promotions == `0`)
2485	sched_thread_unpromote(thread, trace_lck);
2486	}
2487
2488	thread_unlock(thread);
2489	splx(s);
2490	}
2491	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) \| DBG_FUNC_END,
2492	trace_lck, `0`, mutex->lck_mtx_waiters, `0`, `0`);
2493	}
2494
2495	void
2496	lck_mtx_lock_acquire_x86(
2497	lck_mtx_t *mutex)
2498	{
2499	return lck_mtx_lock_acquire_inline(mutex);
2500	}
2501
2502	/*
2503	* Tail call helpers for lock functions that perform
2504	* lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
2505	* the caller's compiled code.
2506	*/
2507
2508	__attribute__((noinline))
2509	static void
2510	lck_mtx_lock_acquire_tail(
2511	lck_mtx_t *mutex,
2512	boolean_t indirect)
2513	{
2514	lck_mtx_lock_acquire_inline(mutex);
2515	lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
2516	}
2517
2518	__attribute__((noinline))
2519	static boolean_t
2520	lck_mtx_try_lock_acquire_tail(
2521	lck_mtx_t *mutex)
2522	{
2523	lck_mtx_lock_acquire_inline(mutex);
2524	lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
2525
2526	return TRUE;
2527	}
2528
2529	__attribute__((noinline))
2530	static void
2531	lck_mtx_convert_spin_acquire_tail(
2532	lck_mtx_t *mutex)
2533	{
2534	lck_mtx_lock_acquire_inline(mutex);
2535	lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
2536	}
2537
2538	boolean_t
2539	lck_mtx_ilk_unlock(
2540	lck_mtx_t *mutex)
2541	{
2542	lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
2543	return TRUE;
2544	}
2545
2546	static inline void
2547	lck_mtx_interlock_lock_set_and_clear_flags(
2548	lck_mtx_t *mutex,
2549	uint32_t xor_flags,
2550	uint32_t and_flags,
2551	uint32_t *new_state)
2552	{
2553	uint32_t state, prev;
2554	state = *new_state;
2555
2556	for ( ; ; ) {
2557	/ have to wait for interlock to clear /
2558	while (__improbable(state & (LCK_MTX_ILOCKED_MSK \| xor_flags))) {
2559	cpu_pause();
2560	state = ordered_load_mtx_state(mutex);
2561	}
2562	prev = state; / prev contains snapshot for exchange /
2563	state \|= LCK_MTX_ILOCKED_MSK \| xor_flags; / pick up interlock /
2564	state &= ~and_flags; / clear flags /
2565
2566	disable_preemption();
2567	if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE))
2568	break;
2569	enable_preemption();
2570	cpu_pause();
2571	state = ordered_load_mtx_state(mutex);
2572	}
2573	*new_state = state;
2574	return;
2575	}
2576
2577	static inline void
2578	lck_mtx_interlock_lock_clear_flags(
2579	lck_mtx_t *mutex,
2580	uint32_t and_flags,
2581	uint32_t *new_state)
2582	{
2583	return lck_mtx_interlock_lock_set_and_clear_flags(mutex, `0`, and_flags, new_state);
2584	}
2585
2586	static inline void
2587	lck_mtx_interlock_lock(
2588	lck_mtx_t *mutex,
2589	uint32_t *new_state)
2590	{
2591	return lck_mtx_interlock_lock_set_and_clear_flags(mutex, `0`, `0`, new_state);
2592	}
2593
2594	static inline int
2595	lck_mtx_interlock_try_lock_set_flags(
2596	lck_mtx_t *mutex,
2597	uint32_t or_flags,
2598	uint32_t *new_state)
2599	{
2600	uint32_t state, prev;
2601	state = *new_state;
2602
2603	/ have to wait for interlock to clear /
2604	if (state & (LCK_MTX_ILOCKED_MSK \| or_flags)) {
2605	return `0`;
2606	}
2607	prev = state; / prev contains snapshot for exchange /
2608	state \|= LCK_MTX_ILOCKED_MSK \| or_flags; / pick up interlock /
2609	disable_preemption();
2610	if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
2611	*new_state = state;
2612	return `1`;
2613	}
2614
2615	enable_preemption();
2616	return `0`;
2617	}
2618
2619	static inline int
2620	lck_mtx_interlock_try_lock(
2621	lck_mtx_t *mutex,
2622	uint32_t *new_state)
2623	{
2624	return lck_mtx_interlock_try_lock_set_flags(mutex, `0`, new_state);
2625	}
2626
2627	static inline int
2628	lck_mtx_interlock_try_lock_disable_interrupts(
2629	lck_mtx_t *mutex,
2630	boolean_t *istate)
2631	{
2632	uint32_t state;
2633
2634	*istate = ml_set_interrupts_enabled(FALSE);
2635	state = ordered_load_mtx_state(mutex);
2636
2637	if (lck_mtx_interlock_try_lock(mutex, &state)) {
2638	return `1`;
2639	} else {
2640	ml_set_interrupts_enabled(*istate);
2641	return `0`;
2642	}
2643	}
2644
2645	static inline void
2646	lck_mtx_interlock_unlock_enable_interrupts(
2647	lck_mtx_t *mutex,
2648	boolean_t istate)
2649	{
2650	lck_mtx_ilk_unlock(mutex);
2651	ml_set_interrupts_enabled(istate);
2652	}
2653
2654	static void __inline__
2655	lck_mtx_inc_stats(
2656	uint64_t* stat)
2657	{
2658	#if ATOMIC_STAT_UPDATES
2659	os_atomic_inc(stat, relaxed);
2660	#else
2661	stat = (stat)++;
2662	#endif
2663	}
2664
2665	static void __inline__
2666	lck_mtx_update_miss(
2667	struct _lck_mtx_ext_ *lock,
2668	int *first_miss)
2669	{
2670	#if LOG_FIRST_MISS_ALONE
2671	if ((*first_miss & `1`) == `0`) {
2672	#else
2673	#pragma unused(first_miss)
2674	#endif
2675	uint64_t* stat = &lock->lck_mtx_grp->lck_grp_miss;
2676	lck_mtx_inc_stats(stat);
2677
2678	#if LOG_FIRST_MISS_ALONE
2679	*first_miss \|= `1`;
2680	}
2681	#endif
2682	}
2683
2684	static void __inline__
2685	lck_mtx_update_direct_wait(
2686	struct _lck_mtx_ext_ *lock)
2687	{
2688	uint64_t* stat = &lock->lck_mtx_grp->lck_grp_direct_wait;
2689	lck_mtx_inc_stats(stat);
2690	}
2691
2692	static void __inline__
2693	lck_mtx_update_wait(
2694	struct _lck_mtx_ext_ *lock,
2695	int *first_miss)
2696	{
2697	#if LOG_FIRST_MISS_ALONE
2698	if ((*first_miss & `2`) == `0`) {
2699	#else
2700	#pragma unused(first_miss)
2701	#endif
2702	uint64_t* stat = &lock->lck_mtx_grp->lck_grp_wait;
2703	lck_mtx_inc_stats(stat);
2704
2705	#if LOG_FIRST_MISS_ALONE
2706	*first_miss \|= `2`;
2707	}
2708	#endif
2709	}
2710
2711	static void __inline__
2712	lck_mtx_update_util(
2713	struct _lck_mtx_ext_ *lock)
2714	{
2715	uint64_t* stat = &lock->lck_mtx_grp->lck_grp_util;
2716	lck_mtx_inc_stats(stat);
2717	}
2718
2719	__attribute__((noinline))
2720	static void
2721	lck_mtx_lock_contended(
2722	lck_mtx_t *lock,
2723	boolean_t indirect,
2724	boolean_t *first_miss)
2725	{
2726	lck_mtx_spinwait_ret_type_t ret;
2727	uint32_t state;
2728	thread_t thread;
2729
2730	try_again:
2731
2732	if (indirect) {
2733	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
2734	}
2735
2736	ret = lck_mtx_lock_spinwait_x86(lock);
2737	state = ordered_load_mtx_state(lock);
2738	switch (ret) {
2739	case LCK_MTX_SPINWAIT_NO_SPIN:
2740	/*
2741	* owner not on core, lck_mtx_lock_spinwait_x86 didn't even
2742	* try to spin.
2743	*/
2744	if (indirect) {
2745	lck_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
2746	}
2747
2748	/ just fall through case LCK_MTX_SPINWAIT_SPUN /
2749	case LCK_MTX_SPINWAIT_SPUN:
2750	/*
2751	* mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
2752	* interlock not held
2753	*/
2754	lck_mtx_interlock_lock(lock, &state);
2755	assert(state & LCK_MTX_ILOCKED_MSK);
2756
2757	if (state & LCK_MTX_MLOCKED_MSK) {
2758	if (indirect) {
2759	lck_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
2760	}
2761	lck_mtx_lock_wait_x86(lock);
2762	/*
2763	* interlock is not held here.
2764	*/
2765	goto try_again;
2766	} else {
2767
2768	/ grab the mutex /
2769	state \|= LCK_MTX_MLOCKED_MSK;
2770	ordered_store_mtx_state_release(lock, state);
2771	thread = current_thread();
2772	ordered_store_mtx_owner(lock, (uintptr_t)thread);
2773	#if MACH_LDEBUG
2774	if (thread) {
2775	thread->mutex_count++;
2776	}
2777	#endif /* MACH_LDEBUG */
2778	}
2779
2780	break;
2781	case LCK_MTX_SPINWAIT_ACQUIRED:
2782	/*
2783	* mutex has been acquired by lck_mtx_lock_spinwait_x86
2784	* interlock is held and preemption disabled
2785	* owner is set and mutex marked as locked
2786	* statistics updated too
2787	*/
2788	break;
2789	default:
2790	panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
2791	}
2792
2793	/*
2794	* interlock is already acquired here
2795	*/
2796
2797	/ mutex has been acquired /
2798	thread = (thread_t)lock->lck_mtx_owner;
2799	if (state & LCK_MTX_WAITERS_MSK \|\| thread->was_promoted_on_wakeup) {
2800	return lck_mtx_lock_acquire_tail(lock, indirect);
2801	}
2802
2803	/ release the interlock /
2804	lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2805	}
2806
2807	/*
2808	* Helper noinline functions for calling
2809	* panic to optimize compiled code.
2810	*/
2811
2812	__attribute__((noinline))
2813	static void
2814	lck_mtx_destroyed(
2815	lck_mtx_t *lock)
2816	{
2817	panic("trying to interlock destroyed mutex (%p)", lock);
2818	}
2819
2820	__attribute__((noinline))
2821	static boolean_t
2822	lck_mtx_try_destroyed(
2823	lck_mtx_t *lock)
2824	{
2825	panic("trying to interlock destroyed mutex (%p)", lock);
2826	return FALSE;
2827	}
2828
2829	__attribute__((always_inline))
2830	static boolean_t
2831	lck_mtx_lock_wait_interlock_to_clear(
2832	lck_mtx_t *lock,
2833	uint32_t* new_state)
2834	{
2835	uint32_t state;
2836
2837	for ( ; ; ) {
2838	cpu_pause();
2839	state = ordered_load_mtx_state(lock);
2840	if (!(state & (LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK))) {
2841	*new_state = state;
2842	return TRUE;
2843	}
2844	if (state & LCK_MTX_MLOCKED_MSK) {
2845	/ if it is held as mutex, just fail /
2846	return FALSE;
2847	}
2848	}
2849	}
2850
2851	__attribute__((always_inline))
2852	static boolean_t
2853	lck_mtx_try_lock_wait_interlock_to_clear(
2854	lck_mtx_t *lock,
2855	uint32_t* new_state)
2856	{
2857	uint32_t state;
2858
2859	for ( ; ; ) {
2860	cpu_pause();
2861	state = ordered_load_mtx_state(lock);
2862	if (state & (LCK_MTX_MLOCKED_MSK \| LCK_MTX_SPIN_MSK)) {
2863	/ if it is held as mutex or spin, just fail /
2864	return FALSE;
2865	}
2866	if (!(state & LCK_MTX_ILOCKED_MSK)) {
2867	*new_state = state;
2868	return TRUE;
2869	}
2870	}
2871	}
2872
2873	/*
2874	* Routine: lck_mtx_lock_slow
2875	*
2876	* Locks a mutex for current thread.
2877	* If the lock is contended this function might
2878	* sleep.
2879	*
2880	* Called with interlock not held.
2881	*/
2882	__attribute__((noinline))
2883	void
2884	lck_mtx_lock_slow(
2885	lck_mtx_t *lock)
2886	{
2887	boolean_t indirect = FALSE;
2888	uint32_t state;
2889	int first_miss = `0`;
2890
2891	state = ordered_load_mtx_state(lock);
2892
2893	/ is the interlock or mutex held /
2894	if (__improbable(state & ((LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK)))) {
2895	/*
2896	* Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2897	* have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2898	* set in state (state == lck_mtx_tag)
2899	*/
2900
2901
2902	/ is the mutex already held and not indirect /
2903	if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2904	/ no, must have been the mutex /
2905	return lck_mtx_lock_contended(lock, indirect, &first_miss);
2906	}
2907
2908	/ check to see if it is marked destroyed /
2909	if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2910	return lck_mtx_destroyed(lock);
2911	}
2912
2913	/ Is this an indirect mutex? /
2914	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2915	indirect = get_indirect_mutex(&lock, &state);
2916
2917	first_miss = `0`;
2918	lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2919
2920	if (state & LCK_MTX_SPIN_MSK) {
2921	/ M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present /
2922	assert(state & LCK_MTX_ILOCKED_MSK);
2923	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
2924	}
2925	}
2926
2927	if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2928	return lck_mtx_lock_contended(lock, indirect, &first_miss);
2929	}
2930	}
2931
2932	/ no - can't be INDIRECT, DESTROYED or locked /
2933	while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
2934	if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
2935	return lck_mtx_lock_contended(lock, indirect, &first_miss);
2936	}
2937	}
2938
2939	/ lock and interlock acquired /
2940
2941	thread_t thread = current_thread();
2942	/ record owner of mutex /
2943	ordered_store_mtx_owner(lock, (uintptr_t)thread);
2944
2945	#if MACH_LDEBUG
2946	if (thread) {
2947	thread->mutex_count++; / lock statistic /
2948	}
2949	#endif
2950	/*
2951	* Check if there are waiters to
2952	* inherit their priority.
2953	*/
2954	if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
2955	return lck_mtx_lock_acquire_tail(lock, indirect);
2956	}
2957
2958	/ release the interlock /
2959	lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
2960
2961	return;
2962	}
2963
2964	__attribute__((noinline))
2965	boolean_t
2966	lck_mtx_try_lock_slow(
2967	lck_mtx_t *lock)
2968	{
2969	boolean_t indirect = FALSE;
2970	uint32_t state;
2971	int first_miss = `0`;
2972
2973	state = ordered_load_mtx_state(lock);
2974
2975	/ is the interlock or mutex held /
2976	if (__improbable(state & ((LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK)))) {
2977	/*
2978	* Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
2979	* have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
2980	* set in state (state == lck_mtx_tag)
2981	*/
2982
2983	/ is the mutex already held and not indirect /
2984	if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
2985	return FALSE;
2986	}
2987
2988	/ check to see if it is marked destroyed /
2989	if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
2990	return lck_mtx_try_destroyed(lock);
2991	}
2992
2993	/ Is this an indirect mutex? /
2994	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
2995	indirect = get_indirect_mutex(&lock, &state);
2996
2997	first_miss = `0`;
2998	lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
2999	}
3000
3001	if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3002	if (indirect)
3003	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3004	return FALSE;
3005	}
3006	}
3007
3008	/ no - can't be INDIRECT, DESTROYED or locked /
3009	while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
3010	if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3011	if (indirect)
3012	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3013	return FALSE;
3014	}
3015	}
3016
3017	/ lock and interlock acquired /
3018
3019	thread_t thread = current_thread();
3020	/ record owner of mutex /
3021	ordered_store_mtx_owner(lock, (uintptr_t)thread);
3022
3023	#if MACH_LDEBUG
3024	if (thread) {
3025	thread->mutex_count++; / lock statistic /
3026	}
3027	#endif
3028	/*
3029	* Check if there are waiters to
3030	* inherit their priority.
3031	*/
3032	if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3033	return lck_mtx_try_lock_acquire_tail(lock);
3034	}
3035
3036	/ release the interlock /
3037	lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
3038
3039	return TRUE;
3040
3041	}
3042
3043	__attribute__((noinline))
3044	void
3045	lck_mtx_lock_spin_slow(
3046	lck_mtx_t *lock)
3047	{
3048	boolean_t indirect = FALSE;
3049	uint32_t state;
3050	int first_miss = `0`;
3051
3052	state = ordered_load_mtx_state(lock);
3053
3054	/ is the interlock or mutex held /
3055	if (__improbable(state & ((LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK)))) {
3056	/*
3057	* Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3058	* have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3059	* set in state (state == lck_mtx_tag)
3060	*/
3061
3062
3063	/ is the mutex already held and not indirect /
3064	if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3065	/ no, must have been the mutex /
3066	return lck_mtx_lock_contended(lock, indirect, &first_miss);
3067	}
3068
3069	/ check to see if it is marked destroyed /
3070	if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3071	return lck_mtx_destroyed(lock);
3072	}
3073
3074	/ Is this an indirect mutex? /
3075	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3076	indirect = get_indirect_mutex(&lock, &state);
3077
3078	first_miss = `0`;
3079	lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3080
3081	if (state & LCK_MTX_SPIN_MSK) {
3082	/ M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present /
3083	assert(state & LCK_MTX_ILOCKED_MSK);
3084	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3085	}
3086	}
3087
3088	if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3089	return lck_mtx_lock_contended(lock, indirect, &first_miss);
3090	}
3091	}
3092
3093	/ no - can't be INDIRECT, DESTROYED or locked /
3094	while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
3095	if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
3096	return lck_mtx_lock_contended(lock, indirect, &first_miss);
3097	}
3098	}
3099
3100	/ lock as spinlock and interlock acquired /
3101
3102	thread_t thread = current_thread();
3103	/ record owner of mutex /
3104	ordered_store_mtx_owner(lock, (uintptr_t)thread);
3105
3106	#if MACH_LDEBUG
3107	if (thread) {
3108	thread->mutex_count++; / lock statistic /
3109	}
3110	#endif
3111
3112	#if CONFIG_DTRACE
3113	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, `0`);
3114	#endif
3115	/ return with the interlock held and preemption disabled /
3116	return;
3117	}
3118
3119	__attribute__((noinline))
3120	boolean_t
3121	lck_mtx_try_lock_spin_slow(
3122	lck_mtx_t *lock)
3123	{
3124	boolean_t indirect = FALSE;
3125	uint32_t state;
3126	int first_miss = `0`;
3127
3128	state = ordered_load_mtx_state(lock);
3129
3130	/ is the interlock or mutex held /
3131	if (__improbable(state & ((LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK)))) {
3132	/*
3133	* Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
3134	* have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
3135	* set in state (state == lck_mtx_tag)
3136	*/
3137
3138	/ is the mutex already held and not indirect /
3139	if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
3140	return FALSE;
3141	}
3142
3143	/ check to see if it is marked destroyed /
3144	if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
3145	return lck_mtx_try_destroyed(lock);
3146	}
3147
3148	/ Is this an indirect mutex? /
3149	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3150	indirect = get_indirect_mutex(&lock, &state);
3151
3152	first_miss = `0`;
3153	lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
3154	}
3155
3156	if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3157	if (indirect)
3158	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3159	return FALSE;
3160	}
3161	}
3162
3163	/ no - can't be INDIRECT, DESTROYED or locked /
3164	while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
3165	if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
3166	if (indirect)
3167	lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
3168	return FALSE;
3169	}
3170	}
3171
3172	/ lock and interlock acquired /
3173
3174	thread_t thread = current_thread();
3175	/ record owner of mutex /
3176	ordered_store_mtx_owner(lock, (uintptr_t)thread);
3177
3178	#if MACH_LDEBUG
3179	if (thread) {
3180	thread->mutex_count++; / lock statistic /
3181	}
3182	#endif
3183
3184	#if CONFIG_DTRACE
3185	LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, `0`);
3186	#endif
3187	return TRUE;
3188
3189	}
3190
3191	__attribute__((noinline))
3192	void
3193	lck_mtx_convert_spin(
3194	lck_mtx_t *lock)
3195	{
3196	uint32_t state;
3197
3198	state = ordered_load_mtx_state(lock);
3199
3200	/ Is this an indirect mutex? /
3201	if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
3202	/ If so, take indirection /
3203	get_indirect_mutex(&lock, &state);
3204	}
3205
3206	assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
3207
3208	if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
3209	/ already owned as a mutex, just return /
3210	return;
3211	}
3212
3213	assert(get_preemption_level() > `0`);
3214	assert(state & LCK_MTX_ILOCKED_MSK);
3215	assert(state & LCK_MTX_SPIN_MSK);
3216
3217	/*
3218	* Check if there are waiters to
3219	* inherit their priority.
3220	*/
3221	if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
3222	return lck_mtx_convert_spin_acquire_tail(lock);
3223	}
3224
3225	lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
3226
3227	return;
3228	}
3229
3230	static inline boolean_t
3231	lck_mtx_lock_grab_mutex(
3232	lck_mtx_t *lock)
3233	{
3234	uint32_t state;
3235
3236	state = ordered_load_mtx_state(lock);
3237
3238	if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
3239	return FALSE;
3240	}
3241
3242	/ lock and interlock acquired /
3243
3244	thread_t thread = current_thread();
3245	/ record owner of mutex /
3246	ordered_store_mtx_owner(lock, (uintptr_t)thread);
3247
3248	#if MACH_LDEBUG
3249	if (thread) {
3250	thread->mutex_count++; / lock statistic /
3251	}
3252	#endif
3253	return TRUE;
3254	}
3255
3256	__attribute__((noinline))
3257	void
3258	lck_mtx_assert(
3259	lck_mtx_t *lock,
3260	unsigned int type)
3261	{
3262	thread_t thread, owner;
3263	uint32_t state;
3264
3265	thread = current_thread();
3266	state = ordered_load_mtx_state(lock);
3267
3268	if (state == LCK_MTX_TAG_INDIRECT) {
3269	get_indirect_mutex(&lock, &state);
3270	}
3271
3272	owner = (thread_t)lock->lck_mtx_owner;
3273
3274	if (type == LCK_MTX_ASSERT_OWNED) {
3275	if (owner != thread \|\| !(state & (LCK_MTX_ILOCKED_MSK \| LCK_MTX_MLOCKED_MSK)))
3276	panic("mutex (%p) not owned\n", lock);
3277	} else {
3278	assert (type == LCK_MTX_ASSERT_NOTOWNED);
3279	if (owner == thread)
3280	panic("mutex (%p) owned\n", lock);
3281	}
3282	}
3283
3284	/*
3285	* Routine: lck_mtx_lock_spinwait_x86
3286	*
3287	* Invoked trying to acquire a mutex when there is contention but
3288	* the holder is running on another processor. We spin for up to a maximum
3289	* time waiting for the lock to be released.
3290	*
3291	* Called with the interlock unlocked.
3292	* returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
3293	* returns LCK_MTX_SPINWAIT_SPUN if we spun
3294	* returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
3295	*/
3296	__attribute__((noinline))
3297	lck_mtx_spinwait_ret_type_t
3298	lck_mtx_lock_spinwait_x86(
3299	lck_mtx_t *mutex)
3300	{
3301	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3302	thread_t holder;
3303	uint64_t overall_deadline;
3304	uint64_t check_owner_deadline;
3305	uint64_t cur_time;
3306	lck_mtx_spinwait_ret_type_t retval = LCK_MTX_SPINWAIT_SPUN;
3307	int loopcount = `0`;
3308
3309	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_START,
3310	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, `0`, `0`);
3311
3312	cur_time = mach_absolute_time();
3313	overall_deadline = cur_time + MutexSpin;
3314	check_owner_deadline = cur_time;
3315
3316	/*
3317	* Spin while:
3318	* - mutex is locked, and
3319	* - its locked as a spin lock, and
3320	* - owner is running on another processor, and
3321	* - owner (processor) is not idling, and
3322	* - we haven't spun for long enough.
3323	*/
3324	do {
3325	if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
3326	retval = LCK_MTX_SPINWAIT_ACQUIRED;
3327	break;
3328	}
3329	cur_time = mach_absolute_time();
3330
3331	if (cur_time >= overall_deadline)
3332	break;
3333
3334	if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
3335	boolean_t istate;
3336
3337	/*
3338	* We will repeatedly peek at the state of the lock while spinning,
3339	* and we will acquire the interlock to do so.
3340	* The thread that will unlock the mutex will also need to acquire
3341	* the interlock, and we want to avoid to slow it down.
3342	* To avoid to get an interrupt while holding the interlock
3343	* and increase the time we are holding it, we
3344	* will try to acquire the interlock with interrupts disabled.
3345	* This is safe because it is a "try_lock", if we can't acquire
3346	* the interlock we re-enable the interrupts and fail, so it is
3347	* ok to call it even if the interlock was already held.
3348	*/
3349	if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
3350
3351	if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
3352
3353	if ( !(holder->machine.specFlags & OnProc) \|\|
3354	(holder->state & TH_IDLE)) {
3355
3356	lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3357
3358	if (loopcount == `0`)
3359	retval = LCK_MTX_SPINWAIT_NO_SPIN;
3360	break;
3361	}
3362	}
3363	lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
3364
3365	check_owner_deadline = cur_time + (MutexSpin / `4`);
3366	}
3367	}
3368	cpu_pause();
3369
3370	loopcount++;
3371
3372	} while (TRUE);
3373
3374	#if CONFIG_DTRACE
3375	/*
3376	* We've already kept a count via overall_deadline of how long we spun.
3377	* If dtrace is active, then we compute backwards to decide how
3378	* long we spun.
3379	*
3380	* Note that we record a different probe id depending on whether
3381	* this is a direct or indirect mutex. This allows us to
3382	* penalize only lock groups that have debug/stats enabled
3383	* with dtrace processing if desired.
3384	*/
3385	if (__probable(mutex->lck_mtx_is_ext == `0`)) {
3386	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN, mutex,
3387	mach_absolute_time() - (overall_deadline - MutexSpin));
3388	} else {
3389	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_SPIN, mutex,
3390	mach_absolute_time() - (overall_deadline - MutexSpin));
3391	}
3392	/ The lockstat acquire event is recorded by the assembly code beneath us. /
3393	#endif
3394
3395	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) \| DBG_FUNC_END,
3396	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, retval, `0`);
3397
3398	return retval;
3399	}
3400
3401
3402
3403	/*
3404	* Routine: lck_mtx_lock_wait_x86
3405	*
3406	* Invoked in order to wait on contention.
3407	*
3408	* Called with the interlock locked and
3409	* preemption disabled...
3410	* returns it unlocked and with preemption enabled
3411	*
3412	* lck_mtx_waiters is 1:1 with a wakeup needing to occur.
3413	* A runnable waiter can exist between wait and acquire
3414	* without a waiters count being set.
3415	* This allows us to never make a spurious wakeup call.
3416	*
3417	* Priority:
3418	* This avoids taking the thread lock if the owning thread is the same priority.
3419	* This optimizes the case of same-priority threads contending on a lock.
3420	* However, that allows the owning thread to drop in priority while holding the lock,
3421	* because there is no state that the priority change can notice that
3422	* says that the targeted thread holds a contended mutex.
3423	*
3424	* One possible solution: priority changes could look for some atomic tag
3425	* on the thread saying 'holding contended lock', and then set up a promotion.
3426	* Needs a story for dropping that promotion - the last contended unlock
3427	* has to notice that this has happened.
3428	*/
3429	__attribute__((noinline))
3430	void
3431	lck_mtx_lock_wait_x86 (
3432	lck_mtx_t *mutex)
3433	{
3434	#if CONFIG_DTRACE
3435	uint64_t sleep_start = `0`;
3436
3437	if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] \|\| lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
3438	sleep_start = mach_absolute_time();
3439	}
3440	#endif
3441	thread_t self = current_thread();
3442	assert(self->waiting_for_mutex == NULL);
3443
3444	self->waiting_for_mutex = mutex;
3445
3446	__kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
3447
3448	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_START,
3449	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3450	mutex->lck_mtx_waiters, mutex->lck_mtx_pri, `0`);
3451
3452	integer_t waiter_pri = self->sched_pri;
3453	waiter_pri = MAX(waiter_pri, self->base_pri);
3454	waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
3455	waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
3456
3457	assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
3458
3459	/ Re-initialize lck_mtx_pri if this is the first contention /
3460	if (mutex->lck_mtx_waiters == `0` \|\| mutex->lck_mtx_pri <= waiter_pri)
3461	mutex->lck_mtx_pri = waiter_pri;
3462
3463	thread_t holder = (thread_t)mutex->lck_mtx_owner;
3464
3465	assert(holder != NULL);
3466
3467	/*
3468	* Intel only causes a promotion when priority needs to change,
3469	* reducing thread lock holds but leaving us vulnerable to the holder
3470	* dropping priority.
3471	*/
3472	if (holder->sched_pri < mutex->lck_mtx_pri) {
3473	int promote_pri = mutex->lck_mtx_pri;
3474
3475	spl_t s = splsched();
3476	thread_lock(holder);
3477
3478	/ Check again in case sched_pri changed /
3479	if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
3480	if (mutex->lck_mtx_promoted == `0`) {
3481	/ This is the first promotion for this mutex /
3482	mutex->lck_mtx_promoted = `1`;
3483
3484	if (holder->promotions++ == `0`) {
3485	/ This is the first promotion for holder /
3486	sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
3487	} else {
3488	/*
3489	* Holder was previously promoted due to a different mutex,
3490	* check if it needs to raise to match this one
3491	*/
3492	sched_thread_update_promotion_to_pri(holder, promote_pri,
3493	trace_lck);
3494	}
3495	} else {
3496	/*
3497	* Holder was previously promoted due to this mutex,
3498	* check if the pri needs to go up
3499	*/
3500	sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
3501	}
3502	}
3503
3504	thread_unlock(holder);
3505	splx(s);
3506	}
3507
3508	mutex->lck_mtx_waiters++;
3509
3510	thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
3511	assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT \| THREAD_WAIT_NOREPORT_USER);
3512
3513	lck_mtx_ilk_unlock(mutex);
3514
3515	thread_block(THREAD_CONTINUE_NULL);
3516
3517	self->waiting_for_mutex = NULL;
3518
3519	KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) \| DBG_FUNC_END,
3520	trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
3521	mutex->lck_mtx_waiters, mutex->lck_mtx_pri, `0`);
3522
3523	#if CONFIG_DTRACE
3524	/*
3525	* Record the Dtrace lockstat probe for blocking, block time
3526	* measured from when we were entered.
3527	*/
3528	if (sleep_start) {
3529	if (mutex->lck_mtx_is_ext == `0`) {
3530	LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_BLOCK, mutex,
3531	mach_absolute_time() - sleep_start);
3532	} else {
3533	LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_BLOCK, mutex,
3534	mach_absolute_time() - sleep_start);
3535	}
3536	}
3537	#endif
3538	}
3539
3540	/*
3541	* Routine: kdp_lck_mtx_lock_spin_is_acquired
3542	* NOT SAFE: To be used only by kernel debugger to avoid deadlock.
3543	* Returns: TRUE if lock is acquired.
3544	*/
3545	boolean_t
3546	kdp_lck_mtx_lock_spin_is_acquired(lck_mtx_t *lck)
3547	{
3548	if (not_in_kdp) {
3549	panic("panic: kdp_lck_mtx_lock_spin_is_acquired called outside of kernel debugger");
3550	}
3551
3552	if (lck->lck_mtx_ilocked \|\| lck->lck_mtx_mlocked) {
3553	return TRUE;
3554	}
3555
3556	return FALSE;
3557	}
3558
3559	void
3560	kdp_lck_mtx_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3561	{
3562	lck_mtx_t * mutex = LCK_EVENT_TO_MUTEX(event);
3563	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
3564	thread_t holder = (thread_t)mutex->lck_mtx_owner;
3565	waitinfo->owner = thread_tid(holder);
3566	}
3567
3568	void
3569	kdp_rwlck_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
3570	{
3571	lck_rw_t *rwlck = NULL;
3572	switch(waitinfo->wait_type) {
3573	case kThreadWaitKernelRWLockRead:
3574	rwlck = READ_EVENT_TO_RWLOCK(event);
3575	break;
3576	case kThreadWaitKernelRWLockWrite:
3577	case kThreadWaitKernelRWLockUpgrade:
3578	rwlck = WRITE_EVENT_TO_RWLOCK(event);
3579	break;
3580	default:
3581	panic("%s was called with an invalid blocking type", __FUNCTION__);
3582	break;
3583	}
3584	waitinfo->context = VM_KERNEL_UNSLIDE_OR_PERM(rwlck);
3585	waitinfo->owner = `0`;
3586	}
3587

Browse the source code of codebrowser/osfmk/i386/locks_i386.c