dtrace.c source code [codebrowser/bsd/dev/dtrace/dtrace.c]

1	/*
2	* CDDL HEADER START
3	*
4	* The contents of this file are subject to the terms of the
5	* Common Development and Distribution License (the "License").
6	* You may not use this file except in compliance with the License.
7	*
8	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9	* or http://www.opensolaris.org/os/licensing.
10	* See the License for the specific language governing permissions
11	* and limitations under the License.
12	*
13	* When distributing Covered Code, include this CDDL HEADER in each
14	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15	* If applicable, add the following below this CDDL HEADER, with the
16	* fields enclosed by brackets "[]" replaced with your own identifying
17	* information: Portions Copyright [yyyy] [name of copyright owner]
18	*
19	* CDDL HEADER END
20	*/
21
22	/*
23	* Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24	* Portions Copyright (c) 2013 by Delphix. All rights reserved.
25	*/
26
27	/*
28	* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29	* Use is subject to license terms.
30	*/
31
32	/ #pragma ident "@(#)dtrace.c 1.65 08/07/02 SMI" /
33
34	/*
35	* DTrace - Dynamic Tracing for Solaris
36	*
37	* This is the implementation of the Solaris Dynamic Tracing framework
38	* (DTrace). The user-visible interface to DTrace is described at length in
39	* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
40	* library, the in-kernel DTrace framework, and the DTrace providers are
41	* described in the block comments in the <sys/dtrace.h> header file. The
42	* internal architecture of DTrace is described in the block comments in the
43	* <sys/dtrace_impl.h> header file. The comments contained within the DTrace
44	* implementation very much assume mastery of all of these sources; if one has
45	* an unanswered question about the implementation, one should consult them
46	* first.
47	*
48	* The functions here are ordered roughly as follows:
49	*
50	* - Probe context functions
51	* - Probe hashing functions
52	* - Non-probe context utility functions
53	* - Matching functions
54	* - Provider-to-Framework API functions
55	* - Probe management functions
56	* - DIF object functions
57	* - Format functions
58	* - Predicate functions
59	* - ECB functions
60	* - Buffer functions
61	* - Enabling functions
62	* - DOF functions
63	* - Anonymous enabling functions
64	* - Process functions
65	* - Consumer state functions
66	* - Helper functions
67	* - Hook functions
68	* - Driver cookbook functions
69	*
70	* Each group of functions begins with a block comment labelled the "DTrace
71	* [Group] Functions", allowing one to find each block by searching forward
72	* on capital-f functions.
73	*/
74	#include <sys/errno.h>
75	#include <sys/types.h>
76	#include <sys/stat.h>
77	#include <sys/conf.h>
78	#include <sys/systm.h>
79	#include <sys/dtrace_impl.h>
80	#include <sys/param.h>
81	#include <sys/proc_internal.h>
82	#include <sys/ioctl.h>
83	#include <sys/fcntl.h>
84	#include <miscfs/devfs/devfs.h>
85	#include <sys/malloc.h>
86	#include <sys/kernel_types.h>
87	#include <sys/proc_internal.h>
88	#include <sys/uio_internal.h>
89	#include <sys/kauth.h>
90	#include <vm/pmap.h>
91	#include <sys/user.h>
92	#include <mach/exception_types.h>
93	#include <sys/signalvar.h>
94	#include <mach/task.h>
95	#include <kern/zalloc.h>
96	#include <kern/ast.h>
97	#include <kern/sched_prim.h>
98	#include <kern/task.h>
99	#include <netinet/in.h>
100	#include <libkern/sysctl.h>
101	#include <sys/kdebug.h>
102
103	#if MONOTONIC
104	#include <kern/monotonic.h>
105	#include <machine/monotonic.h>
106	#endif /* MONOTONIC */
107
108	#include <IOKit/IOPlatformExpert.h>
109
110	#include <kern/cpu_data.h>
111	extern uint32_t pmap_find_phys(void *, uint64_t);
112	extern boolean_t pmap_valid_page(uint32_t);
113	extern void OSKextRegisterKextsWithDTrace(void);
114	extern kmod_info_t g_kernel_kmod_info;
115
116	/ Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. /
117	#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
118
119	#define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
120
121	extern void dtrace_suspend(void);
122	extern void dtrace_resume(void);
123	extern void dtrace_early_init(void);
124	extern int dtrace_keep_kernel_symbols(void);
125	extern void dtrace_init(void);
126	extern void helper_init(void);
127	extern void fasttrap_init(void);
128
129	static int dtrace_lazy_dofs_duplicate(proc_t , proc_t );
130	extern void dtrace_lazy_dofs_destroy(proc_t *);
131	extern void dtrace_postinit(void);
132
133	extern void dtrace_proc_fork(proc_t, proc_t, int);
134	extern void dtrace_proc_exec(proc_t*);
135	extern void dtrace_proc_exit(proc_t*);
136
137	/*
138	* DTrace Tunable Variables
139	*
140	* The following variables may be dynamically tuned by using sysctl(8), the
141	* variables being stored in the kern.dtrace namespace. For example:
142	* sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
143	*
144	* In general, the only variables that one should be tuning this way are those
145	* that affect system-wide DTrace behavior, and for which the default behavior
146	* is undesirable. Most of these variables are tunable on a per-consumer
147	* basis using DTrace options, and need not be tuned on a system-wide basis.
148	* When tuning these variables, avoid pathological values; while some attempt
149	* is made to verify the integrity of these variables, they are not considered
150	* part of the supported interface to DTrace, and they are therefore not
151	* checked comprehensively.
152	*/
153	uint64_t dtrace_buffer_memory_maxsize = `0`; / initialized in dtrace_init /
154	uint64_t dtrace_buffer_memory_inuse = `0`;
155	int dtrace_destructive_disallow = `0`;
156	dtrace_optval_t dtrace_nonroot_maxsize = (`16` * `1024` * `1024`);
157	size_t dtrace_difo_maxsize = (`256` * `1024`);
158	dtrace_optval_t dtrace_dof_maxsize = (`512` * `1024`);
159	dtrace_optval_t dtrace_statvar_maxsize = (`16` * `1024`);
160	dtrace_optval_t dtrace_statvar_maxsize_max = (`16` * `10` * `1024`);
161	size_t dtrace_actions_max = (`16` * `1024`);
162	size_t dtrace_retain_max = `1024`;
163	dtrace_optval_t dtrace_helper_actions_max = `32`;
164	dtrace_optval_t dtrace_helper_providers_max = `64`;
165	dtrace_optval_t dtrace_dstate_defsize = (`1` * `1024` * `1024`);
166	size_t dtrace_strsize_default = `256`;
167	dtrace_optval_t dtrace_strsize_min = `8`;
168	dtrace_optval_t dtrace_strsize_max = `65536`;
169	dtrace_optval_t dtrace_cleanrate_default = `990099000`; / 1.1 hz /
170	dtrace_optval_t dtrace_cleanrate_min = `20000000`; / 50 hz /
171	dtrace_optval_t dtrace_cleanrate_max = (uint64_t)`60` * NANOSEC; / 1/minute /
172	dtrace_optval_t dtrace_aggrate_default = NANOSEC; / 1 hz /
173	dtrace_optval_t dtrace_statusrate_default = NANOSEC; / 1 hz /
174	dtrace_optval_t dtrace_statusrate_max = (hrtime_t)`10` * NANOSEC; / 6/minute /
175	dtrace_optval_t dtrace_switchrate_default = NANOSEC; / 1 hz /
176	dtrace_optval_t dtrace_nspec_default = `1`;
177	dtrace_optval_t dtrace_specsize_default = `32` * `1024`;
178	dtrace_optval_t dtrace_stackframes_default = `20`;
179	dtrace_optval_t dtrace_ustackframes_default = `20`;
180	dtrace_optval_t dtrace_jstackframes_default = `50`;
181	dtrace_optval_t dtrace_jstackstrsize_default = `512`;
182	dtrace_optval_t dtrace_buflimit_default = `75`;
183	dtrace_optval_t dtrace_buflimit_min = `1`;
184	dtrace_optval_t dtrace_buflimit_max = `99`;
185	int dtrace_msgdsize_max = `128`;
186	hrtime_t dtrace_chill_max = `500` * (NANOSEC / MILLISEC); / 500 ms /
187	hrtime_t dtrace_chill_interval = NANOSEC; / 1000 ms /
188	int dtrace_devdepth_max = `32`;
189	int dtrace_err_verbose;
190	int dtrace_provide_private_probes = `0`;
191	hrtime_t dtrace_deadman_interval = NANOSEC;
192	hrtime_t dtrace_deadman_timeout = (hrtime_t)`10` * NANOSEC;
193	hrtime_t dtrace_deadman_user = (hrtime_t)`30` * NANOSEC;
194
195	/*
196	* DTrace External Variables
197	*
198	* As dtrace(7D) is a kernel module, any DTrace variables are obviously
199	* available to DTrace consumers via the backtick (`) syntax. One of these,
200	* dtrace_zero, is made deliberately so: it is provided as a source of
201	* well-known, zero-filled memory. While this variable is not documented,
202	* it is used by some translators as an implementation detail.
203	*/
204	const char dtrace_zero[`256`] = { `0` }; / zero-filled memory /
205	unsigned int dtrace_max_cpus = `0`; / number of enabled cpus /
206	/*
207	* DTrace Internal Variables
208	*/
209	static dev_info_t dtrace_devi; /* device info /
210	static vmem_t dtrace_arena; /* probe ID arena /
211	static dtrace_probe_t *dtrace_probes; /* array of all probes /
212	static int dtrace_nprobes; / number of probes /
213	static dtrace_provider_t dtrace_provider; /* provider list /
214	static dtrace_meta_t dtrace_meta_pid; /* user-land meta provider /
215	static int dtrace_opens; / number of opens /
216	static int dtrace_helpers; / number of helpers /
217	static dtrace_hash_t *dtrace_strings;
218	static dtrace_hash_t dtrace_byprov; /* probes hashed by provider /
219	static dtrace_hash_t dtrace_bymod; /* probes hashed by module /
220	static dtrace_hash_t dtrace_byfunc; /* probes hashed by function /
221	static dtrace_hash_t dtrace_byname; /* probes hashed by name /
222	static dtrace_toxrange_t dtrace_toxrange; /* toxic range array /
223	static int dtrace_toxranges; / number of toxic ranges /
224	static int dtrace_toxranges_max; / size of toxic range array /
225	static dtrace_anon_t dtrace_anon; / anonymous enabling /
226	static kmem_cache_t dtrace_state_cache; /* cache for dynamic state /
227	static uint64_t dtrace_vtime_references; / number of vtimestamp refs /
228	static kthread_t dtrace_panicked; /* panicking thread /
229	static dtrace_ecb_t dtrace_ecb_create_cache; /* cached created ECB /
230	static dtrace_genid_t dtrace_probegen; / current probe generation /
231	static dtrace_helpers_t dtrace_deferred_pid; /* deferred helper list /
232	static dtrace_enabling_t dtrace_retained; /* list of retained enablings /
233	static dtrace_genid_t dtrace_retained_gen; / current retained enab gen /
234	static dtrace_dynvar_t dtrace_dynhash_sink; / end of dynamic hash chains /
235
236	static int dtrace_dof_mode; / See dtrace_impl.h for a description of Darwin's dof modes. /
237
238	/*
239	* This does't quite fit as an internal variable, as it must be accessed in
240	* fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
241	*/
242	int dtrace_kernel_symbol_mode; / See dtrace_impl.h for a description of Darwin's kernel symbol modes. /
243	static uint32_t dtrace_wake_clients;
244	static uint8_t dtrace_kerneluuid[`16`]; / the 128-bit uuid /
245
246	/*
247	* To save memory, some common memory allocations are given a
248	* unique zone. For example, dtrace_probe_t is 72 bytes in size,
249	* which means it would fall into the kalloc.128 bucket. With
250	* 20k elements allocated, the space saved is substantial.
251	*/
252
253	struct zone *dtrace_probe_t_zone;
254
255	static int dtrace_module_unloaded(struct kmod_info *kmod);
256
257	/*
258	* DTrace Locking
259	* DTrace is protected by three (relatively coarse-grained) locks:
260	*
261	* (1) dtrace_lock is required to manipulate essentially any DTrace state,
262	* including enabling state, probes, ECBs, consumer state, helper state,
263	* etc. Importantly, dtrace_lock is _not_ required when in probe context;
264	* probe context is lock-free -- synchronization is handled via the
265	* dtrace_sync() cross call mechanism.
266	*
267	* (2) dtrace_provider_lock is required when manipulating provider state, or
268	* when provider state must be held constant.
269	*
270	* (3) dtrace_meta_lock is required when manipulating meta provider state, or
271	* when meta provider state must be held constant.
272	*
273	* The lock ordering between these three locks is dtrace_meta_lock before
274	* dtrace_provider_lock before dtrace_lock. (In particular, there are
275	* several places where dtrace_provider_lock is held by the framework as it
276	* calls into the providers -- which then call back into the framework,
277	* grabbing dtrace_lock.)
278	*
279	* There are two other locks in the mix: mod_lock and cpu_lock. With respect
280	* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
281	* role as a coarse-grained lock; it is acquired before both of these locks.
282	* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
283	* be acquired _between_ dtrace_meta_lock and any other DTrace locks.
284	* mod_lock is similar with respect to dtrace_provider_lock in that it must be
285	* acquired _between_ dtrace_provider_lock and dtrace_lock.
286	*/
287
288
289	/*
290	* APPLE NOTE:
291	*
292	* For porting purposes, all kmutex_t vars have been changed
293	* to lck_mtx_t, which require explicit initialization.
294	*
295	* kmutex_t becomes lck_mtx_t
296	* mutex_enter() becomes lck_mtx_lock()
297	* mutex_exit() becomes lck_mtx_unlock()
298	*
299	* Lock asserts are changed like this:
300	*
301	* ASSERT(MUTEX_HELD(&cpu_lock));
302	* becomes:
303	* LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
304	*
305	*/
306	static lck_mtx_t dtrace_lock; / probe state lock /
307	static lck_mtx_t dtrace_provider_lock; / provider state lock /
308	static lck_mtx_t dtrace_meta_lock; / meta-provider state lock /
309	static lck_rw_t dtrace_dof_mode_lock; / dof mode lock /
310
311	/*
312	* DTrace Provider Variables
313	*
314	* These are the variables relating to DTrace as a provider (that is, the
315	* provider of the BEGIN, END, and ERROR probes).
316	*/
317	static dtrace_pattr_t dtrace_provider_attr = {
318	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
319	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
320	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
321	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
322	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
323	};
324
325	static void
326	dtrace_nullop(void)
327	{}
328
329	static int
330	dtrace_enable_nullop(void)
331	{
332	return (`0`);
333	}
334
335	static dtrace_pops_t dtrace_provider_ops = {
336	.dtps_provide = (void ()(void* , const* dtrace_probedesc_t *))dtrace_nullop,
337	.dtps_provide_module = (void ()(void* , struct* modctl *))dtrace_nullop,
338	.dtps_enable = (int ()(void* , dtrace_id_t, void* *))dtrace_nullop,
339	.dtps_disable = (void ()(void* , dtrace_id_t, void* *))dtrace_nullop,
340	.dtps_suspend = (void ()(void* , dtrace_id_t, void* *))dtrace_nullop,
341	.dtps_resume = (void ()(void* , dtrace_id_t, void* *))dtrace_nullop,
342	.dtps_getargdesc = NULL,
343	.dtps_getargval = NULL,
344	.dtps_usermode = NULL,
345	.dtps_destroy = (void ()(void* , dtrace_id_t, void* *))dtrace_nullop,
346	};
347
348	static dtrace_id_t dtrace_probeid_begin; / special BEGIN probe /
349	static dtrace_id_t dtrace_probeid_end; / special END probe /
350	dtrace_id_t dtrace_probeid_error; / special ERROR probe /
351
352	/*
353	* DTrace Helper Tracing Variables
354	*/
355	uint32_t dtrace_helptrace_next = `0`;
356	uint32_t dtrace_helptrace_nlocals;
357	char *dtrace_helptrace_buffer;
358	size_t dtrace_helptrace_bufsize = `512` * `1024`;
359
360	#if DEBUG
361	int dtrace_helptrace_enabled = `1`;
362	#else
363	int dtrace_helptrace_enabled = `0`;
364	#endif
365
366	#if defined (__arm64__)
367	/*
368	* The ioctl for adding helper DOF is based on the
369	* size of a user_addr_t. We need to recognize both
370	* U32 and U64 as the same action.
371	*/
372	#define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
373	#define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
374	#endif /* __arm64__ */
375
376	/*
377	* DTrace Error Hashing
378	*
379	* On DEBUG kernels, DTrace will track the errors that has seen in a hash
380	* table. This is very useful for checking coverage of tests that are
381	* expected to induce DIF or DOF processing errors, and may be useful for
382	* debugging problems in the DIF code generator or in DOF generation . The
383	* error hash may be examined with the ::dtrace_errhash MDB dcmd.
384	*/
385	#if DEBUG
386	static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
387	static const char *dtrace_errlast;
388	static kthread_t *dtrace_errthread;
389	static lck_mtx_t dtrace_errlock;
390	#endif
391
392	/*
393	* DTrace Macros and Constants
394	*
395	* These are various macros that are useful in various spots in the
396	* implementation, along with a few random constants that have no meaning
397	* outside of the implementation. There is no real structure to this cpp
398	* mishmash -- but is there ever?
399	*/
400
401	#define DTRACE_GETSTR(hash, elm) \
402	(hash->dth_getstr(elm, hash->dth_stroffs))
403
404	#define DTRACE_HASHSTR(hash, elm) \
405	dtrace_hash_str(DTRACE_GETSTR(hash, elm))
406
407	#define DTRACE_HASHNEXT(hash, elm) \
408	(void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
409
410	#define DTRACE_HASHPREV(hash, elm) \
411	(void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
412
413	#define DTRACE_HASHEQ(hash, lhs, rhs) \
414	(strcmp(DTRACE_GETSTR(hash, lhs), \
415	DTRACE_GETSTR(hash, rhs)) == 0)
416
417	#define DTRACE_AGGHASHSIZE_SLEW 17
418
419	#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
420
421	/*
422	* The key for a thread-local variable consists of the lower 61 bits of the
423	* current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
424	* We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
425	* equal to a variable identifier. This is necessary (but not sufficient) to
426	* assure that global associative arrays never collide with thread-local
427	* variables. To guarantee that they cannot collide, we must also define the
428	* order for keying dynamic variables. That order is:
429	*
430	* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
431	*
432	* Because the variable-key and the tls-key are in orthogonal spaces, there is
433	* no way for a global variable key signature to match a thread-local key
434	* signature.
435	*/
436	#if defined (__x86_64__)
437	/ FIXME: two function calls!! /
438	#define DTRACE_TLS_THRKEY(where) { \
439	uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
440	uint64_t thr = (uintptr_t)current_thread(); \
441	ASSERT(intr < (1 << 3)); \
442	(where) = ((thr + DIF_VARIABLE_MAX) & \
443	(((uint64_t)1 << 61) - 1)) \| ((uint64_t)intr << 61); \
444	}
445	#elif defined(__arm__)
446	/ FIXME: three function calls!!! /
447	#define DTRACE_TLS_THRKEY(where) { \
448	uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
449	uint64_t thr = (uintptr_t)current_thread(); \
450	uint_t pid = (uint_t)dtrace_proc_selfpid(); \
451	ASSERT(intr < (1 << 3)); \
452	(where) = (((thr << 32 \| pid) + DIF_VARIABLE_MAX) & \
453	(((uint64_t)1 << 61) - 1)) \| ((uint64_t)intr << 61); \
454	}
455	#elif defined (__arm64__)
456	/ FIXME: two function calls!! /
457	#define DTRACE_TLS_THRKEY(where) { \
458	uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
459	uint64_t thr = (uintptr_t)current_thread(); \
460	ASSERT(intr < (1 << 3)); \
461	(where) = ((thr + DIF_VARIABLE_MAX) & \
462	(((uint64_t)1 << 61) - 1)) \| ((uint64_t)intr << 61); \
463	}
464	#else
465	#error Unknown architecture
466	#endif
467
468	#define DT_BSWAP_8(x) ((x) & 0xff)
469	#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) \| DT_BSWAP_8((x) >> 8))
470	#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) \| DT_BSWAP_16((x) >> 16))
471	#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) \| DT_BSWAP_32((x) >> 32))
472
473	#define DT_MASK_LO 0x00000000FFFFFFFFULL
474
475	#define DTRACE_STORE(type, tomax, offset, what) \
476	((type )((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
477
478
479	#define DTRACE_ALIGNCHECK(addr, size, flags) \
480	if (addr & (MIN(size,4) - 1)) { \
481	*flags \|= CPU_DTRACE_BADALIGN; \
482	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
483	return (0); \
484	}
485
486	#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
487	do { \
488	if ((remp) != NULL) { \
489	*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
490	} \
491	} while (0)
492
493
494	/*
495	* Test whether a range of memory starting at testaddr of size testsz falls
496	* within the range of memory described by addr, sz. We take care to avoid
497	* problems with overflow and underflow of the unsigned quantities, and
498	* disallow all negative sizes. Ranges of size 0 are allowed.
499	*/
500	#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
501	((testaddr) - (baseaddr) < (basesz) && \
502	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
503	(testaddr) + (testsz) >= (testaddr))
504
505	/*
506	* Test whether alloc_sz bytes will fit in the scratch region. We isolate
507	* alloc_sz on the righthand side of the comparison in order to avoid overflow
508	* or underflow in the comparison with it. This is simpler than the INRANGE
509	* check above, because we know that the dtms_scratch_ptr is valid in the
510	* range. Allocations of size zero are allowed.
511	*/
512	#define DTRACE_INSCRATCH(mstate, alloc_sz) \
513	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
514	(mstate)->dtms_scratch_ptr >= (alloc_sz))
515
516	#define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
517
518	#if defined (__x86_64__) \|\| (defined (__arm__) \|\| defined (__arm64__))
519	#define DTRACE_LOADFUNC(bits) \
520	/CSTYLED/ \
521	uint##bits##_t dtrace_load##bits(uintptr_t addr); \
522	\
523	uint##bits##_t \
524	dtrace_load##bits(uintptr_t addr) \
525	{ \
526	size_t size = bits / NBBY; \
527	/CSTYLED/ \
528	uint##bits##_t rval = 0; \
529	int i; \
530	volatile uint16_t flags = (volatile uint16_t ) \
531	&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
532	\
533	DTRACE_ALIGNCHECK(addr, size, flags); \
534	\
535	for (i = 0; i < dtrace_toxranges; i++) { \
536	if (addr >= dtrace_toxrange[i].dtt_limit) \
537	continue; \
538	\
539	if (addr + size <= dtrace_toxrange[i].dtt_base) \
540	continue; \
541	\
542	/* \
543	* This address falls within a toxic region; return 0. \
544	*/ \
545	*flags \|= CPU_DTRACE_BADADDR; \
546	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
547	return (0); \
548	} \
549	\
550	{ \
551	volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
552	*flags \|= CPU_DTRACE_NOFAULT; \
553	recover = dtrace_set_thread_recover(current_thread(), recover); \
554	/CSTYLED/ \
555	/* \
556	* PR6394061 - avoid device memory that is unpredictably \
557	* mapped and unmapped \
558	*/ \
559	if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
560	rval = ((volatile uint##bits##_t )addr); \
561	else { \
562	*flags \|= CPU_DTRACE_BADADDR; \
563	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
564	return (0); \
565	} \
566	\
567	RECOVER_LABEL(bits); \
568	(void)dtrace_set_thread_recover(current_thread(), recover); \
569	*flags &= ~CPU_DTRACE_NOFAULT; \
570	} \
571	\
572	return (rval); \
573	}
574	#else /* all other architectures */
575	#error Unknown Architecture
576	#endif
577
578	#ifdef __LP64__
579	#define dtrace_loadptr dtrace_load64
580	#else
581	#define dtrace_loadptr dtrace_load32
582	#endif
583
584	#define DTRACE_DYNHASH_FREE 0
585	#define DTRACE_DYNHASH_SINK 1
586	#define DTRACE_DYNHASH_VALID 2
587
588	#define DTRACE_MATCH_FAIL -1
589	#define DTRACE_MATCH_NEXT 0
590	#define DTRACE_MATCH_DONE 1
591	#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
592	#define DTRACE_STATE_ALIGN 64
593
594	#define DTRACE_FLAGS2FLT(flags) \
595	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
596	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
597	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
598	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
599	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
600	((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
601	((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
602	((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
603	((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
604	DTRACEFLT_UNKNOWN)
605
606	#define DTRACEACT_ISSTRING(act) \
607	((act)->dta_kind == DTRACEACT_DIFEXPR && \
608	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
609
610
611	static size_t dtrace_strlen(const char *, size_t);
612	static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
613	static void dtrace_enabling_provide(dtrace_provider_t *);
614	static int dtrace_enabling_match(dtrace_enabling_t , int* , dtrace_match_cond_t cond);
615	static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
616	static void dtrace_enabling_matchall(void);
617	static dtrace_state_t dtrace_anon_grab(void*);
618	static uint64_t dtrace_helper(int, dtrace_mstate_t *,
619	dtrace_state_t *, uint64_t, uint64_t);
620	static dtrace_helpers_t dtrace_helpers_create(proc_t );
621	static void dtrace_buffer_drop(dtrace_buffer_t *);
622	static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
623	dtrace_state_t , dtrace_mstate_t );
624	static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
625	dtrace_optval_t);
626	static int dtrace_ecb_create_enable(dtrace_probe_t , void* , void* *);
627	static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
628	static int dtrace_canload_remains(uint64_t, size_t, size_t *,
629	dtrace_mstate_t , dtrace_vstate_t );
630	static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
631	dtrace_mstate_t , dtrace_vstate_t );
632
633
634	/*
635	* DTrace sysctl handlers
636	*
637	* These declarations and functions are used for a deeper DTrace configuration.
638	* Most of them are not per-consumer basis and may impact the other DTrace
639	* consumers. Correctness may not be supported for all the variables, so you
640	* should be careful about what values you are using.
641	*/
642
643	SYSCTL_DECL(_kern_dtrace);
644	SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "dtrace");
645
646	static int
647	sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
648	{
649	#pragma unused(oidp, arg2)
650	int changed, error;
651	int value = (int* *) arg1;
652
653	error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
654	if (error \|\| !changed)
655	return (error);
656
657	if (value != `0` && value != `1`)
658	return (ERANGE);
659
660	lck_mtx_lock(&dtrace_lock);
661	dtrace_err_verbose = value;
662	lck_mtx_unlock(&dtrace_lock);
663
664	return (`0`);
665	}
666
667	/*
668	* kern.dtrace.err_verbose
669	*
670	* Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
671	* Errors are reported when a DIFO or a DOF has been rejected by the kernel.
672	*/
673	SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
674	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
675	&dtrace_err_verbose, `0`,
676	sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
677
678	static int
679	sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
680	{
681	#pragma unused(oidp, arg2, req)
682	int changed, error;
683	uint64_t value = (uint64_t ) arg1;
684
685	error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
686	if (error \|\| !changed)
687	return (error);
688
689	if (value <= dtrace_buffer_memory_inuse)
690	return (ERANGE);
691
692	lck_mtx_lock(&dtrace_lock);
693	dtrace_buffer_memory_maxsize = value;
694	lck_mtx_unlock(&dtrace_lock);
695
696	return (`0`);
697	}
698
699	/*
700	* kern.dtrace.buffer_memory_maxsize
701	*
702	* Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
703	* the limit is PHYS_MEM / 3 for all consumers. Attempting to set a null, a negative value
704	* or a value <= to dtrace_buffer_memory_inuse will result in a failure.
705	*/
706	SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
707	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
708	&dtrace_buffer_memory_maxsize, `0`,
709	sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
710
711	/*
712	* kern.dtrace.buffer_memory_inuse
713	*
714	* Current state buffer memory used, in bytes, by all the DTrace consumers.
715	* This value is read-only.
716	*/
717	SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD \| CTLFLAG_LOCKED,
718	&dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
719
720	static int
721	sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
722	{
723	#pragma unused(oidp, arg2, req)
724	int changed, error;
725	size_t value = (size_t) arg1;
726
727	error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
728	if (error \|\| !changed)
729	return (error);
730
731	if (value <= `0`)
732	return (ERANGE);
733
734	lck_mtx_lock(&dtrace_lock);
735	dtrace_difo_maxsize = value;
736	lck_mtx_unlock(&dtrace_lock);
737
738	return (`0`);
739	}
740
741	/*
742	* kern.dtrace.difo_maxsize
743	*
744	* Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
745	* to get the default value. Attempting to set a null or negative size will
746	* result in a failure.
747	*/
748	SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
749	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
750	&dtrace_difo_maxsize, `0`,
751	sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
752
753	static int
754	sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
755	{
756	#pragma unused(oidp, arg2, req)
757	int changed, error;
758	dtrace_optval_t value = (dtrace_optval_t ) arg1;
759
760	error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
761	if (error \|\| !changed)
762	return (error);
763
764	if (value <= `0`)
765	return (ERANGE);
766
767	if (value >= dtrace_copy_maxsize())
768	return (ERANGE);
769
770	lck_mtx_lock(&dtrace_lock);
771	dtrace_dof_maxsize = value;
772	lck_mtx_unlock(&dtrace_lock);
773
774	return (`0`);
775	}
776
777	/*
778	* kern.dtrace.dof_maxsize
779	*
780	* Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
781	* get the default value. Attempting to set a null or negative size will result
782	* in a failure.
783	*/
784	SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
785	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
786	&dtrace_dof_maxsize, `0`,
787	sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
788
789	static int
790	sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
791	{
792	#pragma unused(oidp, arg2, req)
793	int changed, error;
794	dtrace_optval_t value = (dtrace_optval_t) arg1;
795
796	error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
797	if (error \|\| !changed)
798	return (error);
799
800	if (value <= `0`)
801	return (ERANGE);
802	if (value > dtrace_statvar_maxsize_max)
803	return (ERANGE);
804
805	lck_mtx_lock(&dtrace_lock);
806	dtrace_statvar_maxsize = value;
807	lck_mtx_unlock(&dtrace_lock);
808
809	return (`0`);
810	}
811
812	/*
813	* kern.dtrace.global_maxsize
814	*
815	* Set the variable max size in bytes, check the definition of
816	* dtrace_statvar_maxsize to get the default value. Attempting to set a null,
817	* too high or negative size will result in a failure.
818	*/
819	SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
820	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED,
821	&dtrace_statvar_maxsize, `0`,
822	sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
823
824	static int
825	sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
826	{
827	#pragma unused(oidp, arg2)
828	int error;
829	int value = (int* *) arg1;
830
831	error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
832	if (error)
833	return (error);
834
835	if (req->newptr) {
836	if (value != `0` && value != `1`)
837	return (ERANGE);
838
839	/*
840	* We do not allow changing this back to zero, as private probes
841	* would still be left registered
842	*/
843	if (value != `1`)
844	return (EPERM);
845
846	lck_mtx_lock(&dtrace_lock);
847	dtrace_provide_private_probes = value;
848	lck_mtx_unlock(&dtrace_lock);
849	}
850	return (`0`);
851	}
852
853	/*
854	* kern.dtrace.provide_private_probes
855	*
856	* Set whether the providers must provide the private probes. This is
857	* mainly used by the FBT provider to request probes for the private/static
858	* symbols.
859	*/
860	SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
861	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
862	&dtrace_provide_private_probes, `0`,
863	sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
864
865	/*
866	* kern.dtrace.dof_mode
867	*
868	* Returns the current DOF mode.
869	* This value is read-only.
870	*/
871	SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD \| CTLFLAG_LOCKED,
872	&dtrace_dof_mode, `0`, "dtrace dof mode");
873
874	/*
875	* DTrace Probe Context Functions
876	*
877	* These functions are called from probe context. Because probe context is
878	* any context in which C may be called, arbitrarily locks may be held,
879	* interrupts may be disabled, we may be in arbitrary dispatched state, etc.
880	* As a result, functions called from probe context may only call other DTrace
881	* support functions -- they may not interact at all with the system at large.
882	* (Note that the ASSERT macro is made probe-context safe by redefining it in
883	* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
884	* loads are to be performed from probe context, they _must_ be in terms of
885	* the safe dtrace_load*() variants.
886	*
887	* Some functions in this block are not actually called from probe context;
888	* for these functions, there will be a comment above the function reading
889	* "Note: not called from probe context."
890	*/
891
892	int
893	dtrace_assfail(const char a, const* char f, int* l)
894	{
895	panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
896
897	/*
898	* We just need something here that even the most clever compiler
899	* cannot optimize away.
900	*/
901	return (a[(uintptr_t)f]);
902	}
903
904	/*
905	* Atomically increment a specified error counter from probe context.
906	*/
907	static void
908	dtrace_error(uint32_t *counter)
909	{
910	/*
911	* Most counters stored to in probe context are per-CPU counters.
912	* However, there are some error conditions that are sufficiently
913	* arcane that they don't merit per-CPU storage. If these counters
914	* are incremented concurrently on different CPUs, scalability will be
915	* adversely affected -- but we don't expect them to be white-hot in a
916	* correctly constructed enabling...
917	*/
918	uint32_t oval, nval;
919
920	do {
921	oval = *counter;
922
923	if ((nval = oval + `1`) == `0`) {
924	/*
925	* If the counter would wrap, set it to 1 -- assuring
926	* that the counter is never zero when we have seen
927	* errors. (The counter must be 32-bits because we
928	* aren't guaranteed a 64-bit compare&swap operation.)
929	* To save this code both the infamy of being fingered
930	* by a priggish news story and the indignity of being
931	* the target of a neo-puritan witch trial, we're
932	* carefully avoiding any colorful description of the
933	* likelihood of this condition -- but suffice it to
934	* say that it is only slightly more likely than the
935	* overflow of predicate cache IDs, as discussed in
936	* dtrace_predicate_create().
937	*/
938	nval = `1`;
939	}
940	} while (dtrace_cas32(counter, oval, nval) != oval);
941	}
942
943	/*
944	* Use the DTRACE_LOADFUNC macro to define functions for each of loading a
945	* uint8_t, a uint16_t, a uint32_t and a uint64_t.
946	*/
947	DTRACE_LOADFUNC(`8`)
948	DTRACE_LOADFUNC(`16`)
949	DTRACE_LOADFUNC(`32`)
950	DTRACE_LOADFUNC(`64`)
951
952	static int
953	dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
954	{
955	if (dest < mstate->dtms_scratch_base)
956	return (`0`);
957
958	if (dest + size < dest)
959	return (`0`);
960
961	if (dest + size > mstate->dtms_scratch_ptr)
962	return (`0`);
963
964	return (`1`);
965	}
966
967	static int
968	dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
969	dtrace_statvar_t *svars, int* nsvars)
970	{
971	int i;
972
973	size_t maxglobalsize, maxlocalsize;
974
975	maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
976	maxlocalsize = (maxglobalsize) * NCPU;
977
978	if (nsvars == `0`)
979	return (`0`);
980
981	for (i = `0`; i < nsvars; i++) {
982	dtrace_statvar_t *svar = svars[i];
983	uint8_t scope;
984	size_t size;
985
986	if (svar == NULL \|\| (size = svar->dtsv_size) == `0`)
987	continue;
988
989	scope = svar->dtsv_var.dtdv_scope;
990
991	/**
992	* We verify that our size is valid in the spirit of providing
993	* defense in depth: we want to prevent attackers from using
994	* DTrace to escalate an orthogonal kernel heap corruption bug
995	* into the ability to store to arbitrary locations in memory.
996	*/
997	VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) \|\|
998	(scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
999
1000	if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1001	DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1002	svar->dtsv_size);
1003	return (`1`);
1004	}
1005	}
1006
1007	return (`0`);
1008	}
1009
1010	/*
1011	* Check to see if the address is within a memory region to which a store may
1012	* be issued. This includes the DTrace scratch areas, and any DTrace variable
1013	* region. The caller of dtrace_canstore() is responsible for performing any
1014	* alignment checks that are needed before stores are actually executed.
1015	*/
1016	static int
1017	dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1018	dtrace_vstate_t *vstate)
1019	{
1020	return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1021	}
1022	/*
1023	* Implementation of dtrace_canstore which communicates the upper bound of the
1024	* allowed memory region.
1025	*/
1026	static int
1027	dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1028	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1029	{
1030	/*
1031	* First, check to see if the address is in scratch space...
1032	*/
1033	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1034	mstate->dtms_scratch_size)) {
1035	DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1036	mstate->dtms_scratch_size);
1037	return (`1`);
1038	}
1039	/*
1040	* Now check to see if it's a dynamic variable. This check will pick
1041	* up both thread-local variables and any global dynamically-allocated
1042	* variables.
1043	*/
1044	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1045	vstate->dtvs_dynvars.dtds_size)) {
1046	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1047	uintptr_t base = (uintptr_t)dstate->dtds_base +
1048	(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1049	uintptr_t chunkoffs;
1050	dtrace_dynvar_t *dvar;
1051
1052	/*
1053	* Before we assume that we can store here, we need to make
1054	* sure that it isn't in our metadata -- storing to our
1055	* dynamic variable metadata would corrupt our state. For
1056	* the range to not include any dynamic variable metadata,
1057	* it must:
1058	*
1059	* (1) Start above the hash table that is at the base of
1060	* the dynamic variable space
1061	*
1062	* (2) Have a starting chunk offset that is beyond the
1063	* dtrace_dynvar_t that is at the base of every chunk
1064	*
1065	* (3) Not span a chunk boundary
1066	*
1067	* (4) Not be in the tuple space of a dynamic variable
1068	*
1069	*/
1070	if (addr < base)
1071	return (`0`);
1072
1073	chunkoffs = (addr - base) % dstate->dtds_chunksize;
1074
1075	if (chunkoffs < sizeof (dtrace_dynvar_t))
1076	return (`0`);
1077
1078	if (chunkoffs + sz > dstate->dtds_chunksize)
1079	return (`0`);
1080
1081	dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1082
1083	if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1084	return (`0`);
1085
1086	if (chunkoffs < sizeof (dtrace_dynvar_t) +
1087	((dvar->dtdv_tuple.dtt_nkeys - `1`) * sizeof (dtrace_key_t)))
1088	return (`0`);
1089
1090	return (`1`);
1091	}
1092
1093	/*
1094	* Finally, check the static local and global variables. These checks
1095	* take the longest, so we perform them last.
1096	*/
1097	if (dtrace_canstore_statvar(addr, sz, remain,
1098	vstate->dtvs_locals, vstate->dtvs_nlocals))
1099	return (`1`);
1100
1101	if (dtrace_canstore_statvar(addr, sz, remain,
1102	vstate->dtvs_globals, vstate->dtvs_nglobals))
1103	return (`1`);
1104
1105	return (`0`);
1106	}
1107
1108
1109	/*
1110	* Convenience routine to check to see if the address is within a memory
1111	* region in which a load may be issued given the user's privilege level;
1112	* if not, it sets the appropriate error flags and loads 'addr' into the
1113	* illegal value slot.
1114	*
1115	* DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1116	* appropriate memory access protection.
1117	*/
1118	int
1119	dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1120	dtrace_vstate_t *vstate)
1121	{
1122	return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1123	}
1124
1125	/*
1126	* Implementation of dtrace_canload which communicates the upper bound of the
1127	* allowed memory region.
1128	*/
1129	static int
1130	dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1131	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1132	{
1133	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1134
1135	/*
1136	* If we hold the privilege to read from kernel memory, then
1137	* everything is readable.
1138	*/
1139	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1140	DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1141	return (`1`);
1142	}
1143
1144	/*
1145	* You can obviously read that which you can store.
1146	*/
1147	if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1148	return (`1`);
1149
1150	/*
1151	* We're allowed to read from our own string table.
1152	*/
1153	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1154	mstate->dtms_difo->dtdo_strlen)) {
1155	DTRACE_RANGE_REMAIN(remain, addr,
1156	mstate->dtms_difo->dtdo_strtab,
1157	mstate->dtms_difo->dtdo_strlen);
1158	return (`1`);
1159	}
1160
1161	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1162	*illval = addr;
1163	return (`0`);
1164	}
1165
1166	/*
1167	* Convenience routine to check to see if a given string is within a memory
1168	* region in which a load may be issued given the user's privilege level;
1169	* this exists so that we don't need to issue unnecessary dtrace_strlen()
1170	* calls in the event that the user has all privileges.
1171	*/
1172	static int
1173	dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1174	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1175	{
1176	size_t rsize;
1177
1178	/*
1179	* If we hold the privilege to read from kernel memory, then
1180	* everything is readable.
1181	*/
1182	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1183	DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1184	return (`1`);
1185	}
1186
1187	/*
1188	* Even if the caller is uninterested in querying the remaining valid
1189	* range, it is required to ensure that the access is allowed.
1190	*/
1191	if (remain == NULL) {
1192	remain = &rsize;
1193	}
1194	if (dtrace_canload_remains(addr, `0`, remain, mstate, vstate)) {
1195	size_t strsz;
1196	/*
1197	* Perform the strlen after determining the length of the
1198	* memory region which is accessible. This prevents timing
1199	* information from being used to find NULs in memory which is
1200	* not accessible to the caller.
1201	*/
1202	strsz = `1` + dtrace_strlen((char *)(uintptr_t)addr,
1203	MIN(sz, *remain));
1204	if (strsz <= *remain) {
1205	return (`1`);
1206	}
1207	}
1208
1209	return (`0`);
1210	}
1211
1212	/*
1213	* Convenience routine to check to see if a given variable is within a memory
1214	* region in which a load may be issued given the user's privilege level.
1215	*/
1216	static int
1217	dtrace_vcanload(void src, dtrace_diftype_t type, size_t *remain,
1218	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1219	{
1220	size_t sz;
1221	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222
1223	/*
1224	* Calculate the max size before performing any checks since even
1225	* DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1226	* return the max length via 'remain'.
1227	*/
1228	if (type->dtdt_kind == DIF_TYPE_STRING) {
1229	dtrace_state_t *state = vstate->dtvs_state;
1230
1231	if (state != NULL) {
1232	sz = state->dts_options[DTRACEOPT_STRSIZE];
1233	} else {
1234	/*
1235	* In helper context, we have a NULL state; fall back
1236	* to using the system-wide default for the string size
1237	* in this case.
1238	*/
1239	sz = dtrace_strsize_default;
1240	}
1241	} else {
1242	sz = type->dtdt_size;
1243	}
1244
1245	/*
1246	* If we hold the privilege to read from kernel memory, then
1247	* everything is readable.
1248	*/
1249	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`) {
1250	DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1251	return (`1`);
1252	}
1253
1254	if (type->dtdt_kind == DIF_TYPE_STRING) {
1255	return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1256	vstate));
1257	}
1258	return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1259	vstate));
1260	}
1261
1262	/*
1263	* Compare two strings using safe loads.
1264	*/
1265	static int
1266	dtrace_strncmp(char s1, char* *s2, size_t limit)
1267	{
1268	uint8_t c1, c2;
1269	volatile uint16_t *flags;
1270
1271	if (s1 == s2 \|\| limit == `0`)
1272	return (`0`);
1273
1274	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1275
1276	do {
1277	if (s1 == NULL) {
1278	c1 = `'\0'`;
1279	} else {
1280	c1 = dtrace_load8((uintptr_t)s1++);
1281	}
1282
1283	if (s2 == NULL) {
1284	c2 = `'\0'`;
1285	} else {
1286	c2 = dtrace_load8((uintptr_t)s2++);
1287	}
1288
1289	if (c1 != c2)
1290	return (c1 - c2);
1291	} while (--limit && c1 != `'\0'` && !(*flags & CPU_DTRACE_FAULT));
1292
1293	return (`0`);
1294	}
1295
1296	/*
1297	* Compute strlen(s) for a string using safe memory accesses. The additional
1298	* len parameter is used to specify a maximum length to ensure completion.
1299	*/
1300	static size_t
1301	dtrace_strlen(const char *s, size_t lim)
1302	{
1303	uint_t len;
1304
1305	for (len = `0`; len != lim; len++) {
1306	if (dtrace_load8((uintptr_t)s++) == `'\0'`)
1307	break;
1308	}
1309
1310	return (len);
1311	}
1312
1313	/*
1314	* Check if an address falls within a toxic region.
1315	*/
1316	static int
1317	dtrace_istoxic(uintptr_t kaddr, size_t size)
1318	{
1319	uintptr_t taddr, tsize;
1320	int i;
1321
1322	for (i = `0`; i < dtrace_toxranges; i++) {
1323	taddr = dtrace_toxrange[i].dtt_base;
1324	tsize = dtrace_toxrange[i].dtt_limit - taddr;
1325
1326	if (kaddr - taddr < tsize) {
1327	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1328	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1329	return (`1`);
1330	}
1331
1332	if (taddr - kaddr < size) {
1333	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1334	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1335	return (`1`);
1336	}
1337	}
1338
1339	return (`0`);
1340	}
1341
1342	/*
1343	* Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1344	* memory specified by the DIF program. The dst is assumed to be safe memory
1345	* that we can store to directly because it is managed by DTrace. As with
1346	* standard bcopy, overlapping copies are handled properly.
1347	*/
1348	static void
1349	dtrace_bcopy(const void src, void* *dst, size_t len)
1350	{
1351	if (len != `0`) {
1352	uint8_t *s1 = dst;
1353	const uint8_t *s2 = src;
1354
1355	if (s1 <= s2) {
1356	do {
1357	*s1++ = dtrace_load8((uintptr_t)s2++);
1358	} while (--len != `0`);
1359	} else {
1360	s2 += len;
1361	s1 += len;
1362
1363	do {
1364	*--s1 = dtrace_load8((uintptr_t)--s2);
1365	} while (--len != `0`);
1366	}
1367	}
1368	}
1369
1370	/*
1371	* Copy src to dst using safe memory accesses, up to either the specified
1372	* length, or the point that a nul byte is encountered. The src is assumed to
1373	* be unsafe memory specified by the DIF program. The dst is assumed to be
1374	* safe memory that we can store to directly because it is managed by DTrace.
1375	* Unlike dtrace_bcopy(), overlapping regions are not handled.
1376	*/
1377	static void
1378	dtrace_strcpy(const void src, void* *dst, size_t len)
1379	{
1380	if (len != `0`) {
1381	uint8_t *s1 = dst, c;
1382	const uint8_t *s2 = src;
1383
1384	do {
1385	*s1++ = c = dtrace_load8((uintptr_t)s2++);
1386	} while (--len != `0` && c != `'\0'`);
1387	}
1388	}
1389
1390	/*
1391	* Copy src to dst, deriving the size and type from the specified (BYREF)
1392	* variable type. The src is assumed to be unsafe memory specified by the DIF
1393	* program. The dst is assumed to be DTrace variable memory that is of the
1394	* specified type; we assume that we can store to directly.
1395	*/
1396	static void
1397	dtrace_vcopy(void src, void* dst, dtrace_diftype_t type, size_t limit)
1398	{
1399	ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1400
1401	if (type->dtdt_kind == DIF_TYPE_STRING) {
1402	dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1403	} else {
1404	dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1405	}
1406	}
1407
1408	/*
1409	* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1410	* unsafe memory specified by the DIF program. The s2 data is assumed to be
1411	* safe memory that we can access directly because it is managed by DTrace.
1412	*/
1413	static int
1414	dtrace_bcmp(const void s1, const* void *s2, size_t len)
1415	{
1416	volatile uint16_t *flags;
1417
1418	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1419
1420	if (s1 == s2)
1421	return (`0`);
1422
1423	if (s1 == NULL \|\| s2 == NULL)
1424	return (`1`);
1425
1426	if (s1 != s2 && len != `0`) {
1427	const uint8_t *ps1 = s1;
1428	const uint8_t *ps2 = s2;
1429
1430	do {
1431	if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1432	return (`1`);
1433	} while (--len != `0` && !(*flags & CPU_DTRACE_FAULT));
1434	}
1435	return (`0`);
1436	}
1437
1438	/*
1439	* Zero the specified region using a simple byte-by-byte loop. Note that this
1440	* is for safe DTrace-managed memory only.
1441	*/
1442	static void
1443	dtrace_bzero(void *dst, size_t len)
1444	{
1445	uchar_t *cp;
1446
1447	for (cp = dst; len != `0`; len--)
1448	*cp++ = `0`;
1449	}
1450
1451	static void
1452	dtrace_add_128(uint64_t addend1, uint64_t addend2, uint64_t *sum)
1453	{
1454	uint64_t result[`2`];
1455
1456	result[`0`] = addend1[`0`] + addend2[`0`];
1457	result[`1`] = addend1[`1`] + addend2[`1`] +
1458	(result[`0`] < addend1[`0`] \|\| result[`0`] < addend2[`0`] ? `1` : `0`);
1459
1460	sum[`0`] = result[`0`];
1461	sum[`1`] = result[`1`];
1462	}
1463
1464	/*
1465	* Shift the 128-bit value in a by b. If b is positive, shift left.
1466	* If b is negative, shift right.
1467	*/
1468	static void
1469	dtrace_shift_128(uint64_t a, int* b)
1470	{
1471	uint64_t mask;
1472
1473	if (b == `0`)
1474	return;
1475
1476	if (b < `0`) {
1477	b = -b;
1478	if (b >= `64`) {
1479	a[`0`] = a[`1`] >> (b - `64`);
1480	a[`1`] = `0`;
1481	} else {
1482	a[`0`] >>= b;
1483	mask = `1LL` << (`64` - b);
1484	mask -= `1`;
1485	a[`0`] \|= ((a[`1`] & mask) << (`64` - b));
1486	a[`1`] >>= b;
1487	}
1488	} else {
1489	if (b >= `64`) {
1490	a[`1`] = a[`0`] << (b - `64`);
1491	a[`0`] = `0`;
1492	} else {
1493	a[`1`] <<= b;
1494	mask = a[`0`] >> (`64` - b);
1495	a[`1`] \|= mask;
1496	a[`0`] <<= b;
1497	}
1498	}
1499	}
1500
1501	/*
1502	* The basic idea is to break the 2 64-bit values into 4 32-bit values,
1503	* use native multiplication on those, and then re-combine into the
1504	* resulting 128-bit value.
1505	*
1506	* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1507	* hi1 * hi2 << 64 +
1508	* hi1 * lo2 << 32 +
1509	* hi2 * lo1 << 32 +
1510	* lo1 * lo2
1511	*/
1512	static void
1513	dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1514	{
1515	uint64_t hi1, hi2, lo1, lo2;
1516	uint64_t tmp[`2`];
1517
1518	hi1 = factor1 >> `32`;
1519	hi2 = factor2 >> `32`;
1520
1521	lo1 = factor1 & DT_MASK_LO;
1522	lo2 = factor2 & DT_MASK_LO;
1523
1524	product[`0`] = lo1 * lo2;
1525	product[`1`] = hi1 * hi2;
1526
1527	tmp[`0`] = hi1 * lo2;
1528	tmp[`1`] = `0`;
1529	dtrace_shift_128(tmp, `32`);
1530	dtrace_add_128(product, tmp, product);
1531
1532	tmp[`0`] = hi2 * lo1;
1533	tmp[`1`] = `0`;
1534	dtrace_shift_128(tmp, `32`);
1535	dtrace_add_128(product, tmp, product);
1536	}
1537
1538	/*
1539	* This privilege check should be used by actions and subroutines to
1540	* verify that the user credentials of the process that enabled the
1541	* invoking ECB match the target credentials
1542	*/
1543	static int
1544	dtrace_priv_proc_common_user(dtrace_state_t *state)
1545	{
1546	cred_t cr, s_cr = state->dts_cred.dcr_cred;
1547
1548	/*
1549	* We should always have a non-NULL state cred here, since if cred
1550	* is null (anonymous tracing), we fast-path bypass this routine.
1551	*/
1552	ASSERT(s_cr != NULL);
1553
1554	if ((cr = dtrace_CRED()) != NULL &&
1555	posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1556	posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1557	posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1558	posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1559	posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1560	posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1561	return (`1`);
1562
1563	return (`0`);
1564	}
1565
1566	/*
1567	* This privilege check should be used by actions and subroutines to
1568	* verify that the zone of the process that enabled the invoking ECB
1569	* matches the target credentials
1570	*/
1571	static int
1572	dtrace_priv_proc_common_zone(dtrace_state_t *state)
1573	{
1574	cred_t cr, s_cr = state->dts_cred.dcr_cred;
1575	#pragma unused(cr, s_cr, state) /* __APPLE__ */
1576
1577	/*
1578	* We should always have a non-NULL state cred here, since if cred
1579	* is null (anonymous tracing), we fast-path bypass this routine.
1580	*/
1581	ASSERT(s_cr != NULL);
1582
1583	return `1`; / APPLE NOTE: Darwin doesn't do zones. /
1584	}
1585
1586	/*
1587	* This privilege check should be used by actions and subroutines to
1588	* verify that the process has not setuid or changed credentials.
1589	*/
1590	static int
1591	dtrace_priv_proc_common_nocd(void)
1592	{
1593	return `1`; / Darwin omits "No Core Dump" flag. /
1594	}
1595
1596	static int
1597	dtrace_priv_proc_destructive(dtrace_state_t *state)
1598	{
1599	int action = state->dts_cred.dcr_action;
1600
1601	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1602	goto bad;
1603
1604	if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1605	goto bad;
1606
1607	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == `0`) &&
1608	dtrace_priv_proc_common_zone(state) == `0`)
1609	goto bad;
1610
1611	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == `0`) &&
1612	dtrace_priv_proc_common_user(state) == `0`)
1613	goto bad;
1614
1615	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == `0`) &&
1616	dtrace_priv_proc_common_nocd() == `0`)
1617	goto bad;
1618
1619	return (`1`);
1620
1621	bad:
1622	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1623
1624	return (`0`);
1625	}
1626
1627	static int
1628	dtrace_priv_proc_control(dtrace_state_t *state)
1629	{
1630	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1631	goto bad;
1632
1633	if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1634	goto bad;
1635
1636	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1637	return (`1`);
1638
1639	if (dtrace_priv_proc_common_zone(state) &&
1640	dtrace_priv_proc_common_user(state) &&
1641	dtrace_priv_proc_common_nocd())
1642	return (`1`);
1643
1644	bad:
1645	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1646
1647	return (`0`);
1648	}
1649
1650	static int
1651	dtrace_priv_proc(dtrace_state_t *state)
1652	{
1653	if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1654	goto bad;
1655
1656	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1657	goto bad;
1658
1659	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1660	return (`1`);
1661
1662	bad:
1663	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1664
1665	return (`0`);
1666	}
1667
1668	/*
1669	* The P_LNOATTACH check is an Apple specific check.
1670	* We need a version of dtrace_priv_proc() that omits
1671	* that check for PID and EXECNAME accesses
1672	*/
1673	static int
1674	dtrace_priv_proc_relaxed(dtrace_state_t *state)
1675	{
1676
1677	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1678	return (`1`);
1679
1680	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;
1681
1682	return (`0`);
1683	}
1684
1685	static int
1686	dtrace_priv_kernel(dtrace_state_t *state)
1687	{
1688	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1689	goto bad;
1690
1691	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1692	return (`1`);
1693
1694	bad:
1695	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;
1696
1697	return (`0`);
1698	}
1699
1700	static int
1701	dtrace_priv_kernel_destructive(dtrace_state_t *state)
1702	{
1703	if (dtrace_is_restricted())
1704	goto bad;
1705
1706	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1707	return (`1`);
1708
1709	bad:
1710	cpu_core[CPU->cpu_id].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;
1711
1712	return (`0`);
1713	}
1714
1715	/*
1716	* Note: not called from probe context. This function is called
1717	* asynchronously (and at a regular interval) from outside of probe context to
1718	* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1719	* cleaning is explained in detail in <sys/dtrace_impl.h>.
1720	*/
1721	static void
1722	dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1723	{
1724	dtrace_dynvar_t *dirty;
1725	dtrace_dstate_percpu_t *dcpu;
1726	int i, work = `0`;
1727
1728	for (i = `0`; i < (int)NCPU; i++) {
1729	dcpu = &dstate->dtds_percpu[i];
1730
1731	ASSERT(dcpu->dtdsc_rinsing == NULL);
1732
1733	/*
1734	* If the dirty list is NULL, there is no dirty work to do.
1735	*/
1736	if (dcpu->dtdsc_dirty == NULL)
1737	continue;
1738
1739	/*
1740	* If the clean list is non-NULL, then we're not going to do
1741	* any work for this CPU -- it means that there has not been
1742	* a dtrace_dynvar() allocation on this CPU (or from this CPU)
1743	* since the last time we cleaned house.
1744	*/
1745	if (dcpu->dtdsc_clean != NULL)
1746	continue;
1747
1748	work = `1`;
1749
1750	/*
1751	* Atomically move the dirty list aside.
1752	*/
1753	do {
1754	dirty = dcpu->dtdsc_dirty;
1755
1756	/*
1757	* Before we zap the dirty list, set the rinsing list.
1758	* (This allows for a potential assertion in
1759	* dtrace_dynvar(): if a free dynamic variable appears
1760	* on a hash chain, either the dirty list or the
1761	* rinsing list for some CPU must be non-NULL.)
1762	*/
1763	dcpu->dtdsc_rinsing = dirty;
1764	dtrace_membar_producer();
1765	} while (dtrace_casptr(&dcpu->dtdsc_dirty,
1766	dirty, NULL) != dirty);
1767	}
1768
1769	if (!work) {
1770	/*
1771	* We have no work to do; we can simply return.
1772	*/
1773	return;
1774	}
1775
1776	dtrace_sync();
1777
1778	for (i = `0`; i < (int)NCPU; i++) {
1779	dcpu = &dstate->dtds_percpu[i];
1780
1781	if (dcpu->dtdsc_rinsing == NULL)
1782	continue;
1783
1784	/*
1785	* We are now guaranteed that no hash chain contains a pointer
1786	* into this dirty list; we can make it clean.
1787	*/
1788	ASSERT(dcpu->dtdsc_clean == NULL);
1789	dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1790	dcpu->dtdsc_rinsing = NULL;
1791	}
1792
1793	/*
1794	* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1795	* sure that all CPUs have seen all of the dtdsc_clean pointers.
1796	* This prevents a race whereby a CPU incorrectly decides that
1797	* the state should be something other than DTRACE_DSTATE_CLEAN
1798	* after dtrace_dynvar_clean() has completed.
1799	*/
1800	dtrace_sync();
1801
1802	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1803	}
1804
1805	/*
1806	* Depending on the value of the op parameter, this function looks-up,
1807	* allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1808	* allocation is requested, this function will return a pointer to a
1809	* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1810	* variable can be allocated. If NULL is returned, the appropriate counter
1811	* will be incremented.
1812	*/
1813	static dtrace_dynvar_t *
1814	dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1815	dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1816	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
1817	{
1818	uint64_t hashval = DTRACE_DYNHASH_VALID;
1819	dtrace_dynhash_t *hash = dstate->dtds_hash;
1820	dtrace_dynvar_t free, new_free, next, dvar, start, prev = NULL;
1821	processorid_t me = CPU->cpu_id, cpu = me;
1822	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1823	size_t bucket, ksize;
1824	size_t chunksize = dstate->dtds_chunksize;
1825	uintptr_t kdata, lock, nstate;
1826	uint_t i;
1827
1828	ASSERT(nkeys != `0`);
1829
1830	/*
1831	* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1832	* algorithm. For the by-value portions, we perform the algorithm in
1833	* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1834	* bit, and seems to have only a minute effect on distribution. For
1835	* the by-reference data, we perform "One-at-a-time" iterating (safely)
1836	* over each referenced byte. It's painful to do this, but it's much
1837	* better than pathological hash distribution. The efficacy of the
1838	* hashing algorithm (and a comparison with other algorithms) may be
1839	* found by running the ::dtrace_dynstat MDB dcmd.
1840	*/
1841	for (i = `0`; i < nkeys; i++) {
1842	if (key[i].dttk_size == `0`) {
1843	uint64_t val = key[i].dttk_value;
1844
1845	hashval += (val >> `48`) & `0xffff`;
1846	hashval += (hashval << `10`);
1847	hashval ^= (hashval >> `6`);
1848
1849	hashval += (val >> `32`) & `0xffff`;
1850	hashval += (hashval << `10`);
1851	hashval ^= (hashval >> `6`);
1852
1853	hashval += (val >> `16`) & `0xffff`;
1854	hashval += (hashval << `10`);
1855	hashval ^= (hashval >> `6`);
1856
1857	hashval += val & `0xffff`;
1858	hashval += (hashval << `10`);
1859	hashval ^= (hashval >> `6`);
1860	} else {
1861	/*
1862	* This is incredibly painful, but it beats the hell
1863	* out of the alternative.
1864	*/
1865	uint64_t j, size = key[i].dttk_size;
1866	uintptr_t base = (uintptr_t)key[i].dttk_value;
1867
1868	if (!dtrace_canload(base, size, mstate, vstate))
1869	break;
1870
1871	for (j = `0`; j < size; j++) {
1872	hashval += dtrace_load8(base + j);
1873	hashval += (hashval << `10`);
1874	hashval ^= (hashval >> `6`);
1875	}
1876	}
1877	}
1878
1879	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1880	return (NULL);
1881
1882	hashval += (hashval << `3`);
1883	hashval ^= (hashval >> `11`);
1884	hashval += (hashval << `15`);
1885
1886	/*
1887	* There is a remote chance (ideally, 1 in 2^31) that our hashval
1888	* comes out to be one of our two sentinel hash values. If this
1889	* actually happens, we set the hashval to be a value known to be a
1890	* non-sentinel value.
1891	*/
1892	if (hashval == DTRACE_DYNHASH_FREE \|\| hashval == DTRACE_DYNHASH_SINK)
1893	hashval = DTRACE_DYNHASH_VALID;
1894
1895	/*
1896	* Yes, it's painful to do a divide here. If the cycle count becomes
1897	* important here, tricks can be pulled to reduce it. (However, it's
1898	* critical that hash collisions be kept to an absolute minimum;
1899	* they're much more painful than a divide.) It's better to have a
1900	* solution that generates few collisions and still keeps things
1901	* relatively simple.
1902	*/
1903	bucket = hashval % dstate->dtds_hashsize;
1904
1905	if (op == DTRACE_DYNVAR_DEALLOC) {
1906	volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1907
1908	for (;;) {
1909	while ((lock = *lockp) & `1`)
1910	continue;
1911
1912	if (dtrace_casptr((void *)(uintptr_t)lockp,
1913	(void )lock, (void* )(lock + `1`)) == (void* *)lock)
1914	break;
1915	}
1916
1917	dtrace_membar_producer();
1918	}
1919
1920	top:
1921	prev = NULL;
1922	lock = hash[bucket].dtdh_lock;
1923
1924	dtrace_membar_consumer();
1925
1926	start = hash[bucket].dtdh_chain;
1927	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK \|\|
1928	start->dtdv_hashval != DTRACE_DYNHASH_FREE \|\|
1929	op != DTRACE_DYNVAR_DEALLOC));
1930
1931	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1932	dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1933	dtrace_key_t *dkey = &dtuple->dtt_key[`0`];
1934
1935	if (dvar->dtdv_hashval != hashval) {
1936	if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1937	/*
1938	* We've reached the sink, and therefore the
1939	* end of the hash chain; we can kick out of
1940	* the loop knowing that we have seen a valid
1941	* snapshot of state.
1942	*/
1943	ASSERT(dvar->dtdv_next == NULL);
1944	ASSERT(dvar == &dtrace_dynhash_sink);
1945	break;
1946	}
1947
1948	if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1949	/*
1950	* We've gone off the rails: somewhere along
1951	* the line, one of the members of this hash
1952	* chain was deleted. Note that we could also
1953	* detect this by simply letting this loop run
1954	* to completion, as we would eventually hit
1955	* the end of the dirty list. However, we
1956	* want to avoid running the length of the
1957	* dirty list unnecessarily (it might be quite
1958	* long), so we catch this as early as
1959	* possible by detecting the hash marker. In
1960	* this case, we simply set dvar to NULL and
1961	* break; the conditional after the loop will
1962	* send us back to top.
1963	*/
1964	dvar = NULL;
1965	break;
1966	}
1967
1968	goto next;
1969	}
1970
1971	if (dtuple->dtt_nkeys != nkeys)
1972	goto next;
1973
1974	for (i = `0`; i < nkeys; i++, dkey++) {
1975	if (dkey->dttk_size != key[i].dttk_size)
1976	goto next; / size or type mismatch /
1977
1978	if (dkey->dttk_size != `0`) {
1979	if (dtrace_bcmp(
1980	(void *)(uintptr_t)key[i].dttk_value,
1981	(void *)(uintptr_t)dkey->dttk_value,
1982	dkey->dttk_size))
1983	goto next;
1984	} else {
1985	if (dkey->dttk_value != key[i].dttk_value)
1986	goto next;
1987	}
1988	}
1989
1990	if (op != DTRACE_DYNVAR_DEALLOC)
1991	return (dvar);
1992
1993	ASSERT(dvar->dtdv_next == NULL \|\|
1994	dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1995
1996	if (prev != NULL) {
1997	ASSERT(hash[bucket].dtdh_chain != dvar);
1998	ASSERT(start != dvar);
1999	ASSERT(prev->dtdv_next == dvar);
2000	prev->dtdv_next = dvar->dtdv_next;
2001	} else {
2002	if (dtrace_casptr(&hash[bucket].dtdh_chain,
2003	start, dvar->dtdv_next) != start) {
2004	/*
2005	* We have failed to atomically swing the
2006	* hash table head pointer, presumably because
2007	* of a conflicting allocation on another CPU.
2008	* We need to reread the hash chain and try
2009	* again.
2010	*/
2011	goto top;
2012	}
2013	}
2014
2015	dtrace_membar_producer();
2016
2017	/*
2018	* Now set the hash value to indicate that it's free.
2019	*/
2020	ASSERT(hash[bucket].dtdh_chain != dvar);
2021	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2022
2023	dtrace_membar_producer();
2024
2025	/*
2026	* Set the next pointer to point at the dirty list, and
2027	* atomically swing the dirty pointer to the newly freed dvar.
2028	*/
2029	do {
2030	next = dcpu->dtdsc_dirty;
2031	dvar->dtdv_next = next;
2032	} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2033
2034	/*
2035	* Finally, unlock this hash bucket.
2036	*/
2037	ASSERT(hash[bucket].dtdh_lock == lock);
2038	ASSERT(lock & `1`);
2039	hash[bucket].dtdh_lock++;
2040
2041	return (NULL);
2042	next:
2043	prev = dvar;
2044	continue;
2045	}
2046
2047	if (dvar == NULL) {
2048	/*
2049	* If dvar is NULL, it is because we went off the rails:
2050	* one of the elements that we traversed in the hash chain
2051	* was deleted while we were traversing it. In this case,
2052	* we assert that we aren't doing a dealloc (deallocs lock
2053	* the hash bucket to prevent themselves from racing with
2054	* one another), and retry the hash chain traversal.
2055	*/
2056	ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2057	goto top;
2058	}
2059
2060	if (op != DTRACE_DYNVAR_ALLOC) {
2061	/*
2062	* If we are not to allocate a new variable, we want to
2063	* return NULL now. Before we return, check that the value
2064	* of the lock word hasn't changed. If it has, we may have
2065	* seen an inconsistent snapshot.
2066	*/
2067	if (op == DTRACE_DYNVAR_NOALLOC) {
2068	if (hash[bucket].dtdh_lock != lock)
2069	goto top;
2070	} else {
2071	ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2072	ASSERT(hash[bucket].dtdh_lock == lock);
2073	ASSERT(lock & `1`);
2074	hash[bucket].dtdh_lock++;
2075	}
2076
2077	return (NULL);
2078	}
2079
2080	/*
2081	* We need to allocate a new dynamic variable. The size we need is the
2082	* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2083	* size of any auxiliary key data (rounded up to 8-byte alignment) plus
2084	* the size of any referred-to data (dsize). We then round the final
2085	* size up to the chunksize for allocation.
2086	*/
2087	for (ksize = `0`, i = `0`; i < nkeys; i++)
2088	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2089
2090	/*
2091	* This should be pretty much impossible, but could happen if, say,
2092	* strange DIF specified the tuple. Ideally, this should be an
2093	* assertion and not an error condition -- but that requires that the
2094	* chunksize calculation in dtrace_difo_chunksize() be absolutely
2095	* bullet-proof. (That is, it must not be able to be fooled by
2096	* malicious DIF.) Given the lack of backwards branches in DIF,
2097	* solving this would presumably not amount to solving the Halting
2098	* Problem -- but it still seems awfully hard.
2099	*/
2100	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - `1`) +
2101	ksize + dsize > chunksize) {
2102	dcpu->dtdsc_drops++;
2103	return (NULL);
2104	}
2105
2106	nstate = DTRACE_DSTATE_EMPTY;
2107
2108	do {
2109	retry:
2110	free = dcpu->dtdsc_free;
2111
2112	if (free == NULL) {
2113	dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2114	void *rval;
2115
2116	if (clean == NULL) {
2117	/*
2118	* We're out of dynamic variable space on
2119	* this CPU. Unless we have tried all CPUs,
2120	* we'll try to allocate from a different
2121	* CPU.
2122	*/
2123	switch (dstate->dtds_state) {
2124	case DTRACE_DSTATE_CLEAN: {
2125	void *sp = &dstate->dtds_state;
2126
2127	if (++cpu >= (int)NCPU)
2128	cpu = `0`;
2129
2130	if (dcpu->dtdsc_dirty != NULL &&
2131	nstate == DTRACE_DSTATE_EMPTY)
2132	nstate = DTRACE_DSTATE_DIRTY;
2133
2134	if (dcpu->dtdsc_rinsing != NULL)
2135	nstate = DTRACE_DSTATE_RINSING;
2136
2137	dcpu = &dstate->dtds_percpu[cpu];
2138
2139	if (cpu != me)
2140	goto retry;
2141
2142	(void) dtrace_cas32(sp,
2143	DTRACE_DSTATE_CLEAN, nstate);
2144
2145	/*
2146	* To increment the correct bean
2147	* counter, take another lap.
2148	*/
2149	goto retry;
2150	}
2151
2152	case DTRACE_DSTATE_DIRTY:
2153	dcpu->dtdsc_dirty_drops++;
2154	break;
2155
2156	case DTRACE_DSTATE_RINSING:
2157	dcpu->dtdsc_rinsing_drops++;
2158	break;
2159
2160	case DTRACE_DSTATE_EMPTY:
2161	dcpu->dtdsc_drops++;
2162	break;
2163	}
2164
2165	DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2166	return (NULL);
2167	}
2168
2169	/*
2170	* The clean list appears to be non-empty. We want to
2171	* move the clean list to the free list; we start by
2172	* moving the clean pointer aside.
2173	*/
2174	if (dtrace_casptr(&dcpu->dtdsc_clean,
2175	clean, NULL) != clean) {
2176	/*
2177	* We are in one of two situations:
2178	*
2179	* (a) The clean list was switched to the
2180	* free list by another CPU.
2181	*
2182	* (b) The clean list was added to by the
2183	* cleansing cyclic.
2184	*
2185	* In either of these situations, we can
2186	* just reattempt the free list allocation.
2187	*/
2188	goto retry;
2189	}
2190
2191	ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2192
2193	/*
2194	* Now we'll move the clean list to the free list.
2195	* It's impossible for this to fail: the only way
2196	* the free list can be updated is through this
2197	* code path, and only one CPU can own the clean list.
2198	* Thus, it would only be possible for this to fail if
2199	* this code were racing with dtrace_dynvar_clean().
2200	* (That is, if dtrace_dynvar_clean() updated the clean
2201	* list, and we ended up racing to update the free
2202	* list.) This race is prevented by the dtrace_sync()
2203	* in dtrace_dynvar_clean() -- which flushes the
2204	* owners of the clean lists out before resetting
2205	* the clean lists.
2206	*/
2207	rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2208	ASSERT(rval == NULL);
2209	goto retry;
2210	}
2211
2212	dvar = free;
2213	new_free = dvar->dtdv_next;
2214	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2215
2216	/*
2217	* We have now allocated a new chunk. We copy the tuple keys into the
2218	* tuple array and copy any referenced key data into the data space
2219	* following the tuple array. As we do this, we relocate dttk_value
2220	* in the final tuple to point to the key data address in the chunk.
2221	*/
2222	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2223	dvar->dtdv_data = (void *)(kdata + ksize);
2224	dvar->dtdv_tuple.dtt_nkeys = nkeys;
2225
2226	for (i = `0`; i < nkeys; i++) {
2227	dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2228	size_t kesize = key[i].dttk_size;
2229
2230	if (kesize != `0`) {
2231	dtrace_bcopy(
2232	(const void *)(uintptr_t)key[i].dttk_value,
2233	(void *)kdata, kesize);
2234	dkey->dttk_value = kdata;
2235	kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2236	} else {
2237	dkey->dttk_value = key[i].dttk_value;
2238	}
2239
2240	dkey->dttk_size = kesize;
2241	}
2242
2243	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2244	dvar->dtdv_hashval = hashval;
2245	dvar->dtdv_next = start;
2246
2247	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2248	return (dvar);
2249
2250	/*
2251	* The cas has failed. Either another CPU is adding an element to
2252	* this hash chain, or another CPU is deleting an element from this
2253	* hash chain. The simplest way to deal with both of these cases
2254	* (though not necessarily the most efficient) is to free our
2255	* allocated block and tail-call ourselves. Note that the free is
2256	* to the dirty list and _not_ to the free list. This is to prevent
2257	* races with allocators, above.
2258	*/
2259	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2260
2261	dtrace_membar_producer();
2262
2263	do {
2264	free = dcpu->dtdsc_dirty;
2265	dvar->dtdv_next = free;
2266	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2267
2268	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2269	}
2270
2271	/ARGSUSED/
2272	static void
2273	dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2274	{
2275	#pragma unused(arg) /* __APPLE__ */
2276	if ((int64_t)nval < (int64_t)*oval)
2277	*oval = nval;
2278	}
2279
2280	/ARGSUSED/
2281	static void
2282	dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2283	{
2284	#pragma unused(arg) /* __APPLE__ */
2285	if ((int64_t)nval > (int64_t)*oval)
2286	*oval = nval;
2287	}
2288
2289	static void
2290	dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2291	{
2292	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2293	int64_t val = (int64_t)nval;
2294
2295	if (val < `0`) {
2296	for (i = `0`; i < zero; i++) {
2297	if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2298	quanta[i] += incr;
2299	return;
2300	}
2301	}
2302	} else {
2303	for (i = zero + `1`; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2304	if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2305	quanta[i - `1`] += incr;
2306	return;
2307	}
2308	}
2309
2310	quanta[DTRACE_QUANTIZE_NBUCKETS - `1`] += incr;
2311	return;
2312	}
2313
2314	ASSERT(`0`);
2315	}
2316
2317	static void
2318	dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2319	{
2320	uint64_t arg = *lquanta++;
2321	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2322	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2323	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2324	int32_t val = (int32_t)nval, level;
2325
2326	ASSERT(step != `0`);
2327	ASSERT(levels != `0`);
2328
2329	if (val < base) {
2330	/*
2331	* This is an underflow.
2332	*/
2333	lquanta[`0`] += incr;
2334	return;
2335	}
2336
2337	level = (val - base) / step;
2338
2339	if (level < levels) {
2340	lquanta[level + `1`] += incr;
2341	return;
2342	}
2343
2344	/*
2345	* This is an overflow.
2346	*/
2347	lquanta[levels + `1`] += incr;
2348	}
2349
2350	static int
2351	dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2352	int16_t nsteps, int64_t value)
2353	{
2354	int64_t this = `1`, last, next;
2355	int base = `1`, order;
2356
2357	for (order = `0`; order < low; ++order)
2358	this *= factor;
2359
2360	/*
2361	* If our value is less than our factor taken to the power of the
2362	* low order of magnitude, it goes into the zeroth bucket.
2363	*/
2364	if (value < this)
2365	return `0`;
2366	else
2367	last = this;
2368
2369	for (this *= factor; order <= high; ++order) {
2370	int nbuckets = this > nsteps ? nsteps : this;
2371
2372	/*
2373	* We should not generally get log/linear quantizations
2374	* with a high magnitude that allows 64-bits to
2375	* overflow, but we nonetheless protect against this
2376	* by explicitly checking for overflow, and clamping
2377	* our value accordingly.
2378	*/
2379	next = this * factor;
2380	if (next < this) {
2381	value = this - `1`;
2382	}
2383
2384	/*
2385	* If our value lies within this order of magnitude,
2386	* determine its position by taking the offset within
2387	* the order of magnitude, dividing by the bucket
2388	* width, and adding to our (accumulated) base.
2389	*/
2390	if (value < this) {
2391	return (base + (value - last) / (this / nbuckets));
2392	}
2393
2394	base += nbuckets - (nbuckets / factor);
2395	last = this;
2396	this = next;
2397	}
2398
2399	/*
2400	* Our value is greater than or equal to our factor taken to the
2401	* power of one plus the high magnitude -- return the top bucket.
2402	*/
2403	return base;
2404	}
2405
2406	static void
2407	dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2408	{
2409	uint64_t arg = *llquanta++;
2410	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2411	uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2412	uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2413	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2414
2415	llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2416	}
2417
2418	/ARGSUSED/
2419	static void
2420	dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2421	{
2422	#pragma unused(arg) /* __APPLE__ */
2423	data[`0`]++;
2424	data[`1`] += nval;
2425	}
2426
2427	/ARGSUSED/
2428	static void
2429	dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2430	{
2431	#pragma unused(arg) /* __APPLE__ */
2432	int64_t snval = (int64_t)nval;
2433	uint64_t tmp[`2`];
2434
2435	data[`0`]++;
2436	data[`1`] += nval;
2437
2438	/*
2439	* What we want to say here is:
2440	*
2441	* data[2] += nval * nval;
2442	*
2443	* But given that nval is 64-bit, we could easily overflow, so
2444	* we do this as 128-bit arithmetic.
2445	*/
2446	if (snval < `0`)
2447	snval = -snval;
2448
2449	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2450	dtrace_add_128(data + `2`, tmp, data + `2`);
2451	}
2452
2453	/ARGSUSED/
2454	static void
2455	dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2456	{
2457	#pragma unused(nval, arg) /* __APPLE__ */
2458	oval = oval + `1`;
2459	}
2460
2461	/ARGSUSED/
2462	static void
2463	dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2464	{
2465	#pragma unused(arg) /* __APPLE__ */
2466	*oval += nval;
2467	}
2468
2469	/*
2470	* Aggregate given the tuple in the principal data buffer, and the aggregating
2471	* action denoted by the specified dtrace_aggregation_t. The aggregation
2472	* buffer is specified as the buf parameter. This routine does not return
2473	* failure; if there is no space in the aggregation buffer, the data will be
2474	* dropped, and a corresponding counter incremented.
2475	*/
2476	static void
2477	dtrace_aggregate(dtrace_aggregation_t agg, dtrace_buffer_t dbuf,
2478	intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2479	{
2480	#pragma unused(arg)
2481	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2482	uint32_t i, ndx, size, fsize;
2483	uint32_t align = sizeof (uint64_t) - `1`;
2484	dtrace_aggbuffer_t *agb;
2485	dtrace_aggkey_t *key;
2486	uint32_t hashval = `0`, limit, isstr;
2487	caddr_t tomax, data, kdata;
2488	dtrace_actkind_t action;
2489	dtrace_action_t *act;
2490	uintptr_t offs;
2491
2492	if (buf == NULL)
2493	return;
2494
2495	if (!agg->dtag_hasarg) {
2496	/*
2497	* Currently, only quantize() and lquantize() take additional
2498	* arguments, and they have the same semantics: an increment
2499	* value that defaults to 1 when not present. If additional
2500	* aggregating actions take arguments, the setting of the
2501	* default argument value will presumably have to become more
2502	* sophisticated...
2503	*/
2504	arg = `1`;
2505	}
2506
2507	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2508	size = rec->dtrd_offset - agg->dtag_base;
2509	fsize = size + rec->dtrd_size;
2510
2511	ASSERT(dbuf->dtb_tomax != NULL);
2512	data = dbuf->dtb_tomax + offset + agg->dtag_base;
2513
2514	if ((tomax = buf->dtb_tomax) == NULL) {
2515	dtrace_buffer_drop(buf);
2516	return;
2517	}
2518
2519	/*
2520	* The metastructure is always at the bottom of the buffer.
2521	*/
2522	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2523	sizeof (dtrace_aggbuffer_t));
2524
2525	if (buf->dtb_offset == `0`) {
2526	/*
2527	* We just kludge up approximately 1/8th of the size to be
2528	* buckets. If this guess ends up being routinely
2529	* off-the-mark, we may need to dynamically readjust this
2530	* based on past performance.
2531	*/
2532	uintptr_t hashsize = (buf->dtb_size >> `3`) / sizeof (uintptr_t);
2533
2534	if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2535	(uintptr_t)tomax \|\| hashsize == `0`) {
2536	/*
2537	* We've been given a ludicrously small buffer;
2538	* increment our drop count and leave.
2539	*/
2540	dtrace_buffer_drop(buf);
2541	return;
2542	}
2543
2544	/*
2545	* And now, a pathetic attempt to try to get a an odd (or
2546	* perchance, a prime) hash size for better hash distribution.
2547	*/
2548	if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << `3`))
2549	hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2550
2551	agb->dtagb_hashsize = hashsize;
2552	agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2553	agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2554	agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2555
2556	for (i = `0`; i < agb->dtagb_hashsize; i++)
2557	agb->dtagb_hash[i] = NULL;
2558	}
2559
2560	ASSERT(agg->dtag_first != NULL);
2561	ASSERT(agg->dtag_first->dta_intuple);
2562
2563	/*
2564	* Calculate the hash value based on the key. Note that we _don't_
2565	* include the aggid in the hashing (but we will store it as part of
2566	* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2567	* algorithm: a simple, quick algorithm that has no known funnels, and
2568	* gets good distribution in practice. The efficacy of the hashing
2569	* algorithm (and a comparison with other algorithms) may be found by
2570	* running the ::dtrace_aggstat MDB dcmd.
2571	*/
2572	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2573	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2574	limit = i + act->dta_rec.dtrd_size;
2575	ASSERT(limit <= size);
2576	isstr = DTRACEACT_ISSTRING(act);
2577
2578	for (; i < limit; i++) {
2579	hashval += data[i];
2580	hashval += (hashval << `10`);
2581	hashval ^= (hashval >> `6`);
2582
2583	if (isstr && data[i] == `'\0'`)
2584	break;
2585	}
2586	}
2587
2588	hashval += (hashval << `3`);
2589	hashval ^= (hashval >> `11`);
2590	hashval += (hashval << `15`);
2591
2592	/*
2593	* Yes, the divide here is expensive -- but it's generally the least
2594	* of the performance issues given the amount of data that we iterate
2595	* over to compute hash values, compare data, etc.
2596	*/
2597	ndx = hashval % agb->dtagb_hashsize;
2598
2599	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2600	ASSERT((caddr_t)key >= tomax);
2601	ASSERT((caddr_t)key < tomax + buf->dtb_size);
2602
2603	if (hashval != key->dtak_hashval \|\| key->dtak_size != size)
2604	continue;
2605
2606	kdata = key->dtak_data;
2607	ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2608
2609	for (act = agg->dtag_first; act->dta_intuple;
2610	act = act->dta_next) {
2611	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2612	limit = i + act->dta_rec.dtrd_size;
2613	ASSERT(limit <= size);
2614	isstr = DTRACEACT_ISSTRING(act);
2615
2616	for (; i < limit; i++) {
2617	if (kdata[i] != data[i])
2618	goto next;
2619
2620	if (isstr && data[i] == `'\0'`)
2621	break;
2622	}
2623	}
2624
2625	if (action != key->dtak_action) {
2626	/*
2627	* We are aggregating on the same value in the same
2628	* aggregation with two different aggregating actions.
2629	* (This should have been picked up in the compiler,
2630	* so we may be dealing with errant or devious DIF.)
2631	* This is an error condition; we indicate as much,
2632	* and return.
2633	*/
2634	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2635	return;
2636	}
2637
2638	/*
2639	* This is a hit: we need to apply the aggregator to
2640	* the value at this key.
2641	*/
2642	agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2643	return;
2644	next:
2645	continue;
2646	}
2647
2648	/*
2649	* We didn't find it. We need to allocate some zero-filled space,
2650	* link it into the hash table appropriately, and apply the aggregator
2651	* to the (zero-filled) value.
2652	*/
2653	offs = buf->dtb_offset;
2654	while (offs & (align - `1`))
2655	offs += sizeof (uint32_t);
2656
2657	/*
2658	* If we don't have enough room to both allocate a new key _and_
2659	* its associated data, increment the drop count and return.
2660	*/
2661	if ((uintptr_t)tomax + offs + fsize >
2662	agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2663	dtrace_buffer_drop(buf);
2664	return;
2665	}
2666
2667	/CONSTCOND/
2668	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - `1`)));
2669	key = (dtrace_aggkey_t )(agb->dtagb_free - sizeof* (dtrace_aggkey_t));
2670	agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2671
2672	key->dtak_data = kdata = tomax + offs;
2673	buf->dtb_offset = offs + fsize;
2674
2675	/*
2676	* Now copy the data across.
2677	*/
2678	((dtrace_aggid_t )kdata) = agg->dtag_id;
2679
2680	for (i = sizeof (dtrace_aggid_t); i < size; i++)
2681	kdata[i] = data[i];
2682
2683	/*
2684	* Because strings are not zeroed out by default, we need to iterate
2685	* looking for actions that store strings, and we need to explicitly
2686	* pad these strings out with zeroes.
2687	*/
2688	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2689	int nul;
2690
2691	if (!DTRACEACT_ISSTRING(act))
2692	continue;
2693
2694	i = act->dta_rec.dtrd_offset - agg->dtag_base;
2695	limit = i + act->dta_rec.dtrd_size;
2696	ASSERT(limit <= size);
2697
2698	for (nul = `0`; i < limit; i++) {
2699	if (nul) {
2700	kdata[i] = `'\0'`;
2701	continue;
2702	}
2703
2704	if (data[i] != `'\0'`)
2705	continue;
2706
2707	nul = `1`;
2708	}
2709	}
2710
2711	for (i = size; i < fsize; i++)
2712	kdata[i] = `0`;
2713
2714	key->dtak_hashval = hashval;
2715	key->dtak_size = size;
2716	key->dtak_action = action;
2717	key->dtak_next = agb->dtagb_hash[ndx];
2718	agb->dtagb_hash[ndx] = key;
2719
2720	/*
2721	* Finally, apply the aggregator.
2722	*/
2723	((uint64_t )(key->dtak_data + size)) = agg->dtag_initial;
2724	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2725	}
2726
2727	/*
2728	* Given consumer state, this routine finds a speculation in the INACTIVE
2729	* state and transitions it into the ACTIVE state. If there is no speculation
2730	* in the INACTIVE state, 0 is returned. In this case, no error counter is
2731	* incremented -- it is up to the caller to take appropriate action.
2732	*/
2733	static int
2734	dtrace_speculation(dtrace_state_t *state)
2735	{
2736	int i = `0`;
2737	dtrace_speculation_state_t current;
2738	uint32_t *stat = &state->dts_speculations_unavail, count;
2739
2740	while (i < state->dts_nspeculations) {
2741	dtrace_speculation_t *spec = &state->dts_speculations[i];
2742
2743	current = spec->dtsp_state;
2744
2745	if (current != DTRACESPEC_INACTIVE) {
2746	if (current == DTRACESPEC_COMMITTINGMANY \|\|
2747	current == DTRACESPEC_COMMITTING \|\|
2748	current == DTRACESPEC_DISCARDING)
2749	stat = &state->dts_speculations_busy;
2750	i++;
2751	continue;
2752	}
2753
2754	if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2755	current, DTRACESPEC_ACTIVE) == current)
2756	return (i + `1`);
2757	}
2758
2759	/*
2760	* We couldn't find a speculation. If we found as much as a single
2761	* busy speculation buffer, we'll attribute this failure as "busy"
2762	* instead of "unavail".
2763	*/
2764	do {
2765	count = *stat;
2766	} while (dtrace_cas32(stat, count, count + `1`) != count);
2767
2768	return (`0`);
2769	}
2770
2771	/*
2772	* This routine commits an active speculation. If the specified speculation
2773	* is not in a valid state to perform a commit(), this routine will silently do
2774	* nothing. The state of the specified speculation is transitioned according
2775	* to the state transition diagram outlined in <sys/dtrace_impl.h>
2776	*/
2777	static void
2778	dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2779	dtrace_specid_t which)
2780	{
2781	dtrace_speculation_t *spec;
2782	dtrace_buffer_t src, dest;
2783	uintptr_t daddr, saddr, dlimit, slimit;
2784	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2785	intptr_t offs;
2786	uint64_t timestamp;
2787
2788	if (which == `0`)
2789	return;
2790
2791	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2792	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
2793	return;
2794	}
2795
2796	spec = &state->dts_speculations[which - `1`];
2797	src = &spec->dtsp_buffer[cpu];
2798	dest = &state->dts_buffer[cpu];
2799
2800	do {
2801	current = spec->dtsp_state;
2802
2803	if (current == DTRACESPEC_COMMITTINGMANY)
2804	break;
2805
2806	switch (current) {
2807	case DTRACESPEC_INACTIVE:
2808	case DTRACESPEC_DISCARDING:
2809	return;
2810
2811	case DTRACESPEC_COMMITTING:
2812	/*
2813	* This is only possible if we are (a) commit()'ing
2814	* without having done a prior speculate() on this CPU
2815	* and (b) racing with another commit() on a different
2816	* CPU. There's nothing to do -- we just assert that
2817	* our offset is 0.
2818	*/
2819	ASSERT(src->dtb_offset == `0`);
2820	return;
2821
2822	case DTRACESPEC_ACTIVE:
2823	new = DTRACESPEC_COMMITTING;
2824	break;
2825
2826	case DTRACESPEC_ACTIVEONE:
2827	/*
2828	* This speculation is active on one CPU. If our
2829	* buffer offset is non-zero, we know that the one CPU
2830	* must be us. Otherwise, we are committing on a
2831	* different CPU from the speculate(), and we must
2832	* rely on being asynchronously cleaned.
2833	*/
2834	if (src->dtb_offset != `0`) {
2835	new = DTRACESPEC_COMMITTING;
2836	break;
2837	}
2838	/FALLTHROUGH/
2839
2840	case DTRACESPEC_ACTIVEMANY:
2841	new = DTRACESPEC_COMMITTINGMANY;
2842	break;
2843
2844	default:
2845	ASSERT(`0`);
2846	}
2847	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2848	current, new) != current);
2849
2850	/*
2851	* We have set the state to indicate that we are committing this
2852	* speculation. Now reserve the necessary space in the destination
2853	* buffer.
2854	*/
2855	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2856	sizeof (uint64_t), state, NULL)) < `0`) {
2857	dtrace_buffer_drop(dest);
2858	goto out;
2859	}
2860
2861	/*
2862	* We have sufficient space to copy the speculative buffer into the
2863	* primary buffer. First, modify the speculative buffer, filling
2864	* in the timestamp of all entries with the current time. The data
2865	* must have the commit() time rather than the time it was traced,
2866	* so that all entries in the primary buffer are in timestamp order.
2867	*/
2868	timestamp = dtrace_gethrtime();
2869	saddr = (uintptr_t)src->dtb_tomax;
2870	slimit = saddr + src->dtb_offset;
2871	while (saddr < slimit) {
2872	size_t size;
2873	dtrace_rechdr_t dtrh = (dtrace_rechdr_t )saddr;
2874
2875	if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2876	saddr += sizeof (dtrace_epid_t);
2877	continue;
2878	}
2879
2880	ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2881	size = state->dts_ecbs[dtrh->dtrh_epid - `1`]->dte_size;
2882
2883	ASSERT(saddr + size <= slimit);
2884	ASSERT(size >= sizeof(dtrace_rechdr_t));
2885	ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2886
2887	DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2888
2889	saddr += size;
2890	}
2891
2892	/*
2893	* Copy the buffer across. (Note that this is a
2894	* highly subobtimal bcopy(); in the unlikely event that this becomes
2895	* a serious performance issue, a high-performance DTrace-specific
2896	* bcopy() should obviously be invented.)
2897	*/
2898	daddr = (uintptr_t)dest->dtb_tomax + offs;
2899	dlimit = daddr + src->dtb_offset;
2900	saddr = (uintptr_t)src->dtb_tomax;
2901
2902	/*
2903	* First, the aligned portion.
2904	*/
2905	while (dlimit - daddr >= sizeof (uint64_t)) {
2906	((uint64_t )daddr) = ((uint64_t )saddr);
2907
2908	daddr += sizeof (uint64_t);
2909	saddr += sizeof (uint64_t);
2910	}
2911
2912	/*
2913	* Now any left-over bit...
2914	*/
2915	while (dlimit - daddr)
2916	((uint8_t )daddr++) = ((uint8_t )saddr++);
2917
2918	/*
2919	* Finally, commit the reserved space in the destination buffer.
2920	*/
2921	dest->dtb_offset = offs + src->dtb_offset;
2922
2923	out:
2924	/*
2925	* If we're lucky enough to be the only active CPU on this speculation
2926	* buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2927	*/
2928	if (current == DTRACESPEC_ACTIVE \|\|
2929	(current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2930	uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2931	DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2932	#pragma unused(rval) /* __APPLE__ */
2933
2934	ASSERT(rval == DTRACESPEC_COMMITTING);
2935	}
2936
2937	src->dtb_offset = `0`;
2938	src->dtb_xamot_drops += src->dtb_drops;
2939	src->dtb_drops = `0`;
2940	}
2941
2942	/*
2943	* This routine discards an active speculation. If the specified speculation
2944	* is not in a valid state to perform a discard(), this routine will silently
2945	* do nothing. The state of the specified speculation is transitioned
2946	* according to the state transition diagram outlined in <sys/dtrace_impl.h>
2947	*/
2948	static void
2949	dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2950	dtrace_specid_t which)
2951	{
2952	dtrace_speculation_t *spec;
2953	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2954	dtrace_buffer_t *buf;
2955
2956	if (which == `0`)
2957	return;
2958
2959	if (which > (dtrace_specid_t)state->dts_nspeculations) {
2960	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
2961	return;
2962	}
2963
2964	spec = &state->dts_speculations[which - `1`];
2965	buf = &spec->dtsp_buffer[cpu];
2966
2967	do {
2968	current = spec->dtsp_state;
2969
2970	switch (current) {
2971	case DTRACESPEC_INACTIVE:
2972	case DTRACESPEC_COMMITTINGMANY:
2973	case DTRACESPEC_COMMITTING:
2974	case DTRACESPEC_DISCARDING:
2975	return;
2976
2977	case DTRACESPEC_ACTIVE:
2978	case DTRACESPEC_ACTIVEMANY:
2979	new = DTRACESPEC_DISCARDING;
2980	break;
2981
2982	case DTRACESPEC_ACTIVEONE:
2983	if (buf->dtb_offset != `0`) {
2984	new = DTRACESPEC_INACTIVE;
2985	} else {
2986	new = DTRACESPEC_DISCARDING;
2987	}
2988	break;
2989
2990	default:
2991	ASSERT(`0`);
2992	}
2993	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2994	current, new) != current);
2995
2996	buf->dtb_offset = `0`;
2997	buf->dtb_drops = `0`;
2998	}
2999
3000	/*
3001	* Note: not called from probe context. This function is called
3002	* asynchronously from cross call context to clean any speculations that are
3003	* in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3004	* transitioned back to the INACTIVE state until all CPUs have cleaned the
3005	* speculation.
3006	*/
3007	static void
3008	dtrace_speculation_clean_here(dtrace_state_t *state)
3009	{
3010	dtrace_icookie_t cookie;
3011	processorid_t cpu = CPU->cpu_id;
3012	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3013	dtrace_specid_t i;
3014
3015	cookie = dtrace_interrupt_disable();
3016
3017	if (dest->dtb_tomax == NULL) {
3018	dtrace_interrupt_enable(cookie);
3019	return;
3020	}
3021
3022	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3023	dtrace_speculation_t *spec = &state->dts_speculations[i];
3024	dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3025
3026	if (src->dtb_tomax == NULL)
3027	continue;
3028
3029	if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3030	src->dtb_offset = `0`;
3031	continue;
3032	}
3033
3034	if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3035	continue;
3036
3037	if (src->dtb_offset == `0`)
3038	continue;
3039
3040	dtrace_speculation_commit(state, cpu, i + `1`);
3041	}
3042
3043	dtrace_interrupt_enable(cookie);
3044	}
3045
3046	/*
3047	* Note: not called from probe context. This function is called
3048	* asynchronously (and at a regular interval) to clean any speculations that
3049	* are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3050	* is work to be done, it cross calls all CPUs to perform that work;
3051	* COMMITMANY and DISCARDING speculations may not be transitioned back to the
3052	* INACTIVE state until they have been cleaned by all CPUs.
3053	*/
3054	static void
3055	dtrace_speculation_clean(dtrace_state_t *state)
3056	{
3057	int work = `0`;
3058	uint32_t rv;
3059	dtrace_specid_t i;
3060
3061	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3062	dtrace_speculation_t *spec = &state->dts_speculations[i];
3063
3064	ASSERT(!spec->dtsp_cleaning);
3065
3066	if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3067	spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3068	continue;
3069
3070	work++;
3071	spec->dtsp_cleaning = `1`;
3072	}
3073
3074	if (!work)
3075	return;
3076
3077	dtrace_xcall(DTRACE_CPUALL,
3078	(dtrace_xcall_t)dtrace_speculation_clean_here, state);
3079
3080	/*
3081	* We now know that all CPUs have committed or discarded their
3082	* speculation buffers, as appropriate. We can now set the state
3083	* to inactive.
3084	*/
3085	for (i = `0`; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3086	dtrace_speculation_t *spec = &state->dts_speculations[i];
3087	dtrace_speculation_state_t current, new;
3088
3089	if (!spec->dtsp_cleaning)
3090	continue;
3091
3092	current = spec->dtsp_state;
3093	ASSERT(current == DTRACESPEC_DISCARDING \|\|
3094	current == DTRACESPEC_COMMITTINGMANY);
3095
3096	new = DTRACESPEC_INACTIVE;
3097
3098	rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3099	ASSERT(rv == current);
3100	spec->dtsp_cleaning = `0`;
3101	}
3102	}
3103
3104	/*
3105	* Called as part of a speculate() to get the speculative buffer associated
3106	* with a given speculation. Returns NULL if the specified speculation is not
3107	* in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3108	* the active CPU is not the specified CPU -- the speculation will be
3109	* atomically transitioned into the ACTIVEMANY state.
3110	*/
3111	static dtrace_buffer_t *
3112	dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3113	dtrace_specid_t which)
3114	{
3115	dtrace_speculation_t *spec;
3116	dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3117	dtrace_buffer_t *buf;
3118
3119	if (which == `0`)
3120	return (NULL);
3121
3122	if (which > (dtrace_specid_t)state->dts_nspeculations) {
3123	cpu_core[cpuid].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
3124	return (NULL);
3125	}
3126
3127	spec = &state->dts_speculations[which - `1`];
3128	buf = &spec->dtsp_buffer[cpuid];
3129
3130	do {
3131	current = spec->dtsp_state;
3132
3133	switch (current) {
3134	case DTRACESPEC_INACTIVE:
3135	case DTRACESPEC_COMMITTINGMANY:
3136	case DTRACESPEC_DISCARDING:
3137	return (NULL);
3138
3139	case DTRACESPEC_COMMITTING:
3140	ASSERT(buf->dtb_offset == `0`);
3141	return (NULL);
3142
3143	case DTRACESPEC_ACTIVEONE:
3144	/*
3145	* This speculation is currently active on one CPU.
3146	* Check the offset in the buffer; if it's non-zero,
3147	* that CPU must be us (and we leave the state alone).
3148	* If it's zero, assume that we're starting on a new
3149	* CPU -- and change the state to indicate that the
3150	* speculation is active on more than one CPU.
3151	*/
3152	if (buf->dtb_offset != `0`)
3153	return (buf);
3154
3155	new = DTRACESPEC_ACTIVEMANY;
3156	break;
3157
3158	case DTRACESPEC_ACTIVEMANY:
3159	return (buf);
3160
3161	case DTRACESPEC_ACTIVE:
3162	new = DTRACESPEC_ACTIVEONE;
3163	break;
3164
3165	default:
3166	ASSERT(`0`);
3167	}
3168	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3169	current, new) != current);
3170
3171	ASSERT(new == DTRACESPEC_ACTIVEONE \|\| new == DTRACESPEC_ACTIVEMANY);
3172	return (buf);
3173	}
3174
3175	/*
3176	* Return a string. In the event that the user lacks the privilege to access
3177	* arbitrary kernel memory, we copy the string out to scratch memory so that we
3178	* don't fail access checking.
3179	*
3180	* dtrace_dif_variable() uses this routine as a helper for various
3181	* builtin values such as 'execname' and 'probefunc.'
3182	*/
3183	static
3184	uintptr_t
3185	dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3186	dtrace_mstate_t *mstate)
3187	{
3188	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3189	uintptr_t ret;
3190	size_t strsz;
3191
3192	/*
3193	* The easy case: this probe is allowed to read all of memory, so
3194	* we can just return this as a vanilla pointer.
3195	*/
3196	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != `0`)
3197	return (addr);
3198
3199	/*
3200	* This is the tougher case: we copy the string in question from
3201	* kernel memory into scratch memory and return it that way: this
3202	* ensures that we won't trip up when access checking tests the
3203	* BYREF return value.
3204	*/
3205	strsz = dtrace_strlen((char *)addr, size) + `1`;
3206
3207	if (mstate->dtms_scratch_ptr + strsz >
3208	mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3209	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3210	return (`0`);
3211	}
3212
3213	dtrace_strcpy((const void )addr, (void* *)mstate->dtms_scratch_ptr,
3214	strsz);
3215	ret = mstate->dtms_scratch_ptr;
3216	mstate->dtms_scratch_ptr += strsz;
3217	return (ret);
3218	}
3219
3220	/*
3221	* This function implements the DIF emulator's variable lookups. The emulator
3222	* passes a reserved variable identifier and optional built-in array index.
3223	*/
3224	static uint64_t
3225	dtrace_dif_variable(dtrace_mstate_t mstate, dtrace_state_t state, uint64_t v,
3226	uint64_t ndx)
3227	{
3228	/*
3229	* If we're accessing one of the uncached arguments, we'll turn this
3230	* into a reference in the args array.
3231	*/
3232	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3233	ndx = v - DIF_VAR_ARG0;
3234	v = DIF_VAR_ARGS;
3235	}
3236
3237	switch (v) {
3238	case DIF_VAR_ARGS:
3239	ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3240	if (ndx >= sizeof (mstate->dtms_arg) /
3241	sizeof (mstate->dtms_arg[`0`])) {
3242	/*
3243	* APPLE NOTE: Account for introduction of __dtrace_probe()
3244	*/
3245	int aframes = mstate->dtms_probe->dtpr_aframes + `3`;
3246	dtrace_vstate_t *vstate = &state->dts_vstate;
3247	dtrace_provider_t *pv;
3248	uint64_t val;
3249
3250	pv = mstate->dtms_probe->dtpr_provider;
3251	if (pv->dtpv_pops.dtps_getargval != NULL)
3252	val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3253	mstate->dtms_probe->dtpr_id,
3254	mstate->dtms_probe->dtpr_arg, ndx, aframes);
3255	/ Special case access of arg5 as passed to dtrace_probe_error() (which see.) /
3256	else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == `5`) {
3257	return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[`0`]))->dts_arg_error_illval;
3258	}
3259
3260	else
3261	val = dtrace_getarg(ndx, aframes, mstate, vstate);
3262
3263	/*
3264	* This is regrettably required to keep the compiler
3265	* from tail-optimizing the call to dtrace_getarg().
3266	* The condition always evaluates to true, but the
3267	* compiler has no way of figuring that out a priori.
3268	* (None of this would be necessary if the compiler
3269	* could be relied upon to _always_ tail-optimize
3270	* the call to dtrace_getarg() -- but it can't.)
3271	*/
3272	if (mstate->dtms_probe != NULL)
3273	return (val);
3274
3275	ASSERT(`0`);
3276	}
3277
3278	return (mstate->dtms_arg[ndx]);
3279
3280	case DIF_VAR_UREGS: {
3281	thread_t thread;
3282
3283	if (!dtrace_priv_proc(state))
3284	return (`0`);
3285
3286	if ((thread = current_thread()) == NULL) {
3287	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3288	cpu_core[CPU->cpu_id].cpuc_dtrace_illval = `0`;
3289	return (`0`);
3290	}
3291
3292	return (dtrace_getreg(find_user_regs(thread), ndx));
3293	}
3294
3295
3296	case DIF_VAR_CURTHREAD:
3297	if (!dtrace_priv_kernel(state))
3298	return (`0`);
3299
3300	return ((uint64_t)(uintptr_t)current_thread());
3301
3302	case DIF_VAR_TIMESTAMP:
3303	if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3304	mstate->dtms_timestamp = dtrace_gethrtime();
3305	mstate->dtms_present \|= DTRACE_MSTATE_TIMESTAMP;
3306	}
3307	return (mstate->dtms_timestamp);
3308
3309	case DIF_VAR_VTIMESTAMP:
3310	ASSERT(dtrace_vtime_references != `0`);
3311	return (dtrace_get_thread_vtime(current_thread()));
3312
3313	case DIF_VAR_WALLTIMESTAMP:
3314	if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3315	mstate->dtms_walltimestamp = dtrace_gethrestime();
3316	mstate->dtms_present \|= DTRACE_MSTATE_WALLTIMESTAMP;
3317	}
3318	return (mstate->dtms_walltimestamp);
3319
3320	case DIF_VAR_MACHTIMESTAMP:
3321	if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3322	mstate->dtms_machtimestamp = mach_absolute_time();
3323	mstate->dtms_present \|= DTRACE_MSTATE_MACHTIMESTAMP;
3324	}
3325	return (mstate->dtms_machtimestamp);
3326
3327	case DIF_VAR_CPU:
3328	return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3329
3330	case DIF_VAR_IPL:
3331	if (!dtrace_priv_kernel(state))
3332	return (`0`);
3333	if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3334	mstate->dtms_ipl = dtrace_getipl();
3335	mstate->dtms_present \|= DTRACE_MSTATE_IPL;
3336	}
3337	return (mstate->dtms_ipl);
3338
3339	case DIF_VAR_EPID:
3340	ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3341	return (mstate->dtms_epid);
3342
3343	case DIF_VAR_ID:
3344	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3345	return (mstate->dtms_probe->dtpr_id);
3346
3347	case DIF_VAR_STACKDEPTH:
3348	if (!dtrace_priv_kernel(state))
3349	return (`0`);
3350	if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3351	/*
3352	* APPLE NOTE: Account for introduction of __dtrace_probe()
3353	*/
3354	int aframes = mstate->dtms_probe->dtpr_aframes + `3`;
3355
3356	mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3357	mstate->dtms_present \|= DTRACE_MSTATE_STACKDEPTH;
3358	}
3359	return (mstate->dtms_stackdepth);
3360
3361	case DIF_VAR_USTACKDEPTH:
3362	if (!dtrace_priv_proc(state))
3363	return (`0`);
3364	if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3365	/*
3366	* See comment in DIF_VAR_PID.
3367	*/
3368	if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3369	CPU_ON_INTR(CPU)) {
3370	mstate->dtms_ustackdepth = `0`;
3371	} else {
3372	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3373	mstate->dtms_ustackdepth =
3374	dtrace_getustackdepth();
3375	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3376	}
3377	mstate->dtms_present \|= DTRACE_MSTATE_USTACKDEPTH;
3378	}
3379	return (mstate->dtms_ustackdepth);
3380
3381	case DIF_VAR_CALLER:
3382	if (!dtrace_priv_kernel(state))
3383	return (`0`);
3384	if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3385	/*
3386	* APPLE NOTE: Account for introduction of __dtrace_probe()
3387	*/
3388	int aframes = mstate->dtms_probe->dtpr_aframes + `3`;
3389
3390	if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3391	/*
3392	* If this is an unanchored probe, we are
3393	* required to go through the slow path:
3394	* dtrace_caller() only guarantees correct
3395	* results for anchored probes.
3396	*/
3397	pc_t caller[`2`];
3398
3399	dtrace_getpcstack(caller, `2`, aframes,
3400	(uint32_t *)(uintptr_t)mstate->dtms_arg[`0`]);
3401	mstate->dtms_caller = caller[`1`];
3402	} else if ((mstate->dtms_caller =
3403	dtrace_caller(aframes)) == (uintptr_t)-`1`) {
3404	/*
3405	* We have failed to do this the quick way;
3406	* we must resort to the slower approach of
3407	* calling dtrace_getpcstack().
3408	*/
3409	pc_t caller;
3410
3411	dtrace_getpcstack(&caller, `1`, aframes, NULL);
3412	mstate->dtms_caller = caller;
3413	}
3414
3415	mstate->dtms_present \|= DTRACE_MSTATE_CALLER;
3416	}
3417	return (mstate->dtms_caller);
3418
3419	case DIF_VAR_UCALLER:
3420	if (!dtrace_priv_proc(state))
3421	return (`0`);
3422
3423	if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3424	uint64_t ustack[`3`];
3425
3426	/*
3427	* dtrace_getupcstack() fills in the first uint64_t
3428	* with the current PID. The second uint64_t will
3429	* be the program counter at user-level. The third
3430	* uint64_t will contain the caller, which is what
3431	* we're after.
3432	*/
3433	ustack[`2`] = `0`;
3434	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3435	dtrace_getupcstack(ustack, `3`);
3436	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437	mstate->dtms_ucaller = ustack[`2`];
3438	mstate->dtms_present \|= DTRACE_MSTATE_UCALLER;
3439	}
3440
3441	return (mstate->dtms_ucaller);
3442
3443	case DIF_VAR_PROBEPROV:
3444	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3445	return (dtrace_dif_varstr(
3446	(uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3447	state, mstate));
3448
3449	case DIF_VAR_PROBEMOD:
3450	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3451	return (dtrace_dif_varstr(
3452	(uintptr_t)mstate->dtms_probe->dtpr_mod,
3453	state, mstate));
3454
3455	case DIF_VAR_PROBEFUNC:
3456	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3457	return (dtrace_dif_varstr(
3458	(uintptr_t)mstate->dtms_probe->dtpr_func,
3459	state, mstate));
3460
3461	case DIF_VAR_PROBENAME:
3462	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3463	return (dtrace_dif_varstr(
3464	(uintptr_t)mstate->dtms_probe->dtpr_name,
3465	state, mstate));
3466
3467	case DIF_VAR_PID:
3468	if (!dtrace_priv_proc_relaxed(state))
3469	return (`0`);
3470
3471	/*
3472	* Note that we are assuming that an unanchored probe is
3473	* always due to a high-level interrupt. (And we're assuming
3474	* that there is only a single high level interrupt.)
3475	*/
3476	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3477	/ Anchored probe that fires while on an interrupt accrues to process 0 /
3478	return `0`;
3479
3480	return ((uint64_t)dtrace_proc_selfpid());
3481
3482	case DIF_VAR_PPID:
3483	if (!dtrace_priv_proc_relaxed(state))
3484	return (`0`);
3485
3486	/*
3487	* See comment in DIF_VAR_PID.
3488	*/
3489	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3490	return (`0`);
3491
3492	return ((uint64_t)dtrace_proc_selfppid());
3493
3494	case DIF_VAR_TID:
3495	/ We do not need to check for null current_thread() /
3496	return thread_tid(current_thread()); / globally unique /
3497
3498	case DIF_VAR_PTHREAD_SELF:
3499	if (!dtrace_priv_proc(state))
3500	return (`0`);
3501
3502	/ Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self /
3503	return `0`;
3504
3505	case DIF_VAR_DISPATCHQADDR:
3506	if (!dtrace_priv_proc(state))
3507	return (`0`);
3508
3509	/ We do not need to check for null current_thread() /
3510	return thread_dispatchqaddr(current_thread());
3511
3512	case DIF_VAR_EXECNAME:
3513	{
3514	char xname = (char* *)mstate->dtms_scratch_ptr;
3515	size_t scratch_size = MAXCOMLEN+`1`;
3516
3517	/ The scratch allocation's lifetime is that of the clause. /
3518	if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3519	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3520	return `0`;
3521	}
3522
3523	if (!dtrace_priv_proc_relaxed(state))
3524	return (`0`);
3525
3526	mstate->dtms_scratch_ptr += scratch_size;
3527	proc_selfname( xname, scratch_size );
3528
3529	return ((uint64_t)(uintptr_t)xname);
3530	}
3531
3532
3533	case DIF_VAR_ZONENAME:
3534	{
3535	/ scratch_size is equal to length('global') + 1 for the null-terminator. /
3536	char zname = (char* *)mstate->dtms_scratch_ptr;
3537	size_t scratch_size = `6` + `1`;
3538
3539	if (!dtrace_priv_proc(state))
3540	return (`0`);
3541
3542	/ The scratch allocation's lifetime is that of the clause. /
3543	if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3544	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3545	return `0`;
3546	}
3547
3548	mstate->dtms_scratch_ptr += scratch_size;
3549
3550	/ The kernel does not provide zonename, it will always return 'global'. /
3551	strlcpy(zname, "global", scratch_size);
3552
3553	return ((uint64_t)(uintptr_t)zname);
3554	}
3555
3556	#if MONOTONIC
3557	case DIF_VAR_CPUINSTRS:
3558	return mt_cur_cpu_instrs();
3559
3560	case DIF_VAR_CPUCYCLES:
3561	return mt_cur_cpu_cycles();
3562
3563	case DIF_VAR_VINSTRS:
3564	return mt_cur_thread_instrs();
3565
3566	case DIF_VAR_VCYCLES:
3567	return mt_cur_thread_cycles();
3568	#else /* MONOTONIC */
3569	case DIF_VAR_CPUINSTRS: / FALLTHROUGH /
3570	case DIF_VAR_CPUCYCLES: / FALLTHROUGH /
3571	case DIF_VAR_VINSTRS: / FALLTHROUGH /
3572	case DIF_VAR_VCYCLES: / FALLTHROUGH /
3573	return `0`;
3574	#endif /* !MONOTONIC */
3575
3576	case DIF_VAR_UID:
3577	if (!dtrace_priv_proc_relaxed(state))
3578	return (`0`);
3579
3580	/*
3581	* See comment in DIF_VAR_PID.
3582	*/
3583	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3584	return (`0`);
3585
3586	return ((uint64_t) dtrace_proc_selfruid());
3587
3588	case DIF_VAR_GID:
3589	if (!dtrace_priv_proc(state))
3590	return (`0`);
3591
3592	/*
3593	* See comment in DIF_VAR_PID.
3594	*/
3595	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3596	return (`0`);
3597
3598	if (dtrace_CRED() != NULL)
3599	/ Credential does not require lazy initialization. /
3600	return ((uint64_t)kauth_getgid());
3601	else {
3602	/ proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). /
3603	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3604	return -`1ULL`;
3605	}
3606
3607	case DIF_VAR_ERRNO: {
3608	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3609	if (!dtrace_priv_proc(state))
3610	return (`0`);
3611
3612	/*
3613	* See comment in DIF_VAR_PID.
3614	*/
3615	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3616	return (`0`);
3617
3618	if (uthread)
3619	return (uint64_t)uthread->t_dtrace_errno;
3620	else {
3621	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3622	return -`1ULL`;
3623	}
3624	}
3625
3626	default:
3627	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3628	return (`0`);
3629	}
3630	}
3631
3632	/*
3633	* Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3634	* Notice that we don't bother validating the proper number of arguments or
3635	* their types in the tuple stack. This isn't needed because all argument
3636	* interpretation is safe because of our load safety -- the worst that can
3637	* happen is that a bogus program can obtain bogus results.
3638	*/
3639	static void
3640	dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3641	dtrace_key_t tupregs, int* nargs,
3642	dtrace_mstate_t mstate, dtrace_state_t state)
3643	{
3644	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3645	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3646	dtrace_vstate_t *vstate = &state->dts_vstate;
3647
3648	#if !defined(__APPLE__)
3649	union {
3650	mutex_impl_t mi;
3651	uint64_t mx;
3652	} m;
3653
3654	union {
3655	krwlock_t ri;
3656	uintptr_t rw;
3657	} r;
3658	#else
3659	/ FIXME: awaits lock/mutex work /
3660	#endif /* __APPLE__ */
3661
3662	switch (subr) {
3663	case DIF_SUBR_RAND:
3664	regs[rd] = (dtrace_gethrtime() * `2416` + `374441`) % `1771875`;
3665	break;
3666
3667	#if !defined(__APPLE__)
3668	case DIF_SUBR_MUTEX_OWNED:
3669	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
3670	mstate, vstate)) {
3671	regs[rd] = `0`;
3672	break;
3673	}
3674
3675	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
3676	if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3677	regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3678	else
3679	regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3680	break;
3681
3682	case DIF_SUBR_MUTEX_OWNER:
3683	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
3684	mstate, vstate)) {
3685	regs[rd] = `0`;
3686	break;
3687	}
3688
3689	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
3690	if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3691	MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3692	regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3693	else
3694	regs[rd] = `0`;
3695	break;
3696
3697	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3698	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
3699	mstate, vstate)) {
3700	regs[rd] = `0`;
3701	break;
3702	}
3703
3704	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
3705	regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3706	break;
3707
3708	case DIF_SUBR_MUTEX_TYPE_SPIN:
3709	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (kmutex_t),
3710	mstate, vstate)) {
3711	regs[rd] = `0`;
3712	break;
3713	}
3714
3715	m.mx = dtrace_load64(tupregs[`0`].dttk_value);
3716	regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3717	break;
3718
3719	case DIF_SUBR_RW_READ_HELD: {
3720	uintptr_t tmp;
3721
3722	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (uintptr_t),
3723	mstate, vstate)) {
3724	regs[rd] = `0`;
3725	break;
3726	}
3727
3728	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
3729	regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3730	break;
3731	}
3732
3733	case DIF_SUBR_RW_WRITE_HELD:
3734	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (krwlock_t),
3735	mstate, vstate)) {
3736	regs[rd] = `0`;
3737	break;
3738	}
3739
3740	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
3741	regs[rd] = _RW_WRITE_HELD(&r.ri);
3742	break;
3743
3744	case DIF_SUBR_RW_ISWRITER:
3745	if (!dtrace_canload(tupregs[`0`].dttk_value, sizeof (krwlock_t),
3746	mstate, vstate)) {
3747	regs[rd] = `0`;
3748	break;
3749	}
3750
3751	r.rw = dtrace_loadptr(tupregs[`0`].dttk_value);
3752	regs[rd] = _RW_ISWRITER(&r.ri);
3753	break;
3754	#else
3755	/ FIXME: awaits lock/mutex work /
3756	#endif /* __APPLE__ */
3757
3758	case DIF_SUBR_BCOPY: {
3759	/*
3760	* We need to be sure that the destination is in the scratch
3761	* region -- no other region is allowed.
3762	*/
3763	uintptr_t src = tupregs[`0`].dttk_value;
3764	uintptr_t dest = tupregs[`1`].dttk_value;
3765	size_t size = tupregs[`2`].dttk_value;
3766
3767	if (!dtrace_inscratch(dest, size, mstate)) {
3768	*flags \|= CPU_DTRACE_BADADDR;
3769	*illval = regs[rd];
3770	break;
3771	}
3772
3773	if (!dtrace_canload(src, size, mstate, vstate)) {
3774	regs[rd] = `0`;
3775	break;
3776	}
3777
3778	dtrace_bcopy((void )src, (void* *)dest, size);
3779	break;
3780	}
3781
3782	case DIF_SUBR_ALLOCA:
3783	case DIF_SUBR_COPYIN: {
3784	uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
3785	uint64_t size =
3786	tupregs[subr == DIF_SUBR_ALLOCA ? `0` : `1`].dttk_value;
3787	size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3788
3789	/*
3790	* Check whether the user can access kernel memory
3791	*/
3792	if (dtrace_priv_kernel(state) == `0`) {
3793	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
3794	regs[rd] = `0`;
3795	break;
3796	}
3797	/*
3798	* This action doesn't require any credential checks since
3799	* probes will not activate in user contexts to which the
3800	* enabling user does not have permissions.
3801	*/
3802
3803	/*
3804	* Rounding up the user allocation size could have overflowed
3805	* a large, bogus allocation (like -1ULL) to 0.
3806	*/
3807	if (scratch_size < size \|\|
3808	!DTRACE_INSCRATCH(mstate, scratch_size)) {
3809	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3810	regs[rd] = `0`;
3811	break;
3812	}
3813
3814	if (subr == DIF_SUBR_COPYIN) {
3815	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3816	if (dtrace_priv_proc(state))
3817	dtrace_copyin(tupregs[`0`].dttk_value, dest, size, flags);
3818	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3819	}
3820
3821	mstate->dtms_scratch_ptr += scratch_size;
3822	regs[rd] = dest;
3823	break;
3824	}
3825
3826	case DIF_SUBR_COPYINTO: {
3827	uint64_t size = tupregs[`1`].dttk_value;
3828	uintptr_t dest = tupregs[`2`].dttk_value;
3829
3830	/*
3831	* This action doesn't require any credential checks since
3832	* probes will not activate in user contexts to which the
3833	* enabling user does not have permissions.
3834	*/
3835	if (!dtrace_inscratch(dest, size, mstate)) {
3836	*flags \|= CPU_DTRACE_BADADDR;
3837	*illval = regs[rd];
3838	break;
3839	}
3840
3841	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3842	if (dtrace_priv_proc(state))
3843	dtrace_copyin(tupregs[`0`].dttk_value, dest, size, flags);
3844	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3845	break;
3846	}
3847
3848	case DIF_SUBR_COPYINSTR: {
3849	uintptr_t dest = mstate->dtms_scratch_ptr;
3850	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3851
3852	if (nargs > `1` && tupregs[`1`].dttk_value < size)
3853	size = tupregs[`1`].dttk_value + `1`;
3854
3855	/*
3856	* This action doesn't require any credential checks since
3857	* probes will not activate in user contexts to which the
3858	* enabling user does not have permissions.
3859	*/
3860	if (!DTRACE_INSCRATCH(mstate, size)) {
3861	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3862	regs[rd] = `0`;
3863	break;
3864	}
3865
3866	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3867	if (dtrace_priv_proc(state))
3868	dtrace_copyinstr(tupregs[`0`].dttk_value, dest, size, flags);
3869	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3870
3871	((char *)dest)[size - `1`] = `'\0'`;
3872	mstate->dtms_scratch_ptr += size;
3873	regs[rd] = dest;
3874	break;
3875	}
3876
3877	case DIF_SUBR_MSGSIZE:
3878	case DIF_SUBR_MSGDSIZE: {
3879	/ Darwin does not implement SysV streams messages /
3880	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3881	regs[rd] = `0`;
3882	break;
3883	}
3884
3885	case DIF_SUBR_PROGENYOF: {
3886	pid_t pid = tupregs[`0`].dttk_value;
3887	struct proc *p = current_proc();
3888	int rval = `0`, lim = nprocs;
3889
3890	while(p && (lim-- > `0`)) {
3891	pid_t ppid;
3892
3893	ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3894	if (*flags & CPU_DTRACE_FAULT)
3895	break;
3896
3897	if (ppid == pid) {
3898	rval = `1`;
3899	break;
3900	}
3901
3902	if (ppid == `0`)
3903	break; / Can't climb process tree any further. /
3904
3905	p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3906	if (*flags & CPU_DTRACE_FAULT)
3907	break;
3908	}
3909
3910	regs[rd] = rval;
3911	break;
3912	}
3913
3914	case DIF_SUBR_SPECULATION:
3915	regs[rd] = dtrace_speculation(state);
3916	break;
3917
3918
3919	case DIF_SUBR_COPYOUT: {
3920	uintptr_t kaddr = tupregs[`0`].dttk_value;
3921	user_addr_t uaddr = tupregs[`1`].dttk_value;
3922	uint64_t size = tupregs[`2`].dttk_value;
3923
3924	if (!dtrace_destructive_disallow &&
3925	dtrace_priv_proc_control(state) &&
3926	!dtrace_istoxic(kaddr, size) &&
3927	dtrace_canload(kaddr, size, mstate, vstate)) {
3928	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3929	dtrace_copyout(kaddr, uaddr, size, flags);
3930	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3931	}
3932	break;
3933	}
3934
3935	case DIF_SUBR_COPYOUTSTR: {
3936	uintptr_t kaddr = tupregs[`0`].dttk_value;
3937	user_addr_t uaddr = tupregs[`1`].dttk_value;
3938	uint64_t size = tupregs[`2`].dttk_value;
3939	size_t lim;
3940
3941	if (!dtrace_destructive_disallow &&
3942	dtrace_priv_proc_control(state) &&
3943	!dtrace_istoxic(kaddr, size) &&
3944	dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
3945	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3946	dtrace_copyoutstr(kaddr, uaddr, lim, flags);
3947	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3948	}
3949	break;
3950	}
3951
3952	case DIF_SUBR_STRLEN: {
3953	size_t size = state->dts_options[DTRACEOPT_STRSIZE];
3954	uintptr_t addr = (uintptr_t)tupregs[`0`].dttk_value;
3955	size_t lim;
3956
3957	if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3958	regs[rd] = `0`;
3959	break;
3960	}
3961
3962	regs[rd] = dtrace_strlen((char *)addr, lim);
3963
3964	break;
3965	}
3966
3967	case DIF_SUBR_STRCHR:
3968	case DIF_SUBR_STRRCHR: {
3969	/*
3970	* We're going to iterate over the string looking for the
3971	* specified character. We will iterate until we have reached
3972	* the string length or we have found the character. If this
3973	* is DIF_SUBR_STRRCHR, we will look for the last occurrence
3974	* of the specified character instead of the first.
3975	*/
3976	uintptr_t addr = tupregs[`0`].dttk_value;
3977	uintptr_t addr_limit;
3978	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3979	size_t lim;
3980	char c, target = (char)tupregs[`1`].dttk_value;
3981
3982	if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3983	regs[rd] = `0`;
3984	break;
3985	}
3986	addr_limit = addr + lim;
3987
3988	for (regs[rd] = `0`; addr < addr_limit; addr++) {
3989	if ((c = dtrace_load8(addr)) == target) {
3990	regs[rd] = addr;
3991
3992	if (subr == DIF_SUBR_STRCHR)
3993	break;
3994	}
3995
3996	if (c == `'\0'`)
3997	break;
3998	}
3999
4000	break;
4001	}
4002
4003	case DIF_SUBR_STRSTR:
4004	case DIF_SUBR_INDEX:
4005	case DIF_SUBR_RINDEX: {
4006	/*
4007	* We're going to iterate over the string looking for the
4008	* specified string. We will iterate until we have reached
4009	* the string length or we have found the string. (Yes, this
4010	* is done in the most naive way possible -- but considering
4011	* that the string we're searching for is likely to be
4012	* relatively short, the complexity of Rabin-Karp or similar
4013	* hardly seems merited.)
4014	*/
4015	char addr = (char* *)(uintptr_t)tupregs[`0`].dttk_value;
4016	char substr = (char* *)(uintptr_t)tupregs[`1`].dttk_value;
4017	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4018	size_t len = dtrace_strlen(addr, size);
4019	size_t sublen = dtrace_strlen(substr, size);
4020	char limit = addr + len, orig = addr;
4021	int notfound = subr == DIF_SUBR_STRSTR ? `0` : -`1`;
4022	int inc = `1`;
4023
4024	regs[rd] = notfound;
4025
4026	if (!dtrace_canload((uintptr_t)addr, len + `1`, mstate, vstate)) {
4027	regs[rd] = `0`;
4028	break;
4029	}
4030
4031	if (!dtrace_canload((uintptr_t)substr, sublen + `1`, mstate,
4032	vstate)) {
4033	regs[rd] = `0`;
4034	break;
4035	}
4036
4037	/*
4038	* strstr() and index()/rindex() have similar semantics if
4039	* both strings are the empty string: strstr() returns a
4040	* pointer to the (empty) string, and index() and rindex()
4041	* both return index 0 (regardless of any position argument).
4042	*/
4043	if (sublen == `0` && len == `0`) {
4044	if (subr == DIF_SUBR_STRSTR)
4045	regs[rd] = (uintptr_t)addr;
4046	else
4047	regs[rd] = `0`;
4048	break;
4049	}
4050
4051	if (subr != DIF_SUBR_STRSTR) {
4052	if (subr == DIF_SUBR_RINDEX) {
4053	limit = orig - `1`;
4054	addr += len;
4055	inc = -`1`;
4056	}
4057
4058	/*
4059	* Both index() and rindex() take an optional position
4060	* argument that denotes the starting position.
4061	*/
4062	if (nargs == `3`) {
4063	int64_t pos = (int64_t)tupregs[`2`].dttk_value;
4064
4065	/*
4066	* If the position argument to index() is
4067	* negative, Perl implicitly clamps it at
4068	* zero. This semantic is a little surprising
4069	* given the special meaning of negative
4070	* positions to similar Perl functions like
4071	* substr(), but it appears to reflect a
4072	* notion that index() can start from a
4073	* negative index and increment its way up to
4074	* the string. Given this notion, Perl's
4075	* rindex() is at least self-consistent in
4076	* that it implicitly clamps positions greater
4077	* than the string length to be the string
4078	* length. Where Perl completely loses
4079	* coherence, however, is when the specified
4080	* substring is the empty string (""). In
4081	* this case, even if the position is
4082	* negative, rindex() returns 0 -- and even if
4083	* the position is greater than the length,
4084	* index() returns the string length. These
4085	* semantics violate the notion that index()
4086	* should never return a value less than the
4087	* specified position and that rindex() should
4088	* never return a value greater than the
4089	* specified position. (One assumes that
4090	* these semantics are artifacts of Perl's
4091	* implementation and not the results of
4092	* deliberate design -- it beggars belief that
4093	* even Larry Wall could desire such oddness.)
4094	* While in the abstract one would wish for
4095	* consistent position semantics across
4096	* substr(), index() and rindex() -- or at the
4097	* very least self-consistent position
4098	* semantics for index() and rindex() -- we
4099	* instead opt to keep with the extant Perl
4100	* semantics, in all their broken glory. (Do
4101	* we have more desire to maintain Perl's
4102	* semantics than Perl does? Probably.)
4103	*/
4104	if (subr == DIF_SUBR_RINDEX) {
4105	if (pos < `0`) {
4106	if (sublen == `0`)
4107	regs[rd] = `0`;
4108	break;
4109	}
4110
4111	if ((size_t)pos > len)
4112	pos = len;
4113	} else {
4114	if (pos < `0`)
4115	pos = `0`;
4116
4117	if ((size_t)pos >= len) {
4118	if (sublen == `0`)
4119	regs[rd] = len;
4120	break;
4121	}
4122	}
4123
4124	addr = orig + pos;
4125	}
4126	}
4127
4128	for (regs[rd] = notfound; addr != limit; addr += inc) {
4129	if (dtrace_strncmp(addr, substr, sublen) == `0`) {
4130	if (subr != DIF_SUBR_STRSTR) {
4131	/*
4132	* As D index() and rindex() are
4133	* modeled on Perl (and not on awk),
4134	* we return a zero-based (and not a
4135	* one-based) index. (For you Perl
4136	* weenies: no, we're not going to add
4137	* $[ -- and shouldn't you be at a con
4138	* or something?)
4139	*/
4140	regs[rd] = (uintptr_t)(addr - orig);
4141	break;
4142	}
4143
4144	ASSERT(subr == DIF_SUBR_STRSTR);
4145	regs[rd] = (uintptr_t)addr;
4146	break;
4147	}
4148	}
4149
4150	break;
4151	}
4152
4153	case DIF_SUBR_STRTOK: {
4154	uintptr_t addr = tupregs[`0`].dttk_value;
4155	uintptr_t tokaddr = tupregs[`1`].dttk_value;
4156	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4157	uintptr_t limit, toklimit;
4158	size_t clim;
4159	char dest = (char* *)mstate->dtms_scratch_ptr;
4160	uint8_t c=`'\0'`, tokmap[`32`]; / 256 / 8 /
4161	uint64_t i = `0`;
4162
4163	/*
4164	* Check both the token buffer and (later) the input buffer,
4165	* since both could be non-scratch addresses.
4166	*/
4167	if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4168	regs[rd] = `0`;
4169	break;
4170	}
4171	toklimit = tokaddr + clim;
4172
4173	if (!DTRACE_INSCRATCH(mstate, size)) {
4174	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4175	regs[rd] = `0`;
4176	break;
4177	}
4178
4179	if (addr == `0`) {
4180	/*
4181	* If the address specified is NULL, we use our saved
4182	* strtok pointer from the mstate. Note that this
4183	* means that the saved strtok pointer is _only_
4184	* valid within multiple enablings of the same probe --
4185	* it behaves like an implicit clause-local variable.
4186	*/
4187	addr = mstate->dtms_strtok;
4188	limit = mstate->dtms_strtok_limit;
4189	} else {
4190	/*
4191	* If the user-specified address is non-NULL we must
4192	* access check it. This is the only time we have
4193	* a chance to do so, since this address may reside
4194	* in the string table of this clause-- future calls
4195	* (when we fetch addr from mstate->dtms_strtok)
4196	* would fail this access check.
4197	*/
4198	if (!dtrace_strcanload(addr, size, &clim, mstate,
4199	vstate)) {
4200	regs[rd] = `0`;
4201	break;
4202	}
4203	limit = addr + clim;
4204	}
4205
4206	/*
4207	* First, zero the token map, and then process the token
4208	* string -- setting a bit in the map for every character
4209	* found in the token string.
4210	*/
4211	for (i = `0`; i < (int)sizeof (tokmap); i++)
4212	tokmap[i] = `0`;
4213
4214	for (; tokaddr < toklimit; tokaddr++) {
4215	if ((c = dtrace_load8(tokaddr)) == `'\0'`)
4216	break;
4217
4218	ASSERT((c >> `3`) < sizeof (tokmap));
4219	tokmap[c >> `3`] \|= (`1` << (c & `0x7`));
4220	}
4221
4222	for (; addr < limit; addr++) {
4223	/*
4224	* We're looking for a character that is _not_
4225	* contained in the token string.
4226	*/
4227	if ((c = dtrace_load8(addr)) == `'\0'`)
4228	break;
4229
4230	if (!(tokmap[c >> `3`] & (`1` << (c & `0x7`))))
4231	break;
4232	}
4233
4234	if (c == `'\0'`) {
4235	/*
4236	* We reached the end of the string without finding
4237	* any character that was not in the token string.
4238	* We return NULL in this case, and we set the saved
4239	* address to NULL as well.
4240	*/
4241	regs[rd] = `0`;
4242	mstate->dtms_strtok = `0`;
4243	mstate->dtms_strtok_limit = `0`;
4244	break;
4245	}
4246
4247	/*
4248	* From here on, we're copying into the destination string.
4249	*/
4250	for (i = `0`; addr < limit && i < size - `1`; addr++) {
4251	if ((c = dtrace_load8(addr)) == `'\0'`)
4252	break;
4253
4254	if (tokmap[c >> `3`] & (`1` << (c & `0x7`)))
4255	break;
4256
4257	ASSERT(i < size);
4258	dest[i++] = c;
4259	}
4260
4261	ASSERT(i < size);
4262	dest[i] = `'\0'`;
4263	regs[rd] = (uintptr_t)dest;
4264	mstate->dtms_scratch_ptr += size;
4265	mstate->dtms_strtok = addr;
4266	mstate->dtms_strtok_limit = limit;
4267	break;
4268	}
4269
4270	case DIF_SUBR_SUBSTR: {
4271	uintptr_t s = tupregs[`0`].dttk_value;
4272	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4273	char d = (char* *)mstate->dtms_scratch_ptr;
4274	int64_t index = (int64_t)tupregs[`1`].dttk_value;
4275	int64_t remaining = (int64_t)tupregs[`2`].dttk_value;
4276	size_t len = dtrace_strlen((char *)s, size);
4277	int64_t i = `0`;
4278
4279	if (!dtrace_canload(s, len + `1`, mstate, vstate)) {
4280	regs[rd] = `0`;
4281	break;
4282	}
4283
4284	if (!DTRACE_INSCRATCH(mstate, size)) {
4285	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4286	regs[rd] = `0`;
4287	break;
4288	}
4289
4290	if (nargs <= `2`)
4291	remaining = (int64_t)size;
4292
4293	if (index < `0`) {
4294	index += len;
4295
4296	if (index < `0` && index + remaining > `0`) {
4297	remaining += index;
4298	index = `0`;
4299	}
4300	}
4301
4302	if ((size_t)index >= len \|\| index < `0`) {
4303	remaining = `0`;
4304	} else if (remaining < `0`) {
4305	remaining += len - index;
4306	} else if ((uint64_t)index + (uint64_t)remaining > size) {
4307	remaining = size - index;
4308	}
4309
4310	for (i = `0`; i < remaining; i++) {
4311	if ((d[i] = dtrace_load8(s + index + i)) == `'\0'`)
4312	break;
4313	}
4314
4315	d[i] = `'\0'`;
4316
4317	mstate->dtms_scratch_ptr += size;
4318	regs[rd] = (uintptr_t)d;
4319	break;
4320	}
4321
4322	case DIF_SUBR_GETMAJOR:
4323	regs[rd] = (uintptr_t)major( (dev_t)tupregs[`0`].dttk_value );
4324	break;
4325
4326	case DIF_SUBR_GETMINOR:
4327	regs[rd] = (uintptr_t)minor( (dev_t)tupregs[`0`].dttk_value );
4328	break;
4329
4330	case DIF_SUBR_DDI_PATHNAME: {
4331	/ APPLE NOTE: currently unsupported on Darwin /
4332	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4333	regs[rd] = `0`;
4334	break;
4335	}
4336
4337	case DIF_SUBR_STRJOIN: {
4338	char d = (char* *)mstate->dtms_scratch_ptr;
4339	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4340	uintptr_t s1 = tupregs[`0`].dttk_value;
4341	uintptr_t s2 = tupregs[`1`].dttk_value;
4342	uint64_t i = `0`, j = `0`;
4343	size_t lim1, lim2;
4344	char c;
4345
4346	if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) \|\|
4347	!dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4348	regs[rd] = `0`;
4349	break;
4350	}
4351
4352	if (!DTRACE_INSCRATCH(mstate, size)) {
4353	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4354	regs[rd] = `0`;
4355	break;
4356	}
4357
4358	for (;;) {
4359	if (i >= size) {
4360	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4361	regs[rd] = `0`;
4362	break;
4363	}
4364	c = (i >= lim1) ? `'\0'` : dtrace_load8(s1++);
4365	if ((d[i++] = c) == `'\0'`) {
4366	i--;
4367	break;
4368	}
4369	}
4370
4371	for (;;) {
4372	if (i >= size) {
4373	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4374	regs[rd] = `0`;
4375	break;
4376	}
4377	c = (j++ >= lim2) ? `'\0'` : dtrace_load8(s2++);
4378	if ((d[i++] = c) == `'\0'`)
4379	break;
4380	}
4381
4382	if (i < size) {
4383	mstate->dtms_scratch_ptr += i;
4384	regs[rd] = (uintptr_t)d;
4385	}
4386
4387	break;
4388	}
4389
4390	case DIF_SUBR_LLTOSTR: {
4391	int64_t i = (int64_t)tupregs[`0`].dttk_value;
4392	uint64_t val, digit;
4393	uint64_t size = `65`; / enough room for 2^64 in binary /
4394	char end = (char* *)mstate->dtms_scratch_ptr + size - `1`;
4395	int base = `10`;
4396
4397	if (nargs > `1`) {
4398	if ((base = tupregs[`1`].dttk_value) <= `1` \|\|
4399	base > (`'z'` - `'a'` + `1`) + (`'9'` - `'0'` + `1`)) {
4400	*flags \|= CPU_DTRACE_ILLOP;
4401	break;
4402	}
4403	}
4404
4405	val = (base == `10` && i < `0`) ? i * -`1` : i;
4406
4407	if (!DTRACE_INSCRATCH(mstate, size)) {
4408	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4409	regs[rd] = `0`;
4410	break;
4411	}
4412
4413	for (*end-- = `'\0'`; val; val /= base) {
4414	if ((digit = val % base) <= `'9'` - `'0'`) {
4415	*end-- = `'0'` + digit;
4416	} else {
4417	*end-- = `'a'` + (digit - (`'9'` - `'0'`) - `1`);
4418	}
4419	}
4420
4421	if (i == `0` && base == `16`)
4422	*end-- = `'0'`;
4423
4424	if (base == `16`)
4425	*end-- = `'x'`;
4426
4427	if (i == `0` \|\| base == `8` \|\| base == `16`)
4428	*end-- = `'0'`;
4429
4430	if (i < `0` && base == `10`)
4431	*end-- = `'-'`;
4432
4433	regs[rd] = (uintptr_t)end + `1`;
4434	mstate->dtms_scratch_ptr += size;
4435	break;
4436	}
4437
4438	case DIF_SUBR_HTONS:
4439	case DIF_SUBR_NTOHS:
4440	#ifdef _BIG_ENDIAN
4441	regs[rd] = (uint16_t)tupregs[`0`].dttk_value;
4442	#else
4443	regs[rd] = DT_BSWAP_16((uint16_t)tupregs[`0`].dttk_value);
4444	#endif
4445	break;
4446
4447
4448	case DIF_SUBR_HTONL:
4449	case DIF_SUBR_NTOHL:
4450	#ifdef _BIG_ENDIAN
4451	regs[rd] = (uint32_t)tupregs[`0`].dttk_value;
4452	#else
4453	regs[rd] = DT_BSWAP_32((uint32_t)tupregs[`0`].dttk_value);
4454	#endif
4455	break;
4456
4457
4458	case DIF_SUBR_HTONLL:
4459	case DIF_SUBR_NTOHLL:
4460	#ifdef _BIG_ENDIAN
4461	regs[rd] = (uint64_t)tupregs[`0`].dttk_value;
4462	#else
4463	regs[rd] = DT_BSWAP_64((uint64_t)tupregs[`0`].dttk_value);
4464	#endif
4465	break;
4466
4467
4468	case DIF_SUBR_DIRNAME:
4469	case DIF_SUBR_BASENAME: {
4470	char dest = (char* *)mstate->dtms_scratch_ptr;
4471	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4472	uintptr_t src = tupregs[`0`].dttk_value;
4473	int i, j, len = dtrace_strlen((char *)src, size);
4474	int lastbase = -`1`, firstbase = -`1`, lastdir = -`1`;
4475	int start, end;
4476
4477	if (!dtrace_canload(src, len + `1`, mstate, vstate)) {
4478	regs[rd] = `0`;
4479	break;
4480	}
4481
4482	if (!DTRACE_INSCRATCH(mstate, size)) {
4483	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4484	regs[rd] = `0`;
4485	break;
4486	}
4487
4488	/*
4489	* The basename and dirname for a zero-length string is
4490	* defined to be "."
4491	*/
4492	if (len == `0`) {
4493	len = `1`;
4494	src = (uintptr_t)".";
4495	}
4496
4497	/*
4498	* Start from the back of the string, moving back toward the
4499	* front until we see a character that isn't a slash. That
4500	* character is the last character in the basename.
4501	*/
4502	for (i = len - `1`; i >= `0`; i--) {
4503	if (dtrace_load8(src + i) != `'/'`)
4504	break;
4505	}
4506
4507	if (i >= `0`)
4508	lastbase = i;
4509
4510	/*
4511	* Starting from the last character in the basename, move
4512	* towards the front until we find a slash. The character
4513	* that we processed immediately before that is the first
4514	* character in the basename.
4515	*/
4516	for (; i >= `0`; i--) {
4517	if (dtrace_load8(src + i) == `'/'`)
4518	break;
4519	}
4520
4521	if (i >= `0`)
4522	firstbase = i + `1`;
4523
4524	/*
4525	* Now keep going until we find a non-slash character. That
4526	* character is the last character in the dirname.
4527	*/
4528	for (; i >= `0`; i--) {
4529	if (dtrace_load8(src + i) != `'/'`)
4530	break;
4531	}
4532
4533	if (i >= `0`)
4534	lastdir = i;
4535
4536	ASSERT(!(lastbase == -`1` && firstbase != -`1`));
4537	ASSERT(!(firstbase == -`1` && lastdir != -`1`));
4538
4539	if (lastbase == -`1`) {
4540	/*
4541	* We didn't find a non-slash character. We know that
4542	* the length is non-zero, so the whole string must be
4543	* slashes. In either the dirname or the basename
4544	* case, we return '/'.
4545	*/
4546	ASSERT(firstbase == -`1`);
4547	firstbase = lastbase = lastdir = `0`;
4548	}
4549
4550	if (firstbase == -`1`) {
4551	/*
4552	* The entire string consists only of a basename
4553	* component. If we're looking for dirname, we need
4554	* to change our string to be just "."; if we're
4555	* looking for a basename, we'll just set the first
4556	* character of the basename to be 0.
4557	*/
4558	if (subr == DIF_SUBR_DIRNAME) {
4559	ASSERT(lastdir == -`1`);
4560	src = (uintptr_t)".";
4561	lastdir = `0`;
4562	} else {
4563	firstbase = `0`;
4564	}
4565	}
4566
4567	if (subr == DIF_SUBR_DIRNAME) {
4568	if (lastdir == -`1`) {
4569	/*
4570	* We know that we have a slash in the name --
4571	* or lastdir would be set to 0, above. And
4572	* because lastdir is -1, we know that this
4573	* slash must be the first character. (That
4574	* is, the full string must be of the form
4575	* "/basename".) In this case, the last
4576	* character of the directory name is 0.
4577	*/
4578	lastdir = `0`;
4579	}
4580
4581	start = `0`;
4582	end = lastdir;
4583	} else {
4584	ASSERT(subr == DIF_SUBR_BASENAME);
4585	ASSERT(firstbase != -`1` && lastbase != -`1`);
4586	start = firstbase;
4587	end = lastbase;
4588	}
4589
4590	for (i = start, j = `0`; i <= end && (uint64_t)j < size - `1`; i++, j++)
4591	dest[j] = dtrace_load8(src + i);
4592
4593	dest[j] = `'\0'`;
4594	regs[rd] = (uintptr_t)dest;
4595	mstate->dtms_scratch_ptr += size;
4596	break;
4597	}
4598
4599	case DIF_SUBR_CLEANPATH: {
4600	char dest = (char* *)mstate->dtms_scratch_ptr, c;
4601	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4602	uintptr_t src = tupregs[`0`].dttk_value;
4603	size_t lim;
4604	size_t i = `0`, j = `0`;
4605
4606	if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
4607	regs[rd] = `0`;
4608	break;
4609	}
4610
4611	if (!DTRACE_INSCRATCH(mstate, size)) {
4612	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4613	regs[rd] = `0`;
4614	break;
4615	}
4616
4617	/*
4618	* Move forward, loading each character.
4619	*/
4620	do {
4621	c = (i >= lim) ? `'\0'` : dtrace_load8(src + i++);
4622	next:
4623	if ((uint64_t)(j + `5`) >= size) / 5 = strlen("/..c\0") /
4624	break;
4625
4626	if (c != `'/'`) {
4627	dest[j++] = c;
4628	continue;
4629	}
4630
4631	c = (i >= lim) ? `'\0'` : dtrace_load8(src + i++);
4632
4633	if (c == `'/'`) {
4634	/*
4635	* We have two slashes -- we can just advance
4636	* to the next character.
4637	*/
4638	goto next;
4639	}
4640
4641	if (c != `'.'`) {
4642	/*
4643	* This is not "." and it's not ".." -- we can
4644	* just store the "/" and this character and
4645	* drive on.
4646	*/
4647	dest[j++] = `'/'`;
4648	dest[j++] = c;
4649	continue;
4650	}
4651
4652	c = (i >= lim) ? `'\0'` : dtrace_load8(src + i++);
4653
4654	if (c == `'/'`) {
4655	/*
4656	* This is a "/./" component. We're not going
4657	* to store anything in the destination buffer;
4658	* we're just going to go to the next component.
4659	*/
4660	goto next;
4661	}
4662
4663	if (c != `'.'`) {
4664	/*
4665	* This is not ".." -- we can just store the
4666	* "/." and this character and continue
4667	* processing.
4668	*/
4669	dest[j++] = `'/'`;
4670	dest[j++] = `'.'`;
4671	dest[j++] = c;
4672	continue;
4673	}
4674
4675	c = (i >= lim) ? `'\0'` : dtrace_load8(src + i++);
4676
4677	if (c != `'/'` && c != `'\0'`) {
4678	/*
4679	* This is not ".." -- it's "..[mumble]".
4680	* We'll store the "/.." and this character
4681	* and continue processing.
4682	*/
4683	dest[j++] = `'/'`;
4684	dest[j++] = `'.'`;
4685	dest[j++] = `'.'`;
4686	dest[j++] = c;
4687	continue;
4688	}
4689
4690	/*
4691	* This is "/../" or "/..\0". We need to back up
4692	* our destination pointer until we find a "/".
4693	*/
4694	i--;
4695	while (j != `0` && dest[--j] != `'/'`)
4696	continue;
4697
4698	if (c == `'\0'`)
4699	dest[++j] = `'/'`;
4700	} while (c != `'\0'`);
4701
4702	dest[j] = `'\0'`;
4703	regs[rd] = (uintptr_t)dest;
4704	mstate->dtms_scratch_ptr += size;
4705	break;
4706	}
4707
4708	case DIF_SUBR_INET_NTOA:
4709	case DIF_SUBR_INET_NTOA6:
4710	case DIF_SUBR_INET_NTOP: {
4711	size_t size;
4712	int af, argi, i;
4713	char base, end;
4714
4715	if (subr == DIF_SUBR_INET_NTOP) {
4716	af = (int)tupregs[`0`].dttk_value;
4717	argi = `1`;
4718	} else {
4719	af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4720	argi = `0`;
4721	}
4722
4723	if (af == AF_INET) {
4724	#if !defined(__APPLE__)
4725	ipaddr_t ip4;
4726	#else
4727	uint32_t ip4;
4728	#endif /* __APPLE__ */
4729	uint8_t *ptr8, val;
4730
4731	/*
4732	* Safely load the IPv4 address.
4733	*/
4734	#if !defined(__APPLE__)
4735	ip4 = dtrace_load32(tupregs[argi].dttk_value);
4736	#else
4737	if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
4738	mstate, vstate)) {
4739	regs[rd] = `0`;
4740	break;
4741	}
4742
4743	dtrace_bcopy(
4744	(void *)(uintptr_t)tupregs[argi].dttk_value,
4745	(void )(uintptr_t)&ip4, sizeof* (ip4));
4746	#endif /* __APPLE__ */
4747	/*
4748	* Check an IPv4 string will fit in scratch.
4749	*/
4750	#if !defined(__APPLE__)
4751	size = INET_ADDRSTRLEN;
4752	#else
4753	size = MAX_IPv4_STR_LEN;
4754	#endif /* __APPLE__ */
4755	if (!DTRACE_INSCRATCH(mstate, size)) {
4756	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4757	regs[rd] = `0`;
4758	break;
4759	}
4760	base = (char *)mstate->dtms_scratch_ptr;
4761	end = (char *)mstate->dtms_scratch_ptr + size - `1`;
4762
4763	/*
4764	* Stringify as a dotted decimal quad.
4765	*/
4766	*end-- = `'\0'`;
4767	ptr8 = (uint8_t *)&ip4;
4768	for (i = `3`; i >= `0`; i--) {
4769	val = ptr8[i];
4770
4771	if (val == `0`) {
4772	*end-- = `'0'`;
4773	} else {
4774	for (; val; val /= `10`) {
4775	*end-- = `'0'` + (val % `10`);
4776	}
4777	}
4778
4779	if (i > `0`)
4780	*end-- = `'.'`;
4781	}
4782	ASSERT(end + `1` >= base);
4783
4784	} else if (af == AF_INET6) {
4785	#if defined(__APPLE__)
4786	#define _S6_un __u6_addr
4787	#define _S6_u8 __u6_addr8
4788	#endif /* __APPLE__ */
4789	struct in6_addr ip6;
4790	int firstzero, tryzero, numzero, v6end;
4791	uint16_t val;
4792	const char digits[] = "0123456789abcdef";
4793
4794	/*
4795	* Stringify using RFC 1884 convention 2 - 16 bit
4796	* hexadecimal values with a zero-run compression.
4797	* Lower case hexadecimal digits are used.
4798	* eg, fe80::214:4fff:fe0b:76c8.
4799	* The IPv4 embedded form is returned for inet_ntop,
4800	* just the IPv4 string is returned for inet_ntoa6.
4801	*/
4802
4803	if (!dtrace_canload(tupregs[argi].dttk_value,
4804	sizeof(struct in6_addr), mstate, vstate)) {
4805	regs[rd] = `0`;
4806	break;
4807	}
4808
4809	/*
4810	* Safely load the IPv6 address.
4811	*/
4812	dtrace_bcopy(
4813	(void *)(uintptr_t)tupregs[argi].dttk_value,
4814	(void )(uintptr_t)&ip6, sizeof* (struct in6_addr));
4815
4816	/*
4817	* Check an IPv6 string will fit in scratch.
4818	*/
4819	size = INET6_ADDRSTRLEN;
4820	if (!DTRACE_INSCRATCH(mstate, size)) {
4821	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4822	regs[rd] = `0`;
4823	break;
4824	}
4825	base = (char *)mstate->dtms_scratch_ptr;
4826	end = (char *)mstate->dtms_scratch_ptr + size - `1`;
4827	*end-- = `'\0'`;
4828
4829	/*
4830	* Find the longest run of 16 bit zero values
4831	* for the single allowed zero compression - "::".
4832	*/
4833	firstzero = -`1`;
4834	tryzero = -`1`;
4835	numzero = `1`;
4836	for (i = `0`; i < (int)sizeof (struct in6_addr); i++) {
4837	if (ip6._S6_un._S6_u8[i] == `0` &&
4838	tryzero == -`1` && i % `2` == `0`) {
4839	tryzero = i;
4840	continue;
4841	}
4842
4843	if (tryzero != -`1` &&
4844	(ip6._S6_un._S6_u8[i] != `0` \|\|
4845	i == sizeof (struct in6_addr) - `1`)) {
4846
4847	if (i - tryzero <= numzero) {
4848	tryzero = -`1`;
4849	continue;
4850	}
4851
4852	firstzero = tryzero;
4853	numzero = i - i % `2` - tryzero;
4854	tryzero = -`1`;
4855
4856	if (ip6._S6_un._S6_u8[i] == `0` &&
4857	i == sizeof (struct in6_addr) - `1`)
4858	numzero += `2`;
4859	}
4860	}
4861	ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
4862
4863	/*
4864	* Check for an IPv4 embedded address.
4865	*/
4866	v6end = sizeof (struct in6_addr) - `2`;
4867	if (IN6_IS_ADDR_V4MAPPED(&ip6) \|\|
4868	IN6_IS_ADDR_V4COMPAT(&ip6)) {
4869	for (i = sizeof (struct in6_addr) - `1`;
4870	i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
4871	ASSERT(end >= base);
4872
4873	val = ip6._S6_un._S6_u8[i];
4874
4875	if (val == `0`) {
4876	*end-- = `'0'`;
4877	} else {
4878	for (; val; val /= `10`) {
4879	*end-- = `'0'` + val % `10`;
4880	}
4881	}
4882
4883	if (i > (int)DTRACE_V4MAPPED_OFFSET)
4884	*end-- = `'.'`;
4885	}
4886
4887	if (subr == DIF_SUBR_INET_NTOA6)
4888	goto inetout;
4889
4890	/*
4891	* Set v6end to skip the IPv4 address that
4892	* we have already stringified.
4893	*/
4894	v6end = `10`;
4895	}
4896
4897	/*
4898	* Build the IPv6 string by working through the
4899	* address in reverse.
4900	*/
4901	for (i = v6end; i >= `0`; i -= `2`) {
4902	ASSERT(end >= base);
4903
4904	if (i == firstzero + numzero - `2`) {
4905	*end-- = `':'`;
4906	*end-- = `':'`;
4907	i -= numzero - `2`;
4908	continue;
4909	}
4910
4911	if (i < `14` && i != firstzero - `2`)
4912	*end-- = `':'`;
4913
4914	val = (ip6._S6_un._S6_u8[i] << `8`) +
4915	ip6._S6_un._S6_u8[i + `1`];
4916
4917	if (val == `0`) {
4918	*end-- = `'0'`;
4919	} else {
4920	for (; val; val /= `16`) {
4921	*end-- = digits[val % `16`];
4922	}
4923	}
4924	}
4925	ASSERT(end + `1` >= base);
4926
4927	#if defined(__APPLE__)
4928	#undef _S6_un
4929	#undef _S6_u8
4930	#endif /* __APPLE__ */
4931	} else {
4932	/*
4933	* The user didn't use AH_INET or AH_INET6.
4934	*/
4935	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4936	regs[rd] = `0`;
4937	break;
4938	}
4939
4940	inetout: regs[rd] = (uintptr_t)end + `1`;
4941	mstate->dtms_scratch_ptr += size;
4942	break;
4943	}
4944
4945	case DIF_SUBR_TOUPPER:
4946	case DIF_SUBR_TOLOWER: {
4947	uintptr_t src = tupregs[`0`].dttk_value;
4948	char dest = (char* *)mstate->dtms_scratch_ptr;
4949	char lower, upper, base, c;
4950	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4951	size_t len = dtrace_strlen((char*) src, size);
4952	size_t i = `0`;
4953
4954	lower = (subr == DIF_SUBR_TOUPPER) ? `'a'` : `'A'`;
4955	upper = (subr == DIF_SUBR_TOUPPER) ? `'z'` : `'Z'`;
4956	base = (subr == DIF_SUBR_TOUPPER) ? `'A'` : `'a'`;
4957
4958	if (!dtrace_canload(src, len + `1`, mstate, vstate)) {
4959	regs[rd] = `0`;
4960	break;
4961	}
4962
4963	if (!DTRACE_INSCRATCH(mstate, size)) {
4964	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4965	regs[rd] = `0`;
4966	break;
4967	}
4968
4969	for (i = `0`; i < size - `1`; ++i) {
4970	if ((c = dtrace_load8(src + i)) == `'\0'`)
4971	break;
4972	if (c >= lower && c <= upper)
4973	c = base + (c - lower);
4974	dest[i] = c;
4975	}
4976
4977	ASSERT(i < size);
4978
4979	dest[i] = `'\0'`;
4980	regs[rd] = (uintptr_t) dest;
4981	mstate->dtms_scratch_ptr += size;
4982
4983	break;
4984	}
4985
4986	#if defined(__APPLE__)
4987	case DIF_SUBR_VM_KERNEL_ADDRPERM: {
4988	if (!dtrace_priv_kernel(state)) {
4989	regs[rd] = `0`;
4990	} else {
4991	regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[`0`].dttk_value);
4992	}
4993
4994	break;
4995	}
4996
4997	case DIF_SUBR_KDEBUG_TRACE: {
4998	uint32_t debugid;
4999	uintptr_t args[`4`] = {`0`};
5000	int i;
5001
5002	if (nargs < `2` \|\| nargs > `5`) {
5003	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5004	break;
5005	}
5006
5007	if (dtrace_destructive_disallow)
5008	return;
5009
5010	debugid = tupregs[`0`].dttk_value;
5011	for (i = `0`; i < nargs - `1`; i++)
5012	args[i] = tupregs[i + `1`].dttk_value;
5013
5014	kernel_debug(debugid, args[`0`], args[`1`], args[`2`], args[`3`], `0`);
5015
5016	break;
5017	}
5018
5019	case DIF_SUBR_KDEBUG_TRACE_STRING: {
5020	if (nargs != `3`) {
5021	break;
5022	}
5023
5024	if (dtrace_destructive_disallow)
5025	return;
5026
5027	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5028	uint32_t debugid = tupregs[`0`].dttk_value;
5029	uint64_t str_id = tupregs[`1`].dttk_value;
5030	uintptr_t src = tupregs[`2`].dttk_value;
5031	size_t lim;
5032	char buf[size];
5033	char* str = NULL;
5034
5035	if (src != (uintptr_t)`0`) {
5036	str = buf;
5037	if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5038	break;
5039	}
5040	dtrace_strcpy((void*)src, buf, size);
5041	}
5042
5043	(void)kernel_debug_string(debugid, &str_id, str);
5044	regs[rd] = str_id;
5045
5046	break;
5047	}
5048	#endif
5049
5050	}
5051	}
5052
5053	/*
5054	* Emulate the execution of DTrace IR instructions specified by the given
5055	* DIF object. This function is deliberately void of assertions as all of
5056	* the necessary checks are handled by a call to dtrace_difo_validate().
5057	*/
5058	static uint64_t
5059	dtrace_dif_emulate(dtrace_difo_t difo, dtrace_mstate_t mstate,
5060	dtrace_vstate_t vstate, dtrace_state_t state)
5061	{
5062	const dif_instr_t *text = difo->dtdo_buf;
5063	const uint_t textlen = difo->dtdo_len;
5064	const char *strtab = difo->dtdo_strtab;
5065	const uint64_t *inttab = difo->dtdo_inttab;
5066
5067	uint64_t rval = `0`;
5068	dtrace_statvar_t *svar;
5069	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5070	dtrace_difv_t *v;
5071	volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5072	volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5073
5074	dtrace_key_t tupregs[DIF_DTR_NREGS + `2`]; / +2 for thread and id /
5075	uint64_t regs[DIF_DIR_NREGS];
5076	uint64_t *tmp;
5077
5078	uint8_t cc_n = `0`, cc_z = `0`, cc_v = `0`, cc_c = `0`;
5079	int64_t cc_r;
5080	uint_t pc = `0`, id, opc = `0`;
5081	uint8_t ttop = `0`;
5082	dif_instr_t instr;
5083	uint_t r1, r2, rd;
5084
5085	/*
5086	* We stash the current DIF object into the machine state: we need it
5087	* for subsequent access checking.
5088	*/
5089	mstate->dtms_difo = difo;
5090
5091	regs[DIF_REG_R0] = `0`; / %r0 is fixed at zero /
5092
5093	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5094	opc = pc;
5095
5096	instr = text[pc++];
5097	r1 = DIF_INSTR_R1(instr);
5098	r2 = DIF_INSTR_R2(instr);
5099	rd = DIF_INSTR_RD(instr);
5100
5101	switch (DIF_INSTR_OP(instr)) {
5102	case DIF_OP_OR:
5103	regs[rd] = regs[r1] \| regs[r2];
5104	break;
5105	case DIF_OP_XOR:
5106	regs[rd] = regs[r1] ^ regs[r2];
5107	break;
5108	case DIF_OP_AND:
5109	regs[rd] = regs[r1] & regs[r2];
5110	break;
5111	case DIF_OP_SLL:
5112	regs[rd] = regs[r1] << regs[r2];
5113	break;
5114	case DIF_OP_SRL:
5115	regs[rd] = regs[r1] >> regs[r2];
5116	break;
5117	case DIF_OP_SUB:
5118	regs[rd] = regs[r1] - regs[r2];
5119	break;
5120	case DIF_OP_ADD:
5121	regs[rd] = regs[r1] + regs[r2];
5122	break;
5123	case DIF_OP_MUL:
5124	regs[rd] = regs[r1] * regs[r2];
5125	break;
5126	case DIF_OP_SDIV:
5127	if (regs[r2] == `0`) {
5128	regs[rd] = `0`;
5129	*flags \|= CPU_DTRACE_DIVZERO;
5130	} else {
5131	regs[rd] = (int64_t)regs[r1] /
5132	(int64_t)regs[r2];
5133	}
5134	break;
5135
5136	case DIF_OP_UDIV:
5137	if (regs[r2] == `0`) {
5138	regs[rd] = `0`;
5139	*flags \|= CPU_DTRACE_DIVZERO;
5140	} else {
5141	regs[rd] = regs[r1] / regs[r2];
5142	}
5143	break;
5144
5145	case DIF_OP_SREM:
5146	if (regs[r2] == `0`) {
5147	regs[rd] = `0`;
5148	*flags \|= CPU_DTRACE_DIVZERO;
5149	} else {
5150	regs[rd] = (int64_t)regs[r1] %
5151	(int64_t)regs[r2];
5152	}
5153	break;
5154
5155	case DIF_OP_UREM:
5156	if (regs[r2] == `0`) {
5157	regs[rd] = `0`;
5158	*flags \|= CPU_DTRACE_DIVZERO;
5159	} else {
5160	regs[rd] = regs[r1] % regs[r2];
5161	}
5162	break;
5163
5164	case DIF_OP_NOT:
5165	regs[rd] = ~regs[r1];
5166	break;
5167	case DIF_OP_MOV:
5168	regs[rd] = regs[r1];
5169	break;
5170	case DIF_OP_CMP:
5171	cc_r = regs[r1] - regs[r2];
5172	cc_n = cc_r < `0`;
5173	cc_z = cc_r == `0`;
5174	cc_v = `0`;
5175	cc_c = regs[r1] < regs[r2];
5176	break;
5177	case DIF_OP_TST:
5178	cc_n = cc_v = cc_c = `0`;
5179	cc_z = regs[r1] == `0`;
5180	break;
5181	case DIF_OP_BA:
5182	pc = DIF_INSTR_LABEL(instr);
5183	break;
5184	case DIF_OP_BE:
5185	if (cc_z)
5186	pc = DIF_INSTR_LABEL(instr);
5187	break;
5188	case DIF_OP_BNE:
5189	if (cc_z == `0`)
5190	pc = DIF_INSTR_LABEL(instr);
5191	break;
5192	case DIF_OP_BG:
5193	if ((cc_z \| (cc_n ^ cc_v)) == `0`)
5194	pc = DIF_INSTR_LABEL(instr);
5195	break;
5196	case DIF_OP_BGU:
5197	if ((cc_c \| cc_z) == `0`)
5198	pc = DIF_INSTR_LABEL(instr);
5199	break;
5200	case DIF_OP_BGE:
5201	if ((cc_n ^ cc_v) == `0`)
5202	pc = DIF_INSTR_LABEL(instr);
5203	break;
5204	case DIF_OP_BGEU:
5205	if (cc_c == `0`)
5206	pc = DIF_INSTR_LABEL(instr);
5207	break;
5208	case DIF_OP_BL:
5209	if (cc_n ^ cc_v)
5210	pc = DIF_INSTR_LABEL(instr);
5211	break;
5212	case DIF_OP_BLU:
5213	if (cc_c)
5214	pc = DIF_INSTR_LABEL(instr);
5215	break;
5216	case DIF_OP_BLE:
5217	if (cc_z \| (cc_n ^ cc_v))
5218	pc = DIF_INSTR_LABEL(instr);
5219	break;
5220	case DIF_OP_BLEU:
5221	if (cc_c \| cc_z)
5222	pc = DIF_INSTR_LABEL(instr);
5223	break;
5224	case DIF_OP_RLDSB:
5225	if (!dtrace_canstore(regs[r1], `1`, mstate, vstate)) {
5226	*flags \|= CPU_DTRACE_KPRIV;
5227	*illval = regs[r1];
5228	break;
5229	}
5230	/FALLTHROUGH/
5231	case DIF_OP_LDSB:
5232	regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5233	break;
5234	case DIF_OP_RLDSH:
5235	if (!dtrace_canstore(regs[r1], `2`, mstate, vstate)) {
5236	*flags \|= CPU_DTRACE_KPRIV;
5237	*illval = regs[r1];
5238	break;
5239	}
5240	/FALLTHROUGH/
5241	case DIF_OP_LDSH:
5242	regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5243	break;
5244	case DIF_OP_RLDSW:
5245	if (!dtrace_canstore(regs[r1], `4`, mstate, vstate)) {
5246	*flags \|= CPU_DTRACE_KPRIV;
5247	*illval = regs[r1];
5248	break;
5249	}
5250	/FALLTHROUGH/
5251	case DIF_OP_LDSW:
5252	regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5253	break;
5254	case DIF_OP_RLDUB:
5255	if (!dtrace_canstore(regs[r1], `1`, mstate, vstate)) {
5256	*flags \|= CPU_DTRACE_KPRIV;
5257	*illval = regs[r1];
5258	break;
5259	}
5260	/FALLTHROUGH/
5261	case DIF_OP_LDUB:
5262	regs[rd] = dtrace_load8(regs[r1]);
5263	break;
5264	case DIF_OP_RLDUH:
5265	if (!dtrace_canstore(regs[r1], `2`, mstate, vstate)) {
5266	*flags \|= CPU_DTRACE_KPRIV;
5267	*illval = regs[r1];
5268	break;
5269	}
5270	/FALLTHROUGH/
5271	case DIF_OP_LDUH:
5272	regs[rd] = dtrace_load16(regs[r1]);
5273	break;
5274	case DIF_OP_RLDUW:
5275	if (!dtrace_canstore(regs[r1], `4`, mstate, vstate)) {
5276	*flags \|= CPU_DTRACE_KPRIV;
5277	*illval = regs[r1];
5278	break;
5279	}
5280	/FALLTHROUGH/
5281	case DIF_OP_LDUW:
5282	regs[rd] = dtrace_load32(regs[r1]);
5283	break;
5284	case DIF_OP_RLDX:
5285	if (!dtrace_canstore(regs[r1], `8`, mstate, vstate)) {
5286	*flags \|= CPU_DTRACE_KPRIV;
5287	*illval = regs[r1];
5288	break;
5289	}
5290	/FALLTHROUGH/
5291	case DIF_OP_LDX:
5292	regs[rd] = dtrace_load64(regs[r1]);
5293	break;
5294	/*
5295	* Darwin 32-bit kernel may fetch from 64-bit user.
5296	* Do not cast regs to uintptr_t
5297	* DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5298	* DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5299	*/
5300	case DIF_OP_ULDSB:
5301	regs[rd] = (int8_t)
5302	dtrace_fuword8(regs[r1]);
5303	break;
5304	case DIF_OP_ULDSH:
5305	regs[rd] = (int16_t)
5306	dtrace_fuword16(regs[r1]);
5307	break;
5308	case DIF_OP_ULDSW:
5309	regs[rd] = (int32_t)
5310	dtrace_fuword32(regs[r1]);
5311	break;
5312	case DIF_OP_ULDUB:
5313	regs[rd] =
5314	dtrace_fuword8(regs[r1]);
5315	break;
5316	case DIF_OP_ULDUH:
5317	regs[rd] =
5318	dtrace_fuword16(regs[r1]);
5319	break;
5320	case DIF_OP_ULDUW:
5321	regs[rd] =
5322	dtrace_fuword32(regs[r1]);
5323	break;
5324	case DIF_OP_ULDX:
5325	regs[rd] =
5326	dtrace_fuword64(regs[r1]);
5327	break;
5328	case DIF_OP_RET:
5329	rval = regs[rd];
5330	pc = textlen;
5331	break;
5332	case DIF_OP_NOP:
5333	break;
5334	case DIF_OP_SETX:
5335	regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5336	break;
5337	case DIF_OP_SETS:
5338	regs[rd] = (uint64_t)(uintptr_t)
5339	(strtab + DIF_INSTR_STRING(instr));
5340	break;
5341	case DIF_OP_SCMP: {
5342	size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5343	uintptr_t s1 = regs[r1];
5344	uintptr_t s2 = regs[r2];
5345	size_t lim1 = sz, lim2 = sz;
5346
5347	if (s1 != `0` &&
5348	!dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5349	break;
5350	if (s2 != `0` &&
5351	!dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5352	break;
5353
5354	cc_r = dtrace_strncmp((char )s1, (char* *)s2,
5355	MIN(lim1, lim2));
5356
5357	cc_n = cc_r < `0`;
5358	cc_z = cc_r == `0`;
5359	cc_v = cc_c = `0`;
5360	break;
5361	}
5362	case DIF_OP_LDGA:
5363	regs[rd] = dtrace_dif_variable(mstate, state,
5364	r1, regs[r2]);
5365	break;
5366	case DIF_OP_LDGS:
5367	id = DIF_INSTR_VAR(instr);
5368
5369	if (id >= DIF_VAR_OTHER_UBASE) {
5370	uintptr_t a;
5371
5372	id -= DIF_VAR_OTHER_UBASE;
5373	svar = vstate->dtvs_globals[id];
5374	ASSERT(svar != NULL);
5375	v = &svar->dtsv_var;
5376
5377	if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5378	regs[rd] = svar->dtsv_data;
5379	break;
5380	}
5381
5382	a = (uintptr_t)svar->dtsv_data;
5383
5384	if ((uint8_t )a == UINT8_MAX) {
5385	/*
5386	* If the 0th byte is set to UINT8_MAX
5387	* then this is to be treated as a
5388	* reference to a NULL variable.
5389	*/
5390	regs[rd] = `0`;
5391	} else {
5392	regs[rd] = a + sizeof (uint64_t);
5393	}
5394
5395	break;
5396	}
5397
5398	regs[rd] = dtrace_dif_variable(mstate, state, id, `0`);
5399	break;
5400
5401	case DIF_OP_STGS:
5402	id = DIF_INSTR_VAR(instr);
5403
5404	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5405	id -= DIF_VAR_OTHER_UBASE;
5406
5407	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5408	svar = vstate->dtvs_globals[id];
5409	ASSERT(svar != NULL);
5410	v = &svar->dtsv_var;
5411
5412	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5413	uintptr_t a = (uintptr_t)svar->dtsv_data;
5414	size_t lim;
5415
5416	ASSERT(a != `0`);
5417	ASSERT(svar->dtsv_size != `0`);
5418
5419	if (regs[rd] == `0`) {
5420	(uint8_t )a = UINT8_MAX;
5421	break;
5422	} else {
5423	(uint8_t )a = `0`;
5424	a += sizeof (uint64_t);
5425	}
5426	if (!dtrace_vcanload(
5427	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
5428	&lim, mstate, vstate))
5429	break;
5430
5431	dtrace_vcopy((void *)(uintptr_t)regs[rd],
5432	(void *)a, &v->dtdv_type, lim);
5433	break;
5434	}
5435
5436	svar->dtsv_data = regs[rd];
5437	break;
5438
5439	case DIF_OP_LDTA:
5440	/*
5441	* There are no DTrace built-in thread-local arrays at
5442	* present. This opcode is saved for future work.
5443	*/
5444	*flags \|= CPU_DTRACE_ILLOP;
5445	regs[rd] = `0`;
5446	break;
5447
5448	case DIF_OP_LDLS:
5449	id = DIF_INSTR_VAR(instr);
5450
5451	if (id < DIF_VAR_OTHER_UBASE) {
5452	/*
5453	* For now, this has no meaning.
5454	*/
5455	regs[rd] = `0`;
5456	break;
5457	}
5458
5459	id -= DIF_VAR_OTHER_UBASE;
5460
5461	ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5462	ASSERT(vstate->dtvs_locals != NULL);
5463	svar = vstate->dtvs_locals[id];
5464	ASSERT(svar != NULL);
5465	v = &svar->dtsv_var;
5466
5467	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5468	uintptr_t a = (uintptr_t)svar->dtsv_data;
5469	size_t sz = v->dtdv_type.dtdt_size;
5470
5471	sz += sizeof (uint64_t);
5472	ASSERT(svar->dtsv_size == (int)NCPU * sz);
5473	a += CPU->cpu_id * sz;
5474
5475	if ((uint8_t )a == UINT8_MAX) {
5476	/*
5477	* If the 0th byte is set to UINT8_MAX
5478	* then this is to be treated as a
5479	* reference to a NULL variable.
5480	*/
5481	regs[rd] = `0`;
5482	} else {
5483	regs[rd] = a + sizeof (uint64_t);
5484	}
5485
5486	break;
5487	}
5488
5489	ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5490	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5491	regs[rd] = tmp[CPU->cpu_id];
5492	break;
5493
5494	case DIF_OP_STLS:
5495	id = DIF_INSTR_VAR(instr);
5496
5497	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5498	id -= DIF_VAR_OTHER_UBASE;
5499	VERIFY(id < (uint_t)vstate->dtvs_nlocals);
5500	ASSERT(vstate->dtvs_locals != NULL);
5501	svar = vstate->dtvs_locals[id];
5502	ASSERT(svar != NULL);
5503	v = &svar->dtsv_var;
5504
5505	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5506	uintptr_t a = (uintptr_t)svar->dtsv_data;
5507	size_t sz = v->dtdv_type.dtdt_size;
5508	size_t lim;
5509
5510	sz += sizeof (uint64_t);
5511	ASSERT(svar->dtsv_size == (int)NCPU * sz);
5512	a += CPU->cpu_id * sz;
5513
5514	if (regs[rd] == `0`) {
5515	(uint8_t )a = UINT8_MAX;
5516	break;
5517	} else {
5518	(uint8_t )a = `0`;
5519	a += sizeof (uint64_t);
5520	}
5521
5522	if (!dtrace_vcanload(
5523	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
5524	&lim, mstate, vstate))
5525	break;
5526
5527	dtrace_vcopy((void *)(uintptr_t)regs[rd],
5528	(void *)a, &v->dtdv_type, lim);
5529	break;
5530	}
5531
5532	ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5533	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5534	tmp[CPU->cpu_id] = regs[rd];
5535	break;
5536
5537	case DIF_OP_LDTS: {
5538	dtrace_dynvar_t *dvar;
5539	dtrace_key_t *key;
5540
5541	id = DIF_INSTR_VAR(instr);
5542	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5543	id -= DIF_VAR_OTHER_UBASE;
5544	v = &vstate->dtvs_tlocals[id];
5545
5546	key = &tupregs[DIF_DTR_NREGS];
5547	key[`0`].dttk_value = (uint64_t)id;
5548	key[`0`].dttk_size = `0`;
5549	DTRACE_TLS_THRKEY(key[`1`].dttk_value);
5550	key[`1`].dttk_size = `0`;
5551
5552	dvar = dtrace_dynvar(dstate, `2`, key,
5553	sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5554	mstate, vstate);
5555
5556	if (dvar == NULL) {
5557	regs[rd] = `0`;
5558	break;
5559	}
5560
5561	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5562	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5563	} else {
5564	regs[rd] = ((uint64_t )dvar->dtdv_data);
5565	}
5566
5567	break;
5568	}
5569
5570	case DIF_OP_STTS: {
5571	dtrace_dynvar_t *dvar;
5572	dtrace_key_t *key;
5573
5574	id = DIF_INSTR_VAR(instr);
5575	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5576	id -= DIF_VAR_OTHER_UBASE;
5577	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5578
5579	key = &tupregs[DIF_DTR_NREGS];
5580	key[`0`].dttk_value = (uint64_t)id;
5581	key[`0`].dttk_size = `0`;
5582	DTRACE_TLS_THRKEY(key[`1`].dttk_value);
5583	key[`1`].dttk_size = `0`;
5584	v = &vstate->dtvs_tlocals[id];
5585
5586	dvar = dtrace_dynvar(dstate, `2`, key,
5587	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5588	v->dtdv_type.dtdt_size : sizeof (uint64_t),
5589	regs[rd] ? DTRACE_DYNVAR_ALLOC :
5590	DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5591
5592	/*
5593	* Given that we're storing to thread-local data,
5594	* we need to flush our predicate cache.
5595	*/
5596	dtrace_set_thread_predcache(current_thread(), `0`);
5597
5598	if (dvar == NULL)
5599	break;
5600
5601	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5602	size_t lim;
5603
5604	if (!dtrace_vcanload(
5605	(void *)(uintptr_t)regs[rd],
5606	&v->dtdv_type, &lim, mstate, vstate))
5607	break;
5608
5609	dtrace_vcopy((void *)(uintptr_t)regs[rd],
5610	dvar->dtdv_data, &v->dtdv_type, lim);
5611	} else {
5612	((uint64_t )dvar->dtdv_data) = regs[rd];
5613	}
5614
5615	break;
5616	}
5617
5618	case DIF_OP_SRA:
5619	regs[rd] = (int64_t)regs[r1] >> regs[r2];
5620	break;
5621
5622	case DIF_OP_CALL:
5623	dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5624	regs, tupregs, ttop, mstate, state);
5625	break;
5626
5627	case DIF_OP_PUSHTR:
5628	if (ttop == DIF_DTR_NREGS) {
5629	*flags \|= CPU_DTRACE_TUPOFLOW;
5630	break;
5631	}
5632
5633	if (r1 == DIF_TYPE_STRING) {
5634	/*
5635	* If this is a string type and the size is 0,
5636	* we'll use the system-wide default string
5637	* size. Note that we are _not_ looking at
5638	* the value of the DTRACEOPT_STRSIZE option;
5639	* had this been set, we would expect to have
5640	* a non-zero size value in the "pushtr".
5641	*/
5642	tupregs[ttop].dttk_size =
5643	dtrace_strlen((char *)(uintptr_t)regs[rd],
5644	regs[r2] ? regs[r2] :
5645	dtrace_strsize_default) + `1`;
5646	} else {
5647	if (regs[r2] > LONG_MAX) {
5648	*flags \|= CPU_DTRACE_ILLOP;
5649	break;
5650	}
5651	tupregs[ttop].dttk_size = regs[r2];
5652	}
5653
5654	tupregs[ttop++].dttk_value = regs[rd];
5655	break;
5656
5657	case DIF_OP_PUSHTV:
5658	if (ttop == DIF_DTR_NREGS) {
5659	*flags \|= CPU_DTRACE_TUPOFLOW;
5660	break;
5661	}
5662
5663	tupregs[ttop].dttk_value = regs[rd];
5664	tupregs[ttop++].dttk_size = `0`;
5665	break;
5666
5667	case DIF_OP_POPTS:
5668	if (ttop != `0`)
5669	ttop--;
5670	break;
5671
5672	case DIF_OP_FLUSHTS:
5673	ttop = `0`;
5674	break;
5675
5676	case DIF_OP_LDGAA:
5677	case DIF_OP_LDTAA: {
5678	dtrace_dynvar_t *dvar;
5679	dtrace_key_t *key = tupregs;
5680	uint_t nkeys = ttop;
5681
5682	id = DIF_INSTR_VAR(instr);
5683	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5684	id -= DIF_VAR_OTHER_UBASE;
5685
5686	key[nkeys].dttk_value = (uint64_t)id;
5687	key[nkeys++].dttk_size = `0`;
5688
5689	if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5690	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5691	key[nkeys++].dttk_size = `0`;
5692	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5693	v = &vstate->dtvs_tlocals[id];
5694	} else {
5695	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5696	v = &vstate->dtvs_globals[id]->dtsv_var;
5697	}
5698
5699	dvar = dtrace_dynvar(dstate, nkeys, key,
5700	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5701	v->dtdv_type.dtdt_size : sizeof (uint64_t),
5702	DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5703
5704	if (dvar == NULL) {
5705	regs[rd] = `0`;
5706	break;
5707	}
5708
5709	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5710	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5711	} else {
5712	regs[rd] = ((uint64_t )dvar->dtdv_data);
5713	}
5714
5715	break;
5716	}
5717
5718	case DIF_OP_STGAA:
5719	case DIF_OP_STTAA: {
5720	dtrace_dynvar_t *dvar;
5721	dtrace_key_t *key = tupregs;
5722	uint_t nkeys = ttop;
5723
5724	id = DIF_INSTR_VAR(instr);
5725	ASSERT(id >= DIF_VAR_OTHER_UBASE);
5726	id -= DIF_VAR_OTHER_UBASE;
5727
5728	key[nkeys].dttk_value = (uint64_t)id;
5729	key[nkeys++].dttk_size = `0`;
5730
5731	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5732	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5733	key[nkeys++].dttk_size = `0`;
5734	VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5735	v = &vstate->dtvs_tlocals[id];
5736	} else {
5737	VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5738	v = &vstate->dtvs_globals[id]->dtsv_var;
5739	}
5740
5741	dvar = dtrace_dynvar(dstate, nkeys, key,
5742	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5743	v->dtdv_type.dtdt_size : sizeof (uint64_t),
5744	regs[rd] ? DTRACE_DYNVAR_ALLOC :
5745	DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5746
5747	if (dvar == NULL)
5748	break;
5749
5750	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5751	size_t lim;
5752
5753	if (!dtrace_vcanload(
5754	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
5755	&lim, mstate, vstate))
5756	break;
5757
5758	dtrace_vcopy((void *)(uintptr_t)regs[rd],
5759	dvar->dtdv_data, &v->dtdv_type, lim);
5760	} else {
5761	((uint64_t )dvar->dtdv_data) = regs[rd];
5762	}
5763
5764	break;
5765	}
5766
5767	case DIF_OP_ALLOCS: {
5768	uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
5769	size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5770
5771	/*
5772	* Rounding up the user allocation size could have
5773	* overflowed large, bogus allocations (like -1ULL) to
5774	* 0.
5775	*/
5776	if (size < regs[r1] \|\|
5777	!DTRACE_INSCRATCH(mstate, size)) {
5778	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5779	regs[rd] = `0`;
5780	break;
5781	}
5782
5783	dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5784	mstate->dtms_scratch_ptr += size;
5785	regs[rd] = ptr;
5786	break;
5787	}
5788
5789	case DIF_OP_COPYS:
5790	if (!dtrace_canstore(regs[rd], regs[r2],
5791	mstate, vstate)) {
5792	*flags \|= CPU_DTRACE_BADADDR;
5793	*illval = regs[rd];
5794	break;
5795	}
5796
5797	if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5798	break;
5799
5800	dtrace_bcopy((void *)(uintptr_t)regs[r1],
5801	(void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5802	break;
5803
5804	case DIF_OP_STB:
5805	if (!dtrace_canstore(regs[rd], `1`, mstate, vstate)) {
5806	*flags \|= CPU_DTRACE_BADADDR;
5807	*illval = regs[rd];
5808	break;
5809	}
5810	((uint8_t )(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5811	break;
5812
5813	case DIF_OP_STH:
5814	if (!dtrace_canstore(regs[rd], `2`, mstate, vstate)) {
5815	*flags \|= CPU_DTRACE_BADADDR;
5816	*illval = regs[rd];
5817	break;
5818	}
5819	if (regs[rd] & `1`) {
5820	*flags \|= CPU_DTRACE_BADALIGN;
5821	*illval = regs[rd];
5822	break;
5823	}
5824	((uint16_t )(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5825	break;
5826
5827	case DIF_OP_STW:
5828	if (!dtrace_canstore(regs[rd], `4`, mstate, vstate)) {
5829	*flags \|= CPU_DTRACE_BADADDR;
5830	*illval = regs[rd];
5831	break;
5832	}
5833	if (regs[rd] & `3`) {
5834	*flags \|= CPU_DTRACE_BADALIGN;
5835	*illval = regs[rd];
5836	break;
5837	}
5838	((uint32_t )(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5839	break;
5840
5841	case DIF_OP_STX:
5842	if (!dtrace_canstore(regs[rd], `8`, mstate, vstate)) {
5843	*flags \|= CPU_DTRACE_BADADDR;
5844	*illval = regs[rd];
5845	break;
5846	}
5847
5848	/*
5849	* Darwin kmem_zalloc() called from
5850	* dtrace_difo_init() is 4-byte aligned.
5851	*/
5852	if (regs[rd] & `3`) {
5853	*flags \|= CPU_DTRACE_BADALIGN;
5854	*illval = regs[rd];
5855	break;
5856	}
5857	((uint64_t )(uintptr_t)regs[rd]) = regs[r1];
5858	break;
5859	}
5860	}
5861
5862	if (!(*flags & CPU_DTRACE_FAULT))
5863	return (rval);
5864
5865	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5866	mstate->dtms_present \|= DTRACE_MSTATE_FLTOFFS;
5867
5868	return (`0`);
5869	}
5870
5871	static void
5872	dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5873	{
5874	dtrace_probe_t *probe = ecb->dte_probe;
5875	dtrace_provider_t *prov = probe->dtpr_provider;
5876	char c[DTRACE_FULLNAMELEN + `80`], *str;
5877	const char *msg = "dtrace: breakpoint action at probe ";
5878	const char *ecbmsg = " (ecb ";
5879	uintptr_t mask = (`0xf` << (sizeof (uintptr_t) * NBBY / `4`));
5880	uintptr_t val = (uintptr_t)ecb;
5881	int shift = (sizeof (uintptr_t) * NBBY) - `4`, i = `0`;
5882
5883	if (dtrace_destructive_disallow)
5884	return;
5885
5886	/*
5887	* It's impossible to be taking action on the NULL probe.
5888	*/
5889	ASSERT(probe != NULL);
5890
5891	/*
5892	* This is a poor man's (destitute man's?) sprintf(): we want to
5893	* print the provider name, module name, function name and name of
5894	* the probe, along with the hex address of the ECB with the breakpoint
5895	* action -- all of which we must place in the character buffer by
5896	* hand.
5897	*/
5898	while (*msg != `'\0'`)
5899	c[i++] = *msg++;
5900
5901	for (str = prov->dtpv_name; *str != `'\0'`; str++)
5902	c[i++] = *str;
5903	c[i++] = `':'`;
5904
5905	for (str = probe->dtpr_mod; *str != `'\0'`; str++)
5906	c[i++] = *str;
5907	c[i++] = `':'`;
5908
5909	for (str = probe->dtpr_func; *str != `'\0'`; str++)
5910	c[i++] = *str;
5911	c[i++] = `':'`;
5912
5913	for (str = probe->dtpr_name; *str != `'\0'`; str++)
5914	c[i++] = *str;
5915
5916	while (*ecbmsg != `'\0'`)
5917	c[i++] = *ecbmsg++;
5918
5919	while (shift >= `0`) {
5920	mask = (uintptr_t)`0xf` << shift;
5921
5922	if (val >= ((uintptr_t)`1` << shift))
5923	c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5924	shift -= `4`;
5925	}
5926
5927	c[i++] = `')'`;
5928	c[i] = `'\0'`;
5929
5930	debug_enter(c);
5931	}
5932
5933	static void
5934	dtrace_action_panic(dtrace_ecb_t *ecb)
5935	{
5936	dtrace_probe_t *probe = ecb->dte_probe;
5937
5938	/*
5939	* It's impossible to be taking action on the NULL probe.
5940	*/
5941	ASSERT(probe != NULL);
5942
5943	if (dtrace_destructive_disallow)
5944	return;
5945
5946	if (dtrace_panicked != NULL)
5947	return;
5948
5949	if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
5950	return;
5951
5952	/*
5953	* We won the right to panic. (We want to be sure that only one
5954	* thread calls panic() from dtrace_probe(), and that panic() is
5955	* called exactly once.)
5956	*/
5957	panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5958	probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5959	probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5960
5961	/*
5962	* APPLE NOTE: this was for an old Mac OS X debug feature
5963	* allowing a return from panic(). Revisit someday.
5964	*/
5965	dtrace_panicked = NULL;
5966	}
5967
5968	static void
5969	dtrace_action_raise(uint64_t sig)
5970	{
5971	if (dtrace_destructive_disallow)
5972	return;
5973
5974	if (sig >= NSIG) {
5975	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5976	return;
5977	}
5978
5979	/*
5980	* raise() has a queue depth of 1 -- we ignore all subsequent
5981	* invocations of the raise() action.
5982	*/
5983
5984	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5985
5986	if (uthread && uthread->t_dtrace_sig == `0`) {
5987	uthread->t_dtrace_sig = sig;
5988	act_set_astbsd(current_thread());
5989	}
5990	}
5991
5992	static void
5993	dtrace_action_stop(void)
5994	{
5995	if (dtrace_destructive_disallow)
5996	return;
5997
5998	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5999	if (uthread) {
6000	/*
6001	* The currently running process will be set to task_suspend
6002	* when it next leaves the kernel.
6003	*/
6004	uthread->t_dtrace_stop = `1`;
6005	act_set_astbsd(current_thread());
6006	}
6007	}
6008
6009
6010	/*
6011	* APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6012	* Both activate only when the currently running process next leaves the
6013	* kernel.
6014	*/
6015	static void
6016	dtrace_action_pidresume(uint64_t pid)
6017	{
6018	if (dtrace_destructive_disallow)
6019	return;
6020
6021	if (kauth_cred_issuser(kauth_cred_get()) == `0`) {
6022	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6023	return;
6024	}
6025	uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6026
6027	/*
6028	* When the currently running process leaves the kernel, it attempts to
6029	* task_resume the process (denoted by pid), if that pid appears to have
6030	* been stopped by dtrace_action_stop().
6031	* The currently running process has a pidresume() queue depth of 1 --
6032	* subsequent invocations of the pidresume() action are ignored.
6033	*/
6034
6035	if (pid != `0` && uthread && uthread->t_dtrace_resumepid == `0`) {
6036	uthread->t_dtrace_resumepid = pid;
6037	act_set_astbsd(current_thread());
6038	}
6039	}
6040
6041	static void
6042	dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6043	{
6044	hrtime_t now;
6045	volatile uint16_t *flags;
6046	dtrace_cpu_t *cpu = CPU;
6047
6048	if (dtrace_destructive_disallow)
6049	return;
6050
6051	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6052
6053	now = dtrace_gethrtime();
6054
6055	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6056	/*
6057	* We need to advance the mark to the current time.
6058	*/
6059	cpu->cpu_dtrace_chillmark = now;
6060	cpu->cpu_dtrace_chilled = `0`;
6061	}
6062
6063	/*
6064	* Now check to see if the requested chill time would take us over
6065	* the maximum amount of time allowed in the chill interval. (Or
6066	* worse, if the calculation itself induces overflow.)
6067	*/
6068	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max \|\|
6069	cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6070	*flags \|= CPU_DTRACE_ILLOP;
6071	return;
6072	}
6073
6074	while (dtrace_gethrtime() - now < val)
6075	continue;
6076
6077	/*
6078	* Normally, we assure that the value of the variable "timestamp" does
6079	* not change within an ECB. The presence of chill() represents an
6080	* exception to this rule, however.
6081	*/
6082	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6083	cpu->cpu_dtrace_chilled += val;
6084	}
6085
6086	static void
6087	dtrace_action_ustack(dtrace_mstate_t mstate, dtrace_state_t state,
6088	uint64_t *buf, uint64_t arg)
6089	{
6090	int nframes = DTRACE_USTACK_NFRAMES(arg);
6091	int strsize = DTRACE_USTACK_STRSIZE(arg);
6092	uint64_t pcs = &buf[`1`], fps;
6093	char str = (char* *)&pcs[nframes];
6094	int size, offs = `0`, i, j;
6095	uintptr_t old = mstate->dtms_scratch_ptr, saved;
6096	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6097	char *sym;
6098
6099	/*
6100	* Should be taking a faster path if string space has not been
6101	* allocated.
6102	*/
6103	ASSERT(strsize != `0`);
6104
6105	/*
6106	* We will first allocate some temporary space for the frame pointers.
6107	*/
6108	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, `8`);
6109	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6110	(nframes * sizeof (uint64_t));
6111
6112	if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6113	/*
6114	* Not enough room for our frame pointers -- need to indicate
6115	* that we ran out of scratch space.
6116	*/
6117	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6118	return;
6119	}
6120
6121	mstate->dtms_scratch_ptr += size;
6122	saved = mstate->dtms_scratch_ptr;
6123
6124	/*
6125	* Now get a stack with both program counters and frame pointers.
6126	*/
6127	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6128	dtrace_getufpstack(buf, fps, nframes + `1`);
6129	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6130
6131	/*
6132	* If that faulted, we're cooked.
6133	*/
6134	if (*flags & CPU_DTRACE_FAULT)
6135	goto out;
6136
6137	/*
6138	* Now we want to walk up the stack, calling the USTACK helper. For
6139	* each iteration, we restore the scratch pointer.
6140	*/
6141	for (i = `0`; i < nframes; i++) {
6142	mstate->dtms_scratch_ptr = saved;
6143
6144	if (offs >= strsize)
6145	break;
6146
6147	sym = (char *)(uintptr_t)dtrace_helper(
6148	DTRACE_HELPER_ACTION_USTACK,
6149	mstate, state, pcs[i], fps[i]);
6150
6151	/*
6152	* If we faulted while running the helper, we're going to
6153	* clear the fault and null out the corresponding string.
6154	*/
6155	if (*flags & CPU_DTRACE_FAULT) {
6156	*flags &= ~CPU_DTRACE_FAULT;
6157	str[offs++] = `'\0'`;
6158	continue;
6159	}
6160
6161	if (sym == NULL) {
6162	str[offs++] = `'\0'`;
6163	continue;
6164	}
6165
6166	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6167
6168	/*
6169	* Now copy in the string that the helper returned to us.
6170	*/
6171	for (j = `0`; offs + j < strsize; j++) {
6172	if ((str[offs + j] = sym[j]) == `'\0'`)
6173	break;
6174	}
6175
6176	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6177
6178	offs += j + `1`;
6179	}
6180
6181	if (offs >= strsize) {
6182	/*
6183	* If we didn't have room for all of the strings, we don't
6184	* abort processing -- this needn't be a fatal error -- but we
6185	* still want to increment a counter (dts_stkstroverflows) to
6186	* allow this condition to be warned about. (If this is from
6187	* a jstack() action, it is easily tuned via jstackstrsize.)
6188	*/
6189	dtrace_error(&state->dts_stkstroverflows);
6190	}
6191
6192	while (offs < strsize)
6193	str[offs++] = `'\0'`;
6194
6195	out:
6196	mstate->dtms_scratch_ptr = old;
6197	}
6198
6199	static void
6200	dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6201	size_t valoffsp, uint64_t valp, uint64_t end, int intuple, int dtkind)
6202	{
6203	volatile uint16_t *flags;
6204	uint64_t val = *valp;
6205	size_t valoffs = *valoffsp;
6206
6207	flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6208	ASSERT(dtkind == DIF_TF_BYREF \|\| dtkind == DIF_TF_BYUREF);
6209
6210	/*
6211	* If this is a string, we're going to only load until we find the zero
6212	* byte -- after which we'll store zero bytes.
6213	*/
6214	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6215	char c = `'\0'` + `1`;
6216	size_t s;
6217
6218	for (s = `0`; s < size; s++) {
6219	if (c != `'\0'` && dtkind == DIF_TF_BYREF) {
6220	c = dtrace_load8(val++);
6221	} else if (c != `'\0'` && dtkind == DIF_TF_BYUREF) {
6222	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6223	c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6224	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6225	if (*flags & CPU_DTRACE_FAULT)
6226	break;
6227	}
6228
6229	DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6230
6231	if (c == `'\0'` && intuple)
6232	break;
6233	}
6234	} else {
6235	uint8_t c;
6236	while (valoffs < end) {
6237	if (dtkind == DIF_TF_BYREF) {
6238	c = dtrace_load8(val++);
6239	} else if (dtkind == DIF_TF_BYUREF) {
6240	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6241	c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6242	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6243	if (*flags & CPU_DTRACE_FAULT)
6244	break;
6245	}
6246
6247	DTRACE_STORE(uint8_t, tomax,
6248	valoffs++, c);
6249	}
6250	}
6251
6252	*valp = val;
6253	*valoffsp = valoffs;
6254	}
6255
6256	/*
6257	* If you're looking for the epicenter of DTrace, you just found it. This
6258	* is the function called by the provider to fire a probe -- from which all
6259	* subsequent probe-context DTrace activity emanates.
6260	*/
6261	static void
6262	__dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6263	uint64_t arg2, uint64_t arg3, uint64_t arg4)
6264	{
6265	processorid_t cpuid;
6266	dtrace_icookie_t cookie;
6267	dtrace_probe_t *probe;
6268	dtrace_mstate_t mstate;
6269	dtrace_ecb_t *ecb;
6270	dtrace_action_t *act;
6271	intptr_t offs;
6272	size_t size;
6273	int vtime, onintr;
6274	volatile uint16_t *flags;
6275	hrtime_t now;
6276
6277	cookie = dtrace_interrupt_disable();
6278	probe = dtrace_probes[id - `1`];
6279	cpuid = CPU->cpu_id;
6280	onintr = CPU_ON_INTR(CPU);
6281
6282	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6283	probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6284	/*
6285	* We have hit in the predicate cache; we know that
6286	* this predicate would evaluate to be false.
6287	*/
6288	dtrace_interrupt_enable(cookie);
6289	return;
6290	}
6291
6292	if (panic_quiesce) {
6293	/*
6294	* We don't trace anything if we're panicking.
6295	*/
6296	dtrace_interrupt_enable(cookie);
6297	return;
6298	}
6299
6300	#if !defined(__APPLE__)
6301	now = dtrace_gethrtime();
6302	vtime = dtrace_vtime_references != `0`;
6303
6304	if (vtime && curthread->t_dtrace_start)
6305	curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6306	#else
6307	/*
6308	* APPLE NOTE: The time spent entering DTrace and arriving
6309	* to this point, is attributed to the current thread.
6310	* Instead it should accrue to DTrace. FIXME
6311	*/
6312	vtime = dtrace_vtime_references != `0`;
6313
6314	if (vtime)
6315	{
6316	int64_t dtrace_accum_time, recent_vtime;
6317	thread_t thread = current_thread();
6318
6319	dtrace_accum_time = dtrace_get_thread_tracing(thread); / Time spent inside DTrace so far (nanoseconds) /
6320
6321	if (dtrace_accum_time >= `0`) {
6322	recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); / up to the moment thread vtime /
6323
6324	recent_vtime = recent_vtime - dtrace_accum_time; / Time without DTrace contribution /
6325
6326	dtrace_set_thread_vtime(thread, recent_vtime);
6327	}
6328	}
6329
6330	now = dtrace_gethrtime(); / must not precede dtrace_calc_thread_recent_vtime() call! /
6331	#endif /* __APPLE__ */
6332
6333	/*
6334	* APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6335	* dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
6336	* However the provider has no access to ECB context, so passes
6337	* 0 through "arg0" and the probe_id of the overridden probe as arg1.
6338	* Detect that here and cons up a viable state (from the probe_id).
6339	*/
6340	if (dtrace_probeid_error == id && `0` == arg0) {
6341	dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6342	dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - `1`];
6343	dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
6344
6345	if (NULL != ftp_ecb) {
6346	dtrace_state_t *ftp_state = ftp_ecb->dte_state;
6347
6348	arg0 = (uint64_t)(uintptr_t)ftp_state;
6349	arg1 = ftp_ecb->dte_epid;
6350	/*
6351	* args[2-4] established by caller.
6352	*/
6353	ftp_state->dts_arg_error_illval = -`1`; / arg5 /
6354	}
6355	}
6356
6357	mstate.dtms_difo = NULL;
6358	mstate.dtms_probe = probe;
6359	mstate.dtms_strtok = `0`;
6360	mstate.dtms_arg[`0`] = arg0;
6361	mstate.dtms_arg[`1`] = arg1;
6362	mstate.dtms_arg[`2`] = arg2;
6363	mstate.dtms_arg[`3`] = arg3;
6364	mstate.dtms_arg[`4`] = arg4;
6365
6366	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6367
6368	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6369	dtrace_predicate_t *pred = ecb->dte_predicate;
6370	dtrace_state_t *state = ecb->dte_state;
6371	dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6372	dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6373	dtrace_vstate_t *vstate = &state->dts_vstate;
6374	dtrace_provider_t *prov = probe->dtpr_provider;
6375	uint64_t tracememsize = `0`;
6376	int committed = `0`;
6377	caddr_t tomax;
6378
6379	/*
6380	* A little subtlety with the following (seemingly innocuous)
6381	* declaration of the automatic 'val': by looking at the
6382	* code, you might think that it could be declared in the
6383	* action processing loop, below. (That is, it's only used in
6384	* the action processing loop.) However, it must be declared
6385	* out of that scope because in the case of DIF expression
6386	* arguments to aggregating actions, one iteration of the
6387	* action loop will use the last iteration's value.
6388	*/
6389	#ifdef lint
6390	uint64_t val = `0`;
6391	#else
6392	uint64_t val = `0`;
6393	#endif
6394
6395	mstate.dtms_present = DTRACE_MSTATE_ARGS \| DTRACE_MSTATE_PROBE;
6396	*flags &= ~CPU_DTRACE_ERROR;
6397
6398	if (prov == dtrace_provider) {
6399	/*
6400	* If dtrace itself is the provider of this probe,
6401	* we're only going to continue processing the ECB if
6402	* arg0 (the dtrace_state_t) is equal to the ECB's
6403	* creating state. (This prevents disjoint consumers
6404	* from seeing one another's metaprobes.)
6405	*/
6406	if (arg0 != (uint64_t)(uintptr_t)state)
6407	continue;
6408	}
6409
6410	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6411	/*
6412	* We're not currently active. If our provider isn't
6413	* the dtrace pseudo provider, we're not interested.
6414	*/
6415	if (prov != dtrace_provider)
6416	continue;
6417
6418	/*
6419	* Now we must further check if we are in the BEGIN
6420	* probe. If we are, we will only continue processing
6421	* if we're still in WARMUP -- if one BEGIN enabling
6422	* has invoked the exit() action, we don't want to
6423	* evaluate subsequent BEGIN enablings.
6424	*/
6425	if (probe->dtpr_id == dtrace_probeid_begin &&
6426	state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6427	ASSERT(state->dts_activity ==
6428	DTRACE_ACTIVITY_DRAINING);
6429	continue;
6430	}
6431	}
6432
6433	if (ecb->dte_cond) {
6434	/*
6435	* If the dte_cond bits indicate that this
6436	* consumer is only allowed to see user-mode firings
6437	* of this probe, call the provider's dtps_usermode()
6438	* entry point to check that the probe was fired
6439	* while in a user context. Skip this ECB if that's
6440	* not the case.
6441	*/
6442	if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6443	prov->dtpv_pops.dtps_usermode &&
6444	prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6445	probe->dtpr_id, probe->dtpr_arg) == `0`)
6446	continue;
6447
6448	/*
6449	* This is more subtle than it looks. We have to be
6450	* absolutely certain that CRED() isn't going to
6451	* change out from under us so it's only legit to
6452	* examine that structure if we're in constrained
6453	* situations. Currently, the only times we'll this
6454	* check is if a non-super-user has enabled the
6455	* profile or syscall providers -- providers that
6456	* allow visibility of all processes. For the
6457	* profile case, the check above will ensure that
6458	* we're examining a user context.
6459	*/
6460	if (ecb->dte_cond & DTRACE_COND_OWNER) {
6461	cred_t *cr;
6462	cred_t *s_cr =
6463	ecb->dte_state->dts_cred.dcr_cred;
6464	proc_t *proc;
6465	#pragma unused(proc) /* __APPLE__ */
6466
6467	ASSERT(s_cr != NULL);
6468
6469	/*
6470	* XXX this is hackish, but so is setting a variable
6471	* XXX in a McCarthy OR...
6472	*/
6473	if ((cr = dtrace_CRED()) == NULL \|\|
6474	posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid \|\|
6475	posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid \|\|
6476	posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid \|\|
6477	posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid \|\|
6478	posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid \|\|
6479	posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid \|\|
6480	#if !defined(__APPLE__)
6481	(proc = ttoproc(curthread)) == NULL \|\|
6482	(proc->p_flag & SNOCD))
6483	#else
6484	`1`) / APPLE NOTE: Darwin omits "No Core Dump" flag /
6485	#endif /* __APPLE__ */
6486	continue;
6487	}
6488
6489	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6490	cred_t *cr;
6491	cred_t *s_cr =
6492	ecb->dte_state->dts_cred.dcr_cred;
6493	#pragma unused(cr, s_cr) /* __APPLE__ */
6494
6495	ASSERT(s_cr != NULL);
6496
6497	#if !defined(__APPLE__)
6498	if ((cr = CRED()) == NULL \|\|
6499	s_cr->cr_zone->zone_id !=
6500	cr->cr_zone->zone_id)
6501	continue;
6502	#else
6503	/ APPLE NOTE: Darwin doesn't do zones. /
6504	#endif /* __APPLE__ */
6505	}
6506	}
6507
6508	if (now - state->dts_alive > dtrace_deadman_timeout) {
6509	/*
6510	* We seem to be dead. Unless we (a) have kernel
6511	* destructive permissions (b) have expicitly enabled
6512	* destructive actions and (c) destructive actions have
6513	* not been disabled, we're going to transition into
6514	* the KILLED state, from which no further processing
6515	* on this state will be performed.
6516	*/
6517	if (!dtrace_priv_kernel_destructive(state) \|\|
6518	!state->dts_cred.dcr_destructive \|\|
6519	dtrace_destructive_disallow) {
6520	void *activity = &state->dts_activity;
6521	dtrace_activity_t current;
6522
6523	do {
6524	current = state->dts_activity;
6525	} while (dtrace_cas32(activity, current,
6526	DTRACE_ACTIVITY_KILLED) != current);
6527
6528	continue;
6529	}
6530	}
6531
6532	if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6533	ecb->dte_alignment, state, &mstate)) < `0`)
6534	continue;
6535
6536	tomax = buf->dtb_tomax;
6537	ASSERT(tomax != NULL);
6538
6539	/*
6540	* Build and store the record header corresponding to the ECB.
6541	*/
6542	if (ecb->dte_size != `0`) {
6543	dtrace_rechdr_t dtrh;
6544
6545	if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
6546	mstate.dtms_timestamp = dtrace_gethrtime();
6547	mstate.dtms_present \|= DTRACE_MSTATE_TIMESTAMP;
6548	}
6549
6550	ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6551
6552	dtrh.dtrh_epid = ecb->dte_epid;
6553	DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
6554	DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
6555	}
6556
6557	mstate.dtms_epid = ecb->dte_epid;
6558	mstate.dtms_present \|= DTRACE_MSTATE_EPID;
6559
6560	if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6561	mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6562	else
6563	mstate.dtms_access = `0`;
6564
6565	if (pred != NULL) {
6566	dtrace_difo_t *dp = pred->dtp_difo;
6567	uint64_t rval;
6568
6569	rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6570
6571	if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6572	dtrace_cacheid_t cid = probe->dtpr_predcache;
6573
6574	if (cid != DTRACE_CACHEIDNONE && !onintr) {
6575	/*
6576	* Update the predicate cache...
6577	*/
6578	ASSERT(cid == pred->dtp_cacheid);
6579
6580	dtrace_set_thread_predcache(current_thread(), cid);
6581	}
6582
6583	continue;
6584	}
6585	}
6586
6587	for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6588	act != NULL; act = act->dta_next) {
6589	size_t valoffs;
6590	dtrace_difo_t *dp;
6591	dtrace_recdesc_t *rec = &act->dta_rec;
6592
6593	size = rec->dtrd_size;
6594	valoffs = offs + rec->dtrd_offset;
6595
6596	if (DTRACEACT_ISAGG(act->dta_kind)) {
6597	uint64_t v = `0xbad`;
6598	dtrace_aggregation_t *agg;
6599
6600	agg = (dtrace_aggregation_t *)act;
6601
6602	if ((dp = act->dta_difo) != NULL)
6603	v = dtrace_dif_emulate(dp,
6604	&mstate, vstate, state);
6605
6606	if (*flags & CPU_DTRACE_ERROR)
6607	continue;
6608
6609	/*
6610	* Note that we always pass the expression
6611	* value from the previous iteration of the
6612	* action loop. This value will only be used
6613	* if there is an expression argument to the
6614	* aggregating action, denoted by the
6615	* dtag_hasarg field.
6616	*/
6617	dtrace_aggregate(agg, buf,
6618	offs, aggbuf, v, val);
6619	continue;
6620	}
6621
6622	switch (act->dta_kind) {
6623	case DTRACEACT_STOP:
6624	if (dtrace_priv_proc_destructive(state))
6625	dtrace_action_stop();
6626	continue;
6627
6628	case DTRACEACT_BREAKPOINT:
6629	if (dtrace_priv_kernel_destructive(state))
6630	dtrace_action_breakpoint(ecb);
6631	continue;
6632
6633	case DTRACEACT_PANIC:
6634	if (dtrace_priv_kernel_destructive(state))
6635	dtrace_action_panic(ecb);
6636	continue;
6637
6638	case DTRACEACT_STACK:
6639	if (!dtrace_priv_kernel(state))
6640	continue;
6641
6642	dtrace_getpcstack((pc_t *)(tomax + valoffs),
6643	size / sizeof (pc_t), probe->dtpr_aframes,
6644	DTRACE_ANCHORED(probe) ? NULL :
6645	(uint32_t *)(uintptr_t)arg0);
6646	continue;
6647
6648	case DTRACEACT_JSTACK:
6649	case DTRACEACT_USTACK:
6650	if (!dtrace_priv_proc(state))
6651	continue;
6652
6653	/*
6654	* See comment in DIF_VAR_PID.
6655	*/
6656	if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6657	CPU_ON_INTR(CPU)) {
6658	int depth = DTRACE_USTACK_NFRAMES(
6659	rec->dtrd_arg) + `1`;
6660
6661	dtrace_bzero((void *)(tomax + valoffs),
6662	DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6663	+ depth * sizeof (uint64_t));
6664
6665	continue;
6666	}
6667
6668	if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != `0` &&
6669	curproc->p_dtrace_helpers != NULL) {
6670	/*
6671	* This is the slow path -- we have
6672	* allocated string space, and we're
6673	* getting the stack of a process that
6674	* has helpers. Call into a separate
6675	* routine to perform this processing.
6676	*/
6677	dtrace_action_ustack(&mstate, state,
6678	(uint64_t *)(tomax + valoffs),
6679	rec->dtrd_arg);
6680	continue;
6681	}
6682
6683	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6684	dtrace_getupcstack((uint64_t *)
6685	(tomax + valoffs),
6686	DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + `1`);
6687	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6688	continue;
6689
6690	default:
6691	break;
6692	}
6693
6694	dp = act->dta_difo;
6695	ASSERT(dp != NULL);
6696
6697	val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6698
6699	if (*flags & CPU_DTRACE_ERROR)
6700	continue;
6701
6702	switch (act->dta_kind) {
6703	case DTRACEACT_SPECULATE: {
6704	dtrace_rechdr_t *dtrh = NULL;
6705
6706	ASSERT(buf == &state->dts_buffer[cpuid]);
6707	buf = dtrace_speculation_buffer(state,
6708	cpuid, val);
6709
6710	if (buf == NULL) {
6711	*flags \|= CPU_DTRACE_DROP;
6712	continue;
6713	}
6714
6715	offs = dtrace_buffer_reserve(buf,
6716	ecb->dte_needed, ecb->dte_alignment,
6717	state, NULL);
6718
6719	if (offs < `0`) {
6720	*flags \|= CPU_DTRACE_DROP;
6721	continue;
6722	}
6723
6724	tomax = buf->dtb_tomax;
6725	ASSERT(tomax != NULL);
6726
6727	if (ecb->dte_size == `0`)
6728	continue;
6729
6730	ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6731	dtrh = ((void *)(tomax + offs));
6732	dtrh->dtrh_epid = ecb->dte_epid;
6733
6734	/*
6735	* When the speculation is committed, all of
6736	* the records in the speculative buffer will
6737	* have their timestamps set to the commit
6738	* time. Until then, it is set to a sentinel
6739	* value, for debugability.
6740	*/
6741	DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
6742
6743	continue;
6744	}
6745
6746	case DTRACEACT_CHILL:
6747	if (dtrace_priv_kernel_destructive(state))
6748	dtrace_action_chill(&mstate, val);
6749	continue;
6750
6751	case DTRACEACT_RAISE:
6752	if (dtrace_priv_proc_destructive(state))
6753	dtrace_action_raise(val);
6754	continue;
6755
6756	case DTRACEACT_PIDRESUME: / __APPLE__ /
6757	if (dtrace_priv_proc_destructive(state))
6758	dtrace_action_pidresume(val);
6759	continue;
6760
6761	case DTRACEACT_COMMIT:
6762	ASSERT(!committed);
6763
6764	/*
6765	* We need to commit our buffer state.
6766	*/
6767	if (ecb->dte_size)
6768	buf->dtb_offset = offs + ecb->dte_size;
6769	buf = &state->dts_buffer[cpuid];
6770	dtrace_speculation_commit(state, cpuid, val);
6771	committed = `1`;
6772	continue;
6773
6774	case DTRACEACT_DISCARD:
6775	dtrace_speculation_discard(state, cpuid, val);
6776	continue;
6777
6778	case DTRACEACT_DIFEXPR:
6779	case DTRACEACT_LIBACT:
6780	case DTRACEACT_PRINTF:
6781	case DTRACEACT_PRINTA:
6782	case DTRACEACT_SYSTEM:
6783	case DTRACEACT_FREOPEN:
6784	case DTRACEACT_APPLEBINARY: / __APPLE__ /
6785	case DTRACEACT_TRACEMEM:
6786	break;
6787
6788	case DTRACEACT_TRACEMEM_DYNSIZE:
6789	tracememsize = val;
6790	break;
6791
6792	case DTRACEACT_SYM:
6793	case DTRACEACT_MOD:
6794	if (!dtrace_priv_kernel(state))
6795	continue;
6796	break;
6797
6798	case DTRACEACT_USYM:
6799	case DTRACEACT_UMOD:
6800	case DTRACEACT_UADDR: {
6801	if (!dtrace_priv_proc(state))
6802	continue;
6803
6804	DTRACE_STORE(uint64_t, tomax,
6805	valoffs, (uint64_t)dtrace_proc_selfpid());
6806	DTRACE_STORE(uint64_t, tomax,
6807	valoffs + sizeof (uint64_t), val);
6808
6809	continue;
6810	}
6811
6812	case DTRACEACT_EXIT: {
6813	/*
6814	* For the exit action, we are going to attempt
6815	* to atomically set our activity to be
6816	* draining. If this fails (either because
6817	* another CPU has beat us to the exit action,
6818	* or because our current activity is something
6819	* other than ACTIVE or WARMUP), we will
6820	* continue. This assures that the exit action
6821	* can be successfully recorded at most once
6822	* when we're in the ACTIVE state. If we're
6823	* encountering the exit() action while in
6824	* COOLDOWN, however, we want to honor the new
6825	* status code. (We know that we're the only
6826	* thread in COOLDOWN, so there is no race.)
6827	*/
6828	void *activity = &state->dts_activity;
6829	dtrace_activity_t current = state->dts_activity;
6830
6831	if (current == DTRACE_ACTIVITY_COOLDOWN)
6832	break;
6833
6834	if (current != DTRACE_ACTIVITY_WARMUP)
6835	current = DTRACE_ACTIVITY_ACTIVE;
6836
6837	if (dtrace_cas32(activity, current,
6838	DTRACE_ACTIVITY_DRAINING) != current) {
6839	*flags \|= CPU_DTRACE_DROP;
6840	continue;
6841	}
6842
6843	break;
6844	}
6845
6846	default:
6847	ASSERT(`0`);
6848	}
6849
6850	if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF \| DIF_TF_BYUREF)) {
6851	uintptr_t end = valoffs + size;
6852
6853	if (tracememsize != `0` &&
6854	valoffs + tracememsize < end)
6855	{
6856	end = valoffs + tracememsize;
6857	tracememsize = `0`;
6858	}
6859
6860	if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
6861	!dtrace_vcanload((void *)(uintptr_t)val,
6862	&dp->dtdo_rtype, NULL, &mstate, vstate))
6863	{
6864	continue;
6865	}
6866
6867	dtrace_store_by_ref(dp, tomax, size, &valoffs,
6868	&val, end, act->dta_intuple,
6869	dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
6870	DIF_TF_BYREF: DIF_TF_BYUREF);
6871
6872	continue;
6873	}
6874
6875	switch (size) {
6876	case `0`:
6877	break;
6878
6879	case sizeof (uint8_t):
6880	DTRACE_STORE(uint8_t, tomax, valoffs, val);
6881	break;
6882	case sizeof (uint16_t):
6883	DTRACE_STORE(uint16_t, tomax, valoffs, val);
6884	break;
6885	case sizeof (uint32_t):
6886	DTRACE_STORE(uint32_t, tomax, valoffs, val);
6887	break;
6888	case sizeof (uint64_t):
6889	DTRACE_STORE(uint64_t, tomax, valoffs, val);
6890	break;
6891	default:
6892	/*
6893	* Any other size should have been returned by
6894	* reference, not by value.
6895	*/
6896	ASSERT(`0`);
6897	break;
6898	}
6899	}
6900
6901	if (*flags & CPU_DTRACE_DROP)
6902	continue;
6903
6904	if (*flags & CPU_DTRACE_FAULT) {
6905	int ndx;
6906	dtrace_action_t *err;
6907
6908	buf->dtb_errors++;
6909
6910	if (probe->dtpr_id == dtrace_probeid_error) {
6911	/*
6912	* There's nothing we can do -- we had an
6913	* error on the error probe. We bump an
6914	* error counter to at least indicate that
6915	* this condition happened.
6916	*/
6917	dtrace_error(&state->dts_dblerrors);
6918	continue;
6919	}
6920
6921	if (vtime) {
6922	/*
6923	* Before recursing on dtrace_probe(), we
6924	* need to explicitly clear out our start
6925	* time to prevent it from being accumulated
6926	* into t_dtrace_vtime.
6927	*/
6928
6929	/*
6930	* Darwin sets the sign bit on t_dtrace_tracing
6931	* to suspend accumulation to it.
6932	*/
6933	dtrace_set_thread_tracing(current_thread(),
6934	(`1ULL`<<`63`) \| dtrace_get_thread_tracing(current_thread()));
6935
6936	}
6937
6938	/*
6939	* Iterate over the actions to figure out which action
6940	* we were processing when we experienced the error.
6941	* Note that act points _past_ the faulting action; if
6942	* act is ecb->dte_action, the fault was in the
6943	* predicate, if it's ecb->dte_action->dta_next it's
6944	* in action #1, and so on.
6945	*/
6946	for (err = ecb->dte_action, ndx = `0`;
6947	err != act; err = err->dta_next, ndx++)
6948	continue;
6949
6950	dtrace_probe_error(state, ecb->dte_epid, ndx,
6951	(mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6952	mstate.dtms_fltoffs : -`1`, DTRACE_FLAGS2FLT(*flags),
6953	cpu_core[cpuid].cpuc_dtrace_illval);
6954
6955	continue;
6956	}
6957
6958	if (!committed)
6959	buf->dtb_offset = offs + ecb->dte_size;
6960	}
6961
6962	/ FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed*
6963	to the current thread. Instead it should accrue to DTrace. /*
6964	if (vtime) {
6965	thread_t thread = current_thread();
6966	int64_t t = dtrace_get_thread_tracing(thread);
6967
6968	if (t >= `0`) {
6969	/ Usual case, accumulate time spent here into t_dtrace_tracing /
6970	dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
6971	} else {
6972	/ Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. /
6973	dtrace_set_thread_tracing(thread, (~(`1ULL`<<`63`)) & t);
6974	}
6975	}
6976
6977	dtrace_interrupt_enable(cookie);
6978	}
6979
6980	/*
6981	* APPLE NOTE: Don't allow a thread to re-enter dtrace_probe().
6982	* This could occur if a probe is encountered on some function in the
6983	* transitive closure of the call to dtrace_probe().
6984	* Solaris has some strong guarantees that this won't happen.
6985	* The Darwin implementation is not so mature as to make those guarantees.
6986	* Hence, the introduction of __dtrace_probe() on xnu.
6987	*/
6988
6989	void
6990	dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6991	uint64_t arg2, uint64_t arg3, uint64_t arg4)
6992	{
6993	thread_t thread = current_thread();
6994	disable_preemption();
6995	if (id == dtrace_probeid_error) {
6996	__dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6997	dtrace_getipl(); / Defeat tail-call optimization of __dtrace_probe() /
6998	} else if (!dtrace_get_thread_reentering(thread)) {
6999	dtrace_set_thread_reentering(thread, TRUE);
7000	__dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7001	dtrace_set_thread_reentering(thread, FALSE);
7002	}
7003	#if DEBUG
7004	else __dtrace_probe(dtrace_probeid_error, `0`, id, `1`, -`1`, DTRACEFLT_UNKNOWN);
7005	#endif
7006	enable_preemption();
7007	}
7008
7009	/*
7010	* DTrace Probe Hashing Functions
7011	*
7012	* The functions in this section (and indeed, the functions in remaining
7013	* sections) are not _called_ from probe context. (Any exceptions to this are
7014	* marked with a "Note:".) Rather, they are called from elsewhere in the
7015	* DTrace framework to look-up probes in, add probes to and remove probes from
7016	* the DTrace probe hashes. (Each probe is hashed by each element of the
7017	* probe tuple -- allowing for fast lookups, regardless of what was
7018	* specified.)
7019	*/
7020	static uint_t
7021	dtrace_hash_str(const char *p)
7022	{
7023	unsigned int g;
7024	uint_t hval = `0`;
7025
7026	while (*p) {
7027	hval = (hval << `4`) + *p++;
7028	if ((g = (hval & `0xf0000000`)) != `0`)
7029	hval ^= g >> `24`;
7030	hval &= ~g;
7031	}
7032	return (hval);
7033	}
7034
7035	static const char*
7036	dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7037	{
7038	#pragma unused(offs)
7039	dtrace_probe_t probe = (dtrace_probe_t)elm;
7040	return probe->dtpr_provider->dtpv_name;
7041	}
7042
7043	static const char*
7044	dtrace_strkey_offset(void *elm, uintptr_t offs)
7045	{
7046	return ((char *)((uintptr_t)(elm) + offs));
7047	}
7048
7049	static const char*
7050	dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7051	{
7052	return ((char* **)((uintptr_t)(elm) + offs));
7053	}
7054
7055	static dtrace_hash_t *
7056	dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7057	{
7058	dtrace_hash_t hash = kmem_zalloc(sizeof* (dtrace_hash_t), KM_SLEEP);
7059
7060	hash->dth_getstr = func;
7061	hash->dth_stroffs = arg;
7062	hash->dth_nextoffs = nextoffs;
7063	hash->dth_prevoffs = prevoffs;
7064
7065	hash->dth_size = `1`;
7066	hash->dth_mask = hash->dth_size - `1`;
7067
7068	hash->dth_tab = kmem_zalloc(hash->dth_size *
7069	sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7070
7071	return (hash);
7072	}
7073
7074	/*
7075	* APPLE NOTE: dtrace_hash_destroy is not used.
7076	* It is called by dtrace_detach which is not
7077	* currently implemented. Revisit someday.
7078	*/
7079	#if !defined(__APPLE__)
7080	static void
7081	dtrace_hash_destroy(dtrace_hash_t *hash)
7082	{
7083	#if DEBUG
7084	int i;
7085
7086	for (i = `0`; i < hash->dth_size; i++)
7087	ASSERT(hash->dth_tab[i] == NULL);
7088	#endif
7089
7090	kmem_free(hash->dth_tab,
7091	hash->dth_size * sizeof (dtrace_hashbucket_t *));
7092	kmem_free(hash, sizeof (dtrace_hash_t));
7093	}
7094	#endif /* __APPLE__ */
7095
7096	static void
7097	dtrace_hash_resize(dtrace_hash_t *hash)
7098	{
7099	int size = hash->dth_size, i, ndx;
7100	int new_size = hash->dth_size << `1`;
7101	int new_mask = new_size - `1`;
7102	dtrace_hashbucket_t *new_tab, bucket, *next;
7103
7104	ASSERT((new_size & new_mask) == `0`);
7105
7106	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7107
7108	for (i = `0`; i < size; i++) {
7109	for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7110	void *elm = bucket->dthb_chain;
7111
7112	ASSERT(elm != NULL);
7113	ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7114
7115	next = bucket->dthb_next;
7116	bucket->dthb_next = new_tab[ndx];
7117	new_tab[ndx] = bucket;
7118	}
7119	}
7120
7121	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7122	hash->dth_tab = new_tab;
7123	hash->dth_size = new_size;
7124	hash->dth_mask = new_mask;
7125	}
7126
7127	static void
7128	dtrace_hash_add(dtrace_hash_t hash, void* *new)
7129	{
7130	int hashval = DTRACE_HASHSTR(hash, new);
7131	int ndx = hashval & hash->dth_mask;
7132	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7133	void nextp, prevp;
7134
7135	for (; bucket != NULL; bucket = bucket->dthb_next) {
7136	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7137	goto add;
7138	}
7139
7140	if ((hash->dth_nbuckets >> `1`) > hash->dth_size) {
7141	dtrace_hash_resize(hash);
7142	dtrace_hash_add(hash, new);
7143	return;
7144	}
7145
7146	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7147	bucket->dthb_next = hash->dth_tab[ndx];
7148	hash->dth_tab[ndx] = bucket;
7149	hash->dth_nbuckets++;
7150
7151	add:
7152	nextp = DTRACE_HASHNEXT(hash, new);
7153	ASSERT(nextp == NULL && (DTRACE_HASHPREV(hash, new)) == NULL);
7154	*nextp = bucket->dthb_chain;
7155
7156	if (bucket->dthb_chain != NULL) {
7157	prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7158	ASSERT(*prevp == NULL);
7159	*prevp = new;
7160	}
7161
7162	bucket->dthb_chain = new;
7163	bucket->dthb_len++;
7164	}
7165
7166	static void *
7167	dtrace_hash_lookup_string(dtrace_hash_t hash, const* char *str)
7168	{
7169	int hashval = dtrace_hash_str(str);
7170	int ndx = hashval & hash->dth_mask;
7171	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7172
7173	for (; bucket != NULL; bucket = bucket->dthb_next) {
7174	if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == `0`)
7175	return (bucket->dthb_chain);
7176	}
7177
7178	return (NULL);
7179	}
7180
7181	static dtrace_probe_t *
7182	dtrace_hash_lookup(dtrace_hash_t hash, void* *template)
7183	{
7184	return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7185	}
7186
7187	static int
7188	dtrace_hash_collisions(dtrace_hash_t hash, void* *template)
7189	{
7190	int hashval = DTRACE_HASHSTR(hash, template);
7191	int ndx = hashval & hash->dth_mask;
7192	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7193
7194	for (; bucket != NULL; bucket = bucket->dthb_next) {
7195	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7196	return (bucket->dthb_len);
7197	}
7198
7199	return (`0`);
7200	}
7201
7202	static void
7203	dtrace_hash_remove(dtrace_hash_t hash, void* *elm)
7204	{
7205	int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7206	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7207
7208	void **prevp = DTRACE_HASHPREV(hash, elm);
7209	void **nextp = DTRACE_HASHNEXT(hash, elm);
7210
7211	/*
7212	* Find the bucket that we're removing this elm from.
7213	*/
7214	for (; bucket != NULL; bucket = bucket->dthb_next) {
7215	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7216	break;
7217	}
7218
7219	ASSERT(bucket != NULL);
7220
7221	if (*prevp == NULL) {
7222	if (*nextp == NULL) {
7223	/*
7224	* The removed element was the only element on this
7225	* bucket; we need to remove the bucket.
7226	*/
7227	dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7228
7229	ASSERT(bucket->dthb_chain == elm);
7230	ASSERT(b != NULL);
7231
7232	if (b == bucket) {
7233	hash->dth_tab[ndx] = bucket->dthb_next;
7234	} else {
7235	while (b->dthb_next != bucket)
7236	b = b->dthb_next;
7237	b->dthb_next = bucket->dthb_next;
7238	}
7239
7240	ASSERT(hash->dth_nbuckets > `0`);
7241	hash->dth_nbuckets--;
7242	kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7243	return;
7244	}
7245
7246	bucket->dthb_chain = *nextp;
7247	} else {
7248	(DTRACE_HASHNEXT(hash, prevp)) = *nextp;
7249	}
7250
7251	if (*nextp != NULL)
7252	(DTRACE_HASHPREV(hash, nextp)) = *prevp;
7253	}
7254
7255	/*
7256	* DTrace Utility Functions
7257	*
7258	* These are random utility functions that are _not_ called from probe context.
7259	*/
7260	static int
7261	dtrace_badattr(const dtrace_attribute_t *a)
7262	{
7263	return (a->dtat_name > DTRACE_STABILITY_MAX \|\|
7264	a->dtat_data > DTRACE_STABILITY_MAX \|\|
7265	a->dtat_class > DTRACE_CLASS_MAX);
7266	}
7267
7268	/*
7269	* Returns a dtrace-managed copy of a string, and will
7270	* deduplicate copies of the same string.
7271	* If the specified string is NULL, returns an empty string
7272	*/
7273	static char *
7274	dtrace_strref(const char *str)
7275	{
7276	dtrace_string_t *s = NULL;
7277	size_t bufsize = (str != NULL ? strlen(str) : `0`) + `1`;
7278
7279	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7280
7281	if (str == NULL)
7282	str = "";
7283
7284	for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7285	s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7286	if (strncmp(str, s->dtst_str, bufsize) != `0`) {
7287	continue;
7288	}
7289	ASSERT(s->dtst_refcount != UINT32_MAX);
7290	s->dtst_refcount++;
7291	return s->dtst_str;
7292	}
7293
7294	s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7295	s->dtst_refcount = `1`;
7296	(void) strlcpy(s->dtst_str, str, bufsize);
7297
7298	dtrace_hash_add(dtrace_strings, s);
7299
7300	return s->dtst_str;
7301	}
7302
7303	static void
7304	dtrace_strunref(const char *str)
7305	{
7306	ASSERT(str != NULL);
7307	dtrace_string_t *s = NULL;
7308	size_t bufsize = strlen(str) + `1`;
7309
7310	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7311
7312	for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7313	s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7314	if (strncmp(str, s->dtst_str, bufsize) != `0`) {
7315	continue;
7316	}
7317	ASSERT(s->dtst_refcount != `0`);
7318	s->dtst_refcount--;
7319	if (s->dtst_refcount == `0`) {
7320	dtrace_hash_remove(dtrace_strings, s);
7321	kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7322	}
7323	return;
7324	}
7325	panic("attempt to unref non-existent string %s", str);
7326	}
7327
7328	#define DTRACE_ISALPHA(c) \
7329	(((c) >= 'a' && (c) <= 'z') \|\| ((c) >= 'A' && (c) <= 'Z'))
7330
7331	static int
7332	dtrace_badname(const char *s)
7333	{
7334	char c;
7335
7336	if (s == NULL \|\| (c = *s++) == `'\0'`)
7337	return (`0`);
7338
7339	if (!DTRACE_ISALPHA(c) && c != `'-'` && c != `'_'` && c != `'.'`)
7340	return (`1`);
7341
7342	while ((c = *s++) != `'\0'`) {
7343	if (!DTRACE_ISALPHA(c) && (c < `'0'` \|\| c > `'9'`) &&
7344	c != `'-'` && c != `'_'` && c != `'.'` && c != '`')
7345	return (`1`);
7346	}
7347
7348	return (`0`);
7349	}
7350
7351	static void
7352	dtrace_cred2priv(cred_t cr, uint32_t privp, uid_t uidp, zoneid_t zoneidp)
7353	{
7354	uint32_t priv;
7355
7356	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7357	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7358	priv = DTRACE_PRIV_USER \| DTRACE_PRIV_PROC \| DTRACE_PRIV_OWNER;
7359	}
7360	else {
7361	priv = DTRACE_PRIV_ALL;
7362	}
7363	*uidp = `0`;
7364	*zoneidp = `0`;
7365	} else {
7366	*uidp = crgetuid(cr);
7367	*zoneidp = crgetzoneid(cr);
7368
7369	priv = `0`;
7370	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7371	priv \|= DTRACE_PRIV_KERNEL \| DTRACE_PRIV_USER;
7372	else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7373	priv \|= DTRACE_PRIV_USER;
7374	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7375	priv \|= DTRACE_PRIV_PROC;
7376	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7377	priv \|= DTRACE_PRIV_OWNER;
7378	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7379	priv \|= DTRACE_PRIV_ZONEOWNER;
7380	}
7381
7382	*privp = priv;
7383	}
7384
7385	#ifdef DTRACE_ERRDEBUG
7386	static void
7387	dtrace_errdebug(const char *str)
7388	{
7389	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7390	int occupied = `0`;
7391
7392	lck_mtx_lock(&dtrace_errlock);
7393	dtrace_errlast = str;
7394	dtrace_errthread = (kthread_t *)current_thread();
7395
7396	while (occupied++ < DTRACE_ERRHASHSZ) {
7397	if (dtrace_errhash[hval].dter_msg == str) {
7398	dtrace_errhash[hval].dter_count++;
7399	goto out;
7400	}
7401
7402	if (dtrace_errhash[hval].dter_msg != NULL) {
7403	hval = (hval + `1`) % DTRACE_ERRHASHSZ;
7404	continue;
7405	}
7406
7407	dtrace_errhash[hval].dter_msg = str;
7408	dtrace_errhash[hval].dter_count = `1`;
7409	goto out;
7410	}
7411
7412	panic("dtrace: undersized error hash");
7413	out:
7414	lck_mtx_unlock(&dtrace_errlock);
7415	}
7416	#endif
7417
7418	/*
7419	* DTrace Matching Functions
7420	*
7421	* These functions are used to match groups of probes, given some elements of
7422	* a probe tuple, or some globbed expressions for elements of a probe tuple.
7423	*/
7424	static int
7425	dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7426	zoneid_t zoneid)
7427	{
7428	if (priv != DTRACE_PRIV_ALL) {
7429	uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7430	uint32_t match = priv & ppriv;
7431
7432	/*
7433	* No PRIV_DTRACE_* privileges...
7434	*/
7435	if ((priv & (DTRACE_PRIV_PROC \| DTRACE_PRIV_USER \|
7436	DTRACE_PRIV_KERNEL)) == `0`)
7437	return (`0`);
7438
7439	/*
7440	* No matching bits, but there were bits to match...
7441	*/
7442	if (match == `0` && ppriv != `0`)
7443	return (`0`);
7444
7445	/*
7446	* Need to have permissions to the process, but don't...
7447	*/
7448	if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != `0` &&
7449	uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7450	return (`0`);
7451	}
7452
7453	/*
7454	* Need to be in the same zone unless we possess the
7455	* privilege to examine all zones.
7456	*/
7457	if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != `0` &&
7458	zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7459	return (`0`);
7460	}
7461	}
7462
7463	return (`1`);
7464	}
7465
7466	/*
7467	* dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7468	* consists of input pattern strings and an ops-vector to evaluate them.
7469	* This function returns >0 for match, 0 for no match, and <0 for error.
7470	*/
7471	static int
7472	dtrace_match_probe(const dtrace_probe_t prp, const* dtrace_probekey_t *pkp,
7473	uint32_t priv, uid_t uid, zoneid_t zoneid)
7474	{
7475	dtrace_provider_t *pvp = prp->dtpr_provider;
7476	int rv;
7477
7478	if (pvp->dtpv_defunct)
7479	return (`0`);
7480
7481	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, `0`)) <= `0`)
7482	return (rv);
7483
7484	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, `0`)) <= `0`)
7485	return (rv);
7486
7487	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, `0`)) <= `0`)
7488	return (rv);
7489
7490	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, `0`)) <= `0`)
7491	return (rv);
7492
7493	if (dtrace_match_priv(prp, priv, uid, zoneid) == `0`)
7494	return (`0`);
7495
7496	return (rv);
7497	}
7498
7499	/*
7500	* dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7501	* interface for matching a glob pattern 'p' to an input string 's'. Unlike
7502	* libc's version, the kernel version only applies to 8-bit ASCII strings.
7503	* In addition, all of the recursion cases except for '*' matching have been
7504	* unwound. For '*', we still implement recursive evaluation, but a depth
7505	* counter is maintained and matching is aborted if we recurse too deep.
7506	* The function returns 0 if no match, >0 if match, and <0 if recursion error.
7507	*/
7508	static int
7509	dtrace_match_glob(const char s, const* char p, int* depth)
7510	{
7511	const char *olds;
7512	char s1, c;
7513	int gs;
7514
7515	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7516	return (-`1`);
7517
7518	if (s == NULL)
7519	s = ""; / treat NULL as empty string /
7520
7521	top:
7522	olds = s;
7523	s1 = *s++;
7524
7525	if (p == NULL)
7526	return (`0`);
7527
7528	if ((c = *p++) == `'\0'`)
7529	return (s1 == `'\0'`);
7530
7531	switch (c) {
7532	case `'['`: {
7533	int ok = `0`, notflag = `0`;
7534	char lc = `'\0'`;
7535
7536	if (s1 == `'\0'`)
7537	return (`0`);
7538
7539	if (*p == `'!'`) {
7540	notflag = `1`;
7541	p++;
7542	}
7543
7544	if ((c = *p++) == `'\0'`)
7545	return (`0`);
7546
7547	do {
7548	if (c == `'-'` && lc != `'\0'` && *p != `']'`) {
7549	if ((c = *p++) == `'\0'`)
7550	return (`0`);
7551	if (c == `'\\'` && (c = *p++) == `'\0'`)
7552	return (`0`);
7553
7554	if (notflag) {
7555	if (s1 < lc \|\| s1 > c)
7556	ok++;
7557	else
7558	return (`0`);
7559	} else if (lc <= s1 && s1 <= c)
7560	ok++;
7561
7562	} else if (c == `'\\'` && (c = *p++) == `'\0'`)
7563	return (`0`);
7564
7565	lc = c; / save left-hand 'c' for next iteration /
7566
7567	if (notflag) {
7568	if (s1 != c)
7569	ok++;
7570	else
7571	return (`0`);
7572	} else if (s1 == c)
7573	ok++;
7574
7575	if ((c = *p++) == `'\0'`)
7576	return (`0`);
7577
7578	} while (c != `']'`);
7579
7580	if (ok)
7581	goto top;
7582
7583	return (`0`);
7584	}
7585
7586	case `'\\'`:
7587	if ((c = *p++) == `'\0'`)
7588	return (`0`);
7589	/FALLTHRU/
7590
7591	default:
7592	if (c != s1)
7593	return (`0`);
7594	/FALLTHRU/
7595
7596	case `'?'`:
7597	if (s1 != `'\0'`)
7598	goto top;
7599	return (`0`);
7600
7601	case `'*'`:
7602	while (p == `''`)
7603	p++; / consecutive 's are identical to a single one /*
7604
7605	if (*p == `'\0'`)
7606	return (`1`);
7607
7608	for (s = olds; *s != `'\0'`; s++) {
7609	if ((gs = dtrace_match_glob(s, p, depth + `1`)) != `0`)
7610	return (gs);
7611	}
7612
7613	return (`0`);
7614	}
7615	}
7616
7617	/ARGSUSED/
7618	static int
7619	dtrace_match_string(const char s, const* char p, int* depth)
7620	{
7621	#pragma unused(depth) /* __APPLE__ */
7622	return (s != NULL && s == p);
7623	}
7624
7625	/ARGSUSED/
7626	static int
7627	dtrace_match_module(const char s, const* char p, int* depth)
7628	{
7629	#pragma unused(depth) /* __APPLE__ */
7630	size_t len;
7631	if (s == NULL \|\| p == NULL)
7632	return (`0`);
7633
7634	len = strlen(p);
7635
7636	if (strncmp(p, s, len) != `0`)
7637	return (`0`);
7638
7639	if (s[len] == `'.'` \|\| s[len] == `'\0'`)
7640	return (`1`);
7641
7642	return (`0`);
7643	}
7644
7645	/ARGSUSED/
7646	static int
7647	dtrace_match_nul(const char s, const* char p, int* depth)
7648	{
7649	#pragma unused(s, p, depth) /* __APPLE__ */
7650	return (`1`); / always match the empty pattern /
7651	}
7652
7653	/ARGSUSED/
7654	static int
7655	dtrace_match_nonzero(const char s, const* char p, int* depth)
7656	{
7657	#pragma unused(p, depth) /* __APPLE__ */
7658	return (s != NULL && s[`0`] != `'\0'`);
7659	}
7660
7661	static int
7662	dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7663	zoneid_t zoneid, int (matched)(dtrace_probe_t , void , void* ), void* arg1, void* *arg2)
7664	{
7665	dtrace_probe_t *probe;
7666	dtrace_provider_t prov_template = {
7667	.dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
7668	};
7669
7670	dtrace_probe_t template = {
7671	.dtpr_provider = &prov_template,
7672	.dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
7673	.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
7674	.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
7675	};
7676
7677	dtrace_hash_t *hash = NULL;
7678	int len, rc, best = INT_MAX, nmatched = `0`;
7679	dtrace_id_t i;
7680
7681	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7682
7683	/*
7684	* If the probe ID is specified in the key, just lookup by ID and
7685	* invoke the match callback once if a matching probe is found.
7686	*/
7687	if (pkp->dtpk_id != DTRACE_IDNONE) {
7688	if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7689	dtrace_match_probe(probe, pkp, priv, uid, zoneid) > `0`) {
7690	if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
7691	return (DTRACE_MATCH_FAIL);
7692	nmatched++;
7693	}
7694	return (nmatched);
7695	}
7696
7697	/*
7698	* We want to find the most distinct of the provider name, module name,
7699	* function name, and name. So for each one that is not a glob
7700	* pattern or empty string, we perform a lookup in the corresponding
7701	* hash and use the hash table with the fewest collisions to do our
7702	* search.
7703	*/
7704	if (pkp->dtpk_pmatch == &dtrace_match_string &&
7705	(len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
7706	best = len;
7707	hash = dtrace_byprov;
7708	}
7709
7710	if (pkp->dtpk_mmatch == &dtrace_match_string &&
7711	(len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7712	best = len;
7713	hash = dtrace_bymod;
7714	}
7715
7716	if (pkp->dtpk_fmatch == &dtrace_match_string &&
7717	(len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7718	best = len;
7719	hash = dtrace_byfunc;
7720	}
7721
7722	if (pkp->dtpk_nmatch == &dtrace_match_string &&
7723	(len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7724	best = len;
7725	hash = dtrace_byname;
7726	}
7727
7728	/*
7729	* If we did not select a hash table, iterate over every probe and
7730	* invoke our callback for each one that matches our input probe key.
7731	*/
7732	if (hash == NULL) {
7733	for (i = `0`; i < (dtrace_id_t)dtrace_nprobes; i++) {
7734	if ((probe = dtrace_probes[i]) == NULL \|\|
7735	dtrace_match_probe(probe, pkp, priv, uid,
7736	zoneid) <= `0`)
7737	continue;
7738
7739	nmatched++;
7740
7741	if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7742	if (rc == DTRACE_MATCH_FAIL)
7743	return (DTRACE_MATCH_FAIL);
7744	break;
7745	}
7746	}
7747
7748	return (nmatched);
7749	}
7750
7751	/*
7752	* If we selected a hash table, iterate over each probe of the same key
7753	* name and invoke the callback for every probe that matches the other
7754	* attributes of our input probe key.
7755	*/
7756	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7757	probe = *(DTRACE_HASHNEXT(hash, probe))) {
7758
7759	if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= `0`)
7760	continue;
7761
7762	nmatched++;
7763
7764	if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7765	if (rc == DTRACE_MATCH_FAIL)
7766	return (DTRACE_MATCH_FAIL);
7767	break;
7768	}
7769	}
7770
7771	return (nmatched);
7772	}
7773
7774	/*
7775	* Return the function pointer dtrace_probecmp() should use to compare the
7776	* specified pattern with a string. For NULL or empty patterns, we select
7777	* dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7778	* For non-empty non-glob strings, we use dtrace_match_string().
7779	*/
7780	static dtrace_probekey_f *
7781	dtrace_probekey_func(const char *p)
7782	{
7783	char c;
7784
7785	if (p == NULL \|\| *p == `'\0'`)
7786	return (&dtrace_match_nul);
7787
7788	while ((c = *p++) != `'\0'`) {
7789	if (c == `'['` \|\| c == `'?'` \|\| c == `'*'` \|\| c == `'\\'`)
7790	return (&dtrace_match_glob);
7791	}
7792
7793	return (&dtrace_match_string);
7794	}
7795
7796	static dtrace_probekey_f *
7797	dtrace_probekey_module_func(const char *p)
7798	{
7799	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7800
7801	dtrace_probekey_f *f = dtrace_probekey_func(p);
7802	if (f == &dtrace_match_string) {
7803	dtrace_probe_t template = {
7804	.dtpr_mod = (char *)(uintptr_t)p,
7805	};
7806	if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
7807	return (&dtrace_match_module);
7808	}
7809	return (&dtrace_match_string);
7810	}
7811	return f;
7812	}
7813
7814	/*
7815	* Build a probe comparison key for use with dtrace_match_probe() from the
7816	* given probe description. By convention, a null key only matches anchored
7817	* probes: if each field is the empty string, reset dtpk_fmatch to
7818	* dtrace_match_nonzero().
7819	*/
7820	static void
7821	dtrace_probekey(const dtrace_probedesc_t pdp, dtrace_probekey_t pkp)
7822	{
7823
7824	pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
7825	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7826
7827	pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
7828	pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
7829
7830	pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
7831	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7832
7833	pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
7834	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7835
7836	pkp->dtpk_id = pdp->dtpd_id;
7837
7838	if (pkp->dtpk_id == DTRACE_IDNONE &&
7839	pkp->dtpk_pmatch == &dtrace_match_nul &&
7840	pkp->dtpk_mmatch == &dtrace_match_nul &&
7841	pkp->dtpk_fmatch == &dtrace_match_nul &&
7842	pkp->dtpk_nmatch == &dtrace_match_nul)
7843	pkp->dtpk_fmatch = &dtrace_match_nonzero;
7844	}
7845
7846	static void
7847	dtrace_probekey_release(dtrace_probekey_t *pkp)
7848	{
7849	dtrace_strunref(pkp->dtpk_prov);
7850	dtrace_strunref(pkp->dtpk_mod);
7851	dtrace_strunref(pkp->dtpk_func);
7852	dtrace_strunref(pkp->dtpk_name);
7853	}
7854
7855	static int
7856	dtrace_cond_provider_match(dtrace_probedesc_t desc, void* *data)
7857	{
7858	if (desc == NULL)
7859	return `1`;
7860
7861	dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
7862
7863	return func((char*)data, desc->dtpd_provider, `0`);
7864	}
7865
7866	/*
7867	* DTrace Provider-to-Framework API Functions
7868	*
7869	* These functions implement much of the Provider-to-Framework API, as
7870	* described in <sys/dtrace.h>. The parts of the API not in this section are
7871	* the functions in the API for probe management (found below), and
7872	* dtrace_probe() itself (found above).
7873	*/
7874
7875	/*
7876	* Register the calling provider with the DTrace framework. This should
7877	* generally be called by DTrace providers in their attach(9E) entry point.
7878	*/
7879	int
7880	dtrace_register(const char name, const* dtrace_pattr_t *pap, uint32_t priv,
7881	cred_t cr, const* dtrace_pops_t pops, void* arg, dtrace_provider_id_t idp)
7882	{
7883	dtrace_provider_t *provider;
7884
7885	if (name == NULL \|\| pap == NULL \|\| pops == NULL \|\| idp == NULL) {
7886	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7887	"arguments", name ? name : "<NULL>");
7888	return (EINVAL);
7889	}
7890
7891	if (name[`0`] == `'\0'` \|\| dtrace_badname(name)) {
7892	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7893	"provider name", name);
7894	return (EINVAL);
7895	}
7896
7897	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) \|\|
7898	pops->dtps_enable == NULL \|\| pops->dtps_disable == NULL \|\|
7899	pops->dtps_destroy == NULL \|\|
7900	((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7901	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7902	"provider ops", name);
7903	return (EINVAL);
7904	}
7905
7906	if (dtrace_badattr(&pap->dtpa_provider) \|\|
7907	dtrace_badattr(&pap->dtpa_mod) \|\|
7908	dtrace_badattr(&pap->dtpa_func) \|\|
7909	dtrace_badattr(&pap->dtpa_name) \|\|
7910	dtrace_badattr(&pap->dtpa_args)) {
7911	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7912	"provider attributes", name);
7913	return (EINVAL);
7914	}
7915
7916	if (priv & ~DTRACE_PRIV_ALL) {
7917	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7918	"privilege attributes", name);
7919	return (EINVAL);
7920	}
7921
7922	if ((priv & DTRACE_PRIV_KERNEL) &&
7923	(priv & (DTRACE_PRIV_USER \| DTRACE_PRIV_OWNER)) &&
7924	pops->dtps_usermode == NULL) {
7925	cmn_err(CE_WARN, "failed to register provider '%s': need "
7926	"dtps_usermode() op for given privilege attributes", name);
7927	return (EINVAL);
7928	}
7929
7930	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7931
7932	provider->dtpv_attr = *pap;
7933	provider->dtpv_priv.dtpp_flags = priv;
7934	if (cr != NULL) {
7935	provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7936	provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7937	}
7938	provider->dtpv_pops = *pops;
7939
7940	if (pops->dtps_provide == NULL) {
7941	ASSERT(pops->dtps_provide_module != NULL);
7942	provider->dtpv_pops.dtps_provide =
7943	(void ()(void* , const* dtrace_probedesc_t *))dtrace_nullop;
7944	}
7945
7946	if (pops->dtps_provide_module == NULL) {
7947	ASSERT(pops->dtps_provide != NULL);
7948	provider->dtpv_pops.dtps_provide_module =
7949	(void ()(void* , struct* modctl *))dtrace_nullop;
7950	}
7951
7952	if (pops->dtps_suspend == NULL) {
7953	ASSERT(pops->dtps_resume == NULL);
7954	provider->dtpv_pops.dtps_suspend =
7955	(void ()(void* , dtrace_id_t, void* *))dtrace_nullop;
7956	provider->dtpv_pops.dtps_resume =
7957	(void ()(void* , dtrace_id_t, void* *))dtrace_nullop;
7958	}
7959
7960	provider->dtpv_arg = arg;
7961	*idp = (dtrace_provider_id_t)provider;
7962
7963	if (pops == &dtrace_provider_ops) {
7964	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7965	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7966
7967	provider->dtpv_name = dtrace_strref(name);
7968
7969	ASSERT(dtrace_anon.dta_enabling == NULL);
7970
7971	/*
7972	* We make sure that the DTrace provider is at the head of
7973	* the provider chain.
7974	*/
7975	provider->dtpv_next = dtrace_provider;
7976	dtrace_provider = provider;
7977	return (`0`);
7978	}
7979
7980	lck_mtx_lock(&dtrace_provider_lock);
7981	lck_mtx_lock(&dtrace_lock);
7982
7983	provider->dtpv_name = dtrace_strref(name);
7984
7985	/*
7986	* If there is at least one provider registered, we'll add this
7987	* provider after the first provider.
7988	*/
7989	if (dtrace_provider != NULL) {
7990	provider->dtpv_next = dtrace_provider->dtpv_next;
7991	dtrace_provider->dtpv_next = provider;
7992	} else {
7993	dtrace_provider = provider;
7994	}
7995
7996	if (dtrace_retained != NULL) {
7997	dtrace_enabling_provide(provider);
7998
7999	/*
8000	* Now we need to call dtrace_enabling_matchall_with_cond() --
8001	* with a condition matching the provider name we just added,
8002	* which will acquire cpu_lock and dtrace_lock. We therefore need
8003	* to drop all of our locks before calling into it...
8004	*/
8005	lck_mtx_unlock(&dtrace_lock);
8006	lck_mtx_unlock(&dtrace_provider_lock);
8007
8008	dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8009	dtrace_enabling_matchall_with_cond(&cond);
8010
8011	return (`0`);
8012	}
8013
8014	lck_mtx_unlock(&dtrace_lock);
8015	lck_mtx_unlock(&dtrace_provider_lock);
8016
8017	return (`0`);
8018	}
8019
8020	/*
8021	* Unregister the specified provider from the DTrace framework. This should
8022	* generally be called by DTrace providers in their detach(9E) entry point.
8023	*/
8024	int
8025	dtrace_unregister(dtrace_provider_id_t id)
8026	{
8027	dtrace_provider_t old = (dtrace_provider_t )id;
8028	dtrace_provider_t *prev = NULL;
8029	int self = `0`;
8030	dtrace_probe_t probe, first = NULL, *next = NULL;
8031	dtrace_probe_t template = {
8032	.dtpr_provider = old
8033	};
8034
8035	if (old->dtpv_pops.dtps_enable ==
8036	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop) {
8037	/*
8038	* If DTrace itself is the provider, we're called with locks
8039	* already held.
8040	*/
8041	ASSERT(old == dtrace_provider);
8042	ASSERT(dtrace_devi != NULL);
8043	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8044	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8045	self = `1`;
8046
8047	if (dtrace_provider->dtpv_next != NULL) {
8048	/*
8049	* There's another provider here; return failure.
8050	*/
8051	return (EBUSY);
8052	}
8053	} else {
8054	lck_mtx_lock(&dtrace_provider_lock);
8055	lck_mtx_lock(&mod_lock);
8056	lck_mtx_lock(&dtrace_lock);
8057	}
8058
8059	/*
8060	* If anyone has /dev/dtrace open, or if there are anonymous enabled
8061	* probes, we refuse to let providers slither away, unless this
8062	* provider has already been explicitly invalidated.
8063	*/
8064	if (!old->dtpv_defunct &&
8065	(dtrace_opens \|\| (dtrace_anon.dta_state != NULL &&
8066	dtrace_anon.dta_state->dts_necbs > `0`))) {
8067	if (!self) {
8068	lck_mtx_unlock(&dtrace_lock);
8069	lck_mtx_unlock(&mod_lock);
8070	lck_mtx_unlock(&dtrace_provider_lock);
8071	}
8072	return (EBUSY);
8073	}
8074
8075	/*
8076	* Attempt to destroy the probes associated with this provider.
8077	*/
8078	if (old->dtpv_ecb_count!=`0`) {
8079	/*
8080	* We have at least one ECB; we can't remove this provider.
8081	*/
8082	if (!self) {
8083	lck_mtx_unlock(&dtrace_lock);
8084	lck_mtx_unlock(&mod_lock);
8085	lck_mtx_unlock(&dtrace_provider_lock);
8086	}
8087	return (EBUSY);
8088	}
8089
8090	/*
8091	* All of the probes for this provider are disabled; we can safely
8092	* remove all of them from their hash chains and from the probe array.
8093	*/
8094	for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8095	probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8096	if (probe->dtpr_provider != old)
8097	continue;
8098
8099	dtrace_probes[probe->dtpr_id - `1`] = NULL;
8100	old->dtpv_probe_count--;
8101
8102	dtrace_hash_remove(dtrace_bymod, probe);
8103	dtrace_hash_remove(dtrace_byfunc, probe);
8104	dtrace_hash_remove(dtrace_byname, probe);
8105
8106	if (first == NULL) {
8107	first = probe;
8108	probe->dtpr_nextmod = NULL;
8109	} else {
8110	/*
8111	* Use nextmod as the chain of probes to remove
8112	*/
8113	probe->dtpr_nextmod = first;
8114	first = probe;
8115	}
8116	}
8117
8118	for (probe = first; probe != NULL; probe = next) {
8119	next = probe->dtpr_nextmod;
8120	dtrace_hash_remove(dtrace_byprov, probe);
8121	}
8122
8123	/*
8124	* The provider's probes have been removed from the hash chains and
8125	* from the probe array. Now issue a dtrace_sync() to be sure that
8126	* everyone has cleared out from any probe array processing.
8127	*/
8128	dtrace_sync();
8129
8130	for (probe = first; probe != NULL; probe = next) {
8131	next = probe->dtpr_nextmod;
8132
8133	old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8134	probe->dtpr_arg);
8135	dtrace_strunref(probe->dtpr_mod);
8136	dtrace_strunref(probe->dtpr_func);
8137	dtrace_strunref(probe->dtpr_name);
8138	vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), `1`);
8139	zfree(dtrace_probe_t_zone, probe);
8140	}
8141
8142	if ((prev = dtrace_provider) == old) {
8143	ASSERT(self \|\| dtrace_devi == NULL);
8144	ASSERT(old->dtpv_next == NULL \|\| dtrace_devi == NULL);
8145	dtrace_provider = old->dtpv_next;
8146	} else {
8147	while (prev != NULL && prev->dtpv_next != old)
8148	prev = prev->dtpv_next;
8149
8150	if (prev == NULL) {
8151	panic("attempt to unregister non-existent "
8152	"dtrace provider %p\n", (void *)id);
8153	}
8154
8155	prev->dtpv_next = old->dtpv_next;
8156	}
8157
8158	dtrace_strunref(old->dtpv_name);
8159
8160	if (!self) {
8161	lck_mtx_unlock(&dtrace_lock);
8162	lck_mtx_unlock(&mod_lock);
8163	lck_mtx_unlock(&dtrace_provider_lock);
8164	}
8165
8166	kmem_free(old, sizeof (dtrace_provider_t));
8167
8168	return (`0`);
8169	}
8170
8171	/*
8172	* Invalidate the specified provider. All subsequent probe lookups for the
8173	* specified provider will fail, but its probes will not be removed.
8174	*/
8175	void
8176	dtrace_invalidate(dtrace_provider_id_t id)
8177	{
8178	dtrace_provider_t pvp = (dtrace_provider_t )id;
8179
8180	ASSERT(pvp->dtpv_pops.dtps_enable !=
8181	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop);
8182
8183	lck_mtx_lock(&dtrace_provider_lock);
8184	lck_mtx_lock(&dtrace_lock);
8185
8186	pvp->dtpv_defunct = `1`;
8187
8188	lck_mtx_unlock(&dtrace_lock);
8189	lck_mtx_unlock(&dtrace_provider_lock);
8190	}
8191
8192	/*
8193	* Indicate whether or not DTrace has attached.
8194	*/
8195	int
8196	dtrace_attached(void)
8197	{
8198	/*
8199	* dtrace_provider will be non-NULL iff the DTrace driver has
8200	* attached. (It's non-NULL because DTrace is always itself a
8201	* provider.)
8202	*/
8203	return (dtrace_provider != NULL);
8204	}
8205
8206	/*
8207	* Remove all the unenabled probes for the given provider. This function is
8208	* not unlike dtrace_unregister(), except that it doesn't remove the provider
8209	* -- just as many of its associated probes as it can.
8210	*/
8211	int
8212	dtrace_condense(dtrace_provider_id_t id)
8213	{
8214	dtrace_provider_t prov = (dtrace_provider_t )id;
8215	dtrace_probe_t probe, first = NULL;
8216	dtrace_probe_t template = {
8217	.dtpr_provider = prov
8218	};
8219
8220	/*
8221	* Make sure this isn't the dtrace provider itself.
8222	*/
8223	ASSERT(prov->dtpv_pops.dtps_enable !=
8224	(int ()(void* , dtrace_id_t, void* *))dtrace_enable_nullop);
8225
8226	lck_mtx_lock(&dtrace_provider_lock);
8227	lck_mtx_lock(&dtrace_lock);
8228
8229	/*
8230	* Attempt to destroy the probes associated with this provider.
8231	*/
8232	for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8233	probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8234
8235	if (probe->dtpr_provider != prov)
8236	continue;
8237
8238	if (probe->dtpr_ecb != NULL)
8239	continue;
8240
8241	dtrace_probes[probe->dtpr_id - `1`] = NULL;
8242	prov->dtpv_probe_count--;
8243
8244	dtrace_hash_remove(dtrace_bymod, probe);
8245	dtrace_hash_remove(dtrace_byfunc, probe);
8246	dtrace_hash_remove(dtrace_byname, probe);
8247
8248	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8249	probe->dtpr_arg);
8250	dtrace_strunref(probe->dtpr_mod);
8251	dtrace_strunref(probe->dtpr_func);
8252	dtrace_strunref(probe->dtpr_name);
8253	if (first == NULL) {
8254	first = probe;
8255	probe->dtpr_nextmod = NULL;
8256	} else {
8257	/*
8258	* Use nextmod as the chain of probes to remove
8259	*/
8260	probe->dtpr_nextmod = first;
8261	first = probe;
8262	}
8263	}
8264
8265	for (probe = first; probe != NULL; probe = first) {
8266	first = probe->dtpr_nextmod;
8267	dtrace_hash_remove(dtrace_byprov, probe);
8268	vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), `1`);
8269	zfree(dtrace_probe_t_zone, probe);
8270	}
8271
8272	lck_mtx_unlock(&dtrace_lock);
8273	lck_mtx_unlock(&dtrace_provider_lock);
8274
8275	return (`0`);
8276	}
8277
8278	/*
8279	* DTrace Probe Management Functions
8280	*
8281	* The functions in this section perform the DTrace probe management,
8282	* including functions to create probes, look-up probes, and call into the
8283	* providers to request that probes be provided. Some of these functions are
8284	* in the Provider-to-Framework API; these functions can be identified by the
8285	* fact that they are not declared "static".
8286	*/
8287
8288	/*
8289	* Create a probe with the specified module name, function name, and name.
8290	*/
8291	dtrace_id_t
8292	dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8293	const char func, const* char name, int* aframes, void *arg)
8294	{
8295	dtrace_probe_t probe, *probes;
8296	dtrace_provider_t provider = (dtrace_provider_t )prov;
8297	dtrace_id_t id;
8298
8299	if (provider == dtrace_provider) {
8300	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8301	} else {
8302	lck_mtx_lock(&dtrace_lock);
8303	}
8304
8305	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, `1`,
8306	VM_BESTFIT \| VM_SLEEP);
8307
8308	probe = zalloc(dtrace_probe_t_zone);
8309	bzero(probe, sizeof (dtrace_probe_t));
8310
8311	probe->dtpr_id = id;
8312	probe->dtpr_gen = dtrace_probegen++;
8313	probe->dtpr_mod = dtrace_strref(mod);
8314	probe->dtpr_func = dtrace_strref(func);
8315	probe->dtpr_name = dtrace_strref(name);
8316	probe->dtpr_arg = arg;
8317	probe->dtpr_aframes = aframes;
8318	probe->dtpr_provider = provider;
8319
8320	dtrace_hash_add(dtrace_byprov, probe);
8321	dtrace_hash_add(dtrace_bymod, probe);
8322	dtrace_hash_add(dtrace_byfunc, probe);
8323	dtrace_hash_add(dtrace_byname, probe);
8324
8325	if (id - `1` >= (dtrace_id_t)dtrace_nprobes) {
8326	size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8327	size_t nsize = osize << `1`;
8328
8329	if (nsize == `0`) {
8330	ASSERT(osize == `0`);
8331	ASSERT(dtrace_probes == NULL);
8332	nsize = sizeof (dtrace_probe_t *);
8333	}
8334
8335	probes = kmem_zalloc(nsize, KM_SLEEP);
8336
8337	if (dtrace_probes == NULL) {
8338	ASSERT(osize == `0`);
8339	dtrace_probes = probes;
8340	dtrace_nprobes = `1`;
8341	} else {
8342	dtrace_probe_t **oprobes = dtrace_probes;
8343
8344	bcopy(oprobes, probes, osize);
8345	dtrace_membar_producer();
8346	dtrace_probes = probes;
8347
8348	dtrace_sync();
8349
8350	/*
8351	* All CPUs are now seeing the new probes array; we can
8352	* safely free the old array.
8353	*/
8354	kmem_free(oprobes, osize);
8355	dtrace_nprobes <<= `1`;
8356	}
8357
8358	ASSERT(id - `1` < (dtrace_id_t)dtrace_nprobes);
8359	}
8360
8361	ASSERT(dtrace_probes[id - `1`] == NULL);
8362	dtrace_probes[id - `1`] = probe;
8363	provider->dtpv_probe_count++;
8364
8365	if (provider != dtrace_provider)
8366	lck_mtx_unlock(&dtrace_lock);
8367
8368	return (id);
8369	}
8370
8371	static dtrace_probe_t *
8372	dtrace_probe_lookup_id(dtrace_id_t id)
8373	{
8374	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8375
8376	if (id == `0` \|\| id > (dtrace_id_t)dtrace_nprobes)
8377	return (NULL);
8378
8379	return (dtrace_probes[id - `1`]);
8380	}
8381
8382	static int
8383	dtrace_probe_lookup_match(dtrace_probe_t probe, void* arg1, void* *arg2)
8384	{
8385	#pragma unused(arg2)
8386	((dtrace_id_t )arg1) = probe->dtpr_id;
8387
8388	return (DTRACE_MATCH_DONE);
8389	}
8390
8391	/*
8392	* Look up a probe based on provider and one or more of module name, function
8393	* name and probe name.
8394	*/
8395	dtrace_id_t
8396	dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
8397	const char func, const* char *name)
8398	{
8399	dtrace_probekey_t pkey;
8400	dtrace_id_t id;
8401	int match;
8402
8403	lck_mtx_lock(&dtrace_lock);
8404
8405	pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
8406	pkey.dtpk_pmatch = &dtrace_match_string;
8407	pkey.dtpk_mod = dtrace_strref(mod);
8408	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8409	pkey.dtpk_func = dtrace_strref(func);
8410	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8411	pkey.dtpk_name = dtrace_strref(name);
8412	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8413	pkey.dtpk_id = DTRACE_IDNONE;
8414
8415	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, `0`, `0`,
8416	dtrace_probe_lookup_match, &id, NULL);
8417
8418	dtrace_probekey_release(&pkey);
8419
8420	lck_mtx_unlock(&dtrace_lock);
8421
8422	ASSERT(match == `1` \|\| match == `0`);
8423	return (match ? id : `0`);
8424	}
8425
8426	/*
8427	* Returns the probe argument associated with the specified probe.
8428	*/
8429	void *
8430	dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8431	{
8432	dtrace_probe_t *probe;
8433	void *rval = NULL;
8434
8435	lck_mtx_lock(&dtrace_lock);
8436
8437	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8438	probe->dtpr_provider == (dtrace_provider_t *)id)
8439	rval = probe->dtpr_arg;
8440
8441	lck_mtx_unlock(&dtrace_lock);
8442
8443	return (rval);
8444	}
8445
8446	/*
8447	* Copy a probe into a probe description.
8448	*/
8449	static void
8450	dtrace_probe_description(const dtrace_probe_t prp, dtrace_probedesc_t pdp)
8451	{
8452	bzero(pdp, sizeof (dtrace_probedesc_t));
8453	pdp->dtpd_id = prp->dtpr_id;
8454
8455	/ APPLE NOTE: Darwin employs size bounded string operation. /
8456	(void) strlcpy(pdp->dtpd_provider,
8457	prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
8458
8459	(void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
8460	(void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
8461	(void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
8462	}
8463
8464	/*
8465	* Called to indicate that a probe -- or probes -- should be provided by a
8466	* specfied provider. If the specified description is NULL, the provider will
8467	* be told to provide all of its probes. (This is done whenever a new
8468	* consumer comes along, or whenever a retained enabling is to be matched.) If
8469	* the specified description is non-NULL, the provider is given the
8470	* opportunity to dynamically provide the specified probe, allowing providers
8471	* to support the creation of probes on-the-fly. (So-called _autocreated_
8472	* probes.) If the provider is NULL, the operations will be applied to all
8473	* providers; if the provider is non-NULL the operations will only be applied
8474	* to the specified provider. The dtrace_provider_lock must be held, and the
8475	* dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8476	* will need to grab the dtrace_lock when it reenters the framework through
8477	* dtrace_probe_lookup(), dtrace_probe_create(), etc.
8478	*/
8479	static void
8480	dtrace_probe_provide(dtrace_probedesc_t desc, dtrace_provider_t prv)
8481	{
8482	struct modctl *ctl;
8483	int all = `0`;
8484
8485	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8486
8487	if (prv == NULL) {
8488	all = `1`;
8489	prv = dtrace_provider;
8490	}
8491
8492	do {
8493	/*
8494	* First, call the blanket provide operation.
8495	*/
8496	prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8497
8498	/*
8499	* Now call the per-module provide operation. We will grab
8500	* mod_lock to prevent the list from being modified. Note
8501	* that this also prevents the mod_busy bits from changing.
8502	* (mod_busy can only be changed with mod_lock held.)
8503	*/
8504	lck_mtx_lock(&mod_lock);
8505
8506	ctl = dtrace_modctl_list;
8507	while (ctl) {
8508	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8509	ctl = ctl->mod_next;
8510	}
8511
8512	lck_mtx_unlock(&mod_lock);
8513	} while (all && (prv = prv->dtpv_next) != NULL);
8514	}
8515
8516	/*
8517	* Iterate over each probe, and call the Framework-to-Provider API function
8518	* denoted by offs.
8519	*/
8520	static void
8521	dtrace_probe_foreach(uintptr_t offs)
8522	{
8523	dtrace_provider_t *prov;
8524	void (func)(void* , dtrace_id_t, void* *);
8525	dtrace_probe_t *probe;
8526	dtrace_icookie_t cookie;
8527	int i;
8528
8529	/*
8530	* We disable interrupts to walk through the probe array. This is
8531	* safe -- the dtrace_sync() in dtrace_unregister() assures that we
8532	* won't see stale data.
8533	*/
8534	cookie = dtrace_interrupt_disable();
8535
8536	for (i = `0`; i < dtrace_nprobes; i++) {
8537	if ((probe = dtrace_probes[i]) == NULL)
8538	continue;
8539
8540	if (probe->dtpr_ecb == NULL) {
8541	/*
8542	* This probe isn't enabled -- don't call the function.
8543	*/
8544	continue;
8545	}
8546
8547	prov = probe->dtpr_provider;
8548	func = ((void()(void* , dtrace_id_t, void* *))
8549	((uintptr_t)&prov->dtpv_pops + offs));
8550
8551	func(prov->dtpv_arg, i + `1`, probe->dtpr_arg);
8552	}
8553
8554	dtrace_interrupt_enable(cookie);
8555	}
8556
8557	static int
8558	dtrace_probe_enable(const dtrace_probedesc_t desc, dtrace_enabling_t enab, dtrace_ecbdesc_t *ep)
8559	{
8560	dtrace_probekey_t pkey;
8561	uint32_t priv;
8562	uid_t uid;
8563	zoneid_t zoneid;
8564	int err;
8565
8566	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8567
8568	dtrace_ecb_create_cache = NULL;
8569
8570	if (desc == NULL) {
8571	/*
8572	* If we're passed a NULL description, we're being asked to
8573	* create an ECB with a NULL probe.
8574	*/
8575	(void) dtrace_ecb_create_enable(NULL, enab, ep);
8576	return (`0`);
8577	}
8578
8579	dtrace_probekey(desc, &pkey);
8580	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8581	&priv, &uid, &zoneid);
8582
8583	err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
8584
8585	dtrace_probekey_release(&pkey);
8586
8587	return err;
8588	}
8589
8590	/*
8591	* DTrace Helper Provider Functions
8592	*/
8593	static void
8594	dtrace_dofattr2attr(dtrace_attribute_t attr, const* dof_attr_t dofattr)
8595	{
8596	attr->dtat_name = DOF_ATTR_NAME(dofattr);
8597	attr->dtat_data = DOF_ATTR_DATA(dofattr);
8598	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8599	}
8600
8601	static void
8602	dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8603	const dof_provider_t dofprov, char* *strtab)
8604	{
8605	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8606	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8607	dofprov->dofpv_provattr);
8608	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8609	dofprov->dofpv_modattr);
8610	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8611	dofprov->dofpv_funcattr);
8612	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8613	dofprov->dofpv_nameattr);
8614	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8615	dofprov->dofpv_argsattr);
8616	}
8617
8618	static void
8619	dtrace_helper_provide_one(dof_helper_t dhp, dof_sec_t sec, proc_t *p)
8620	{
8621	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8622	dof_hdr_t dof = (dof_hdr_t )daddr;
8623	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
8624	dof_provider_t *provider;
8625	dof_probe_t *probe;
8626	uint32_t off, enoff;
8627	uint8_t *arg;
8628	char *strtab;
8629	uint_t i, nprobes;
8630	dtrace_helper_provdesc_t dhpv;
8631	dtrace_helper_probedesc_t dhpb;
8632	dtrace_meta_t *meta = dtrace_meta_pid;
8633	dtrace_mops_t *mops = &meta->dtm_mops;
8634	void *parg;
8635
8636	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8637	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8638	provider->dofpv_strtab * dof->dofh_secsize);
8639	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8640	provider->dofpv_probes * dof->dofh_secsize);
8641	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8642	provider->dofpv_prargs * dof->dofh_secsize);
8643	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8644	provider->dofpv_proffs * dof->dofh_secsize);
8645
8646	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8647	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8648	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8649	enoff = NULL;
8650
8651	/*
8652	* See dtrace_helper_provider_validate().
8653	*/
8654	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8655	provider->dofpv_prenoffs != DOF_SECT_NONE) {
8656	enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8657	provider->dofpv_prenoffs * dof->dofh_secsize);
8658	enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8659	}
8660
8661	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8662
8663	/*
8664	* Create the provider.
8665	*/
8666	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8667
8668	if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
8669	return;
8670
8671	meta->dtm_count++;
8672
8673	/*
8674	* Create the probes.
8675	*/
8676	for (i = `0`; i < nprobes; i++) {
8677	probe = (dof_probe_t *)(uintptr_t)(daddr +
8678	prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8679
8680	dhpb.dthpb_mod = dhp->dofhp_mod;
8681	dhpb.dthpb_func = strtab + probe->dofpr_func;
8682	dhpb.dthpb_name = strtab + probe->dofpr_name;
8683	#if !defined(__APPLE__)
8684	dhpb.dthpb_base = probe->dofpr_addr;
8685	#else
8686	dhpb.dthpb_base = dhp->dofhp_addr; / FIXME: James, why? /
8687	#endif
8688	dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8689	dhpb.dthpb_noffs = probe->dofpr_noffs;
8690	if (enoff != NULL) {
8691	dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8692	dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8693	} else {
8694	dhpb.dthpb_enoffs = NULL;
8695	dhpb.dthpb_nenoffs = `0`;
8696	}
8697	dhpb.dthpb_args = arg + probe->dofpr_argidx;
8698	dhpb.dthpb_nargc = probe->dofpr_nargc;
8699	dhpb.dthpb_xargc = probe->dofpr_xargc;
8700	dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8701	dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8702
8703	mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8704	}
8705
8706	/*
8707	* Since we just created probes, we need to match our enablings
8708	* against those, with a precondition knowing that we have only
8709	* added probes from this provider
8710	*/
8711	char *prov_name = mops->dtms_provider_name(parg);
8712	ASSERT(prov_name != NULL);
8713	dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
8714
8715	dtrace_enabling_matchall_with_cond(&cond);
8716	}
8717
8718	static void
8719	dtrace_helper_provide(dof_helper_t dhp, proc_t p)
8720	{
8721	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8722	dof_hdr_t dof = (dof_hdr_t )daddr;
8723	uint32_t i;
8724
8725	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8726
8727	for (i = `0`; i < dof->dofh_secnum; i++) {
8728	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
8729	dof->dofh_secoff + i * dof->dofh_secsize);
8730
8731	if (sec->dofs_type != DOF_SECT_PROVIDER)
8732	continue;
8733
8734	dtrace_helper_provide_one(dhp, sec, p);
8735	}
8736	}
8737
8738	static void
8739	dtrace_helper_provider_remove_one(dof_helper_t dhp, dof_sec_t sec, proc_t *p)
8740	{
8741	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8742	dof_hdr_t dof = (dof_hdr_t )daddr;
8743	dof_sec_t *str_sec;
8744	dof_provider_t *provider;
8745	char *strtab;
8746	dtrace_helper_provdesc_t dhpv;
8747	dtrace_meta_t *meta = dtrace_meta_pid;
8748	dtrace_mops_t *mops = &meta->dtm_mops;
8749
8750	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8751	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8752	provider->dofpv_strtab * dof->dofh_secsize);
8753
8754	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8755
8756	/*
8757	* Create the provider.
8758	*/
8759	dtrace_dofprov2hprov(&dhpv, provider, strtab);
8760
8761	mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
8762
8763	meta->dtm_count--;
8764	}
8765
8766	static void
8767	dtrace_helper_provider_remove(dof_helper_t dhp, proc_t p)
8768	{
8769	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8770	dof_hdr_t dof = (dof_hdr_t )daddr;
8771	uint32_t i;
8772
8773	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8774
8775	for (i = `0`; i < dof->dofh_secnum; i++) {
8776	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
8777	dof->dofh_secoff + i * dof->dofh_secsize);
8778
8779	if (sec->dofs_type != DOF_SECT_PROVIDER)
8780	continue;
8781
8782	dtrace_helper_provider_remove_one(dhp, sec, p);
8783	}
8784	}
8785
8786	/*
8787	* DTrace Meta Provider-to-Framework API Functions
8788	*
8789	* These functions implement the Meta Provider-to-Framework API, as described
8790	* in <sys/dtrace.h>.
8791	*/
8792	int
8793	dtrace_meta_register(const char name, const* dtrace_mops_t mops, void* *arg,
8794	dtrace_meta_provider_id_t *idp)
8795	{
8796	dtrace_meta_t *meta;
8797	dtrace_helpers_t help, next;
8798	uint_t i;
8799
8800	*idp = DTRACE_METAPROVNONE;
8801
8802	/*
8803	* We strictly don't need the name, but we hold onto it for
8804	* debuggability. All hail error queues!
8805	*/
8806	if (name == NULL) {
8807	cmn_err(CE_WARN, "failed to register meta-provider: "
8808	"invalid name");
8809	return (EINVAL);
8810	}
8811
8812	if (mops == NULL \|\|
8813	mops->dtms_create_probe == NULL \|\|
8814	mops->dtms_provide_proc == NULL \|\|
8815	mops->dtms_remove_proc == NULL) {
8816	cmn_err(CE_WARN, "failed to register meta-register %s: "
8817	"invalid ops", name);
8818	return (EINVAL);
8819	}
8820
8821	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8822	meta->dtm_mops = *mops;
8823	meta->dtm_arg = arg;
8824
8825	lck_mtx_lock(&dtrace_meta_lock);
8826	lck_mtx_lock(&dtrace_lock);
8827
8828	if (dtrace_meta_pid != NULL) {
8829	lck_mtx_unlock(&dtrace_lock);
8830	lck_mtx_unlock(&dtrace_meta_lock);
8831	cmn_err(CE_WARN, "failed to register meta-register %s: "
8832	"user-land meta-provider exists", name);
8833	kmem_free(meta, sizeof (dtrace_meta_t));
8834	return (EINVAL);
8835	}
8836
8837	meta->dtm_name = dtrace_strref(name);
8838
8839	dtrace_meta_pid = meta;
8840	*idp = (dtrace_meta_provider_id_t)meta;
8841
8842	/*
8843	* If there are providers and probes ready to go, pass them
8844	* off to the new meta provider now.
8845	*/
8846
8847	help = dtrace_deferred_pid;
8848	dtrace_deferred_pid = NULL;
8849
8850	lck_mtx_unlock(&dtrace_lock);
8851
8852	while (help != NULL) {
8853	for (i = `0`; i < help->dthps_nprovs; i++) {
8854	proc_t *p = proc_find(help->dthps_pid);
8855	if (p == PROC_NULL)
8856	continue;
8857	dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8858	p);
8859	proc_rele(p);
8860	}
8861
8862	next = help->dthps_next;
8863	help->dthps_next = NULL;
8864	help->dthps_prev = NULL;
8865	help->dthps_deferred = `0`;
8866	help = next;
8867	}
8868
8869	lck_mtx_unlock(&dtrace_meta_lock);
8870
8871	return (`0`);
8872	}
8873
8874	int
8875	dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8876	{
8877	dtrace_meta_t *pp, old = (dtrace_meta_t *)id;
8878
8879	lck_mtx_lock(&dtrace_meta_lock);
8880	lck_mtx_lock(&dtrace_lock);
8881
8882	if (old == dtrace_meta_pid) {
8883	pp = &dtrace_meta_pid;
8884	} else {
8885	panic("attempt to unregister non-existent "
8886	"dtrace meta-provider %p\n", (void *)old);
8887	}
8888
8889	if (old->dtm_count != `0`) {
8890	lck_mtx_unlock(&dtrace_lock);
8891	lck_mtx_unlock(&dtrace_meta_lock);
8892	return (EBUSY);
8893	}
8894
8895	*pp = NULL;
8896
8897	dtrace_strunref(old->dtm_name);
8898
8899	lck_mtx_unlock(&dtrace_lock);
8900	lck_mtx_unlock(&dtrace_meta_lock);
8901
8902	kmem_free(old, sizeof (dtrace_meta_t));
8903
8904	return (`0`);
8905	}
8906
8907
8908	/*
8909	* DTrace DIF Object Functions
8910	*/
8911	static int
8912	dtrace_difo_err(uint_t pc, const char *format, ...)
8913	{
8914	if (dtrace_err_verbose) {
8915	va_list alist;
8916
8917	(void) uprintf("dtrace DIF object error: [%u]: ", pc);
8918	va_start(alist, format);
8919	(void) vuprintf(format, alist);
8920	va_end(alist);
8921	}
8922
8923	#ifdef DTRACE_ERRDEBUG
8924	dtrace_errdebug(format);
8925	#endif
8926	return (`1`);
8927	}
8928
8929	/*
8930	* Validate a DTrace DIF object by checking the IR instructions. The following
8931	* rules are currently enforced by dtrace_difo_validate():
8932	*
8933	* 1. Each instruction must have a valid opcode
8934	* 2. Each register, string, variable, or subroutine reference must be valid
8935	* 3. No instruction can modify register %r0 (must be zero)
8936	* 4. All instruction reserved bits must be set to zero
8937	* 5. The last instruction must be a "ret" instruction
8938	* 6. All branch targets must reference a valid instruction _after_ the branch
8939	*/
8940	static int
8941	dtrace_difo_validate(dtrace_difo_t dp, dtrace_vstate_t vstate, uint_t nregs,
8942	cred_t *cr)
8943	{
8944	int err = `0`;
8945	uint_t i;
8946
8947	int (efunc)(uint_t pc, const* char *, ...) = dtrace_difo_err;
8948	int kcheckload;
8949	uint_t pc;
8950	int maxglobal = -`1`, maxlocal = -`1`, maxtlocal = -`1`;
8951
8952	kcheckload = cr == NULL \|\|
8953	(vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == `0`;
8954
8955	dp->dtdo_destructive = `0`;
8956
8957	for (pc = `0`; pc < dp->dtdo_len && err == `0`; pc++) {
8958	dif_instr_t instr = dp->dtdo_buf[pc];
8959
8960	uint_t r1 = DIF_INSTR_R1(instr);
8961	uint_t r2 = DIF_INSTR_R2(instr);
8962	uint_t rd = DIF_INSTR_RD(instr);
8963	uint_t rs = DIF_INSTR_RS(instr);
8964	uint_t label = DIF_INSTR_LABEL(instr);
8965	uint_t v = DIF_INSTR_VAR(instr);
8966	uint_t subr = DIF_INSTR_SUBR(instr);
8967	uint_t type = DIF_INSTR_TYPE(instr);
8968	uint_t op = DIF_INSTR_OP(instr);
8969
8970	switch (op) {
8971	case DIF_OP_OR:
8972	case DIF_OP_XOR:
8973	case DIF_OP_AND:
8974	case DIF_OP_SLL:
8975	case DIF_OP_SRL:
8976	case DIF_OP_SRA:
8977	case DIF_OP_SUB:
8978	case DIF_OP_ADD:
8979	case DIF_OP_MUL:
8980	case DIF_OP_SDIV:
8981	case DIF_OP_UDIV:
8982	case DIF_OP_SREM:
8983	case DIF_OP_UREM:
8984	case DIF_OP_COPYS:
8985	if (r1 >= nregs)
8986	err += efunc(pc, "invalid register %u\n", r1);
8987	if (r2 >= nregs)
8988	err += efunc(pc, "invalid register %u\n", r2);
8989	if (rd >= nregs)
8990	err += efunc(pc, "invalid register %u\n", rd);
8991	if (rd == `0`)
8992	err += efunc(pc, "cannot write to %r0\n");
8993	break;
8994	case DIF_OP_NOT:
8995	case DIF_OP_MOV:
8996	case DIF_OP_ALLOCS:
8997	if (r1 >= nregs)
8998	err += efunc(pc, "invalid register %u\n", r1);
8999	if (r2 != `0`)
9000	err += efunc(pc, "non-zero reserved bits\n");
9001	if (rd >= nregs)
9002	err += efunc(pc, "invalid register %u\n", rd);
9003	if (rd == `0`)
9004	err += efunc(pc, "cannot write to %r0\n");
9005	break;
9006	case DIF_OP_LDSB:
9007	case DIF_OP_LDSH:
9008	case DIF_OP_LDSW:
9009	case DIF_OP_LDUB:
9010	case DIF_OP_LDUH:
9011	case DIF_OP_LDUW:
9012	case DIF_OP_LDX:
9013	if (r1 >= nregs)
9014	err += efunc(pc, "invalid register %u\n", r1);
9015	if (r2 != `0`)
9016	err += efunc(pc, "non-zero reserved bits\n");
9017	if (rd >= nregs)
9018	err += efunc(pc, "invalid register %u\n", rd);
9019	if (rd == `0`)
9020	err += efunc(pc, "cannot write to %r0\n");
9021	if (kcheckload)
9022	dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9023	DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9024	break;
9025	case DIF_OP_RLDSB:
9026	case DIF_OP_RLDSH:
9027	case DIF_OP_RLDSW:
9028	case DIF_OP_RLDUB:
9029	case DIF_OP_RLDUH:
9030	case DIF_OP_RLDUW:
9031	case DIF_OP_RLDX:
9032	if (r1 >= nregs)
9033	err += efunc(pc, "invalid register %u\n", r1);
9034	if (r2 != `0`)
9035	err += efunc(pc, "non-zero reserved bits\n");
9036	if (rd >= nregs)
9037	err += efunc(pc, "invalid register %u\n", rd);
9038	if (rd == `0`)
9039	err += efunc(pc, "cannot write to %r0\n");
9040	break;
9041	case DIF_OP_ULDSB:
9042	case DIF_OP_ULDSH:
9043	case DIF_OP_ULDSW:
9044	case DIF_OP_ULDUB:
9045	case DIF_OP_ULDUH:
9046	case DIF_OP_ULDUW:
9047	case DIF_OP_ULDX:
9048	if (r1 >= nregs)
9049	err += efunc(pc, "invalid register %u\n", r1);
9050	if (r2 != `0`)
9051	err += efunc(pc, "non-zero reserved bits\n");
9052	if (rd >= nregs)
9053	err += efunc(pc, "invalid register %u\n", rd);
9054	if (rd == `0`)
9055	err += efunc(pc, "cannot write to %r0\n");
9056	break;
9057	case DIF_OP_STB:
9058	case DIF_OP_STH:
9059	case DIF_OP_STW:
9060	case DIF_OP_STX:
9061	if (r1 >= nregs)
9062	err += efunc(pc, "invalid register %u\n", r1);
9063	if (r2 != `0`)
9064	err += efunc(pc, "non-zero reserved bits\n");
9065	if (rd >= nregs)
9066	err += efunc(pc, "invalid register %u\n", rd);
9067	if (rd == `0`)
9068	err += efunc(pc, "cannot write to 0 address\n");
9069	break;
9070	case DIF_OP_CMP:
9071	case DIF_OP_SCMP:
9072	if (r1 >= nregs)
9073	err += efunc(pc, "invalid register %u\n", r1);
9074	if (r2 >= nregs)
9075	err += efunc(pc, "invalid register %u\n", r2);
9076	if (rd != `0`)
9077	err += efunc(pc, "non-zero reserved bits\n");
9078	break;
9079	case DIF_OP_TST:
9080	if (r1 >= nregs)
9081	err += efunc(pc, "invalid register %u\n", r1);
9082	if (r2 != `0` \|\| rd != `0`)
9083	err += efunc(pc, "non-zero reserved bits\n");
9084	break;
9085	case DIF_OP_BA:
9086	case DIF_OP_BE:
9087	case DIF_OP_BNE:
9088	case DIF_OP_BG:
9089	case DIF_OP_BGU:
9090	case DIF_OP_BGE:
9091	case DIF_OP_BGEU:
9092	case DIF_OP_BL:
9093	case DIF_OP_BLU:
9094	case DIF_OP_BLE:
9095	case DIF_OP_BLEU:
9096	if (label >= dp->dtdo_len) {
9097	err += efunc(pc, "invalid branch target %u\n",
9098	label);
9099	}
9100	if (label <= pc) {
9101	err += efunc(pc, "backward branch to %u\n",
9102	label);
9103	}
9104	break;
9105	case DIF_OP_RET:
9106	if (r1 != `0` \|\| r2 != `0`)
9107	err += efunc(pc, "non-zero reserved bits\n");
9108	if (rd >= nregs)
9109	err += efunc(pc, "invalid register %u\n", rd);
9110	break;
9111	case DIF_OP_NOP:
9112	case DIF_OP_POPTS:
9113	case DIF_OP_FLUSHTS:
9114	if (r1 != `0` \|\| r2 != `0` \|\| rd != `0`)
9115	err += efunc(pc, "non-zero reserved bits\n");
9116	break;
9117	case DIF_OP_SETX:
9118	if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9119	err += efunc(pc, "invalid integer ref %u\n",
9120	DIF_INSTR_INTEGER(instr));
9121	}
9122	if (rd >= nregs)
9123	err += efunc(pc, "invalid register %u\n", rd);
9124	if (rd == `0`)
9125	err += efunc(pc, "cannot write to %r0\n");
9126	break;
9127	case DIF_OP_SETS:
9128	if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9129	err += efunc(pc, "invalid string ref %u\n",
9130	DIF_INSTR_STRING(instr));
9131	}
9132	if (rd >= nregs)
9133	err += efunc(pc, "invalid register %u\n", rd);
9134	if (rd == `0`)
9135	err += efunc(pc, "cannot write to %r0\n");
9136	break;
9137	case DIF_OP_LDGA:
9138	case DIF_OP_LDTA:
9139	if (r1 > DIF_VAR_ARRAY_MAX)
9140	err += efunc(pc, "invalid array %u\n", r1);
9141	if (r2 >= nregs)
9142	err += efunc(pc, "invalid register %u\n", r2);
9143	if (rd >= nregs)
9144	err += efunc(pc, "invalid register %u\n", rd);
9145	if (rd == `0`)
9146	err += efunc(pc, "cannot write to %r0\n");
9147	break;
9148	case DIF_OP_LDGS:
9149	case DIF_OP_LDTS:
9150	case DIF_OP_LDLS:
9151	case DIF_OP_LDGAA:
9152	case DIF_OP_LDTAA:
9153	if (v < DIF_VAR_OTHER_MIN \|\| v > DIF_VAR_OTHER_MAX)
9154	err += efunc(pc, "invalid variable %u\n", v);
9155	if (rd >= nregs)
9156	err += efunc(pc, "invalid register %u\n", rd);
9157	if (rd == `0`)
9158	err += efunc(pc, "cannot write to %r0\n");
9159	break;
9160	case DIF_OP_STGS:
9161	case DIF_OP_STTS:
9162	case DIF_OP_STLS:
9163	case DIF_OP_STGAA:
9164	case DIF_OP_STTAA:
9165	if (v < DIF_VAR_OTHER_UBASE \|\| v > DIF_VAR_OTHER_MAX)
9166	err += efunc(pc, "invalid variable %u\n", v);
9167	if (rs >= nregs)
9168	err += efunc(pc, "invalid register %u\n", rd);
9169	break;
9170	case DIF_OP_CALL:
9171	if (subr > DIF_SUBR_MAX &&
9172	!(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9173	err += efunc(pc, "invalid subr %u\n", subr);
9174	if (rd >= nregs)
9175	err += efunc(pc, "invalid register %u\n", rd);
9176	if (rd == `0`)
9177	err += efunc(pc, "cannot write to %r0\n");
9178
9179	if (subr == DIF_SUBR_COPYOUT \|\|
9180	subr == DIF_SUBR_COPYOUTSTR \|\|
9181	subr == DIF_SUBR_KDEBUG_TRACE \|\|
9182	subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9183	dp->dtdo_destructive = `1`;
9184	}
9185	break;
9186	case DIF_OP_PUSHTR:
9187	if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9188	err += efunc(pc, "invalid ref type %u\n", type);
9189	if (r2 >= nregs)
9190	err += efunc(pc, "invalid register %u\n", r2);
9191	if (rs >= nregs)
9192	err += efunc(pc, "invalid register %u\n", rs);
9193	break;
9194	case DIF_OP_PUSHTV:
9195	if (type != DIF_TYPE_CTF)
9196	err += efunc(pc, "invalid val type %u\n", type);
9197	if (r2 >= nregs)
9198	err += efunc(pc, "invalid register %u\n", r2);
9199	if (rs >= nregs)
9200	err += efunc(pc, "invalid register %u\n", rs);
9201	break;
9202	default:
9203	err += efunc(pc, "invalid opcode %u\n",
9204	DIF_INSTR_OP(instr));
9205	}
9206	}
9207
9208	if (dp->dtdo_len != `0` &&
9209	DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - `1`]) != DIF_OP_RET) {
9210	err += efunc(dp->dtdo_len - `1`,
9211	"expected 'ret' as last DIF instruction\n");
9212	}
9213
9214	if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF \| DIF_TF_BYUREF))) {
9215	/*
9216	* If we're not returning by reference, the size must be either
9217	* 0 or the size of one of the base types.
9218	*/
9219	switch (dp->dtdo_rtype.dtdt_size) {
9220	case `0`:
9221	case sizeof (uint8_t):
9222	case sizeof (uint16_t):
9223	case sizeof (uint32_t):
9224	case sizeof (uint64_t):
9225	break;
9226
9227	default:
9228	err += efunc(dp->dtdo_len - `1`, "bad return size\n");
9229	}
9230	}
9231
9232	for (i = `0`; i < dp->dtdo_varlen && err == `0`; i++) {
9233	dtrace_difv_t v = &dp->dtdo_vartab[i], existing = NULL;
9234	dtrace_diftype_t vt, et;
9235	uint_t id;
9236	int ndx;
9237
9238	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9239	v->dtdv_scope != DIFV_SCOPE_THREAD &&
9240	v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9241	err += efunc(i, "unrecognized variable scope %d\n",
9242	v->dtdv_scope);
9243	break;
9244	}
9245
9246	if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9247	v->dtdv_kind != DIFV_KIND_SCALAR) {
9248	err += efunc(i, "unrecognized variable type %d\n",
9249	v->dtdv_kind);
9250	break;
9251	}
9252
9253	if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9254	err += efunc(i, "%d exceeds variable id limit\n", id);
9255	break;
9256	}
9257
9258	if (id < DIF_VAR_OTHER_UBASE)
9259	continue;
9260
9261	/*
9262	* For user-defined variables, we need to check that this
9263	* definition is identical to any previous definition that we
9264	* encountered.
9265	*/
9266	ndx = id - DIF_VAR_OTHER_UBASE;
9267
9268	switch (v->dtdv_scope) {
9269	case DIFV_SCOPE_GLOBAL:
9270	if (maxglobal == -`1` \|\| ndx > maxglobal)
9271	maxglobal = ndx;
9272
9273	if (ndx < vstate->dtvs_nglobals) {
9274	dtrace_statvar_t *svar;
9275
9276	if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9277	existing = &svar->dtsv_var;
9278	}
9279
9280	break;
9281
9282	case DIFV_SCOPE_THREAD:
9283	if (maxtlocal == -`1` \|\| ndx > maxtlocal)
9284	maxtlocal = ndx;
9285
9286	if (ndx < vstate->dtvs_ntlocals)
9287	existing = &vstate->dtvs_tlocals[ndx];
9288	break;
9289
9290	case DIFV_SCOPE_LOCAL:
9291	if (maxlocal == -`1` \|\| ndx > maxlocal)
9292	maxlocal = ndx;
9293	if (ndx < vstate->dtvs_nlocals) {
9294	dtrace_statvar_t *svar;
9295
9296	if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9297	existing = &svar->dtsv_var;
9298	}
9299
9300	break;
9301	}
9302
9303	vt = &v->dtdv_type;
9304
9305	if (vt->dtdt_flags & DIF_TF_BYREF) {
9306	if (vt->dtdt_size == `0`) {
9307	err += efunc(i, "zero-sized variable\n");
9308	break;
9309	}
9310
9311	if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL \|\|
9312	v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9313	vt->dtdt_size > dtrace_statvar_maxsize) {
9314	err += efunc(i, "oversized by-ref static\n");
9315	break;
9316	}
9317	}
9318
9319	if (existing == NULL \|\| existing->dtdv_id == `0`)
9320	continue;
9321
9322	ASSERT(existing->dtdv_id == v->dtdv_id);
9323	ASSERT(existing->dtdv_scope == v->dtdv_scope);
9324
9325	if (existing->dtdv_kind != v->dtdv_kind)
9326	err += efunc(i, "%d changed variable kind\n", id);
9327
9328	et = &existing->dtdv_type;
9329
9330	if (vt->dtdt_flags != et->dtdt_flags) {
9331	err += efunc(i, "%d changed variable type flags\n", id);
9332	break;
9333	}
9334
9335	if (vt->dtdt_size != `0` && vt->dtdt_size != et->dtdt_size) {
9336	err += efunc(i, "%d changed variable type size\n", id);
9337	break;
9338	}
9339	}
9340
9341	for (pc = `0`; pc < dp->dtdo_len && err == `0`; pc++) {
9342	dif_instr_t instr = dp->dtdo_buf[pc];
9343
9344	uint_t v = DIF_INSTR_VAR(instr);
9345	uint_t op = DIF_INSTR_OP(instr);
9346
9347	switch (op) {
9348	case DIF_OP_LDGS:
9349	case DIF_OP_LDGAA:
9350	case DIF_OP_STGS:
9351	case DIF_OP_STGAA:
9352	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9353	err += efunc(pc, "invalid variable %u\n", v);
9354	break;
9355	case DIF_OP_LDTS:
9356	case DIF_OP_LDTAA:
9357	case DIF_OP_STTS:
9358	case DIF_OP_STTAA:
9359	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9360	err += efunc(pc, "invalid variable %u\n", v);
9361	break;
9362	case DIF_OP_LDLS:
9363	case DIF_OP_STLS:
9364	if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9365	err += efunc(pc, "invalid variable %u\n", v);
9366	break;
9367	default:
9368	break;
9369	}
9370	}
9371
9372	return (err);
9373	}
9374
9375	/*
9376	* Validate a DTrace DIF object that it is to be used as a helper. Helpers
9377	* are much more constrained than normal DIFOs. Specifically, they may
9378	* not:
9379	*
9380	* 1. Make calls to subroutines other than copyin(), copyinstr() or
9381	* miscellaneous string routines
9382	* 2. Access DTrace variables other than the args[] array, and the
9383	* curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9384	* 3. Have thread-local variables.
9385	* 4. Have dynamic variables.
9386	*/
9387	static int
9388	dtrace_difo_validate_helper(dtrace_difo_t *dp)
9389	{
9390	int (efunc)(uint_t pc, const* char *, ...) = dtrace_difo_err;
9391	int err = `0`;
9392	uint_t pc;
9393
9394	for (pc = `0`; pc < dp->dtdo_len; pc++) {
9395	dif_instr_t instr = dp->dtdo_buf[pc];
9396
9397	uint_t v = DIF_INSTR_VAR(instr);
9398	uint_t subr = DIF_INSTR_SUBR(instr);
9399	uint_t op = DIF_INSTR_OP(instr);
9400
9401	switch (op) {
9402	case DIF_OP_OR:
9403	case DIF_OP_XOR:
9404	case DIF_OP_AND:
9405	case DIF_OP_SLL:
9406	case DIF_OP_SRL:
9407	case DIF_OP_SRA:
9408	case DIF_OP_SUB:
9409	case DIF_OP_ADD:
9410	case DIF_OP_MUL:
9411	case DIF_OP_SDIV:
9412	case DIF_OP_UDIV:
9413	case DIF_OP_SREM:
9414	case DIF_OP_UREM:
9415	case DIF_OP_COPYS:
9416	case DIF_OP_NOT:
9417	case DIF_OP_MOV:
9418	case DIF_OP_RLDSB:
9419	case DIF_OP_RLDSH:
9420	case DIF_OP_RLDSW:
9421	case DIF_OP_RLDUB:
9422	case DIF_OP_RLDUH:
9423	case DIF_OP_RLDUW:
9424	case DIF_OP_RLDX:
9425	case DIF_OP_ULDSB:
9426	case DIF_OP_ULDSH:
9427	case DIF_OP_ULDSW:
9428	case DIF_OP_ULDUB:
9429	case DIF_OP_ULDUH:
9430	case DIF_OP_ULDUW:
9431	case DIF_OP_ULDX:
9432	case DIF_OP_STB:
9433	case DIF_OP_STH:
9434	case DIF_OP_STW:
9435	case DIF_OP_STX:
9436	case DIF_OP_ALLOCS:
9437	case DIF_OP_CMP:
9438	case DIF_OP_SCMP:
9439	case DIF_OP_TST:
9440	case DIF_OP_BA:
9441	case DIF_OP_BE:
9442	case DIF_OP_BNE:
9443	case DIF_OP_BG:
9444	case DIF_OP_BGU:
9445	case DIF_OP_BGE:
9446	case DIF_OP_BGEU:
9447	case DIF_OP_BL:
9448	case DIF_OP_BLU:
9449	case DIF_OP_BLE:
9450	case DIF_OP_BLEU:
9451	case DIF_OP_RET:
9452	case DIF_OP_NOP:
9453	case DIF_OP_POPTS:
9454	case DIF_OP_FLUSHTS:
9455	case DIF_OP_SETX:
9456	case DIF_OP_SETS:
9457	case DIF_OP_LDGA:
9458	case DIF_OP_LDLS:
9459	case DIF_OP_STGS:
9460	case DIF_OP_STLS:
9461	case DIF_OP_PUSHTR:
9462	case DIF_OP_PUSHTV:
9463	break;
9464
9465	case DIF_OP_LDGS:
9466	if (v >= DIF_VAR_OTHER_UBASE)
9467	break;
9468
9469	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9470	break;
9471
9472	if (v == DIF_VAR_CURTHREAD \|\| v == DIF_VAR_PID \|\|
9473	v == DIF_VAR_PPID \|\| v == DIF_VAR_TID \|\|
9474	v == DIF_VAR_EXECNAME \|\| v == DIF_VAR_ZONENAME \|\|
9475	v == DIF_VAR_UID \|\| v == DIF_VAR_GID)
9476	break;
9477
9478	err += efunc(pc, "illegal variable %u\n", v);
9479	break;
9480
9481	case DIF_OP_LDTA:
9482	case DIF_OP_LDTS:
9483	case DIF_OP_LDGAA:
9484	case DIF_OP_LDTAA:
9485	err += efunc(pc, "illegal dynamic variable load\n");
9486	break;
9487
9488	case DIF_OP_STTS:
9489	case DIF_OP_STGAA:
9490	case DIF_OP_STTAA:
9491	err += efunc(pc, "illegal dynamic variable store\n");
9492	break;
9493
9494	case DIF_OP_CALL:
9495	if (subr == DIF_SUBR_ALLOCA \|\|
9496	subr == DIF_SUBR_BCOPY \|\|
9497	subr == DIF_SUBR_COPYIN \|\|
9498	subr == DIF_SUBR_COPYINTO \|\|
9499	subr == DIF_SUBR_COPYINSTR \|\|
9500	subr == DIF_SUBR_INDEX \|\|
9501	subr == DIF_SUBR_INET_NTOA \|\|
9502	subr == DIF_SUBR_INET_NTOA6 \|\|
9503	subr == DIF_SUBR_INET_NTOP \|\|
9504	subr == DIF_SUBR_LLTOSTR \|\|
9505	subr == DIF_SUBR_RINDEX \|\|
9506	subr == DIF_SUBR_STRCHR \|\|
9507	subr == DIF_SUBR_STRJOIN \|\|
9508	subr == DIF_SUBR_STRRCHR \|\|
9509	subr == DIF_SUBR_STRSTR \|\|
9510	subr == DIF_SUBR_KDEBUG_TRACE \|\|
9511	subr == DIF_SUBR_KDEBUG_TRACE_STRING \|\|
9512	subr == DIF_SUBR_HTONS \|\|
9513	subr == DIF_SUBR_HTONL \|\|
9514	subr == DIF_SUBR_HTONLL \|\|
9515	subr == DIF_SUBR_NTOHS \|\|
9516	subr == DIF_SUBR_NTOHL \|\|
9517	subr == DIF_SUBR_NTOHLL)
9518	break;
9519
9520	err += efunc(pc, "invalid subr %u\n", subr);
9521	break;
9522
9523	default:
9524	err += efunc(pc, "invalid opcode %u\n",
9525	DIF_INSTR_OP(instr));
9526	}
9527	}
9528
9529	return (err);
9530	}
9531
9532	/*
9533	* Returns 1 if the expression in the DIF object can be cached on a per-thread
9534	* basis; 0 if not.
9535	*/
9536	static int
9537	dtrace_difo_cacheable(dtrace_difo_t *dp)
9538	{
9539	uint_t i;
9540
9541	if (dp == NULL)
9542	return (`0`);
9543
9544	for (i = `0`; i < dp->dtdo_varlen; i++) {
9545	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9546
9547	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9548	continue;
9549
9550	switch (v->dtdv_id) {
9551	case DIF_VAR_CURTHREAD:
9552	case DIF_VAR_PID:
9553	case DIF_VAR_TID:
9554	case DIF_VAR_EXECNAME:
9555	case DIF_VAR_ZONENAME:
9556	break;
9557
9558	default:
9559	return (`0`);
9560	}
9561	}
9562
9563	/*
9564	* This DIF object may be cacheable. Now we need to look for any
9565	* array loading instructions, any memory loading instructions, or
9566	* any stores to thread-local variables.
9567	*/
9568	for (i = `0`; i < dp->dtdo_len; i++) {
9569	uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9570
9571	if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) \|\|
9572	(op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) \|\|
9573	(op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) \|\|
9574	op == DIF_OP_LDGA \|\| op == DIF_OP_STTS)
9575	return (`0`);
9576	}
9577
9578	return (`1`);
9579	}
9580
9581	static void
9582	dtrace_difo_hold(dtrace_difo_t *dp)
9583	{
9584	uint_t i;
9585
9586	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9587
9588	dp->dtdo_refcnt++;
9589	ASSERT(dp->dtdo_refcnt != `0`);
9590
9591	/*
9592	* We need to check this DIF object for references to the variable
9593	* DIF_VAR_VTIMESTAMP.
9594	*/
9595	for (i = `0`; i < dp->dtdo_varlen; i++) {
9596	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9597
9598	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9599	continue;
9600
9601	if (dtrace_vtime_references++ == `0`)
9602	dtrace_vtime_enable();
9603	}
9604	}
9605
9606	/*
9607	* This routine calculates the dynamic variable chunksize for a given DIF
9608	* object. The calculation is not fool-proof, and can probably be tricked by
9609	* malicious DIF -- but it works for all compiler-generated DIF. Because this
9610	* calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9611	* if a dynamic variable size exceeds the chunksize.
9612	*/
9613	static void
9614	dtrace_difo_chunksize(dtrace_difo_t dp, dtrace_vstate_t vstate)
9615	{
9616	uint64_t sval = `0`;
9617	dtrace_key_t tupregs[DIF_DTR_NREGS + `2`]; / +2 for thread and id /
9618	const dif_instr_t *text = dp->dtdo_buf;
9619	uint_t pc, srd = `0`;
9620	uint_t ttop = `0`;
9621	size_t size, ksize;
9622	uint_t id, i;
9623
9624	for (pc = `0`; pc < dp->dtdo_len; pc++) {
9625	dif_instr_t instr = text[pc];
9626	uint_t op = DIF_INSTR_OP(instr);
9627	uint_t rd = DIF_INSTR_RD(instr);
9628	uint_t r1 = DIF_INSTR_R1(instr);
9629	uint_t nkeys = `0`;
9630	uchar_t scope;
9631
9632	dtrace_key_t *key = tupregs;
9633
9634	switch (op) {
9635	case DIF_OP_SETX:
9636	sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9637	srd = rd;
9638	continue;
9639
9640	case DIF_OP_STTS:
9641	key = &tupregs[DIF_DTR_NREGS];
9642	key[`0`].dttk_size = `0`;
9643	key[`1`].dttk_size = `0`;
9644	nkeys = `2`;
9645	scope = DIFV_SCOPE_THREAD;
9646	break;
9647
9648	case DIF_OP_STGAA:
9649	case DIF_OP_STTAA:
9650	nkeys = ttop;
9651
9652	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9653	key[nkeys++].dttk_size = `0`;
9654
9655	key[nkeys++].dttk_size = `0`;
9656
9657	if (op == DIF_OP_STTAA) {
9658	scope = DIFV_SCOPE_THREAD;
9659	} else {
9660	scope = DIFV_SCOPE_GLOBAL;
9661	}
9662
9663	break;
9664
9665	case DIF_OP_PUSHTR:
9666	if (ttop == DIF_DTR_NREGS)
9667	return;
9668
9669	if ((srd == `0` \|\| sval == `0`) && r1 == DIF_TYPE_STRING) {
9670	/*
9671	* If the register for the size of the "pushtr"
9672	* is %r0 (or the value is 0) and the type is
9673	* a string, we'll use the system-wide default
9674	* string size.
9675	*/
9676	tupregs[ttop++].dttk_size =
9677	dtrace_strsize_default;
9678	} else {
9679	if (srd == `0`)
9680	return;
9681
9682	if (sval > LONG_MAX)
9683	return;
9684
9685	tupregs[ttop++].dttk_size = sval;
9686	}
9687
9688	break;
9689
9690	case DIF_OP_PUSHTV:
9691	if (ttop == DIF_DTR_NREGS)
9692	return;
9693
9694	tupregs[ttop++].dttk_size = `0`;
9695	break;
9696
9697	case DIF_OP_FLUSHTS:
9698	ttop = `0`;
9699	break;
9700
9701	case DIF_OP_POPTS:
9702	if (ttop != `0`)
9703	ttop--;
9704	break;
9705	}
9706
9707	sval = `0`;
9708	srd = `0`;
9709
9710	if (nkeys == `0`)
9711	continue;
9712
9713	/*
9714	* We have a dynamic variable allocation; calculate its size.
9715	*/
9716	for (ksize = `0`, i = `0`; i < nkeys; i++)
9717	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9718
9719	size = sizeof (dtrace_dynvar_t);
9720	size += sizeof (dtrace_key_t) * (nkeys - `1`);
9721	size += ksize;
9722
9723	/*
9724	* Now we need to determine the size of the stored data.
9725	*/
9726	id = DIF_INSTR_VAR(instr);
9727
9728	for (i = `0`; i < dp->dtdo_varlen; i++) {
9729	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9730
9731	if (v->dtdv_id == id && v->dtdv_scope == scope) {
9732	size += v->dtdv_type.dtdt_size;
9733	break;
9734	}
9735	}
9736
9737	if (i == dp->dtdo_varlen)
9738	return;
9739
9740	/*
9741	* We have the size. If this is larger than the chunk size
9742	* for our dynamic variable state, reset the chunk size.
9743	*/
9744	size = P2ROUNDUP(size, sizeof (uint64_t));
9745
9746	/*
9747	* Before setting the chunk size, check that we're not going
9748	* to set it to a negative value...
9749	*/
9750	if (size > LONG_MAX)
9751	return;
9752
9753	/*
9754	* ...and make certain that we didn't badly overflow.
9755	*/
9756	if (size < ksize \|\| size < sizeof (dtrace_dynvar_t))
9757	return;
9758
9759	if (size > vstate->dtvs_dynvars.dtds_chunksize)
9760	vstate->dtvs_dynvars.dtds_chunksize = size;
9761	}
9762	}
9763
9764	static void
9765	dtrace_difo_init(dtrace_difo_t dp, dtrace_vstate_t vstate)
9766	{
9767	int oldsvars, osz, nsz, otlocals, ntlocals;
9768	uint_t i, id;
9769
9770	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9771	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != `0`);
9772
9773	for (i = `0`; i < dp->dtdo_varlen; i++) {
9774	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9775	dtrace_statvar_t *svar;
9776	dtrace_statvar_t ***svarp = NULL;
9777	size_t dsize = `0`;
9778	uint8_t scope = v->dtdv_scope;
9779	int np = (int* *)NULL;
9780
9781	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9782	continue;
9783
9784	id -= DIF_VAR_OTHER_UBASE;
9785
9786	switch (scope) {
9787	case DIFV_SCOPE_THREAD:
9788	while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9789	dtrace_difv_t *tlocals;
9790
9791	if ((ntlocals = (otlocals << `1`)) == `0`)
9792	ntlocals = `1`;
9793
9794	osz = otlocals * sizeof (dtrace_difv_t);
9795	nsz = ntlocals * sizeof (dtrace_difv_t);
9796
9797	tlocals = kmem_zalloc(nsz, KM_SLEEP);
9798
9799	if (osz != `0`) {
9800	bcopy(vstate->dtvs_tlocals,
9801	tlocals, osz);
9802	kmem_free(vstate->dtvs_tlocals, osz);
9803	}
9804
9805	vstate->dtvs_tlocals = tlocals;
9806	vstate->dtvs_ntlocals = ntlocals;
9807	}
9808
9809	vstate->dtvs_tlocals[id] = *v;
9810	continue;
9811
9812	case DIFV_SCOPE_LOCAL:
9813	np = &vstate->dtvs_nlocals;
9814	svarp = &vstate->dtvs_locals;
9815
9816	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9817	dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9818	sizeof (uint64_t));
9819	else
9820	dsize = (int)NCPU * sizeof (uint64_t);
9821
9822	break;
9823
9824	case DIFV_SCOPE_GLOBAL:
9825	np = &vstate->dtvs_nglobals;
9826	svarp = &vstate->dtvs_globals;
9827
9828	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9829	dsize = v->dtdv_type.dtdt_size +
9830	sizeof (uint64_t);
9831
9832	break;
9833
9834	default:
9835	ASSERT(`0`);
9836	}
9837
9838	while (id >= (uint_t)(oldsvars = *np)) {
9839	dtrace_statvar_t **statics;
9840	int newsvars, oldsize, newsize;
9841
9842	if ((newsvars = (oldsvars << `1`)) == `0`)
9843	newsvars = `1`;
9844
9845	oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9846	newsize = newsvars * sizeof (dtrace_statvar_t *);
9847
9848	statics = kmem_zalloc(newsize, KM_SLEEP);
9849
9850	if (oldsize != `0`) {
9851	bcopy(*svarp, statics, oldsize);
9852	kmem_free(*svarp, oldsize);
9853	}
9854
9855	*svarp = statics;
9856	*np = newsvars;
9857	}
9858
9859	if ((svar = (*svarp)[id]) == NULL) {
9860	svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9861	svar->dtsv_var = *v;
9862
9863	if ((svar->dtsv_size = dsize) != `0`) {
9864	svar->dtsv_data = (uint64_t)(uintptr_t)
9865	kmem_zalloc(dsize, KM_SLEEP);
9866	}
9867
9868	(*svarp)[id] = svar;
9869	}
9870
9871	svar->dtsv_refcnt++;
9872	}
9873
9874	dtrace_difo_chunksize(dp, vstate);
9875	dtrace_difo_hold(dp);
9876	}
9877
9878	static dtrace_difo_t *
9879	dtrace_difo_duplicate(dtrace_difo_t dp, dtrace_vstate_t vstate)
9880	{
9881	dtrace_difo_t *new;
9882	size_t sz;
9883
9884	ASSERT(dp->dtdo_buf != NULL);
9885	ASSERT(dp->dtdo_refcnt != `0`);
9886
9887	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9888
9889	ASSERT(dp->dtdo_buf != NULL);
9890	sz = dp->dtdo_len * sizeof (dif_instr_t);
9891	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9892	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9893	new->dtdo_len = dp->dtdo_len;
9894
9895	if (dp->dtdo_strtab != NULL) {
9896	ASSERT(dp->dtdo_strlen != `0`);
9897	new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9898	bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9899	new->dtdo_strlen = dp->dtdo_strlen;
9900	}
9901
9902	if (dp->dtdo_inttab != NULL) {
9903	ASSERT(dp->dtdo_intlen != `0`);
9904	sz = dp->dtdo_intlen * sizeof (uint64_t);
9905	new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9906	bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9907	new->dtdo_intlen = dp->dtdo_intlen;
9908	}
9909
9910	if (dp->dtdo_vartab != NULL) {
9911	ASSERT(dp->dtdo_varlen != `0`);
9912	sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9913	new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9914	bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9915	new->dtdo_varlen = dp->dtdo_varlen;
9916	}
9917
9918	dtrace_difo_init(new, vstate);
9919	return (new);
9920	}
9921
9922	static void
9923	dtrace_difo_destroy(dtrace_difo_t dp, dtrace_vstate_t vstate)
9924	{
9925	uint_t i;
9926
9927	ASSERT(dp->dtdo_refcnt == `0`);
9928
9929	for (i = `0`; i < dp->dtdo_varlen; i++) {
9930	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9931	dtrace_statvar_t *svar;
9932	dtrace_statvar_t **svarp = NULL;
9933	uint_t id;
9934	uint8_t scope = v->dtdv_scope;
9935	int *np = NULL;
9936
9937	switch (scope) {
9938	case DIFV_SCOPE_THREAD:
9939	continue;
9940
9941	case DIFV_SCOPE_LOCAL:
9942	np = &vstate->dtvs_nlocals;
9943	svarp = vstate->dtvs_locals;
9944	break;
9945
9946	case DIFV_SCOPE_GLOBAL:
9947	np = &vstate->dtvs_nglobals;
9948	svarp = vstate->dtvs_globals;
9949	break;
9950
9951	default:
9952	ASSERT(`0`);
9953	}
9954
9955	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9956	continue;
9957
9958	id -= DIF_VAR_OTHER_UBASE;
9959
9960	ASSERT(id < (uint_t)*np);
9961
9962	svar = svarp[id];
9963	ASSERT(svar != NULL);
9964	ASSERT(svar->dtsv_refcnt > `0`);
9965
9966	if (--svar->dtsv_refcnt > `0`)
9967	continue;
9968
9969	if (svar->dtsv_size != `0`) {
9970	ASSERT(svar->dtsv_data != `0`);
9971	kmem_free((void *)(uintptr_t)svar->dtsv_data,
9972	svar->dtsv_size);
9973	}
9974
9975	kmem_free(svar, sizeof (dtrace_statvar_t));
9976	svarp[id] = NULL;
9977	}
9978
9979	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9980	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9981	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9982	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9983
9984	kmem_free(dp, sizeof (dtrace_difo_t));
9985	}
9986
9987	static void
9988	dtrace_difo_release(dtrace_difo_t dp, dtrace_vstate_t vstate)
9989	{
9990	uint_t i;
9991
9992	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9993	ASSERT(dp->dtdo_refcnt != `0`);
9994
9995	for (i = `0`; i < dp->dtdo_varlen; i++) {
9996	dtrace_difv_t *v = &dp->dtdo_vartab[i];
9997
9998	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9999	continue;
10000
10001	ASSERT(dtrace_vtime_references > `0`);
10002	if (--dtrace_vtime_references == `0`)
10003	dtrace_vtime_disable();
10004	}
10005
10006	if (--dp->dtdo_refcnt == `0`)
10007	dtrace_difo_destroy(dp, vstate);
10008	}
10009
10010	/*
10011	* DTrace Format Functions
10012	*/
10013	static uint16_t
10014	dtrace_format_add(dtrace_state_t state, char* *str)
10015	{
10016	char fmt, *new;
10017	uint16_t ndx, len = strlen(str) + `1`;
10018
10019	fmt = kmem_zalloc(len, KM_SLEEP);
10020	bcopy(str, fmt, len);
10021
10022	for (ndx = `0`; ndx < state->dts_nformats; ndx++) {
10023	if (state->dts_formats[ndx] == NULL) {
10024	state->dts_formats[ndx] = fmt;
10025	return (ndx + `1`);
10026	}
10027	}
10028
10029	if (state->dts_nformats == USHRT_MAX) {
10030	/*
10031	* This is only likely if a denial-of-service attack is being
10032	* attempted. As such, it's okay to fail silently here.
10033	*/
10034	kmem_free(fmt, len);
10035	return (`0`);
10036	}
10037
10038	/*
10039	* For simplicity, we always resize the formats array to be exactly the
10040	* number of formats.
10041	*/
10042	ndx = state->dts_nformats++;
10043	new = kmem_alloc((ndx + `1`) * sizeof (char *), KM_SLEEP);
10044
10045	if (state->dts_formats != NULL) {
10046	ASSERT(ndx != `0`);
10047	bcopy(state->dts_formats, new, ndx * sizeof (char *));
10048	kmem_free(state->dts_formats, ndx * sizeof (char *));
10049	}
10050
10051	state->dts_formats = new;
10052	state->dts_formats[ndx] = fmt;
10053
10054	return (ndx + `1`);
10055	}
10056
10057	static void
10058	dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10059	{
10060	char *fmt;
10061
10062	ASSERT(state->dts_formats != NULL);
10063	ASSERT(format <= state->dts_nformats);
10064	ASSERT(state->dts_formats[format - `1`] != NULL);
10065
10066	fmt = state->dts_formats[format - `1`];
10067	kmem_free(fmt, strlen(fmt) + `1`);
10068	state->dts_formats[format - `1`] = NULL;
10069	}
10070
10071	static void
10072	dtrace_format_destroy(dtrace_state_t *state)
10073	{
10074	int i;
10075
10076	if (state->dts_nformats == `0`) {
10077	ASSERT(state->dts_formats == NULL);
10078	return;
10079	}
10080
10081	ASSERT(state->dts_formats != NULL);
10082
10083	for (i = `0`; i < state->dts_nformats; i++) {
10084	char *fmt = state->dts_formats[i];
10085
10086	if (fmt == NULL)
10087	continue;
10088
10089	kmem_free(fmt, strlen(fmt) + `1`);
10090	}
10091
10092	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10093	state->dts_nformats = `0`;
10094	state->dts_formats = NULL;
10095	}
10096
10097	/*
10098	* DTrace Predicate Functions
10099	*/
10100	static dtrace_predicate_t *
10101	dtrace_predicate_create(dtrace_difo_t *dp)
10102	{
10103	dtrace_predicate_t *pred;
10104
10105	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10106	ASSERT(dp->dtdo_refcnt != `0`);
10107
10108	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10109	pred->dtp_difo = dp;
10110	pred->dtp_refcnt = `1`;
10111
10112	if (!dtrace_difo_cacheable(dp))
10113	return (pred);
10114
10115	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10116	/*
10117	* This is only theoretically possible -- we have had 2^32
10118	* cacheable predicates on this machine. We cannot allow any
10119	* more predicates to become cacheable: as unlikely as it is,
10120	* there may be a thread caching a (now stale) predicate cache
10121	* ID. (N.B.: the temptation is being successfully resisted to
10122	* have this cmn_err() "Holy shit -- we executed this code!")
10123	*/
10124	return (pred);
10125	}
10126
10127	pred->dtp_cacheid = dtrace_predcache_id++;
10128
10129	return (pred);
10130	}
10131
10132	static void
10133	dtrace_predicate_hold(dtrace_predicate_t *pred)
10134	{
10135	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10136	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != `0`);
10137	ASSERT(pred->dtp_refcnt > `0`);
10138
10139	pred->dtp_refcnt++;
10140	}
10141
10142	static void
10143	dtrace_predicate_release(dtrace_predicate_t pred, dtrace_vstate_t vstate)
10144	{
10145	dtrace_difo_t *dp = pred->dtp_difo;
10146	#pragma unused(dp) /* __APPLE__ */
10147
10148	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10149	ASSERT(dp != NULL && dp->dtdo_refcnt != `0`);
10150	ASSERT(pred->dtp_refcnt > `0`);
10151
10152	if (--pred->dtp_refcnt == `0`) {
10153	dtrace_difo_release(pred->dtp_difo, vstate);
10154	kmem_free(pred, sizeof (dtrace_predicate_t));
10155	}
10156	}
10157
10158	/*
10159	* DTrace Action Description Functions
10160	*/
10161	static dtrace_actdesc_t *
10162	dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10163	uint64_t uarg, uint64_t arg)
10164	{
10165	dtrace_actdesc_t *act;
10166
10167	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) \|\| (arg != `0` &&
10168	arg >= KERNELBASE) \|\| (arg == `0` && kind == DTRACEACT_PRINTA));
10169
10170	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10171	act->dtad_kind = kind;
10172	act->dtad_ntuple = ntuple;
10173	act->dtad_uarg = uarg;
10174	act->dtad_arg = arg;
10175	act->dtad_refcnt = `1`;
10176
10177	return (act);
10178	}
10179
10180	static void
10181	dtrace_actdesc_hold(dtrace_actdesc_t *act)
10182	{
10183	ASSERT(act->dtad_refcnt >= `1`);
10184	act->dtad_refcnt++;
10185	}
10186
10187	static void
10188	dtrace_actdesc_release(dtrace_actdesc_t act, dtrace_vstate_t vstate)
10189	{
10190	dtrace_actkind_t kind = act->dtad_kind;
10191	dtrace_difo_t *dp;
10192
10193	ASSERT(act->dtad_refcnt >= `1`);
10194
10195	if (--act->dtad_refcnt != `0`)
10196	return;
10197
10198	if ((dp = act->dtad_difo) != NULL)
10199	dtrace_difo_release(dp, vstate);
10200
10201	if (DTRACEACT_ISPRINTFLIKE(kind)) {
10202	char str = (char* *)(uintptr_t)act->dtad_arg;
10203
10204	ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) \|\|
10205	(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10206
10207	if (str != NULL)
10208	kmem_free(str, strlen(str) + `1`);
10209	}
10210
10211	kmem_free(act, sizeof (dtrace_actdesc_t));
10212	}
10213
10214	/*
10215	* DTrace ECB Functions
10216	*/
10217	static dtrace_ecb_t *
10218	dtrace_ecb_add(dtrace_state_t state, dtrace_probe_t probe)
10219	{
10220	dtrace_ecb_t *ecb;
10221	dtrace_epid_t epid;
10222
10223	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10224
10225	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10226	ecb->dte_predicate = NULL;
10227	ecb->dte_probe = probe;
10228
10229	/*
10230	* The default size is the size of the default action: recording
10231	* the header.
10232	*/
10233	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10234	ecb->dte_alignment = sizeof (dtrace_epid_t);
10235
10236	epid = state->dts_epid++;
10237
10238	if (epid - `1` >= (dtrace_epid_t)state->dts_necbs) {
10239	dtrace_ecb_t oecbs = state->dts_ecbs, ecbs;
10240	int necbs = state->dts_necbs << `1`;
10241
10242	ASSERT(epid == (dtrace_epid_t)state->dts_necbs + `1`);
10243
10244	if (necbs == `0`) {
10245	ASSERT(oecbs == NULL);
10246	necbs = `1`;
10247	}
10248
10249	ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10250
10251	if (oecbs != NULL)
10252	bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10253
10254	dtrace_membar_producer();
10255	state->dts_ecbs = ecbs;
10256
10257	if (oecbs != NULL) {
10258	/*
10259	* If this state is active, we must dtrace_sync()
10260	* before we can free the old dts_ecbs array: we're
10261	* coming in hot, and there may be active ring
10262	* buffer processing (which indexes into the dts_ecbs
10263	* array) on another CPU.
10264	*/
10265	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10266	dtrace_sync();
10267
10268	kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10269	}
10270
10271	dtrace_membar_producer();
10272	state->dts_necbs = necbs;
10273	}
10274
10275	ecb->dte_state = state;
10276
10277	ASSERT(state->dts_ecbs[epid - `1`] == NULL);
10278	dtrace_membar_producer();
10279	state->dts_ecbs[(ecb->dte_epid = epid) - `1`] = ecb;
10280
10281	return (ecb);
10282	}
10283
10284	static int
10285	dtrace_ecb_enable(dtrace_ecb_t *ecb)
10286	{
10287	dtrace_probe_t *probe = ecb->dte_probe;
10288
10289	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10290	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10291	ASSERT(ecb->dte_next == NULL);
10292
10293	if (probe == NULL) {
10294	/*
10295	* This is the NULL probe -- there's nothing to do.
10296	*/
10297	return(`0`);
10298	}
10299
10300	probe->dtpr_provider->dtpv_ecb_count++;
10301	if (probe->dtpr_ecb == NULL) {
10302	dtrace_provider_t *prov = probe->dtpr_provider;
10303
10304	/*
10305	* We're the first ECB on this probe.
10306	*/
10307	probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10308
10309	if (ecb->dte_predicate != NULL)
10310	probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10311
10312	return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10313	probe->dtpr_id, probe->dtpr_arg));
10314	} else {
10315	/*
10316	* This probe is already active. Swing the last pointer to
10317	* point to the new ECB, and issue a dtrace_sync() to assure
10318	* that all CPUs have seen the change.
10319	*/
10320	ASSERT(probe->dtpr_ecb_last != NULL);
10321	probe->dtpr_ecb_last->dte_next = ecb;
10322	probe->dtpr_ecb_last = ecb;
10323	probe->dtpr_predcache = `0`;
10324
10325	dtrace_sync();
10326	return(`0`);
10327	}
10328	}
10329
10330	static int
10331	dtrace_ecb_resize(dtrace_ecb_t *ecb)
10332	{
10333	dtrace_action_t *act;
10334	uint32_t curneeded = UINT32_MAX;
10335	uint32_t aggbase = UINT32_MAX;
10336
10337	/*
10338	* If we record anything, we always record the dtrace_rechdr_t. (And
10339	* we always record it first.)
10340	*/
10341	ecb->dte_size = sizeof (dtrace_rechdr_t);
10342	ecb->dte_alignment = sizeof (dtrace_epid_t);
10343
10344	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10345	dtrace_recdesc_t *rec = &act->dta_rec;
10346	ASSERT(rec->dtrd_size > `0` \|\| rec->dtrd_alignment == `1`);
10347
10348	ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10349
10350	if (DTRACEACT_ISAGG(act->dta_kind)) {
10351	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
10352
10353	ASSERT(rec->dtrd_size != `0`);
10354	ASSERT(agg->dtag_first != NULL);
10355	ASSERT(act->dta_prev->dta_intuple);
10356	ASSERT(aggbase != UINT32_MAX);
10357	ASSERT(curneeded != UINT32_MAX);
10358
10359	agg->dtag_base = aggbase;
10360	curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10361	rec->dtrd_offset = curneeded;
10362	if (curneeded + rec->dtrd_size < curneeded)
10363	return (EINVAL);
10364	curneeded += rec->dtrd_size;
10365	ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10366
10367	aggbase = UINT32_MAX;
10368	curneeded = UINT32_MAX;
10369	} else if (act->dta_intuple) {
10370	if (curneeded == UINT32_MAX) {
10371	/*
10372	* This is the first record in a tuple. Align
10373	* curneeded to be at offset 4 in an 8-byte
10374	* aligned block.
10375	*/
10376	ASSERT(act->dta_prev == NULL \|\| !act->dta_prev->dta_intuple);
10377	ASSERT(aggbase == UINT32_MAX);
10378
10379	curneeded = P2PHASEUP(ecb->dte_size,
10380	sizeof (uint64_t), sizeof (dtrace_aggid_t));
10381
10382	aggbase = curneeded - sizeof (dtrace_aggid_t);
10383	ASSERT(IS_P2ALIGNED(aggbase,
10384	sizeof (uint64_t)));
10385	}
10386
10387	curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10388	rec->dtrd_offset = curneeded;
10389	curneeded += rec->dtrd_size;
10390	if (curneeded + rec->dtrd_size < curneeded)
10391	return (EINVAL);
10392	} else {
10393	/ tuples must be followed by an aggregation /
10394	ASSERT(act->dta_prev == NULL \|\| !act->dta_prev->dta_intuple);
10395	ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
10396	rec->dtrd_offset = ecb->dte_size;
10397	if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
10398	return (EINVAL);
10399	ecb->dte_size += rec->dtrd_size;
10400	ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10401	}
10402	}
10403
10404	if ((act = ecb->dte_action) != NULL &&
10405	!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10406	ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10407	/*
10408	* If the size is still sizeof (dtrace_rechdr_t), then all
10409	* actions store no data; set the size to 0.
10410	*/
10411	ecb->dte_size = `0`;
10412	}
10413
10414	ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10415	ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10416	ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
10417	return (`0`);
10418	}
10419
10420	static dtrace_action_t *
10421	dtrace_ecb_aggregation_create(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
10422	{
10423	dtrace_aggregation_t *agg;
10424	size_t size = sizeof (uint64_t);
10425	int ntuple = desc->dtad_ntuple;
10426	dtrace_action_t *act;
10427	dtrace_recdesc_t *frec;
10428	dtrace_aggid_t aggid;
10429	dtrace_state_t *state = ecb->dte_state;
10430
10431	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10432	agg->dtag_ecb = ecb;
10433
10434	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10435
10436	switch (desc->dtad_kind) {
10437	case DTRACEAGG_MIN:
10438	agg->dtag_initial = INT64_MAX;
10439	agg->dtag_aggregate = dtrace_aggregate_min;
10440	break;
10441
10442	case DTRACEAGG_MAX:
10443	agg->dtag_initial = INT64_MIN;
10444	agg->dtag_aggregate = dtrace_aggregate_max;
10445	break;
10446
10447	case DTRACEAGG_COUNT:
10448	agg->dtag_aggregate = dtrace_aggregate_count;
10449	break;
10450
10451	case DTRACEAGG_QUANTIZE:
10452	agg->dtag_aggregate = dtrace_aggregate_quantize;
10453	size = (((sizeof (uint64_t) * NBBY) - `1`) * `2` + `1`) *
10454	sizeof (uint64_t);
10455	break;
10456
10457	case DTRACEAGG_LQUANTIZE: {
10458	uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10459	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10460
10461	agg->dtag_initial = desc->dtad_arg;
10462	agg->dtag_aggregate = dtrace_aggregate_lquantize;
10463
10464	if (step == `0` \|\| levels == `0`)
10465	goto err;
10466
10467	size = levels * sizeof (uint64_t) + `3` * sizeof (uint64_t);
10468	break;
10469	}
10470
10471	case DTRACEAGG_LLQUANTIZE: {
10472	uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10473	uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10474	uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10475	uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10476	int64_t v;
10477
10478	agg->dtag_initial = desc->dtad_arg;
10479	agg->dtag_aggregate = dtrace_aggregate_llquantize;
10480
10481	if (factor < `2` \|\| low >= high \|\| nsteps < factor)
10482	goto err;
10483
10484	/*
10485	* Now check that the number of steps evenly divides a power
10486	* of the factor. (This assures both integer bucket size and
10487	* linearity within each magnitude.)
10488	*/
10489	for (v = factor; v < nsteps; v *= factor)
10490	continue;
10491
10492	if ((v % nsteps) \|\| (nsteps % factor))
10493	goto err;
10494
10495	size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + `2`) * sizeof (uint64_t);
10496	break;
10497	}
10498
10499	case DTRACEAGG_AVG:
10500	agg->dtag_aggregate = dtrace_aggregate_avg;
10501	size = sizeof (uint64_t) * `2`;
10502	break;
10503
10504	case DTRACEAGG_STDDEV:
10505	agg->dtag_aggregate = dtrace_aggregate_stddev;
10506	size = sizeof (uint64_t) * `4`;
10507	break;
10508
10509	case DTRACEAGG_SUM:
10510	agg->dtag_aggregate = dtrace_aggregate_sum;
10511	break;
10512
10513	default:
10514	goto err;
10515	}
10516
10517	agg->dtag_action.dta_rec.dtrd_size = size;
10518
10519	if (ntuple == `0`)
10520	goto err;
10521
10522	/*
10523	* We must make sure that we have enough actions for the n-tuple.
10524	*/
10525	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10526	if (DTRACEACT_ISAGG(act->dta_kind))
10527	break;
10528
10529	if (--ntuple == `0`) {
10530	/*
10531	* This is the action with which our n-tuple begins.
10532	*/
10533	agg->dtag_first = act;
10534	goto success;
10535	}
10536	}
10537
10538	/*
10539	* This n-tuple is short by ntuple elements. Return failure.
10540	*/
10541	ASSERT(ntuple != `0`);
10542	err:
10543	kmem_free(agg, sizeof (dtrace_aggregation_t));
10544	return (NULL);
10545
10546	success:
10547	/*
10548	* If the last action in the tuple has a size of zero, it's actually
10549	* an expression argument for the aggregating action.
10550	*/
10551	ASSERT(ecb->dte_action_last != NULL);
10552	act = ecb->dte_action_last;
10553
10554	if (act->dta_kind == DTRACEACT_DIFEXPR) {
10555	ASSERT(act->dta_difo != NULL);
10556
10557	if (act->dta_difo->dtdo_rtype.dtdt_size == `0`)
10558	agg->dtag_hasarg = `1`;
10559	}
10560
10561	/*
10562	* We need to allocate an id for this aggregation.
10563	*/
10564	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, `1`,
10565	VM_BESTFIT \| VM_SLEEP);
10566
10567	if (aggid - `1` >= (dtrace_aggid_t)state->dts_naggregations) {
10568	dtrace_aggregation_t **oaggs = state->dts_aggregations;
10569	dtrace_aggregation_t **aggs;
10570	int naggs = state->dts_naggregations << `1`;
10571	int onaggs = state->dts_naggregations;
10572
10573	ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + `1`);
10574
10575	if (naggs == `0`) {
10576	ASSERT(oaggs == NULL);
10577	naggs = `1`;
10578	}
10579
10580	aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10581
10582	if (oaggs != NULL) {
10583	bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10584	kmem_free(oaggs, onaggs * sizeof (*aggs));
10585	}
10586
10587	state->dts_aggregations = aggs;
10588	state->dts_naggregations = naggs;
10589	}
10590
10591	ASSERT(state->dts_aggregations[aggid - `1`] == NULL);
10592	state->dts_aggregations[(agg->dtag_id = aggid) - `1`] = agg;
10593
10594	frec = &agg->dtag_first->dta_rec;
10595	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10596	frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10597
10598	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10599	ASSERT(!act->dta_intuple);
10600	act->dta_intuple = `1`;
10601	}
10602
10603	return (&agg->dtag_action);
10604	}
10605
10606	static void
10607	dtrace_ecb_aggregation_destroy(dtrace_ecb_t ecb, dtrace_action_t act)
10608	{
10609	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
10610	dtrace_state_t *state = ecb->dte_state;
10611	dtrace_aggid_t aggid = agg->dtag_id;
10612
10613	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10614	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, `1`);
10615
10616	ASSERT(state->dts_aggregations[aggid - `1`] == agg);
10617	state->dts_aggregations[aggid - `1`] = NULL;
10618
10619	kmem_free(agg, sizeof (dtrace_aggregation_t));
10620	}
10621
10622	static int
10623	dtrace_ecb_action_add(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
10624	{
10625	dtrace_action_t action, last;
10626	dtrace_difo_t *dp = desc->dtad_difo;
10627	uint32_t size = `0`, align = sizeof (uint8_t), mask;
10628	uint16_t format = `0`;
10629	dtrace_recdesc_t *rec;
10630	dtrace_state_t *state = ecb->dte_state;
10631	dtrace_optval_t *opt = state->dts_options;
10632	dtrace_optval_t nframes=`0`, strsize;
10633	uint64_t arg = desc->dtad_arg;
10634
10635	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10636	ASSERT(ecb->dte_action == NULL \|\| ecb->dte_action->dta_refcnt == `1`);
10637
10638	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10639	/*
10640	* If this is an aggregating action, there must be neither
10641	* a speculate nor a commit on the action chain.
10642	*/
10643	dtrace_action_t *act;
10644
10645	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10646	if (act->dta_kind == DTRACEACT_COMMIT)
10647	return (EINVAL);
10648
10649	if (act->dta_kind == DTRACEACT_SPECULATE)
10650	return (EINVAL);
10651	}
10652
10653	action = dtrace_ecb_aggregation_create(ecb, desc);
10654
10655	if (action == NULL)
10656	return (EINVAL);
10657	} else {
10658	if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) \|\|
10659	(desc->dtad_kind == DTRACEACT_DIFEXPR &&
10660	dp != NULL && dp->dtdo_destructive)) {
10661	state->dts_destructive = `1`;
10662	}
10663
10664	switch (desc->dtad_kind) {
10665	case DTRACEACT_PRINTF:
10666	case DTRACEACT_PRINTA:
10667	case DTRACEACT_SYSTEM:
10668	case DTRACEACT_FREOPEN:
10669	case DTRACEACT_DIFEXPR:
10670	/*
10671	* We know that our arg is a string -- turn it into a
10672	* format.
10673	*/
10674	if (arg == `0`) {
10675	ASSERT(desc->dtad_kind == DTRACEACT_PRINTA \|\|
10676	desc->dtad_kind == DTRACEACT_DIFEXPR);
10677	format = `0`;
10678	} else {
10679	ASSERT(arg != `0`);
10680	ASSERT(arg > KERNELBASE);
10681	format = dtrace_format_add(state,
10682	(char *)(uintptr_t)arg);
10683	}
10684
10685	/FALLTHROUGH/
10686	case DTRACEACT_LIBACT:
10687	case DTRACEACT_TRACEMEM:
10688	case DTRACEACT_TRACEMEM_DYNSIZE:
10689	case DTRACEACT_APPLEBINARY: / __APPLE__ /
10690	if (dp == NULL)
10691	return (EINVAL);
10692
10693	if ((size = dp->dtdo_rtype.dtdt_size) != `0`)
10694	break;
10695
10696	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10697	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10698	return (EINVAL);
10699
10700	size = opt[DTRACEOPT_STRSIZE];
10701	}
10702
10703	break;
10704
10705	case DTRACEACT_STACK:
10706	if ((nframes = arg) == `0`) {
10707	nframes = opt[DTRACEOPT_STACKFRAMES];
10708	ASSERT(nframes > `0`);
10709	arg = nframes;
10710	}
10711
10712	size = nframes * sizeof (pc_t);
10713	break;
10714
10715	case DTRACEACT_JSTACK:
10716	if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == `0`)
10717	strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10718
10719	if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == `0`)
10720	nframes = opt[DTRACEOPT_JSTACKFRAMES];
10721
10722	arg = DTRACE_USTACK_ARG(nframes, strsize);
10723
10724	/FALLTHROUGH/
10725	case DTRACEACT_USTACK:
10726	if (desc->dtad_kind != DTRACEACT_JSTACK &&
10727	(nframes = DTRACE_USTACK_NFRAMES(arg)) == `0`) {
10728	strsize = DTRACE_USTACK_STRSIZE(arg);
10729	nframes = opt[DTRACEOPT_USTACKFRAMES];
10730	ASSERT(nframes > `0`);
10731	arg = DTRACE_USTACK_ARG(nframes, strsize);
10732	}
10733
10734	/*
10735	* Save a slot for the pid.
10736	*/
10737	size = (nframes + `1`) * sizeof (uint64_t);
10738	size += DTRACE_USTACK_STRSIZE(arg);
10739	size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10740
10741	break;
10742
10743	case DTRACEACT_SYM:
10744	case DTRACEACT_MOD:
10745	if (dp == NULL \|\| ((size = dp->dtdo_rtype.dtdt_size) !=
10746	sizeof (uint64_t)) \|\|
10747	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10748	return (EINVAL);
10749	break;
10750
10751	case DTRACEACT_USYM:
10752	case DTRACEACT_UMOD:
10753	case DTRACEACT_UADDR:
10754	if (dp == NULL \|\|
10755	(dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) \|\|
10756	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10757	return (EINVAL);
10758
10759	/*
10760	* We have a slot for the pid, plus a slot for the
10761	* argument. To keep things simple (aligned with
10762	* bitness-neutral sizing), we store each as a 64-bit
10763	* quantity.
10764	*/
10765	size = `2` * sizeof (uint64_t);
10766	break;
10767
10768	case DTRACEACT_STOP:
10769	case DTRACEACT_BREAKPOINT:
10770	case DTRACEACT_PANIC:
10771	break;
10772
10773	case DTRACEACT_CHILL:
10774	case DTRACEACT_DISCARD:
10775	case DTRACEACT_RAISE:
10776	case DTRACEACT_PIDRESUME: / __APPLE__ /
10777	if (dp == NULL)
10778	return (EINVAL);
10779	break;
10780
10781	case DTRACEACT_EXIT:
10782	if (dp == NULL \|\|
10783	(size = dp->dtdo_rtype.dtdt_size) != sizeof (int) \|\|
10784	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10785	return (EINVAL);
10786	break;
10787
10788	case DTRACEACT_SPECULATE:
10789	if (ecb->dte_size > sizeof (dtrace_rechdr_t))
10790	return (EINVAL);
10791
10792	if (dp == NULL)
10793	return (EINVAL);
10794
10795	state->dts_speculates = `1`;
10796	break;
10797
10798	case DTRACEACT_COMMIT: {
10799	dtrace_action_t *act = ecb->dte_action;
10800
10801	for (; act != NULL; act = act->dta_next) {
10802	if (act->dta_kind == DTRACEACT_COMMIT)
10803	return (EINVAL);
10804	}
10805
10806	if (dp == NULL)
10807	return (EINVAL);
10808	break;
10809	}
10810
10811	default:
10812	return (EINVAL);
10813	}
10814
10815	if (size != `0` \|\| desc->dtad_kind == DTRACEACT_SPECULATE) {
10816	/*
10817	* If this is a data-storing action or a speculate,
10818	* we must be sure that there isn't a commit on the
10819	* action chain.
10820	*/
10821	dtrace_action_t *act = ecb->dte_action;
10822
10823	for (; act != NULL; act = act->dta_next) {
10824	if (act->dta_kind == DTRACEACT_COMMIT)
10825	return (EINVAL);
10826	}
10827	}
10828
10829	action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10830	action->dta_rec.dtrd_size = size;
10831	}
10832
10833	action->dta_refcnt = `1`;
10834	rec = &action->dta_rec;
10835	size = rec->dtrd_size;
10836
10837	for (mask = sizeof (uint64_t) - `1`; size != `0` && mask > `0`; mask >>= `1`) {
10838	if (!(size & mask)) {
10839	align = mask + `1`;
10840	break;
10841	}
10842	}
10843
10844	action->dta_kind = desc->dtad_kind;
10845
10846	if ((action->dta_difo = dp) != NULL)
10847	dtrace_difo_hold(dp);
10848
10849	rec->dtrd_action = action->dta_kind;
10850	rec->dtrd_arg = arg;
10851	rec->dtrd_uarg = desc->dtad_uarg;
10852	rec->dtrd_alignment = (uint16_t)align;
10853	rec->dtrd_format = format;
10854
10855	if ((last = ecb->dte_action_last) != NULL) {
10856	ASSERT(ecb->dte_action != NULL);
10857	action->dta_prev = last;
10858	last->dta_next = action;
10859	} else {
10860	ASSERT(ecb->dte_action == NULL);
10861	ecb->dte_action = action;
10862	}
10863
10864	ecb->dte_action_last = action;
10865
10866	return (`0`);
10867	}
10868
10869	static void
10870	dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10871	{
10872	dtrace_action_t act = ecb->dte_action, next;
10873	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10874	dtrace_difo_t *dp;
10875	uint16_t format;
10876
10877	if (act != NULL && act->dta_refcnt > `1`) {
10878	ASSERT(act->dta_next == NULL \|\| act->dta_next->dta_refcnt == `1`);
10879	act->dta_refcnt--;
10880	} else {
10881	for (; act != NULL; act = next) {
10882	next = act->dta_next;
10883	ASSERT(next != NULL \|\| act == ecb->dte_action_last);
10884	ASSERT(act->dta_refcnt == `1`);
10885
10886	if ((format = act->dta_rec.dtrd_format) != `0`)
10887	dtrace_format_remove(ecb->dte_state, format);
10888
10889	if ((dp = act->dta_difo) != NULL)
10890	dtrace_difo_release(dp, vstate);
10891
10892	if (DTRACEACT_ISAGG(act->dta_kind)) {
10893	dtrace_ecb_aggregation_destroy(ecb, act);
10894	} else {
10895	kmem_free(act, sizeof (dtrace_action_t));
10896	}
10897	}
10898	}
10899
10900	ecb->dte_action = NULL;
10901	ecb->dte_action_last = NULL;
10902	ecb->dte_size = `0`;
10903	}
10904
10905	static void
10906	dtrace_ecb_disable(dtrace_ecb_t *ecb)
10907	{
10908	/*
10909	* We disable the ECB by removing it from its probe.
10910	*/
10911	dtrace_ecb_t pecb, prev = NULL;
10912	dtrace_probe_t *probe = ecb->dte_probe;
10913
10914	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10915
10916	if (probe == NULL) {
10917	/*
10918	* This is the NULL probe; there is nothing to disable.
10919	*/
10920	return;
10921	}
10922
10923	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10924	if (pecb == ecb)
10925	break;
10926	prev = pecb;
10927	}
10928
10929	ASSERT(pecb != NULL);
10930
10931	if (prev == NULL) {
10932	probe->dtpr_ecb = ecb->dte_next;
10933	} else {
10934	prev->dte_next = ecb->dte_next;
10935	}
10936
10937	if (ecb == probe->dtpr_ecb_last) {
10938	ASSERT(ecb->dte_next == NULL);
10939	probe->dtpr_ecb_last = prev;
10940	}
10941
10942	probe->dtpr_provider->dtpv_ecb_count--;
10943	/*
10944	* The ECB has been disconnected from the probe; now sync to assure
10945	* that all CPUs have seen the change before returning.
10946	*/
10947	dtrace_sync();
10948
10949	if (probe->dtpr_ecb == NULL) {
10950	/*
10951	* That was the last ECB on the probe; clear the predicate
10952	* cache ID for the probe, disable it and sync one more time
10953	* to assure that we'll never hit it again.
10954	*/
10955	dtrace_provider_t *prov = probe->dtpr_provider;
10956
10957	ASSERT(ecb->dte_next == NULL);
10958	ASSERT(probe->dtpr_ecb_last == NULL);
10959	probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10960	prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10961	probe->dtpr_id, probe->dtpr_arg);
10962	dtrace_sync();
10963	} else {
10964	/*
10965	* There is at least one ECB remaining on the probe. If there
10966	* is _exactly_ one, set the probe's predicate cache ID to be
10967	* the predicate cache ID of the remaining ECB.
10968	*/
10969	ASSERT(probe->dtpr_ecb_last != NULL);
10970	ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10971
10972	if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10973	dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10974
10975	ASSERT(probe->dtpr_ecb->dte_next == NULL);
10976
10977	if (p != NULL)
10978	probe->dtpr_predcache = p->dtp_cacheid;
10979	}
10980
10981	ecb->dte_next = NULL;
10982	}
10983	}
10984
10985	static void
10986	dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10987	{
10988	dtrace_state_t *state = ecb->dte_state;
10989	dtrace_vstate_t *vstate = &state->dts_vstate;
10990	dtrace_predicate_t *pred;
10991	dtrace_epid_t epid = ecb->dte_epid;
10992
10993	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10994	ASSERT(ecb->dte_next == NULL);
10995	ASSERT(ecb->dte_probe == NULL \|\| ecb->dte_probe->dtpr_ecb != ecb);
10996
10997	if ((pred = ecb->dte_predicate) != NULL)
10998	dtrace_predicate_release(pred, vstate);
10999
11000	dtrace_ecb_action_remove(ecb);
11001
11002	ASSERT(state->dts_ecbs[epid - `1`] == ecb);
11003	state->dts_ecbs[epid - `1`] = NULL;
11004
11005	kmem_free(ecb, sizeof (dtrace_ecb_t));
11006	}
11007
11008	static dtrace_ecb_t *
11009	dtrace_ecb_create(dtrace_state_t state, dtrace_probe_t probe,
11010	dtrace_enabling_t *enab)
11011	{
11012	dtrace_ecb_t *ecb;
11013	dtrace_predicate_t *pred;
11014	dtrace_actdesc_t *act;
11015	dtrace_provider_t *prov;
11016	dtrace_ecbdesc_t *desc = enab->dten_current;
11017
11018	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11019	ASSERT(state != NULL);
11020
11021	ecb = dtrace_ecb_add(state, probe);
11022	ecb->dte_uarg = desc->dted_uarg;
11023
11024	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11025	dtrace_predicate_hold(pred);
11026	ecb->dte_predicate = pred;
11027	}
11028
11029	if (probe != NULL) {
11030	/*
11031	* If the provider shows more leg than the consumer is old
11032	* enough to see, we need to enable the appropriate implicit
11033	* predicate bits to prevent the ecb from activating at
11034	* revealing times.
11035	*
11036	* Providers specifying DTRACE_PRIV_USER at register time
11037	* are stating that they need the /proc-style privilege
11038	* model to be enforced, and this is what DTRACE_COND_OWNER
11039	* and DTRACE_COND_ZONEOWNER will then do at probe time.
11040	*/
11041	prov = probe->dtpr_provider;
11042	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11043	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11044	ecb->dte_cond \|= DTRACE_COND_OWNER;
11045
11046	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11047	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11048	ecb->dte_cond \|= DTRACE_COND_ZONEOWNER;
11049
11050	/*
11051	* If the provider shows us kernel innards and the user
11052	* is lacking sufficient privilege, enable the
11053	* DTRACE_COND_USERMODE implicit predicate.
11054	*/
11055	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11056	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11057	ecb->dte_cond \|= DTRACE_COND_USERMODE;
11058	}
11059
11060	if (dtrace_ecb_create_cache != NULL) {
11061	/*
11062	* If we have a cached ecb, we'll use its action list instead
11063	* of creating our own (saving both time and space).
11064	*/
11065	dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11066	dtrace_action_t *act_if = cached->dte_action;
11067
11068	if (act_if != NULL) {
11069	ASSERT(act_if->dta_refcnt > `0`);
11070	act_if->dta_refcnt++;
11071	ecb->dte_action = act_if;
11072	ecb->dte_action_last = cached->dte_action_last;
11073	ecb->dte_needed = cached->dte_needed;
11074	ecb->dte_size = cached->dte_size;
11075	ecb->dte_alignment = cached->dte_alignment;
11076	}
11077
11078	return (ecb);
11079	}
11080
11081	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11082	if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != `0`) {
11083	dtrace_ecb_destroy(ecb);
11084	return (NULL);
11085	}
11086	}
11087
11088	if ((enab->dten_error = dtrace_ecb_resize(ecb)) != `0`) {
11089	dtrace_ecb_destroy(ecb);
11090	return (NULL);
11091	}
11092
11093	return (dtrace_ecb_create_cache = ecb);
11094	}
11095
11096	static int
11097	dtrace_ecb_create_enable(dtrace_probe_t probe, void* arg1, void* *arg2)
11098	{
11099	dtrace_ecb_t *ecb;
11100	dtrace_enabling_t *enab = arg1;
11101	dtrace_ecbdesc_t *ep = arg2;
11102	dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11103
11104	ASSERT(state != NULL);
11105
11106	if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11107	/*
11108	* This probe was created in a generation for which this
11109	* enabling has previously created ECBs; we don't want to
11110	* enable it again, so just kick out.
11111	*/
11112	return (DTRACE_MATCH_NEXT);
11113	}
11114
11115	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11116	return (DTRACE_MATCH_DONE);
11117
11118	if (dtrace_ecb_enable(ecb) < `0`)
11119	return (DTRACE_MATCH_FAIL);
11120
11121	return (DTRACE_MATCH_NEXT);
11122	}
11123
11124	static dtrace_ecb_t *
11125	dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11126	{
11127	dtrace_ecb_t *ecb;
11128	#pragma unused(ecb) /* __APPLE__ */
11129
11130	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11131
11132	if (id == `0` \|\| id > (dtrace_epid_t)state->dts_necbs)
11133	return (NULL);
11134
11135	ASSERT(state->dts_necbs > `0` && state->dts_ecbs != NULL);
11136	ASSERT((ecb = state->dts_ecbs[id - `1`]) == NULL \|\| ecb->dte_epid == id);
11137
11138	return (state->dts_ecbs[id - `1`]);
11139	}
11140
11141	static dtrace_aggregation_t *
11142	dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11143	{
11144	dtrace_aggregation_t *agg;
11145	#pragma unused(agg) /* __APPLE__ */
11146
11147	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11148
11149	if (id == `0` \|\| id > (dtrace_aggid_t)state->dts_naggregations)
11150	return (NULL);
11151
11152	ASSERT(state->dts_naggregations > `0` && state->dts_aggregations != NULL);
11153	ASSERT((agg = state->dts_aggregations[id - `1`]) == NULL \|\|
11154	agg->dtag_id == id);
11155
11156	return (state->dts_aggregations[id - `1`]);
11157	}
11158
11159	/*
11160	* DTrace Buffer Functions
11161	*
11162	* The following functions manipulate DTrace buffers. Most of these functions
11163	* are called in the context of establishing or processing consumer state;
11164	* exceptions are explicitly noted.
11165	*/
11166
11167	/*
11168	* Note: called from cross call context. This function switches the two
11169	* buffers on a given CPU. The atomicity of this operation is assured by
11170	* disabling interrupts while the actual switch takes place; the disabling of
11171	* interrupts serializes the execution with any execution of dtrace_probe() on
11172	* the same CPU.
11173	*/
11174	static void
11175	dtrace_buffer_switch(dtrace_buffer_t *buf)
11176	{
11177	caddr_t tomax = buf->dtb_tomax;
11178	caddr_t xamot = buf->dtb_xamot;
11179	dtrace_icookie_t cookie;
11180	hrtime_t now;
11181
11182	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11183	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11184
11185	cookie = dtrace_interrupt_disable();
11186	now = dtrace_gethrtime();
11187	buf->dtb_tomax = xamot;
11188	buf->dtb_xamot = tomax;
11189	buf->dtb_xamot_drops = buf->dtb_drops;
11190	buf->dtb_xamot_offset = buf->dtb_offset;
11191	buf->dtb_xamot_errors = buf->dtb_errors;
11192	buf->dtb_xamot_flags = buf->dtb_flags;
11193	buf->dtb_offset = `0`;
11194	buf->dtb_drops = `0`;
11195	buf->dtb_errors = `0`;
11196	buf->dtb_flags &= ~(DTRACEBUF_ERROR \| DTRACEBUF_DROPPED);
11197	buf->dtb_interval = now - buf->dtb_switched;
11198	buf->dtb_switched = now;
11199	buf->dtb_cur_limit = buf->dtb_limit;
11200
11201	dtrace_interrupt_enable(cookie);
11202	}
11203
11204	/*
11205	* Note: called from cross call context. This function activates a buffer
11206	* on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11207	* is guaranteed by the disabling of interrupts.
11208	*/
11209	static void
11210	dtrace_buffer_activate(dtrace_state_t *state)
11211	{
11212	dtrace_buffer_t *buf;
11213	dtrace_icookie_t cookie = dtrace_interrupt_disable();
11214
11215	buf = &state->dts_buffer[CPU->cpu_id];
11216
11217	if (buf->dtb_tomax != NULL) {
11218	/*
11219	* We might like to assert that the buffer is marked inactive,
11220	* but this isn't necessarily true: the buffer for the CPU
11221	* that processes the BEGIN probe has its buffer activated
11222	* manually. In this case, we take the (harmless) action
11223	* re-clearing the bit INACTIVE bit.
11224	*/
11225	buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11226	}
11227
11228	dtrace_interrupt_enable(cookie);
11229	}
11230
11231	static int
11232	dtrace_buffer_canalloc(size_t size)
11233	{
11234	if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11235	return (B_FALSE);
11236	if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11237	return (B_FALSE);
11238
11239	return (B_TRUE);
11240	}
11241
11242	static int
11243	dtrace_buffer_alloc(dtrace_buffer_t bufs, size_t limit, size_t size, int* flags,
11244	processorid_t cpu)
11245	{
11246	dtrace_cpu_t *cp;
11247	dtrace_buffer_t *buf;
11248	size_t size_before_alloc = dtrace_buffer_memory_inuse;
11249
11250	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11251	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11252
11253	if (size > (size_t)dtrace_nonroot_maxsize &&
11254	!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11255	return (EFBIG);
11256
11257	cp = cpu_list;
11258
11259	do {
11260	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11261	continue;
11262
11263	buf = &bufs[cp->cpu_id];
11264
11265	/*
11266	* If there is already a buffer allocated for this CPU, it
11267	* is only possible that this is a DR event. In this case,
11268	* the buffer size must match our specified size.
11269	*/
11270	if (buf->dtb_tomax != NULL) {
11271	ASSERT(buf->dtb_size == size);
11272	continue;
11273	}
11274
11275	ASSERT(buf->dtb_xamot == NULL);
11276
11277	/ DTrace, please do not eat all the memory. /
11278	if (dtrace_buffer_canalloc(size) == B_FALSE)
11279	goto err;
11280	if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11281	goto err;
11282	dtrace_buffer_memory_inuse += size;
11283
11284	/ Unsure that limit is always lower than size /
11285	limit = limit == size ? limit - `1` : limit;
11286	buf->dtb_cur_limit = limit;
11287	buf->dtb_limit = limit;
11288	buf->dtb_size = size;
11289	buf->dtb_flags = flags;
11290	buf->dtb_offset = `0`;
11291	buf->dtb_drops = `0`;
11292
11293	if (flags & DTRACEBUF_NOSWITCH)
11294	continue;
11295
11296	/ DTrace, please do not eat all the memory. /
11297	if (dtrace_buffer_canalloc(size) == B_FALSE)
11298	goto err;
11299	if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11300	goto err;
11301	dtrace_buffer_memory_inuse += size;
11302	} while ((cp = cp->cpu_next) != cpu_list);
11303
11304	ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11305
11306	return (`0`);
11307
11308	err:
11309	cp = cpu_list;
11310
11311	do {
11312	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11313	continue;
11314
11315	buf = &bufs[cp->cpu_id];
11316
11317	if (buf->dtb_xamot != NULL) {
11318	ASSERT(buf->dtb_tomax != NULL);
11319	ASSERT(buf->dtb_size == size);
11320	kmem_free(buf->dtb_xamot, size);
11321	}
11322
11323	if (buf->dtb_tomax != NULL) {
11324	ASSERT(buf->dtb_size == size);
11325	kmem_free(buf->dtb_tomax, size);
11326	}
11327
11328	buf->dtb_tomax = NULL;
11329	buf->dtb_xamot = NULL;
11330	buf->dtb_size = `0`;
11331	} while ((cp = cp->cpu_next) != cpu_list);
11332
11333	/ Restore the size saved before allocating memory /
11334	dtrace_buffer_memory_inuse = size_before_alloc;
11335
11336	return (ENOMEM);
11337	}
11338
11339	/*
11340	* Note: called from probe context. This function just increments the drop
11341	* count on a buffer. It has been made a function to allow for the
11342	* possibility of understanding the source of mysterious drop counts. (A
11343	* problem for which one may be particularly disappointed that DTrace cannot
11344	* be used to understand DTrace.)
11345	*/
11346	static void
11347	dtrace_buffer_drop(dtrace_buffer_t *buf)
11348	{
11349	buf->dtb_drops++;
11350	}
11351
11352	/*
11353	* Note: called from probe context. This function is called to reserve space
11354	* in a buffer. If mstate is non-NULL, sets the scratch base and size in the
11355	* mstate. Returns the new offset in the buffer, or a negative value if an
11356	* error has occurred.
11357	*/
11358	static intptr_t
11359	dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11360	dtrace_state_t state, dtrace_mstate_t mstate)
11361	{
11362	intptr_t offs = buf->dtb_offset, soffs;
11363	intptr_t woffs;
11364	caddr_t tomax;
11365	size_t total_off;
11366
11367	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11368	return (-`1`);
11369
11370	if ((tomax = buf->dtb_tomax) == NULL) {
11371	dtrace_buffer_drop(buf);
11372	return (-`1`);
11373	}
11374
11375	if (!(buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL))) {
11376	while (offs & (align - `1`)) {
11377	/*
11378	* Assert that our alignment is off by a number which
11379	* is itself sizeof (uint32_t) aligned.
11380	*/
11381	ASSERT(!((align - (offs & (align - `1`))) &
11382	(sizeof (uint32_t) - `1`)));
11383	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11384	offs += sizeof (uint32_t);
11385	}
11386
11387	if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
11388	if (buf->dtb_cur_limit == buf->dtb_limit) {
11389	buf->dtb_cur_limit = buf->dtb_size;
11390
11391	atomic_add_32(&state->dts_buf_over_limit, `1`);
11392	/**
11393	* Set an AST on the current processor
11394	* so that we can wake up the process
11395	* outside of probe context, when we know
11396	* it is safe to do so
11397	*/
11398	minor_t minor = getminor(state->dts_dev);
11399	ASSERT(minor < `32`);
11400
11401	atomic_or_32(&dtrace_wake_clients, `1` << minor);
11402	ast_dtrace_on();
11403	}
11404	if ((uint64_t)soffs > buf->dtb_size) {
11405	dtrace_buffer_drop(buf);
11406	return (-`1`);
11407	}
11408	}
11409
11410	if (mstate == NULL)
11411	return (offs);
11412
11413	mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11414	mstate->dtms_scratch_size = buf->dtb_size - soffs;
11415	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11416
11417	return (offs);
11418	}
11419
11420	if (buf->dtb_flags & DTRACEBUF_FILL) {
11421	if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11422	(buf->dtb_flags & DTRACEBUF_FULL))
11423	return (-`1`);
11424	goto out;
11425	}
11426
11427	total_off = needed + (offs & (align - `1`));
11428
11429	/*
11430	* For a ring buffer, life is quite a bit more complicated. Before
11431	* we can store any padding, we need to adjust our wrapping offset.
11432	* (If we've never before wrapped or we're not about to, no adjustment
11433	* is required.)
11434	*/
11435	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) \|\|
11436	offs + total_off > buf->dtb_size) {
11437	woffs = buf->dtb_xamot_offset;
11438
11439	if (offs + total_off > buf->dtb_size) {
11440	/*
11441	* We can't fit in the end of the buffer. First, a
11442	* sanity check that we can fit in the buffer at all.
11443	*/
11444	if (total_off > buf->dtb_size) {
11445	dtrace_buffer_drop(buf);
11446	return (-`1`);
11447	}
11448
11449	/*
11450	* We're going to be storing at the top of the buffer,
11451	* so now we need to deal with the wrapped offset. We
11452	* only reset our wrapped offset to 0 if it is
11453	* currently greater than the current offset. If it
11454	* is less than the current offset, it is because a
11455	* previous allocation induced a wrap -- but the
11456	* allocation didn't subsequently take the space due
11457	* to an error or false predicate evaluation. In this
11458	* case, we'll just leave the wrapped offset alone: if
11459	* the wrapped offset hasn't been advanced far enough
11460	* for this allocation, it will be adjusted in the
11461	* lower loop.
11462	*/
11463	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11464	if (woffs >= offs)
11465	woffs = `0`;
11466	} else {
11467	woffs = `0`;
11468	}
11469
11470	/*
11471	* Now we know that we're going to be storing to the
11472	* top of the buffer and that there is room for us
11473	* there. We need to clear the buffer from the current
11474	* offset to the end (there may be old gunk there).
11475	*/
11476	while ((uint64_t)offs < buf->dtb_size)
11477	tomax[offs++] = `0`;
11478
11479	/*
11480	* We need to set our offset to zero. And because we
11481	* are wrapping, we need to set the bit indicating as
11482	* much. We can also adjust our needed space back
11483	* down to the space required by the ECB -- we know
11484	* that the top of the buffer is aligned.
11485	*/
11486	offs = `0`;
11487	total_off = needed;
11488	buf->dtb_flags \|= DTRACEBUF_WRAPPED;
11489	} else {
11490	/*
11491	* There is room for us in the buffer, so we simply
11492	* need to check the wrapped offset.
11493	*/
11494	if (woffs < offs) {
11495	/*
11496	* The wrapped offset is less than the offset.
11497	* This can happen if we allocated buffer space
11498	* that induced a wrap, but then we didn't
11499	* subsequently take the space due to an error
11500	* or false predicate evaluation. This is
11501	* okay; we know that _this_ allocation isn't
11502	* going to induce a wrap. We still can't
11503	* reset the wrapped offset to be zero,
11504	* however: the space may have been trashed in
11505	* the previous failed probe attempt. But at
11506	* least the wrapped offset doesn't need to
11507	* be adjusted at all...
11508	*/
11509	goto out;
11510	}
11511	}
11512
11513	while (offs + total_off > (size_t)woffs) {
11514	dtrace_epid_t epid = (uint32_t )(tomax + woffs);
11515	size_t size;
11516
11517	if (epid == DTRACE_EPIDNONE) {
11518	size = sizeof (uint32_t);
11519	} else {
11520	ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
11521	ASSERT(state->dts_ecbs[epid - `1`] != NULL);
11522
11523	size = state->dts_ecbs[epid - `1`]->dte_size;
11524	}
11525
11526	ASSERT(woffs + size <= buf->dtb_size);
11527	ASSERT(size != `0`);
11528
11529	if (woffs + size == buf->dtb_size) {
11530	/*
11531	* We've reached the end of the buffer; we want
11532	* to set the wrapped offset to 0 and break
11533	* out. However, if the offs is 0, then we're
11534	* in a strange edge-condition: the amount of
11535	* space that we want to reserve plus the size
11536	* of the record that we're overwriting is
11537	* greater than the size of the buffer. This
11538	* is problematic because if we reserve the
11539	* space but subsequently don't consume it (due
11540	* to a failed predicate or error) the wrapped
11541	* offset will be 0 -- yet the EPID at offset 0
11542	* will not be committed. This situation is
11543	* relatively easy to deal with: if we're in
11544	* this case, the buffer is indistinguishable
11545	* from one that hasn't wrapped; we need only
11546	* finish the job by clearing the wrapped bit,
11547	* explicitly setting the offset to be 0, and
11548	* zero'ing out the old data in the buffer.
11549	*/
11550	if (offs == `0`) {
11551	buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11552	buf->dtb_offset = `0`;
11553	woffs = total_off;
11554
11555	while ((uint64_t)woffs < buf->dtb_size)
11556	tomax[woffs++] = `0`;
11557	}
11558
11559	woffs = `0`;
11560	break;
11561	}
11562
11563	woffs += size;
11564	}
11565
11566	/*
11567	* We have a wrapped offset. It may be that the wrapped offset
11568	* has become zero -- that's okay.
11569	*/
11570	buf->dtb_xamot_offset = woffs;
11571	}
11572
11573	out:
11574	/*
11575	* Now we can plow the buffer with any necessary padding.
11576	*/
11577	while (offs & (align - `1`)) {
11578	/*
11579	* Assert that our alignment is off by a number which
11580	* is itself sizeof (uint32_t) aligned.
11581	*/
11582	ASSERT(!((align - (offs & (align - `1`))) &
11583	(sizeof (uint32_t) - `1`)));
11584	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11585	offs += sizeof (uint32_t);
11586	}
11587
11588	if (buf->dtb_flags & DTRACEBUF_FILL) {
11589	if (offs + needed > buf->dtb_size - state->dts_reserve) {
11590	buf->dtb_flags \|= DTRACEBUF_FULL;
11591	return (-`1`);
11592	}
11593	}
11594
11595	if (mstate == NULL)
11596	return (offs);
11597
11598	/*
11599	* For ring buffers and fill buffers, the scratch space is always
11600	* the inactive buffer.
11601	*/
11602	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11603	mstate->dtms_scratch_size = buf->dtb_size;
11604	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11605
11606	return (offs);
11607	}
11608
11609	static void
11610	dtrace_buffer_polish(dtrace_buffer_t *buf)
11611	{
11612	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11613	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11614
11615	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11616	return;
11617
11618	/*
11619	* We need to polish the ring buffer. There are three cases:
11620	*
11621	* - The first (and presumably most common) is that there is no gap
11622	* between the buffer offset and the wrapped offset. In this case,
11623	* there is nothing in the buffer that isn't valid data; we can
11624	* mark the buffer as polished and return.
11625	*
11626	* - The second (less common than the first but still more common
11627	* than the third) is that there is a gap between the buffer offset
11628	* and the wrapped offset, and the wrapped offset is larger than the
11629	* buffer offset. This can happen because of an alignment issue, or
11630	* can happen because of a call to dtrace_buffer_reserve() that
11631	* didn't subsequently consume the buffer space. In this case,
11632	* we need to zero the data from the buffer offset to the wrapped
11633	* offset.
11634	*
11635	* - The third (and least common) is that there is a gap between the
11636	* buffer offset and the wrapped offset, but the wrapped offset is
11637	* _less_ than the buffer offset. This can only happen because a
11638	* call to dtrace_buffer_reserve() induced a wrap, but the space
11639	* was not subsequently consumed. In this case, we need to zero the
11640	* space from the offset to the end of the buffer _and_ from the
11641	* top of the buffer to the wrapped offset.
11642	*/
11643	if (buf->dtb_offset < buf->dtb_xamot_offset) {
11644	bzero(buf->dtb_tomax + buf->dtb_offset,
11645	buf->dtb_xamot_offset - buf->dtb_offset);
11646	}
11647
11648	if (buf->dtb_offset > buf->dtb_xamot_offset) {
11649	bzero(buf->dtb_tomax + buf->dtb_offset,
11650	buf->dtb_size - buf->dtb_offset);
11651	bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11652	}
11653	}
11654
11655	static void
11656	dtrace_buffer_free(dtrace_buffer_t *bufs)
11657	{
11658	int i;
11659
11660	for (i = `0`; i < (int)NCPU; i++) {
11661	dtrace_buffer_t *buf = &bufs[i];
11662
11663	if (buf->dtb_tomax == NULL) {
11664	ASSERT(buf->dtb_xamot == NULL);
11665	ASSERT(buf->dtb_size == `0`);
11666	continue;
11667	}
11668
11669	if (buf->dtb_xamot != NULL) {
11670	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11671	kmem_free(buf->dtb_xamot, buf->dtb_size);
11672
11673	ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11674	dtrace_buffer_memory_inuse -= buf->dtb_size;
11675	}
11676
11677	kmem_free(buf->dtb_tomax, buf->dtb_size);
11678	ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11679	dtrace_buffer_memory_inuse -= buf->dtb_size;
11680
11681	buf->dtb_size = `0`;
11682	buf->dtb_tomax = NULL;
11683	buf->dtb_xamot = NULL;
11684	}
11685	}
11686
11687	/*
11688	* DTrace Enabling Functions
11689	*/
11690	static dtrace_enabling_t *
11691	dtrace_enabling_create(dtrace_vstate_t *vstate)
11692	{
11693	dtrace_enabling_t *enab;
11694
11695	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11696	enab->dten_vstate = vstate;
11697
11698	return (enab);
11699	}
11700
11701	static void
11702	dtrace_enabling_add(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb)
11703	{
11704	dtrace_ecbdesc_t **ndesc;
11705	size_t osize, nsize;
11706
11707	/*
11708	* We can't add to enablings after we've enabled them, or after we've
11709	* retained them.
11710	*/
11711	ASSERT(enab->dten_probegen == `0`);
11712	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11713
11714	/ APPLE NOTE: this protects against gcc 4.0 botch on x86 /
11715	if (ecb == NULL) return;
11716
11717	if (enab->dten_ndesc < enab->dten_maxdesc) {
11718	enab->dten_desc[enab->dten_ndesc++] = ecb;
11719	return;
11720	}
11721
11722	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11723
11724	if (enab->dten_maxdesc == `0`) {
11725	enab->dten_maxdesc = `1`;
11726	} else {
11727	enab->dten_maxdesc <<= `1`;
11728	}
11729
11730	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11731
11732	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11733	ndesc = kmem_zalloc(nsize, KM_SLEEP);
11734	bcopy(enab->dten_desc, ndesc, osize);
11735	kmem_free(enab->dten_desc, osize);
11736
11737	enab->dten_desc = ndesc;
11738	enab->dten_desc[enab->dten_ndesc++] = ecb;
11739	}
11740
11741	static void
11742	dtrace_enabling_addlike(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb,
11743	dtrace_probedesc_t *pd)
11744	{
11745	dtrace_ecbdesc_t *new;
11746	dtrace_predicate_t *pred;
11747	dtrace_actdesc_t *act;
11748
11749	/*
11750	* We're going to create a new ECB description that matches the
11751	* specified ECB in every way, but has the specified probe description.
11752	*/
11753	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11754
11755	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11756	dtrace_predicate_hold(pred);
11757
11758	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11759	dtrace_actdesc_hold(act);
11760
11761	new->dted_action = ecb->dted_action;
11762	new->dted_pred = ecb->dted_pred;
11763	new->dted_probe = *pd;
11764	new->dted_uarg = ecb->dted_uarg;
11765
11766	dtrace_enabling_add(enab, new);
11767	}
11768
11769	static void
11770	dtrace_enabling_dump(dtrace_enabling_t *enab)
11771	{
11772	int i;
11773
11774	for (i = `0`; i < enab->dten_ndesc; i++) {
11775	dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11776
11777	cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11778	desc->dtpd_provider, desc->dtpd_mod,
11779	desc->dtpd_func, desc->dtpd_name);
11780	}
11781	}
11782
11783	static void
11784	dtrace_enabling_destroy(dtrace_enabling_t *enab)
11785	{
11786	int i;
11787	dtrace_ecbdesc_t *ep;
11788	dtrace_vstate_t *vstate = enab->dten_vstate;
11789
11790	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11791
11792	for (i = `0`; i < enab->dten_ndesc; i++) {
11793	dtrace_actdesc_t act, next;
11794	dtrace_predicate_t *pred;
11795
11796	ep = enab->dten_desc[i];
11797
11798	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11799	dtrace_predicate_release(pred, vstate);
11800
11801	for (act = ep->dted_action; act != NULL; act = next) {
11802	next = act->dtad_next;
11803	dtrace_actdesc_release(act, vstate);
11804	}
11805
11806	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11807	}
11808
11809	kmem_free(enab->dten_desc,
11810	enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11811
11812	/*
11813	* If this was a retained enabling, decrement the dts_nretained count
11814	* and take it off of the dtrace_retained list.
11815	*/
11816	if (enab->dten_prev != NULL \|\| enab->dten_next != NULL \|\|
11817	dtrace_retained == enab) {
11818	ASSERT(enab->dten_vstate->dtvs_state != NULL);
11819	ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > `0`);
11820	enab->dten_vstate->dtvs_state->dts_nretained--;
11821	dtrace_retained_gen++;
11822	}
11823
11824	if (enab->dten_prev == NULL) {
11825	if (dtrace_retained == enab) {
11826	dtrace_retained = enab->dten_next;
11827
11828	if (dtrace_retained != NULL)
11829	dtrace_retained->dten_prev = NULL;
11830	}
11831	} else {
11832	ASSERT(enab != dtrace_retained);
11833	ASSERT(dtrace_retained != NULL);
11834	enab->dten_prev->dten_next = enab->dten_next;
11835	}
11836
11837	if (enab->dten_next != NULL) {
11838	ASSERT(dtrace_retained != NULL);
11839	enab->dten_next->dten_prev = enab->dten_prev;
11840	}
11841
11842	kmem_free(enab, sizeof (dtrace_enabling_t));
11843	}
11844
11845	static int
11846	dtrace_enabling_retain(dtrace_enabling_t *enab)
11847	{
11848	dtrace_state_t *state;
11849
11850	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11851	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11852	ASSERT(enab->dten_vstate != NULL);
11853
11854	state = enab->dten_vstate->dtvs_state;
11855	ASSERT(state != NULL);
11856
11857	/*
11858	* We only allow each state to retain dtrace_retain_max enablings.
11859	*/
11860	if (state->dts_nretained >= dtrace_retain_max)
11861	return (ENOSPC);
11862
11863	state->dts_nretained++;
11864	dtrace_retained_gen++;
11865
11866	if (dtrace_retained == NULL) {
11867	dtrace_retained = enab;
11868	return (`0`);
11869	}
11870
11871	enab->dten_next = dtrace_retained;
11872	dtrace_retained->dten_prev = enab;
11873	dtrace_retained = enab;
11874
11875	return (`0`);
11876	}
11877
11878	static int
11879	dtrace_enabling_replicate(dtrace_state_t state, dtrace_probedesc_t match,
11880	dtrace_probedesc_t *create)
11881	{
11882	dtrace_enabling_t new, enab;
11883	int found = `0`, err = ENOENT;
11884
11885	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11886	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11887	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11888	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11889	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11890
11891	new = dtrace_enabling_create(&state->dts_vstate);
11892
11893	/*
11894	* Iterate over all retained enablings, looking for enablings that
11895	* match the specified state.
11896	*/
11897	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11898	int i;
11899
11900	/*
11901	* dtvs_state can only be NULL for helper enablings -- and
11902	* helper enablings can't be retained.
11903	*/
11904	ASSERT(enab->dten_vstate->dtvs_state != NULL);
11905
11906	if (enab->dten_vstate->dtvs_state != state)
11907	continue;
11908
11909	/*
11910	* Now iterate over each probe description; we're looking for
11911	* an exact match to the specified probe description.
11912	*/
11913	for (i = `0`; i < enab->dten_ndesc; i++) {
11914	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11915	dtrace_probedesc_t *pd = &ep->dted_probe;
11916
11917	/ APPLE NOTE: Darwin employs size bounded string operation. /
11918	if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
11919	continue;
11920
11921	if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
11922	continue;
11923
11924	if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
11925	continue;
11926
11927	if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
11928	continue;
11929
11930	/*
11931	* We have a winning probe! Add it to our growing
11932	* enabling.
11933	*/
11934	found = `1`;
11935	dtrace_enabling_addlike(new, ep, create);
11936	}
11937	}
11938
11939	if (!found \|\| (err = dtrace_enabling_retain(new)) != `0`) {
11940	dtrace_enabling_destroy(new);
11941	return (err);
11942	}
11943
11944	return (`0`);
11945	}
11946
11947	static void
11948	dtrace_enabling_retract(dtrace_state_t *state)
11949	{
11950	dtrace_enabling_t enab, next;
11951
11952	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11953
11954	/*
11955	* Iterate over all retained enablings, destroy the enablings retained
11956	* for the specified state.
11957	*/
11958	for (enab = dtrace_retained; enab != NULL; enab = next) {
11959	next = enab->dten_next;
11960
11961	/*
11962	* dtvs_state can only be NULL for helper enablings -- and
11963	* helper enablings can't be retained.
11964	*/
11965	ASSERT(enab->dten_vstate->dtvs_state != NULL);
11966
11967	if (enab->dten_vstate->dtvs_state == state) {
11968	ASSERT(state->dts_nretained > `0`);
11969	dtrace_enabling_destroy(enab);
11970	}
11971	}
11972
11973	ASSERT(state->dts_nretained == `0`);
11974	}
11975
11976	static int
11977	dtrace_enabling_match(dtrace_enabling_t enab, int* nmatched, dtrace_match_cond_t cond)
11978	{
11979	int i = `0`;
11980	int total_matched = `0`, matched = `0`;
11981
11982	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11983	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11984
11985	for (i = `0`; i < enab->dten_ndesc; i++) {
11986	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11987
11988	enab->dten_current = ep;
11989	enab->dten_error = `0`;
11990
11991	/**
11992	* Before doing a dtrace_probe_enable, which is really
11993	* expensive, check that this enabling matches the matching precondition
11994	* if we have one
11995	*/
11996	if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == `0`)) {
11997	continue;
11998	}
11999	/*
12000	* If a provider failed to enable a probe then get out and
12001	* let the consumer know we failed.
12002	*/
12003	if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < `0`)
12004	return (EBUSY);
12005
12006	total_matched += matched;
12007
12008	if (enab->dten_error != `0`) {
12009	/*
12010	* If we get an error half-way through enabling the
12011	* probes, we kick out -- perhaps with some number of
12012	* them enabled. Leaving enabled probes enabled may
12013	* be slightly confusing for user-level, but we expect
12014	* that no one will attempt to actually drive on in
12015	* the face of such errors. If this is an anonymous
12016	* enabling (indicated with a NULL nmatched pointer),
12017	* we cmn_err() a message. We aren't expecting to
12018	* get such an error -- such as it can exist at all,
12019	* it would be a result of corrupted DOF in the driver
12020	* properties.
12021	*/
12022	if (nmatched == NULL) {
12023	cmn_err(CE_WARN, "dtrace_enabling_match() "
12024	"error on %p: %d", (void *)ep,
12025	enab->dten_error);
12026	}
12027
12028	return (enab->dten_error);
12029	}
12030
12031	ep->dted_probegen = dtrace_probegen;
12032	}
12033
12034	if (nmatched != NULL)
12035	*nmatched = total_matched;
12036
12037	return (`0`);
12038	}
12039
12040	static void
12041	dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12042	{
12043	dtrace_enabling_t *enab;
12044
12045	lck_mtx_lock(&cpu_lock);
12046	lck_mtx_lock(&dtrace_lock);
12047
12048	/*
12049	* Iterate over all retained enablings to see if any probes match
12050	* against them. We only perform this operation on enablings for which
12051	* we have sufficient permissions by virtue of being in the global zone
12052	* or in the same zone as the DTrace client. Because we can be called
12053	* after dtrace_detach() has been called, we cannot assert that there
12054	* are retained enablings. We can safely load from dtrace_retained,
12055	* however: the taskq_destroy() at the end of dtrace_detach() will
12056	* block pending our completion.
12057	*/
12058
12059	/*
12060	* Darwin doesn't do zones.
12061	* Behave as if always in "global" zone."
12062	*/
12063	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12064	(void) dtrace_enabling_match(enab, NULL, cond);
12065	}
12066
12067	lck_mtx_unlock(&dtrace_lock);
12068	lck_mtx_unlock(&cpu_lock);
12069
12070	}
12071
12072	static void
12073	dtrace_enabling_matchall(void)
12074	{
12075	dtrace_enabling_matchall_with_cond(NULL);
12076	}
12077
12078
12079
12080	/*
12081	* If an enabling is to be enabled without having matched probes (that is, if
12082	* dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12083	* enabling must be _primed_ by creating an ECB for every ECB description.
12084	* This must be done to assure that we know the number of speculations, the
12085	* number of aggregations, the minimum buffer size needed, etc. before we
12086	* transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12087	* enabling any probes, we create ECBs for every ECB decription, but with a
12088	* NULL probe -- which is exactly what this function does.
12089	*/
12090	static void
12091	dtrace_enabling_prime(dtrace_state_t *state)
12092	{
12093	dtrace_enabling_t *enab;
12094	int i;
12095
12096	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12097	ASSERT(enab->dten_vstate->dtvs_state != NULL);
12098
12099	if (enab->dten_vstate->dtvs_state != state)
12100	continue;
12101
12102	/*
12103	* We don't want to prime an enabling more than once, lest
12104	* we allow a malicious user to induce resource exhaustion.
12105	* (The ECBs that result from priming an enabling aren't
12106	* leaked -- but they also aren't deallocated until the
12107	* consumer state is destroyed.)
12108	*/
12109	if (enab->dten_primed)
12110	continue;
12111
12112	for (i = `0`; i < enab->dten_ndesc; i++) {
12113	enab->dten_current = enab->dten_desc[i];
12114	(void) dtrace_probe_enable(NULL, enab, NULL);
12115	}
12116
12117	enab->dten_primed = `1`;
12118	}
12119	}
12120
12121	/*
12122	* Called to indicate that probes should be provided due to retained
12123	* enablings. This is implemented in terms of dtrace_probe_provide(), but it
12124	* must take an initial lap through the enabling calling the dtps_provide()
12125	* entry point explicitly to allow for autocreated probes.
12126	*/
12127	static void
12128	dtrace_enabling_provide(dtrace_provider_t *prv)
12129	{
12130	int i, all = `0`;
12131	dtrace_probedesc_t desc;
12132	dtrace_genid_t gen;
12133
12134	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12135	LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12136
12137	if (prv == NULL) {
12138	all = `1`;
12139	prv = dtrace_provider;
12140	}
12141
12142	do {
12143	dtrace_enabling_t *enab;
12144	void *parg = prv->dtpv_arg;
12145
12146	retry:
12147	gen = dtrace_retained_gen;
12148	for (enab = dtrace_retained; enab != NULL;
12149	enab = enab->dten_next) {
12150	for (i = `0`; i < enab->dten_ndesc; i++) {
12151	desc = enab->dten_desc[i]->dted_probe;
12152	lck_mtx_unlock(&dtrace_lock);
12153	prv->dtpv_pops.dtps_provide(parg, &desc);
12154	lck_mtx_lock(&dtrace_lock);
12155	/*
12156	* Process the retained enablings again if
12157	* they have changed while we weren't holding
12158	* dtrace_lock.
12159	*/
12160	if (gen != dtrace_retained_gen)
12161	goto retry;
12162	}
12163	}
12164	} while (all && (prv = prv->dtpv_next) != NULL);
12165
12166	lck_mtx_unlock(&dtrace_lock);
12167	dtrace_probe_provide(NULL, all ? NULL : prv);
12168	lck_mtx_lock(&dtrace_lock);
12169	}
12170
12171	/*
12172	* DTrace DOF Functions
12173	*/
12174	/ARGSUSED/
12175	static void
12176	dtrace_dof_error(dof_hdr_t dof, const* char *str)
12177	{
12178	#pragma unused(dof) /* __APPLE__ */
12179	if (dtrace_err_verbose)
12180	cmn_err(CE_WARN, "failed to process DOF: %s", str);
12181
12182	#ifdef DTRACE_ERRDEBUG
12183	dtrace_errdebug(str);
12184	#endif
12185	}
12186
12187	/*
12188	* Create DOF out of a currently enabled state. Right now, we only create
12189	* DOF containing the run-time options -- but this could be expanded to create
12190	* complete DOF representing the enabled state.
12191	*/
12192	static dof_hdr_t *
12193	dtrace_dof_create(dtrace_state_t *state)
12194	{
12195	dof_hdr_t *dof;
12196	dof_sec_t *sec;
12197	dof_optdesc_t *opt;
12198	int i, len = sizeof (dof_hdr_t) +
12199	roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12200	sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12201
12202	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12203
12204	dof = kmem_zalloc_aligned(len, `8`, KM_SLEEP);
12205	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12206	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12207	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12208	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12209
12210	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12211	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12212	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12213	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12214	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12215	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12216
12217	dof->dofh_flags = `0`;
12218	dof->dofh_hdrsize = sizeof (dof_hdr_t);
12219	dof->dofh_secsize = sizeof (dof_sec_t);
12220	dof->dofh_secnum = `1`; / only DOF_SECT_OPTDESC /
12221	dof->dofh_secoff = sizeof (dof_hdr_t);
12222	dof->dofh_loadsz = len;
12223	dof->dofh_filesz = len;
12224	dof->dofh_pad = `0`;
12225
12226	/*
12227	* Fill in the option section header...
12228	*/
12229	sec = (dof_sec_t )((uintptr_t)dof + sizeof* (dof_hdr_t));
12230	sec->dofs_type = DOF_SECT_OPTDESC;
12231	sec->dofs_align = sizeof (uint64_t);
12232	sec->dofs_flags = DOF_SECF_LOAD;
12233	sec->dofs_entsize = sizeof (dof_optdesc_t);
12234
12235	opt = (dof_optdesc_t *)((uintptr_t)sec +
12236	roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12237
12238	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12239	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12240
12241	for (i = `0`; i < DTRACEOPT_MAX; i++) {
12242	opt[i].dofo_option = i;
12243	opt[i].dofo_strtab = DOF_SECIDX_NONE;
12244	opt[i].dofo_value = state->dts_options[i];
12245	}
12246
12247	return (dof);
12248	}
12249
12250	static dof_hdr_t *
12251	dtrace_dof_copyin(user_addr_t uarg, int *errp)
12252	{
12253	dof_hdr_t hdr, *dof;
12254
12255	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12256
12257	/*
12258	* First, we're going to copyin() the sizeof (dof_hdr_t).
12259	*/
12260	if (copyin(uarg, &hdr, sizeof (hdr)) != `0`) {
12261	dtrace_dof_error(NULL, "failed to copyin DOF header");
12262	*errp = EFAULT;
12263	return (NULL);
12264	}
12265
12266	/*
12267	* Now we'll allocate the entire DOF and copy it in -- provided
12268	* that the length isn't outrageous.
12269	*/
12270	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12271	dtrace_dof_error(&hdr, "load size exceeds maximum");
12272	*errp = E2BIG;
12273	return (NULL);
12274	}
12275
12276	if (hdr.dofh_loadsz < sizeof (hdr)) {
12277	dtrace_dof_error(&hdr, "invalid load size");
12278	*errp = EINVAL;
12279	return (NULL);
12280	}
12281
12282	dof = kmem_alloc_aligned(hdr.dofh_loadsz, `8`, KM_SLEEP);
12283
12284	if (copyin(uarg, dof, hdr.dofh_loadsz) != `0` \|\|
12285	dof->dofh_loadsz != hdr.dofh_loadsz) {
12286	kmem_free_aligned(dof, hdr.dofh_loadsz);
12287	*errp = EFAULT;
12288	return (NULL);
12289	}
12290
12291	return (dof);
12292	}
12293
12294	static dof_hdr_t *
12295	dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12296	{
12297	dof_hdr_t hdr, *dof;
12298
12299	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12300
12301	/*
12302	* First, we're going to copyin() the sizeof (dof_hdr_t).
12303	*/
12304	if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12305	dtrace_dof_error(NULL, "failed to copyin DOF header");
12306	*errp = EFAULT;
12307	return (NULL);
12308	}
12309
12310	/*
12311	* Now we'll allocate the entire DOF and copy it in -- provided
12312	* that the length isn't outrageous.
12313	*/
12314	if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12315	dtrace_dof_error(&hdr, "load size exceeds maximum");
12316	*errp = E2BIG;
12317	return (NULL);
12318	}
12319
12320	if (hdr.dofh_loadsz < sizeof (hdr)) {
12321	dtrace_dof_error(&hdr, "invalid load size");
12322	*errp = EINVAL;
12323	return (NULL);
12324	}
12325
12326	dof = kmem_alloc_aligned(hdr.dofh_loadsz, `8`, KM_SLEEP);
12327
12328	if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12329	kmem_free_aligned(dof, hdr.dofh_loadsz);
12330	*errp = EFAULT;
12331	return (NULL);
12332	}
12333
12334	return (dof);
12335	}
12336
12337	static void
12338	dtrace_dof_destroy(dof_hdr_t *dof)
12339	{
12340	kmem_free_aligned(dof, dof->dofh_loadsz);
12341	}
12342
12343	static dof_hdr_t *
12344	dtrace_dof_property(const char *name)
12345	{
12346	unsigned int len = `0`;
12347	dof_hdr_t *dof;
12348
12349	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12350	return NULL;
12351	}
12352
12353	if (!PEReadNVRAMProperty(name, NULL, &len)) {
12354	return NULL;
12355	}
12356
12357	dof = kmem_alloc_aligned(len, `8`, KM_SLEEP);
12358
12359	if (!PEReadNVRAMProperty(name, dof, &len)) {
12360	dtrace_dof_destroy(dof);
12361	dtrace_dof_error(NULL, "unreadable DOF");
12362	return NULL;
12363	}
12364
12365	if (len < sizeof (dof_hdr_t)) {
12366	dtrace_dof_destroy(dof);
12367	dtrace_dof_error(NULL, "truncated header");
12368	return (NULL);
12369	}
12370
12371	if (len < dof->dofh_loadsz) {
12372	dtrace_dof_destroy(dof);
12373	dtrace_dof_error(NULL, "truncated DOF");
12374	return (NULL);
12375	}
12376
12377	if (len != dof->dofh_loadsz) {
12378	dtrace_dof_destroy(dof);
12379	dtrace_dof_error(NULL, "invalid DOF size");
12380	return (NULL);
12381	}
12382
12383	if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12384	dtrace_dof_destroy(dof);
12385	dtrace_dof_error(NULL, "oversized DOF");
12386	return (NULL);
12387	}
12388
12389	return (dof);
12390	}
12391
12392	/*
12393	* Return the dof_sec_t pointer corresponding to a given section index. If the
12394	* index is not valid, dtrace_dof_error() is called and NULL is returned. If
12395	* a type other than DOF_SECT_NONE is specified, the header is checked against
12396	* this type and NULL is returned if the types do not match.
12397	*/
12398	static dof_sec_t *
12399	dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12400	{
12401	dof_sec_t sec = (dof_sec_t )(uintptr_t)
12402	((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12403
12404	if (i >= dof->dofh_secnum) {
12405	dtrace_dof_error(dof, "referenced section index is invalid");
12406	return (NULL);
12407	}
12408
12409	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12410	dtrace_dof_error(dof, "referenced section is not loadable");
12411	return (NULL);
12412	}
12413
12414	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12415	dtrace_dof_error(dof, "referenced section is the wrong type");
12416	return (NULL);
12417	}
12418
12419	return (sec);
12420	}
12421
12422	static dtrace_probedesc_t *
12423	dtrace_dof_probedesc(dof_hdr_t dof, dof_sec_t sec, dtrace_probedesc_t *desc)
12424	{
12425	dof_probedesc_t *probe;
12426	dof_sec_t *strtab;
12427	uintptr_t daddr = (uintptr_t)dof;
12428	uintptr_t str;
12429	size_t size;
12430
12431	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12432	dtrace_dof_error(dof, "invalid probe section");
12433	return (NULL);
12434	}
12435
12436	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12437	dtrace_dof_error(dof, "bad alignment in probe description");
12438	return (NULL);
12439	}
12440
12441	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12442	dtrace_dof_error(dof, "truncated probe description");
12443	return (NULL);
12444	}
12445
12446	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12447	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12448
12449	if (strtab == NULL)
12450	return (NULL);
12451
12452	str = daddr + strtab->dofs_offset;
12453	size = strtab->dofs_size;
12454
12455	if (probe->dofp_provider >= strtab->dofs_size) {
12456	dtrace_dof_error(dof, "corrupt probe provider");
12457	return (NULL);
12458	}
12459
12460	(void) strncpy(desc->dtpd_provider,
12461	(char *)(str + probe->dofp_provider),
12462	MIN(DTRACE_PROVNAMELEN - `1`, size - probe->dofp_provider));
12463
12464	/ APPLE NOTE: Darwin employs size bounded string operation. /
12465	desc->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
12466
12467	if (probe->dofp_mod >= strtab->dofs_size) {
12468	dtrace_dof_error(dof, "corrupt probe module");
12469	return (NULL);
12470	}
12471
12472	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12473	MIN(DTRACE_MODNAMELEN - `1`, size - probe->dofp_mod));
12474
12475	/ APPLE NOTE: Darwin employs size bounded string operation. /
12476	desc->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
12477
12478	if (probe->dofp_func >= strtab->dofs_size) {
12479	dtrace_dof_error(dof, "corrupt probe function");
12480	return (NULL);
12481	}
12482
12483	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12484	MIN(DTRACE_FUNCNAMELEN - `1`, size - probe->dofp_func));
12485
12486	/ APPLE NOTE: Darwin employs size bounded string operation. /
12487	desc->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
12488
12489	if (probe->dofp_name >= strtab->dofs_size) {
12490	dtrace_dof_error(dof, "corrupt probe name");
12491	return (NULL);
12492	}
12493
12494	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12495	MIN(DTRACE_NAMELEN - `1`, size - probe->dofp_name));
12496
12497	/ APPLE NOTE: Darwin employs size bounded string operation. /
12498	desc->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
12499
12500	return (desc);
12501	}
12502
12503	static dtrace_difo_t *
12504	dtrace_dof_difo(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
12505	cred_t *cr)
12506	{
12507	dtrace_difo_t *dp;
12508	size_t ttl = `0`;
12509	dof_difohdr_t *dofd;
12510	uintptr_t daddr = (uintptr_t)dof;
12511	size_t max_size = dtrace_difo_maxsize;
12512	uint_t i;
12513	int l, n;
12514
12515
12516	static const struct {
12517	int section;
12518	int bufoffs;
12519	int lenoffs;
12520	int entsize;
12521	int align;
12522	const char *msg;
12523	} difo[] = {
12524	{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12525	offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12526	sizeof (dif_instr_t), "multiple DIF sections" },
12527
12528	{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12529	offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12530	sizeof (uint64_t), "multiple integer tables" },
12531
12532	{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12533	offsetof(dtrace_difo_t, dtdo_strlen), `0`,
12534	sizeof (char), "multiple string tables" },
12535
12536	{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12537	offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12538	sizeof (uint_t), "multiple variable tables" },
12539
12540	{ DOF_SECT_NONE, `0`, `0`, `0`, `0`, NULL }
12541	};
12542
12543	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12544	dtrace_dof_error(dof, "invalid DIFO header section");
12545	return (NULL);
12546	}
12547
12548	if (sec->dofs_align != sizeof (dof_secidx_t)) {
12549	dtrace_dof_error(dof, "bad alignment in DIFO header");
12550	return (NULL);
12551	}
12552
12553	if (sec->dofs_size < sizeof (dof_difohdr_t) \|\|
12554	sec->dofs_size % sizeof (dof_secidx_t)) {
12555	dtrace_dof_error(dof, "bad size in DIFO header");
12556	return (NULL);
12557	}
12558
12559	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12560	n = (sec->dofs_size - sizeof (dofd)) / sizeof* (dof_secidx_t) + `1`;
12561
12562	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12563	dp->dtdo_rtype = dofd->dofd_rtype;
12564
12565	for (l = `0`; l < n; l++) {
12566	dof_sec_t *subsec;
12567	void **bufp;
12568	uint32_t *lenp;
12569
12570	if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12571	dofd->dofd_links[l])) == NULL)
12572	goto err; / invalid section link /
12573
12574	if (ttl + subsec->dofs_size > max_size) {
12575	dtrace_dof_error(dof, "exceeds maximum size");
12576	goto err;
12577	}
12578
12579	ttl += subsec->dofs_size;
12580
12581	for (i = `0`; difo[i].section != DOF_SECT_NONE; i++) {
12582
12583	if (subsec->dofs_type != (uint32_t)difo[i].section)
12584	continue;
12585
12586	if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12587	dtrace_dof_error(dof, "section not loaded");
12588	goto err;
12589	}
12590
12591	if (subsec->dofs_align != (uint32_t)difo[i].align) {
12592	dtrace_dof_error(dof, "bad alignment");
12593	goto err;
12594	}
12595
12596	bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12597	lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12598
12599	if (*bufp != NULL) {
12600	dtrace_dof_error(dof, difo[i].msg);
12601	goto err;
12602	}
12603
12604	if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
12605	dtrace_dof_error(dof, "entry size mismatch");
12606	goto err;
12607	}
12608
12609	if (subsec->dofs_entsize != `0` &&
12610	(subsec->dofs_size % subsec->dofs_entsize) != `0`) {
12611	dtrace_dof_error(dof, "corrupt entry size");
12612	goto err;
12613	}
12614
12615	*lenp = subsec->dofs_size;
12616	*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12617	bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12618	*bufp, subsec->dofs_size);
12619
12620	if (subsec->dofs_entsize != `0`)
12621	*lenp /= subsec->dofs_entsize;
12622
12623	break;
12624	}
12625
12626	/*
12627	* If we encounter a loadable DIFO sub-section that is not
12628	* known to us, assume this is a broken program and fail.
12629	*/
12630	if (difo[i].section == DOF_SECT_NONE &&
12631	(subsec->dofs_flags & DOF_SECF_LOAD)) {
12632	dtrace_dof_error(dof, "unrecognized DIFO subsection");
12633	goto err;
12634	}
12635	}
12636
12637	if (dp->dtdo_buf == NULL) {
12638	/*
12639	* We can't have a DIF object without DIF text.
12640	*/
12641	dtrace_dof_error(dof, "missing DIF text");
12642	goto err;
12643	}
12644
12645	/*
12646	* Before we validate the DIF object, run through the variable table
12647	* looking for the strings -- if any of their size are under, we'll set
12648	* their size to be the system-wide default string size. Note that
12649	* this should _not_ happen if the "strsize" option has been set --
12650	* in this case, the compiler should have set the size to reflect the
12651	* setting of the option.
12652	*/
12653	for (i = `0`; i < dp->dtdo_varlen; i++) {
12654	dtrace_difv_t *v = &dp->dtdo_vartab[i];
12655	dtrace_diftype_t *t = &v->dtdv_type;
12656
12657	if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12658	continue;
12659
12660	if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == `0`)
12661	t->dtdt_size = dtrace_strsize_default;
12662	}
12663
12664	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != `0`)
12665	goto err;
12666
12667	dtrace_difo_init(dp, vstate);
12668	return (dp);
12669
12670	err:
12671	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12672	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12673	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12674	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12675
12676	kmem_free(dp, sizeof (dtrace_difo_t));
12677	return (NULL);
12678	}
12679
12680	static dtrace_predicate_t *
12681	dtrace_dof_predicate(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
12682	cred_t *cr)
12683	{
12684	dtrace_difo_t *dp;
12685
12686	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12687	return (NULL);
12688
12689	return (dtrace_predicate_create(dp));
12690	}
12691
12692	static dtrace_actdesc_t *
12693	dtrace_dof_actdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
12694	cred_t *cr)
12695	{
12696	dtrace_actdesc_t act, first = NULL, last = NULL, next;
12697	dof_actdesc_t *desc;
12698	dof_sec_t *difosec;
12699	size_t offs;
12700	uintptr_t daddr = (uintptr_t)dof;
12701	uint64_t arg;
12702	dtrace_actkind_t kind;
12703
12704	if (sec->dofs_type != DOF_SECT_ACTDESC) {
12705	dtrace_dof_error(dof, "invalid action section");
12706	return (NULL);
12707	}
12708
12709	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12710	dtrace_dof_error(dof, "truncated action description");
12711	return (NULL);
12712	}
12713
12714	if (sec->dofs_align != sizeof (uint64_t)) {
12715	dtrace_dof_error(dof, "bad alignment in action description");
12716	return (NULL);
12717	}
12718
12719	if (sec->dofs_size < sec->dofs_entsize) {
12720	dtrace_dof_error(dof, "section entry size exceeds total size");
12721	return (NULL);
12722	}
12723
12724	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12725	dtrace_dof_error(dof, "bad entry size in action description");
12726	return (NULL);
12727	}
12728
12729	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12730	dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12731	return (NULL);
12732	}
12733
12734	for (offs = `0`; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12735	desc = (dof_actdesc_t *)(daddr +
12736	(uintptr_t)sec->dofs_offset + offs);
12737	kind = (dtrace_actkind_t)desc->dofa_kind;
12738
12739	if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12740	(kind != DTRACEACT_PRINTA \|\| desc->dofa_strtab != DOF_SECIDX_NONE)) \|\|
12741	(kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
12742	{
12743	dof_sec_t *strtab;
12744	char str, fmt;
12745	uint64_t i;
12746
12747	/*
12748	* The argument to these actions is an index into the
12749	* DOF string table. For printf()-like actions, this
12750	* is the format string. For print(), this is the
12751	* CTF type of the expression result.
12752	*/
12753	if ((strtab = dtrace_dof_sect(dof,
12754	DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12755	goto err;
12756
12757	str = (char *)((uintptr_t)dof +
12758	(uintptr_t)strtab->dofs_offset);
12759
12760	for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12761	if (str[i] == `'\0'`)
12762	break;
12763	}
12764
12765	if (i >= strtab->dofs_size) {
12766	dtrace_dof_error(dof, "bogus format string");
12767	goto err;
12768	}
12769
12770	if (i == desc->dofa_arg) {
12771	dtrace_dof_error(dof, "empty format string");
12772	goto err;
12773	}
12774
12775	i -= desc->dofa_arg;
12776	fmt = kmem_alloc(i + `1`, KM_SLEEP);
12777	bcopy(&str[desc->dofa_arg], fmt, i + `1`);
12778	arg = (uint64_t)(uintptr_t)fmt;
12779	} else {
12780	if (kind == DTRACEACT_PRINTA) {
12781	ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12782	arg = `0`;
12783	} else {
12784	arg = desc->dofa_arg;
12785	}
12786	}
12787
12788	act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12789	desc->dofa_uarg, arg);
12790
12791	if (last != NULL) {
12792	last->dtad_next = act;
12793	} else {
12794	first = act;
12795	}
12796
12797	last = act;
12798
12799	if (desc->dofa_difo == DOF_SECIDX_NONE)
12800	continue;
12801
12802	if ((difosec = dtrace_dof_sect(dof,
12803	DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12804	goto err;
12805
12806	act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12807
12808	if (act->dtad_difo == NULL)
12809	goto err;
12810	}
12811
12812	ASSERT(first != NULL);
12813	return (first);
12814
12815	err:
12816	for (act = first; act != NULL; act = next) {
12817	next = act->dtad_next;
12818	dtrace_actdesc_release(act, vstate);
12819	}
12820
12821	return (NULL);
12822	}
12823
12824	static dtrace_ecbdesc_t *
12825	dtrace_dof_ecbdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
12826	cred_t *cr)
12827	{
12828	dtrace_ecbdesc_t *ep;
12829	dof_ecbdesc_t *ecb;
12830	dtrace_probedesc_t *desc;
12831	dtrace_predicate_t *pred = NULL;
12832
12833	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12834	dtrace_dof_error(dof, "truncated ECB description");
12835	return (NULL);
12836	}
12837
12838	if (sec->dofs_align != sizeof (uint64_t)) {
12839	dtrace_dof_error(dof, "bad alignment in ECB description");
12840	return (NULL);
12841	}
12842
12843	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12844	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12845
12846	if (sec == NULL)
12847	return (NULL);
12848
12849	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12850	ep->dted_uarg = ecb->dofe_uarg;
12851	desc = &ep->dted_probe;
12852
12853	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12854	goto err;
12855
12856	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12857	if ((sec = dtrace_dof_sect(dof,
12858	DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12859	goto err;
12860
12861	if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12862	goto err;
12863
12864	ep->dted_pred.dtpdd_predicate = pred;
12865	}
12866
12867	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12868	if ((sec = dtrace_dof_sect(dof,
12869	DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12870	goto err;
12871
12872	ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12873
12874	if (ep->dted_action == NULL)
12875	goto err;
12876	}
12877
12878	return (ep);
12879
12880	err:
12881	if (pred != NULL)
12882	dtrace_predicate_release(pred, vstate);
12883	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12884	return (NULL);
12885	}
12886
12887	/*
12888	* APPLE NOTE: dyld handles dof relocation.
12889	* Darwin does not need dtrace_dof_relocate()
12890	*/
12891
12892	/*
12893	* The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12894	* header: it should be at the front of a memory region that is at least
12895	* sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12896	* size. It need not be validated in any other way.
12897	*/
12898	static int
12899	dtrace_dof_slurp(dof_hdr_t dof, dtrace_vstate_t vstate, cred_t *cr,
12900	dtrace_enabling_t *enabp, uint64_t ubase, int* noprobes)
12901	{
12902	#pragma unused(ubase) /* __APPLE__ */
12903	uint64_t len = dof->dofh_loadsz, seclen;
12904	uintptr_t daddr = (uintptr_t)dof;
12905	dtrace_ecbdesc_t *ep;
12906	dtrace_enabling_t *enab;
12907	uint_t i;
12908
12909	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12910	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12911
12912	/*
12913	* Check the DOF header identification bytes. In addition to checking
12914	* valid settings, we also verify that unused bits/bytes are zeroed so
12915	* we can use them later without fear of regressing existing binaries.
12916	*/
12917	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12918	DOF_MAG_STRING, DOF_MAG_STRLEN) != `0`) {
12919	dtrace_dof_error(dof, "DOF magic string mismatch");
12920	return (-`1`);
12921	}
12922
12923	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12924	dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12925	dtrace_dof_error(dof, "DOF has invalid data model");
12926	return (-`1`);
12927	}
12928
12929	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12930	dtrace_dof_error(dof, "DOF encoding mismatch");
12931	return (-`1`);
12932	}
12933
12934	/*
12935	* APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
12936	*/
12937	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
12938	dtrace_dof_error(dof, "DOF version mismatch");
12939	return (-`1`);
12940	}
12941
12942	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12943	dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12944	return (-`1`);
12945	}
12946
12947	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12948	dtrace_dof_error(dof, "DOF uses too many integer registers");
12949	return (-`1`);
12950	}
12951
12952	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12953	dtrace_dof_error(dof, "DOF uses too many tuple registers");
12954	return (-`1`);
12955	}
12956
12957	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12958	if (dof->dofh_ident[i] != `0`) {
12959	dtrace_dof_error(dof, "DOF has invalid ident byte set");
12960	return (-`1`);
12961	}
12962	}
12963
12964	if (dof->dofh_flags & ~DOF_FL_VALID) {
12965	dtrace_dof_error(dof, "DOF has invalid flag bits set");
12966	return (-`1`);
12967	}
12968
12969	if (dof->dofh_secsize < sizeof(dof_sec_t)) {
12970	dtrace_dof_error(dof, "invalid section header size");
12971	return (-`1`);
12972	}
12973
12974	/*
12975	* Check that the section headers don't exceed the amount of DOF
12976	* data. Note that we cast the section size and number of sections
12977	* to uint64_t's to prevent possible overflow in the multiplication.
12978	*/
12979	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12980
12981	if (dof->dofh_secoff > len \|\| seclen > len \|\|
12982	dof->dofh_secoff + seclen > len) {
12983	dtrace_dof_error(dof, "truncated section headers");
12984	return (-`1`);
12985	}
12986
12987	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12988	dtrace_dof_error(dof, "misaligned section headers");
12989	return (-`1`);
12990	}
12991
12992	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12993	dtrace_dof_error(dof, "misaligned section size");
12994	return (-`1`);
12995	}
12996
12997	/*
12998	* Take an initial pass through the section headers to be sure that
12999	* the headers don't have stray offsets. If the 'noprobes' flag is
13000	* set, do not permit sections relating to providers, probes, or args.
13001	*/
13002	for (i = `0`; i < dof->dofh_secnum; i++) {
13003	dof_sec_t sec = (dof_sec_t )(daddr +
13004	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13005
13006	if (noprobes) {
13007	switch (sec->dofs_type) {
13008	case DOF_SECT_PROVIDER:
13009	case DOF_SECT_PROBES:
13010	case DOF_SECT_PRARGS:
13011	case DOF_SECT_PROFFS:
13012	dtrace_dof_error(dof, "illegal sections "
13013	"for enabling");
13014	return (-`1`);
13015	}
13016	}
13017
13018	if (!(sec->dofs_flags & DOF_SECF_LOAD))
13019	continue; / just ignore non-loadable sections /
13020
13021	if (sec->dofs_align & (sec->dofs_align - `1`)) {
13022	dtrace_dof_error(dof, "bad section alignment");
13023	return (-`1`);
13024	}
13025
13026	if (sec->dofs_offset & (sec->dofs_align - `1`)) {
13027	dtrace_dof_error(dof, "misaligned section");
13028	return (-`1`);
13029	}
13030
13031	if (sec->dofs_offset > len \|\| sec->dofs_size > len \|\|
13032	sec->dofs_offset + sec->dofs_size > len) {
13033	dtrace_dof_error(dof, "corrupt section header");
13034	return (-`1`);
13035	}
13036
13037	if (sec->dofs_type == DOF_SECT_STRTAB && ((char* *)daddr +
13038	sec->dofs_offset + sec->dofs_size - `1`) != `'\0'`) {
13039	dtrace_dof_error(dof, "non-terminating string table");
13040	return (-`1`);
13041	}
13042	}
13043
13044	/*
13045	* APPLE NOTE: We have no further relocation to perform.
13046	* All dof values are relative offsets.
13047	*/
13048
13049	if ((enab = *enabp) == NULL)
13050	enab = *enabp = dtrace_enabling_create(vstate);
13051
13052	for (i = `0`; i < dof->dofh_secnum; i++) {
13053	dof_sec_t sec = (dof_sec_t )(daddr +
13054	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13055
13056	if (sec->dofs_type != DOF_SECT_ECBDESC)
13057	continue;
13058
13059	/*
13060	* APPLE NOTE: Defend against gcc 4.0 botch on x86.
13061	* not all paths out of inlined dtrace_dof_ecbdesc
13062	* are checked for the NULL return value.
13063	* Check for NULL explicitly here.
13064	*/
13065	ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13066	if (ep == NULL) {
13067	dtrace_enabling_destroy(enab);
13068	*enabp = NULL;
13069	return (-`1`);
13070	}
13071
13072	dtrace_enabling_add(enab, ep);
13073	}
13074
13075	return (`0`);
13076	}
13077
13078	/*
13079	* Process DOF for any options. This routine assumes that the DOF has been
13080	* at least processed by dtrace_dof_slurp().
13081	*/
13082	static int
13083	dtrace_dof_options(dof_hdr_t dof, dtrace_state_t state)
13084	{
13085	uint_t i;
13086	int rval;
13087	uint32_t entsize;
13088	size_t offs;
13089	dof_optdesc_t *desc;
13090
13091	for (i = `0`; i < dof->dofh_secnum; i++) {
13092	dof_sec_t sec = (dof_sec_t )((uintptr_t)dof +
13093	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13094
13095	if (sec->dofs_type != DOF_SECT_OPTDESC)
13096	continue;
13097
13098	if (sec->dofs_align != sizeof (uint64_t)) {
13099	dtrace_dof_error(dof, "bad alignment in "
13100	"option description");
13101	return (EINVAL);
13102	}
13103
13104	if ((entsize = sec->dofs_entsize) == `0`) {
13105	dtrace_dof_error(dof, "zeroed option entry size");
13106	return (EINVAL);
13107	}
13108
13109	if (entsize < sizeof (dof_optdesc_t)) {
13110	dtrace_dof_error(dof, "bad option entry size");
13111	return (EINVAL);
13112	}
13113
13114	for (offs = `0`; offs < sec->dofs_size; offs += entsize) {
13115	desc = (dof_optdesc_t *)((uintptr_t)dof +
13116	(uintptr_t)sec->dofs_offset + offs);
13117
13118	if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13119	dtrace_dof_error(dof, "non-zero option string");
13120	return (EINVAL);
13121	}
13122
13123	if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13124	dtrace_dof_error(dof, "unset option");
13125	return (EINVAL);
13126	}
13127
13128	if ((rval = dtrace_state_option(state,
13129	desc->dofo_option, desc->dofo_value)) != `0`) {
13130	dtrace_dof_error(dof, "rejected option");
13131	return (rval);
13132	}
13133	}
13134	}
13135
13136	return (`0`);
13137	}
13138
13139	/*
13140	* DTrace Consumer State Functions
13141	*/
13142	static int
13143	dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13144	{
13145	size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13146	void *base;
13147	uintptr_t limit;
13148	dtrace_dynvar_t dvar, next, *start;
13149	size_t i;
13150
13151	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13152	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13153
13154	bzero(dstate, sizeof (dtrace_dstate_t));
13155
13156	if ((dstate->dtds_chunksize = chunksize) == `0`)
13157	dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13158
13159	VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13160
13161	if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13162	size = min_size;
13163
13164	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13165	return (ENOMEM);
13166
13167	dstate->dtds_size = size;
13168	dstate->dtds_base = base;
13169	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13170	bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13171
13172	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13173
13174	if (hashsize != `1` && (hashsize & `1`))
13175	hashsize--;
13176
13177	dstate->dtds_hashsize = hashsize;
13178	dstate->dtds_hash = dstate->dtds_base;
13179
13180	/*
13181	* Set all of our hash buckets to point to the single sink, and (if
13182	* it hasn't already been set), set the sink's hash value to be the
13183	* sink sentinel value. The sink is needed for dynamic variable
13184	* lookups to know that they have iterated over an entire, valid hash
13185	* chain.
13186	*/
13187	for (i = `0`; i < hashsize; i++)
13188	dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13189
13190	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13191	dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13192
13193	/*
13194	* Determine number of active CPUs. Divide free list evenly among
13195	* active CPUs.
13196	*/
13197	start = (dtrace_dynvar_t *)
13198	((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13199	limit = (uintptr_t)base + size;
13200
13201	VERIFY((uintptr_t)start < limit);
13202	VERIFY((uintptr_t)start >= (uintptr_t)base);
13203
13204	maxper = (limit - (uintptr_t)start) / (int)NCPU;
13205	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13206
13207	for (i = `0`; i < NCPU; i++) {
13208	dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13209
13210	/*
13211	* If we don't even have enough chunks to make it once through
13212	* NCPUs, we're just going to allocate everything to the first
13213	* CPU. And if we're on the last CPU, we're going to allocate
13214	* whatever is left over. In either case, we set the limit to
13215	* be the limit of the dynamic variable space.
13216	*/
13217	if (maxper == `0` \|\| i == NCPU - `1`) {
13218	limit = (uintptr_t)base + size;
13219	start = NULL;
13220	} else {
13221	limit = (uintptr_t)start + maxper;
13222	start = (dtrace_dynvar_t *)limit;
13223	}
13224
13225	VERIFY(limit <= (uintptr_t)base + size);
13226
13227	for (;;) {
13228	next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13229	dstate->dtds_chunksize);
13230
13231	if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13232	break;
13233
13234	VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13235	(uintptr_t)dvar <= (uintptr_t)base + size);
13236	dvar->dtdv_next = next;
13237	dvar = next;
13238	}
13239
13240	if (maxper == `0`)
13241	break;
13242	}
13243
13244	return (`0`);
13245	}
13246
13247	static void
13248	dtrace_dstate_fini(dtrace_dstate_t *dstate)
13249	{
13250	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13251
13252	if (dstate->dtds_base == NULL)
13253	return;
13254
13255	kmem_free(dstate->dtds_base, dstate->dtds_size);
13256	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13257	}
13258
13259	static void
13260	dtrace_vstate_fini(dtrace_vstate_t *vstate)
13261	{
13262	/*
13263	* Logical XOR, where are you?
13264	*/
13265	ASSERT((vstate->dtvs_nglobals == `0`) ^ (vstate->dtvs_globals != NULL));
13266
13267	if (vstate->dtvs_nglobals > `0`) {
13268	kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13269	sizeof (dtrace_statvar_t *));
13270	}
13271
13272	if (vstate->dtvs_ntlocals > `0`) {
13273	kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13274	sizeof (dtrace_difv_t));
13275	}
13276
13277	ASSERT((vstate->dtvs_nlocals == `0`) ^ (vstate->dtvs_locals != NULL));
13278
13279	if (vstate->dtvs_nlocals > `0`) {
13280	kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13281	sizeof (dtrace_statvar_t *));
13282	}
13283	}
13284
13285	static void
13286	dtrace_state_clean(dtrace_state_t *state)
13287	{
13288	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13289	return;
13290
13291	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13292	dtrace_speculation_clean(state);
13293	}
13294
13295	static void
13296	dtrace_state_deadman(dtrace_state_t *state)
13297	{
13298	hrtime_t now;
13299
13300	dtrace_sync();
13301
13302	now = dtrace_gethrtime();
13303
13304	if (state != dtrace_anon.dta_state &&
13305	now - state->dts_laststatus >= dtrace_deadman_user)
13306	return;
13307
13308	/*
13309	* We must be sure that dts_alive never appears to be less than the
13310	* value upon entry to dtrace_state_deadman(), and because we lack a
13311	* dtrace_cas64(), we cannot store to it atomically. We thus instead
13312	* store INT64_MAX to it, followed by a memory barrier, followed by
13313	* the new value. This assures that dts_alive never appears to be
13314	* less than its true value, regardless of the order in which the
13315	* stores to the underlying storage are issued.
13316	*/
13317	state->dts_alive = INT64_MAX;
13318	dtrace_membar_producer();
13319	state->dts_alive = now;
13320	}
13321
13322	static int
13323	dtrace_state_create(dev_t devp, cred_t cr, dtrace_state_t **new_state)
13324	{
13325	minor_t minor;
13326	major_t major;
13327	char c[`30`];
13328	dtrace_state_t *state;
13329	dtrace_optval_t *opt;
13330	int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13331
13332	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13333	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13334
13335	/ Cause restart /
13336	*new_state = NULL;
13337
13338	if (devp != NULL) {
13339	minor = getminor(*devp);
13340	}
13341	else {
13342	minor = DTRACE_NCLIENTS - `1`;
13343	}
13344
13345	state = dtrace_state_allocate(minor);
13346	if (NULL == state) {
13347	printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13348	return (ERESTART); / can't reacquire /
13349	}
13350
13351	state->dts_epid = DTRACE_EPIDNONE + `1`;
13352
13353	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13354	state->dts_aggid_arena = vmem_create(c, (void *)`1`, UINT32_MAX, `1`,
13355	NULL, NULL, NULL, `0`, VM_SLEEP \| VMC_IDENTIFIER);
13356
13357	if (devp != NULL) {
13358	major = getemajor(*devp);
13359	} else {
13360	major = ddi_driver_major(dtrace_devi);
13361	}
13362
13363	state->dts_dev = makedev(major, minor);
13364
13365	if (devp != NULL)
13366	*devp = state->dts_dev;
13367
13368	/*
13369	* We allocate NCPU buffers. On the one hand, this can be quite
13370	* a bit of memory per instance (nearly 36K on a Starcat). On the
13371	* other hand, it saves an additional memory reference in the probe
13372	* path.
13373	*/
13374	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13375	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13376	state->dts_buf_over_limit = `0`;
13377	state->dts_cleaner = CYCLIC_NONE;
13378	state->dts_deadman = CYCLIC_NONE;
13379	state->dts_vstate.dtvs_state = state;
13380
13381	for (i = `0`; i < DTRACEOPT_MAX; i++)
13382	state->dts_options[i] = DTRACEOPT_UNSET;
13383
13384	/*
13385	* Set the default options.
13386	*/
13387	opt = state->dts_options;
13388	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13389	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13390	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13391	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13392	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13393	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13394	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13395	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13396	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13397	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13398	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13399	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13400	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13401	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13402	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
13403
13404	/*
13405	* Depending on the user credentials, we set flag bits which alter probe
13406	* visibility or the amount of destructiveness allowed. In the case of
13407	* actual anonymous tracing, or the possession of all privileges, all of
13408	* the normal checks are bypassed.
13409	*/
13410	#if defined(__APPLE__)
13411	if (cr != NULL) {
13412	kauth_cred_ref(cr);
13413	state->dts_cred.dcr_cred = cr;
13414	}
13415	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13416	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13417	/*
13418	* Allow only proc credentials when DTrace is
13419	* restricted by the current security policy
13420	*/
13421	state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
13422	state->dts_cred.dcr_action = DTRACE_CRA_PROC \| DTRACE_CRA_PROC_CONTROL \| DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13423	}
13424	else {
13425	state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13426	state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13427	}
13428	}
13429
13430	#else
13431	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13432	state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13433	state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13434	}
13435	else {
13436	/*
13437	* Set up the credentials for this instantiation. We take a
13438	* hold on the credential to prevent it from disappearing on
13439	* us; this in turn prevents the zone_t referenced by this
13440	* credential from disappearing. This means that we can
13441	* examine the credential and the zone from probe context.
13442	*/
13443	crhold(cr);
13444	state->dts_cred.dcr_cred = cr;
13445
13446	/*
13447	* CRA_PROC means "we have some privilege for dtrace" and
13448	* unlocks the use of variables like pid, zonename, etc.
13449	*/
13450	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) \|\|
13451	PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13452	state->dts_cred.dcr_action \|= DTRACE_CRA_PROC;
13453	}
13454
13455	/*
13456	* dtrace_user allows use of syscall and profile providers.
13457	* If the user also has proc_owner and/or proc_zone, we
13458	* extend the scope to include additional visibility and
13459	* destructive power.
13460	*/
13461	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13462	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13463	state->dts_cred.dcr_visible \|=
13464	DTRACE_CRV_ALLPROC;
13465
13466	state->dts_cred.dcr_action \|=
13467	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13468	}
13469
13470	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13471	state->dts_cred.dcr_visible \|=
13472	DTRACE_CRV_ALLZONE;
13473
13474	state->dts_cred.dcr_action \|=
13475	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13476	}
13477
13478	/*
13479	* If we have all privs in whatever zone this is,
13480	* we can do destructive things to processes which
13481	* have altered credentials.
13482	*
13483	* APPLE NOTE: Darwin doesn't do zones.
13484	* Behave as if zone always has destructive privs.
13485	*/
13486
13487	state->dts_cred.dcr_action \|=
13488	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13489	}
13490
13491	/*
13492	* Holding the dtrace_kernel privilege also implies that
13493	* the user has the dtrace_user privilege from a visibility
13494	* perspective. But without further privileges, some
13495	* destructive actions are not available.
13496	*/
13497	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13498	/*
13499	* Make all probes in all zones visible. However,
13500	* this doesn't mean that all actions become available
13501	* to all zones.
13502	*/
13503	state->dts_cred.dcr_visible \|= DTRACE_CRV_KERNEL \|
13504	DTRACE_CRV_ALLPROC \| DTRACE_CRV_ALLZONE;
13505
13506	state->dts_cred.dcr_action \|= DTRACE_CRA_KERNEL \|
13507	DTRACE_CRA_PROC;
13508	/*
13509	* Holding proc_owner means that destructive actions
13510	* for this zone are allowed.
13511	*/
13512	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13513	state->dts_cred.dcr_action \|=
13514	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13515
13516	/*
13517	* Holding proc_zone means that destructive actions
13518	* for this user/group ID in all zones is allowed.
13519	*/
13520	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13521	state->dts_cred.dcr_action \|=
13522	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13523
13524	/*
13525	* If we have all privs in whatever zone this is,
13526	* we can do destructive things to processes which
13527	* have altered credentials.
13528	*
13529	* APPLE NOTE: Darwin doesn't do zones.
13530	* Behave as if zone always has destructive privs.
13531	*/
13532	state->dts_cred.dcr_action \|=
13533	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13534	}
13535
13536	/*
13537	* Holding the dtrace_proc privilege gives control over fasttrap
13538	* and pid providers. We need to grant wider destructive
13539	* privileges in the event that the user has proc_owner and/or
13540	* proc_zone.
13541	*/
13542	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13543	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13544	state->dts_cred.dcr_action \|=
13545	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13546
13547	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13548	state->dts_cred.dcr_action \|=
13549	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13550	}
13551	}
13552	#endif
13553
13554	*new_state = state;
13555	return(`0`); / Success /
13556	}
13557
13558	static int
13559	dtrace_state_buffer(dtrace_state_t state, dtrace_buffer_t buf, int which)
13560	{
13561	dtrace_optval_t *opt = state->dts_options, size;
13562	processorid_t cpu = `0`;
13563	size_t limit = buf->dtb_size;
13564	int flags = `0`, rval;
13565
13566	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13567	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13568	ASSERT(which < DTRACEOPT_MAX);
13569	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE \|\|
13570	(state == dtrace_anon.dta_state &&
13571	state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13572
13573	if (opt[which] == DTRACEOPT_UNSET \|\| opt[which] == `0`)
13574	return (`0`);
13575
13576	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13577	cpu = opt[DTRACEOPT_CPU];
13578
13579	if (which == DTRACEOPT_SPECSIZE)
13580	flags \|= DTRACEBUF_NOSWITCH;
13581
13582	if (which == DTRACEOPT_BUFSIZE) {
13583	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13584	flags \|= DTRACEBUF_RING;
13585
13586	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13587	flags \|= DTRACEBUF_FILL;
13588
13589	if (state != dtrace_anon.dta_state \|\|
13590	state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13591	flags \|= DTRACEBUF_INACTIVE;
13592	}
13593
13594	for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= `1`) {
13595	/*
13596	* The size must be 8-byte aligned. If the size is not 8-byte
13597	* aligned, drop it down by the difference.
13598	*/
13599	if (size & (sizeof (uint64_t) - `1`))
13600	size -= size & (sizeof (uint64_t) - `1`);
13601
13602	if (size < state->dts_reserve) {
13603	/*
13604	* Buffers always must be large enough to accommodate
13605	* their prereserved space. We return E2BIG instead
13606	* of ENOMEM in this case to allow for user-level
13607	* software to differentiate the cases.
13608	*/
13609	return (E2BIG);
13610	}
13611	limit = opt[DTRACEOPT_BUFLIMIT] * size / `100`;
13612	rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
13613
13614	if (rval != ENOMEM) {
13615	opt[which] = size;
13616	return (rval);
13617	}
13618
13619	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13620	return (rval);
13621	}
13622
13623	return (ENOMEM);
13624	}
13625
13626	static int
13627	dtrace_state_buffers(dtrace_state_t *state)
13628	{
13629	dtrace_speculation_t *spec = state->dts_speculations;
13630	int rval, i;
13631
13632	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13633	DTRACEOPT_BUFSIZE)) != `0`)
13634	return (rval);
13635
13636	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13637	DTRACEOPT_AGGSIZE)) != `0`)
13638	return (rval);
13639
13640	for (i = `0`; i < state->dts_nspeculations; i++) {
13641	if ((rval = dtrace_state_buffer(state,
13642	spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != `0`)
13643	return (rval);
13644	}
13645
13646	return (`0`);
13647	}
13648
13649	static void
13650	dtrace_state_prereserve(dtrace_state_t *state)
13651	{
13652	dtrace_ecb_t *ecb;
13653	dtrace_probe_t *probe;
13654
13655	state->dts_reserve = `0`;
13656
13657	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13658	return;
13659
13660	/*
13661	* If our buffer policy is a "fill" buffer policy, we need to set the
13662	* prereserved space to be the space required by the END probes.
13663	*/
13664	probe = dtrace_probes[dtrace_probeid_end - `1`];
13665	ASSERT(probe != NULL);
13666
13667	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
13668	if (ecb->dte_state != state)
13669	continue;
13670
13671	state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
13672	}
13673	}
13674
13675	static int
13676	dtrace_state_go(dtrace_state_t state, processorid_t cpu)
13677	{
13678	dtrace_optval_t *opt = state->dts_options, sz, nspec;
13679	dtrace_speculation_t *spec;
13680	dtrace_buffer_t *buf;
13681	cyc_handler_t hdlr;
13682	cyc_time_t when;
13683	int rval = `0`, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
13684	dtrace_icookie_t cookie;
13685
13686	lck_mtx_lock(&cpu_lock);
13687	lck_mtx_lock(&dtrace_lock);
13688
13689	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
13690	rval = EBUSY;
13691	goto out;
13692	}
13693
13694	/*
13695	* Before we can perform any checks, we must prime all of the
13696	* retained enablings that correspond to this state.
13697	*/
13698	dtrace_enabling_prime(state);
13699
13700	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13701	rval = EACCES;
13702	goto out;
13703	}
13704
13705	dtrace_state_prereserve(state);
13706
13707	/*
13708	* Now we want to do is try to allocate our speculations.
13709	* We do not automatically resize the number of speculations; if
13710	* this fails, we will fail the operation.
13711	*/
13712	nspec = opt[DTRACEOPT_NSPEC];
13713	ASSERT(nspec != DTRACEOPT_UNSET);
13714
13715	if (nspec > INT_MAX) {
13716	rval = ENOMEM;
13717	goto out;
13718	}
13719
13720	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13721
13722	if (spec == NULL) {
13723	rval = ENOMEM;
13724	goto out;
13725	}
13726
13727	state->dts_speculations = spec;
13728	state->dts_nspeculations = (int)nspec;
13729
13730	for (i = `0`; i < nspec; i++) {
13731	if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13732	rval = ENOMEM;
13733	goto err;
13734	}
13735
13736	spec[i].dtsp_buffer = buf;
13737	}
13738
13739	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13740	if (dtrace_anon.dta_state == NULL) {
13741	rval = ENOENT;
13742	goto out;
13743	}
13744
13745	if (state->dts_necbs != `0`) {
13746	rval = EALREADY;
13747	goto out;
13748	}
13749
13750	state->dts_anon = dtrace_anon_grab();
13751	ASSERT(state->dts_anon != NULL);
13752	state = state->dts_anon;
13753
13754	/*
13755	* We want "grabanon" to be set in the grabbed state, so we'll
13756	* copy that option value from the grabbing state into the
13757	* grabbed state.
13758	*/
13759	state->dts_options[DTRACEOPT_GRABANON] =
13760	opt[DTRACEOPT_GRABANON];
13761
13762	*cpu = dtrace_anon.dta_beganon;
13763
13764	/*
13765	* If the anonymous state is active (as it almost certainly
13766	* is if the anonymous enabling ultimately matched anything),
13767	* we don't allow any further option processing -- but we
13768	* don't return failure.
13769	*/
13770	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13771	goto out;
13772	}
13773
13774	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13775	opt[DTRACEOPT_AGGSIZE] != `0`) {
13776	if (state->dts_aggregations == NULL) {
13777	/*
13778	* We're not going to create an aggregation buffer
13779	* because we don't have any ECBs that contain
13780	* aggregations -- set this option to 0.
13781	*/
13782	opt[DTRACEOPT_AGGSIZE] = `0`;
13783	} else {
13784	/*
13785	* If we have an aggregation buffer, we must also have
13786	* a buffer to use as scratch.
13787	*/
13788	if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET \|\|
13789	(size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13790	opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13791	}
13792	}
13793	}
13794
13795	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13796	opt[DTRACEOPT_SPECSIZE] != `0`) {
13797	if (!state->dts_speculates) {
13798	/*
13799	* We're not going to create speculation buffers
13800	* because we don't have any ECBs that actually
13801	* speculate -- set the speculation size to 0.
13802	*/
13803	opt[DTRACEOPT_SPECSIZE] = `0`;
13804	}
13805	}
13806
13807	/*
13808	* The bare minimum size for any buffer that we're actually going to
13809	* do anything to is sizeof (uint64_t).
13810	*/
13811	sz = sizeof (uint64_t);
13812
13813	if ((state->dts_needed != `0` && opt[DTRACEOPT_BUFSIZE] < sz) \|\|
13814	(state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) \|\|
13815	(state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13816	/*
13817	* A buffer size has been explicitly set to 0 (or to a size
13818	* that will be adjusted to 0) and we need the space -- we
13819	* need to return failure. We return ENOSPC to differentiate
13820	* it from failing to allocate a buffer due to failure to meet
13821	* the reserve (for which we return E2BIG).
13822	*/
13823	rval = ENOSPC;
13824	goto out;
13825	}
13826
13827	if ((rval = dtrace_state_buffers(state)) != `0`)
13828	goto err;
13829
13830	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13831	sz = dtrace_dstate_defsize;
13832
13833	do {
13834	rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13835
13836	if (rval == `0`)
13837	break;
13838
13839	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13840	goto err;
13841	} while (sz >>= `1`);
13842
13843	opt[DTRACEOPT_DYNVARSIZE] = sz;
13844
13845	if (rval != `0`)
13846	goto err;
13847
13848	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13849	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13850
13851	if (opt[DTRACEOPT_CLEANRATE] == `0`)
13852	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13853
13854	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13855	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13856
13857	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13858	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13859
13860	if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
13861	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
13862
13863	if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
13864	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
13865
13866	if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
13867	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
13868
13869	if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
13870	opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
13871
13872	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13873	hdlr.cyh_arg = state;
13874	hdlr.cyh_level = CY_LOW_LEVEL;
13875
13876	when.cyt_when = `0`;
13877	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13878
13879	state->dts_cleaner = cyclic_add(&hdlr, &when);
13880
13881	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13882	hdlr.cyh_arg = state;
13883	hdlr.cyh_level = CY_LOW_LEVEL;
13884
13885	when.cyt_when = `0`;
13886	when.cyt_interval = dtrace_deadman_interval;
13887
13888	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13889	state->dts_deadman = cyclic_add(&hdlr, &when);
13890
13891	state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13892
13893	/*
13894	* Now it's time to actually fire the BEGIN probe. We need to disable
13895	* interrupts here both to record the CPU on which we fired the BEGIN
13896	* probe (the data from this CPU will be processed first at user
13897	* level) and to manually activate the buffer for this CPU.
13898	*/
13899	cookie = dtrace_interrupt_disable();
13900	*cpu = CPU->cpu_id;
13901	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13902	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13903
13904	dtrace_probe(dtrace_probeid_begin,
13905	(uint64_t)(uintptr_t)state, `0`, `0`, `0`, `0`);
13906	dtrace_interrupt_enable(cookie);
13907	/*
13908	* We may have had an exit action from a BEGIN probe; only change our
13909	* state to ACTIVE if we're still in WARMUP.
13910	*/
13911	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP \|\|
13912	state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13913
13914	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13915	state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13916
13917	/*
13918	* Regardless of whether or not now we're in ACTIVE or DRAINING, we
13919	* want each CPU to transition its principal buffer out of the
13920	* INACTIVE state. Doing this assures that no CPU will suddenly begin
13921	* processing an ECB halfway down a probe's ECB chain; all CPUs will
13922	* atomically transition from processing none of a state's ECBs to
13923	* processing all of them.
13924	*/
13925	dtrace_xcall(DTRACE_CPUALL,
13926	(dtrace_xcall_t)dtrace_buffer_activate, state);
13927	goto out;
13928
13929	err:
13930	dtrace_buffer_free(state->dts_buffer);
13931	dtrace_buffer_free(state->dts_aggbuffer);
13932
13933	if ((nspec = state->dts_nspeculations) == `0`) {
13934	ASSERT(state->dts_speculations == NULL);
13935	goto out;
13936	}
13937
13938	spec = state->dts_speculations;
13939	ASSERT(spec != NULL);
13940
13941	for (i = `0`; i < state->dts_nspeculations; i++) {
13942	if ((buf = spec[i].dtsp_buffer) == NULL)
13943	break;
13944
13945	dtrace_buffer_free(buf);
13946	kmem_free(buf, bufsize);
13947	}
13948
13949	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13950	state->dts_nspeculations = `0`;
13951	state->dts_speculations = NULL;
13952
13953	out:
13954	lck_mtx_unlock(&dtrace_lock);
13955	lck_mtx_unlock(&cpu_lock);
13956
13957	return (rval);
13958	}
13959
13960	static int
13961	dtrace_state_stop(dtrace_state_t state, processorid_t cpu)
13962	{
13963	dtrace_icookie_t cookie;
13964
13965	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13966
13967	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13968	state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13969	return (EINVAL);
13970
13971	/*
13972	* We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13973	* to be sure that every CPU has seen it. See below for the details
13974	* on why this is done.
13975	*/
13976	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13977	dtrace_sync();
13978
13979	/*
13980	* By this point, it is impossible for any CPU to be still processing
13981	* with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13982	* DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13983	* other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13984	* and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13985	* iff we're in the END probe.
13986	*/
13987	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13988	dtrace_sync();
13989	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13990
13991	/*
13992	* Finally, we can release the reserve and call the END probe. We
13993	* disable interrupts across calling the END probe to allow us to
13994	* return the CPU on which we actually called the END probe. This
13995	* allows user-land to be sure that this CPU's principal buffer is
13996	* processed last.
13997	*/
13998	state->dts_reserve = `0`;
13999
14000	cookie = dtrace_interrupt_disable();
14001	*cpu = CPU->cpu_id;
14002	dtrace_probe(dtrace_probeid_end,
14003	(uint64_t)(uintptr_t)state, `0`, `0`, `0`, `0`);
14004	dtrace_interrupt_enable(cookie);
14005
14006	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14007	dtrace_sync();
14008
14009	return (`0`);
14010	}
14011
14012	static int
14013	dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14014	dtrace_optval_t val)
14015	{
14016	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14017
14018	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14019	return (EBUSY);
14020
14021	if (option >= DTRACEOPT_MAX)
14022	return (EINVAL);
14023
14024	if (option != DTRACEOPT_CPU && val < `0`)
14025	return (EINVAL);
14026
14027	switch (option) {
14028	case DTRACEOPT_DESTRUCTIVE:
14029	/*
14030	* Prevent consumers from enabling destructive actions if DTrace
14031	* is running in a restricted environment, or if actions are
14032	* disallowed.
14033	*/
14034	if (dtrace_is_restricted() \|\| dtrace_destructive_disallow)
14035	return (EACCES);
14036
14037	state->dts_cred.dcr_destructive = `1`;
14038	break;
14039
14040	case DTRACEOPT_BUFSIZE:
14041	case DTRACEOPT_DYNVARSIZE:
14042	case DTRACEOPT_AGGSIZE:
14043	case DTRACEOPT_SPECSIZE:
14044	case DTRACEOPT_STRSIZE:
14045	if (val < `0`)
14046	return (EINVAL);
14047
14048	if (val >= LONG_MAX) {
14049	/*
14050	* If this is an otherwise negative value, set it to
14051	* the highest multiple of 128m less than LONG_MAX.
14052	* Technically, we're adjusting the size without
14053	* regard to the buffer resizing policy, but in fact,
14054	* this has no effect -- if we set the buffer size to
14055	* ~LONG_MAX and the buffer policy is ultimately set to
14056	* be "manual", the buffer allocation is guaranteed to
14057	* fail, if only because the allocation requires two
14058	* buffers. (We set the the size to the highest
14059	* multiple of 128m because it ensures that the size
14060	* will remain a multiple of a megabyte when
14061	* repeatedly halved -- all the way down to 15m.)
14062	*/
14063	val = LONG_MAX - (`1` << `27`) + `1`;
14064	}
14065	}
14066
14067	state->dts_options[option] = val;
14068
14069	return (`0`);
14070	}
14071
14072	static void
14073	dtrace_state_destroy(dtrace_state_t *state)
14074	{
14075	dtrace_ecb_t *ecb;
14076	dtrace_vstate_t *vstate = &state->dts_vstate;
14077	minor_t minor = getminor(state->dts_dev);
14078	int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14079	dtrace_speculation_t *spec = state->dts_speculations;
14080	int nspec = state->dts_nspeculations;
14081	uint32_t match;
14082
14083	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14084	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14085
14086	/*
14087	* First, retract any retained enablings for this state.
14088	*/
14089	dtrace_enabling_retract(state);
14090	ASSERT(state->dts_nretained == `0`);
14091
14092	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE \|\|
14093	state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14094	/*
14095	* We have managed to come into dtrace_state_destroy() on a
14096	* hot enabling -- almost certainly because of a disorderly
14097	* shutdown of a consumer. (That is, a consumer that is
14098	* exiting without having called dtrace_stop().) In this case,
14099	* we're going to set our activity to be KILLED, and then
14100	* issue a sync to be sure that everyone is out of probe
14101	* context before we start blowing away ECBs.
14102	*/
14103	state->dts_activity = DTRACE_ACTIVITY_KILLED;
14104	dtrace_sync();
14105	}
14106
14107	/*
14108	* Release the credential hold we took in dtrace_state_create().
14109	*/
14110	if (state->dts_cred.dcr_cred != NULL)
14111	kauth_cred_unref(&state->dts_cred.dcr_cred);
14112
14113	/*
14114	* Now we can safely disable and destroy any enabled probes. Because
14115	* any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14116	* (especially if they're all enabled), we take two passes through the
14117	* ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14118	* in the second we disable whatever is left over.
14119	*/
14120	for (match = DTRACE_PRIV_KERNEL; ; match = `0`) {
14121	for (i = `0`; i < state->dts_necbs; i++) {
14122	if ((ecb = state->dts_ecbs[i]) == NULL)
14123	continue;
14124
14125	if (match && ecb->dte_probe != NULL) {
14126	dtrace_probe_t *probe = ecb->dte_probe;
14127	dtrace_provider_t *prov = probe->dtpr_provider;
14128
14129	if (!(prov->dtpv_priv.dtpp_flags & match))
14130	continue;
14131	}
14132
14133	dtrace_ecb_disable(ecb);
14134	dtrace_ecb_destroy(ecb);
14135	}
14136
14137	if (!match)
14138	break;
14139	}
14140
14141	/*
14142	* Before we free the buffers, perform one more sync to assure that
14143	* every CPU is out of probe context.
14144	*/
14145	dtrace_sync();
14146
14147	dtrace_buffer_free(state->dts_buffer);
14148	dtrace_buffer_free(state->dts_aggbuffer);
14149
14150	for (i = `0`; i < nspec; i++)
14151	dtrace_buffer_free(spec[i].dtsp_buffer);
14152
14153	if (state->dts_cleaner != CYCLIC_NONE)
14154	cyclic_remove(state->dts_cleaner);
14155
14156	if (state->dts_deadman != CYCLIC_NONE)
14157	cyclic_remove(state->dts_deadman);
14158
14159	dtrace_dstate_fini(&vstate->dtvs_dynvars);
14160	dtrace_vstate_fini(vstate);
14161	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14162
14163	if (state->dts_aggregations != NULL) {
14164	#if DEBUG
14165	for (i = `0`; i < state->dts_naggregations; i++)
14166	ASSERT(state->dts_aggregations[i] == NULL);
14167	#endif
14168	ASSERT(state->dts_naggregations > `0`);
14169	kmem_free(state->dts_aggregations,
14170	state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14171	}
14172
14173	kmem_free(state->dts_buffer, bufsize);
14174	kmem_free(state->dts_aggbuffer, bufsize);
14175
14176	for (i = `0`; i < nspec; i++)
14177	kmem_free(spec[i].dtsp_buffer, bufsize);
14178
14179	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14180
14181	dtrace_format_destroy(state);
14182
14183	vmem_destroy(state->dts_aggid_arena);
14184	dtrace_state_free(minor);
14185	}
14186
14187	/*
14188	* DTrace Anonymous Enabling Functions
14189	*/
14190
14191	int
14192	dtrace_keep_kernel_symbols(void)
14193	{
14194	if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14195	return `0`;
14196	}
14197
14198	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14199	return `1`;
14200
14201	return `0`;
14202	}
14203
14204	static dtrace_state_t *
14205	dtrace_anon_grab(void)
14206	{
14207	dtrace_state_t *state;
14208
14209	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14210
14211	if ((state = dtrace_anon.dta_state) == NULL) {
14212	ASSERT(dtrace_anon.dta_enabling == NULL);
14213	return (NULL);
14214	}
14215
14216	ASSERT(dtrace_anon.dta_enabling != NULL);
14217	ASSERT(dtrace_retained != NULL);
14218
14219	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14220	dtrace_anon.dta_enabling = NULL;
14221	dtrace_anon.dta_state = NULL;
14222
14223	return (state);
14224	}
14225
14226	static void
14227	dtrace_anon_property(void)
14228	{
14229	int i, rv;
14230	dtrace_state_t *state;
14231	dof_hdr_t *dof;
14232	char c[`32`]; / enough for "dof-data-" + digits /
14233
14234	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14235	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14236
14237	for (i = `0`; ; i++) {
14238	(void) snprintf(c, sizeof (c), "dof-data-%d", i);
14239
14240	dtrace_err_verbose = `1`;
14241
14242	if ((dof = dtrace_dof_property(c)) == NULL) {
14243	dtrace_err_verbose = `0`;
14244	break;
14245	}
14246
14247	#ifdef illumos
14248	/*
14249	* We want to create anonymous state, so we need to transition
14250	* the kernel debugger to indicate that DTrace is active. If
14251	* this fails (e.g. because the debugger has modified text in
14252	* some way), we won't continue with the processing.
14253	*/
14254	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != `0`) {
14255	cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14256	"enabling ignored.");
14257	dtrace_dof_destroy(dof);
14258	break;
14259	}
14260	#endif
14261
14262	/*
14263	* If we haven't allocated an anonymous state, we'll do so now.
14264	*/
14265	if ((state = dtrace_anon.dta_state) == NULL) {
14266	rv = dtrace_state_create(NULL, NULL, &state);
14267	dtrace_anon.dta_state = state;
14268	if (rv != `0` \|\| state == NULL) {
14269	/*
14270	* This basically shouldn't happen: the only
14271	* failure mode from dtrace_state_create() is a
14272	* failure of ddi_soft_state_zalloc() that
14273	* itself should never happen. Still, the
14274	* interface allows for a failure mode, and
14275	* we want to fail as gracefully as possible:
14276	* we'll emit an error message and cease
14277	* processing anonymous state in this case.
14278	*/
14279	cmn_err(CE_WARN, "failed to create "
14280	"anonymous state");
14281	dtrace_dof_destroy(dof);
14282	break;
14283	}
14284	}
14285
14286	rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14287	&dtrace_anon.dta_enabling, `0`, B_TRUE);
14288
14289	if (rv == `0`)
14290	rv = dtrace_dof_options(dof, state);
14291
14292	dtrace_err_verbose = `0`;
14293	dtrace_dof_destroy(dof);
14294
14295	if (rv != `0`) {
14296	/*
14297	* This is malformed DOF; chuck any anonymous state
14298	* that we created.
14299	*/
14300	ASSERT(dtrace_anon.dta_enabling == NULL);
14301	dtrace_state_destroy(state);
14302	dtrace_anon.dta_state = NULL;
14303	break;
14304	}
14305
14306	ASSERT(dtrace_anon.dta_enabling != NULL);
14307	}
14308
14309	if (dtrace_anon.dta_enabling != NULL) {
14310	int rval;
14311
14312	/*
14313	* dtrace_enabling_retain() can only fail because we are
14314	* trying to retain more enablings than are allowed -- but
14315	* we only have one anonymous enabling, and we are guaranteed
14316	* to be allowed at least one retained enabling; we assert
14317	* that dtrace_enabling_retain() returns success.
14318	*/
14319	rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14320	ASSERT(rval == `0`);
14321
14322	dtrace_enabling_dump(dtrace_anon.dta_enabling);
14323	}
14324	}
14325
14326	/*
14327	* DTrace Helper Functions
14328	*/
14329	static void
14330	dtrace_helper_trace(dtrace_helper_action_t *helper,
14331	dtrace_mstate_t mstate, dtrace_vstate_t vstate, int where)
14332	{
14333	uint32_t size, next, nnext;
14334	int i;
14335	dtrace_helptrace_t *ent;
14336	uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14337
14338	if (!dtrace_helptrace_enabled)
14339	return;
14340
14341	ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14342
14343	/*
14344	* What would a tracing framework be without its own tracing
14345	* framework? (Well, a hell of a lot simpler, for starters...)
14346	*/
14347	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14348	sizeof (uint64_t) - sizeof (uint64_t);
14349
14350	/*
14351	* Iterate until we can allocate a slot in the trace buffer.
14352	*/
14353	do {
14354	next = dtrace_helptrace_next;
14355
14356	if (next + size < dtrace_helptrace_bufsize) {
14357	nnext = next + size;
14358	} else {
14359	nnext = size;
14360	}
14361	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14362
14363	/*
14364	* We have our slot; fill it in.
14365	*/
14366	if (nnext == size)
14367	next = `0`;
14368
14369	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14370	ent->dtht_helper = helper;
14371	ent->dtht_where = where;
14372	ent->dtht_nlocals = vstate->dtvs_nlocals;
14373
14374	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14375	mstate->dtms_fltoffs : -`1`;
14376	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14377	ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
14378
14379	for (i = `0`; i < vstate->dtvs_nlocals; i++) {
14380	dtrace_statvar_t *svar;
14381
14382	if ((svar = vstate->dtvs_locals[i]) == NULL)
14383	continue;
14384
14385	ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
14386	ent->dtht_locals[i] =
14387	((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
14388	}
14389	}
14390
14391	static uint64_t
14392	dtrace_helper(int which, dtrace_mstate_t *mstate,
14393	dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14394	{
14395	uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14396	uint64_t sarg0 = mstate->dtms_arg[`0`];
14397	uint64_t sarg1 = mstate->dtms_arg[`1`];
14398	uint64_t rval = `0`;
14399	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14400	dtrace_helper_action_t *helper;
14401	dtrace_vstate_t *vstate;
14402	dtrace_difo_t *pred;
14403	int i, trace = dtrace_helptrace_enabled;
14404
14405	ASSERT(which >= `0` && which < DTRACE_NHELPER_ACTIONS);
14406
14407	if (helpers == NULL)
14408	return (`0`);
14409
14410	if ((helper = helpers->dthps_actions[which]) == NULL)
14411	return (`0`);
14412
14413	vstate = &helpers->dthps_vstate;
14414	mstate->dtms_arg[`0`] = arg0;
14415	mstate->dtms_arg[`1`] = arg1;
14416
14417	/*
14418	* Now iterate over each helper. If its predicate evaluates to 'true',
14419	* we'll call the corresponding actions. Note that the below calls
14420	* to dtrace_dif_emulate() may set faults in machine state. This is
14421	* okay: our caller (the outer dtrace_dif_emulate()) will simply plow
14422	* the stored DIF offset with its own (which is the desired behavior).
14423	* Also, note the calls to dtrace_dif_emulate() may allocate scratch
14424	* from machine state; this is okay, too.
14425	*/
14426	for (; helper != NULL; helper = helper->dtha_next) {
14427	if ((pred = helper->dtha_predicate) != NULL) {
14428	if (trace)
14429	dtrace_helper_trace(helper, mstate, vstate, `0`);
14430
14431	if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14432	goto next;
14433
14434	if (*flags & CPU_DTRACE_FAULT)
14435	goto err;
14436	}
14437
14438	for (i = `0`; i < helper->dtha_nactions; i++) {
14439	if (trace)
14440	dtrace_helper_trace(helper,
14441	mstate, vstate, i + `1`);
14442
14443	rval = dtrace_dif_emulate(helper->dtha_actions[i],
14444	mstate, vstate, state);
14445
14446	if (*flags & CPU_DTRACE_FAULT)
14447	goto err;
14448	}
14449
14450	next:
14451	if (trace)
14452	dtrace_helper_trace(helper, mstate, vstate,
14453	DTRACE_HELPTRACE_NEXT);
14454	}
14455
14456	if (trace)
14457	dtrace_helper_trace(helper, mstate, vstate,
14458	DTRACE_HELPTRACE_DONE);
14459
14460	/*
14461	* Restore the arg0 that we saved upon entry.
14462	*/
14463	mstate->dtms_arg[`0`] = sarg0;
14464	mstate->dtms_arg[`1`] = sarg1;
14465
14466	return (rval);
14467
14468	err:
14469	if (trace)
14470	dtrace_helper_trace(helper, mstate, vstate,
14471	DTRACE_HELPTRACE_ERR);
14472
14473	/*
14474	* Restore the arg0 that we saved upon entry.
14475	*/
14476	mstate->dtms_arg[`0`] = sarg0;
14477	mstate->dtms_arg[`1`] = sarg1;
14478
14479	return (`0`);
14480	}
14481
14482	static void
14483	dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14484	dtrace_vstate_t *vstate)
14485	{
14486	int i;
14487
14488	if (helper->dtha_predicate != NULL)
14489	dtrace_difo_release(helper->dtha_predicate, vstate);
14490
14491	for (i = `0`; i < helper->dtha_nactions; i++) {
14492	ASSERT(helper->dtha_actions[i] != NULL);
14493	dtrace_difo_release(helper->dtha_actions[i], vstate);
14494	}
14495
14496	kmem_free(helper->dtha_actions,
14497	helper->dtha_nactions * sizeof (dtrace_difo_t *));
14498	kmem_free(helper, sizeof (dtrace_helper_action_t));
14499	}
14500
14501	static int
14502	dtrace_helper_destroygen(proc_t* p, int gen)
14503	{
14504	dtrace_helpers_t *help = p->p_dtrace_helpers;
14505	dtrace_vstate_t *vstate;
14506	uint_t i;
14507
14508	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14509	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14510
14511	if (help == NULL \|\| gen > help->dthps_generation)
14512	return (EINVAL);
14513
14514	vstate = &help->dthps_vstate;
14515
14516	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
14517	dtrace_helper_action_t last = NULL, h, *next;
14518
14519	for (h = help->dthps_actions[i]; h != NULL; h = next) {
14520	next = h->dtha_next;
14521
14522	if (h->dtha_generation == gen) {
14523	if (last != NULL) {
14524	last->dtha_next = next;
14525	} else {
14526	help->dthps_actions[i] = next;
14527	}
14528
14529	dtrace_helper_action_destroy(h, vstate);
14530	} else {
14531	last = h;
14532	}
14533	}
14534	}
14535
14536	/*
14537	* Interate until we've cleared out all helper providers with the
14538	* given generation number.
14539	*/
14540	for (;;) {
14541	dtrace_helper_provider_t *prov = NULL;
14542
14543	/*
14544	* Look for a helper provider with the right generation. We
14545	* have to start back at the beginning of the list each time
14546	* because we drop dtrace_lock. It's unlikely that we'll make
14547	* more than two passes.
14548	*/
14549	for (i = `0`; i < help->dthps_nprovs; i++) {
14550	prov = help->dthps_provs[i];
14551
14552	if (prov->dthp_generation == gen)
14553	break;
14554	}
14555
14556	/*
14557	* If there were no matches, we're done.
14558	*/
14559	if (i == help->dthps_nprovs)
14560	break;
14561
14562	/*
14563	* Move the last helper provider into this slot.
14564	*/
14565	help->dthps_nprovs--;
14566	help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14567	help->dthps_provs[help->dthps_nprovs] = NULL;
14568
14569	lck_mtx_unlock(&dtrace_lock);
14570
14571	/*
14572	* If we have a meta provider, remove this helper provider.
14573	*/
14574	if (dtrace_meta_pid != NULL) {
14575	ASSERT(dtrace_deferred_pid == NULL);
14576	dtrace_helper_provider_remove(&prov->dthp_prov,
14577	p);
14578	}
14579
14580	dtrace_helper_provider_destroy(prov);
14581
14582	lck_mtx_lock(&dtrace_lock);
14583	}
14584
14585	return (`0`);
14586	}
14587
14588	static int
14589	dtrace_helper_validate(dtrace_helper_action_t *helper)
14590	{
14591	int err = `0`, i;
14592	dtrace_difo_t *dp;
14593
14594	if ((dp = helper->dtha_predicate) != NULL)
14595	err += dtrace_difo_validate_helper(dp);
14596
14597	for (i = `0`; i < helper->dtha_nactions; i++)
14598	err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14599
14600	return (err == `0`);
14601	}
14602
14603	static int
14604	dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
14605	{
14606	dtrace_helpers_t *help;
14607	dtrace_helper_action_t helper, last;
14608	dtrace_actdesc_t *act;
14609	dtrace_vstate_t *vstate;
14610	dtrace_predicate_t *pred;
14611	int count = `0`, nactions = `0`, i;
14612
14613	if (which < `0` \|\| which >= DTRACE_NHELPER_ACTIONS)
14614	return (EINVAL);
14615
14616	help = p->p_dtrace_helpers;
14617	last = help->dthps_actions[which];
14618	vstate = &help->dthps_vstate;
14619
14620	for (count = `0`; last != NULL; last = last->dtha_next) {
14621	count++;
14622	if (last->dtha_next == NULL)
14623	break;
14624	}
14625
14626	/*
14627	* If we already have dtrace_helper_actions_max helper actions for this
14628	* helper action type, we'll refuse to add a new one.
14629	*/
14630	if (count >= dtrace_helper_actions_max)
14631	return (ENOSPC);
14632
14633	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14634	helper->dtha_generation = help->dthps_generation;
14635
14636	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14637	ASSERT(pred->dtp_difo != NULL);
14638	dtrace_difo_hold(pred->dtp_difo);
14639	helper->dtha_predicate = pred->dtp_difo;
14640	}
14641
14642	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14643	if (act->dtad_kind != DTRACEACT_DIFEXPR)
14644	goto err;
14645
14646	if (act->dtad_difo == NULL)
14647	goto err;
14648
14649	nactions++;
14650	}
14651
14652	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t )
14653	(helper->dtha_nactions = nactions), KM_SLEEP);
14654
14655	for (act = ep->dted_action, i = `0`; act != NULL; act = act->dtad_next) {
14656	dtrace_difo_hold(act->dtad_difo);
14657	helper->dtha_actions[i++] = act->dtad_difo;
14658	}
14659
14660	if (!dtrace_helper_validate(helper))
14661	goto err;
14662
14663	if (last == NULL) {
14664	help->dthps_actions[which] = helper;
14665	} else {
14666	last->dtha_next = helper;
14667	}
14668
14669	if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
14670	dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
14671	dtrace_helptrace_next = `0`;
14672	}
14673
14674	return (`0`);
14675	err:
14676	dtrace_helper_action_destroy(helper, vstate);
14677	return (EINVAL);
14678	}
14679
14680	static void
14681	dtrace_helper_provider_register(proc_t p, dtrace_helpers_t help,
14682	dof_helper_t *dofhp)
14683	{
14684	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14685	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
14686
14687	lck_mtx_lock(&dtrace_lock);
14688
14689	if (!dtrace_attached() \|\| dtrace_meta_pid == NULL) {
14690	/*
14691	* If the dtrace module is loaded but not attached, or if
14692	* there aren't isn't a meta provider registered to deal with
14693	* these provider descriptions, we need to postpone creating
14694	* the actual providers until later.
14695	*/
14696
14697	if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14698	dtrace_deferred_pid != help) {
14699	help->dthps_deferred = `1`;
14700	help->dthps_pid = p->p_pid;
14701	help->dthps_next = dtrace_deferred_pid;
14702	help->dthps_prev = NULL;
14703	if (dtrace_deferred_pid != NULL)
14704	dtrace_deferred_pid->dthps_prev = help;
14705	dtrace_deferred_pid = help;
14706	}
14707
14708	lck_mtx_unlock(&dtrace_lock);
14709
14710	} else if (dofhp != NULL) {
14711	/*
14712	* If the dtrace module is loaded and we have a particular
14713	* helper provider description, pass that off to the
14714	* meta provider.
14715	*/
14716
14717	lck_mtx_unlock(&dtrace_lock);
14718
14719	dtrace_helper_provide(dofhp, p);
14720
14721	} else {
14722	/*
14723	* Otherwise, just pass all the helper provider descriptions
14724	* off to the meta provider.
14725	*/
14726
14727	uint_t i;
14728	lck_mtx_unlock(&dtrace_lock);
14729
14730	for (i = `0`; i < help->dthps_nprovs; i++) {
14731	dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14732	p);
14733	}
14734	}
14735	}
14736
14737	static int
14738	dtrace_helper_provider_add(proc_t* p, dof_helper_t dofhp, int* gen)
14739	{
14740	dtrace_helpers_t *help;
14741	dtrace_helper_provider_t hprov, *tmp_provs;
14742	uint_t tmp_maxprovs, i;
14743
14744	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14745	help = p->p_dtrace_helpers;
14746	ASSERT(help != NULL);
14747
14748	/*
14749	* If we already have dtrace_helper_providers_max helper providers,
14750	* we're refuse to add a new one.
14751	*/
14752	if (help->dthps_nprovs >= dtrace_helper_providers_max)
14753	return (ENOSPC);
14754
14755	/*
14756	* Check to make sure this isn't a duplicate.
14757	*/
14758	for (i = `0`; i < help->dthps_nprovs; i++) {
14759	if (dofhp->dofhp_addr ==
14760	help->dthps_provs[i]->dthp_prov.dofhp_addr)
14761	return (EALREADY);
14762	}
14763
14764	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14765	hprov->dthp_prov = *dofhp;
14766	hprov->dthp_ref = `1`;
14767	hprov->dthp_generation = gen;
14768
14769	/*
14770	* Allocate a bigger table for helper providers if it's already full.
14771	*/
14772	if (help->dthps_maxprovs == help->dthps_nprovs) {
14773	tmp_maxprovs = help->dthps_maxprovs;
14774	tmp_provs = help->dthps_provs;
14775
14776	if (help->dthps_maxprovs == `0`)
14777	help->dthps_maxprovs = `2`;
14778	else
14779	help->dthps_maxprovs *= `2`;
14780	if (help->dthps_maxprovs > dtrace_helper_providers_max)
14781	help->dthps_maxprovs = dtrace_helper_providers_max;
14782
14783	ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14784
14785	help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14786	sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14787
14788	if (tmp_provs != NULL) {
14789	bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14790	sizeof (dtrace_helper_provider_t *));
14791	kmem_free(tmp_provs, tmp_maxprovs *
14792	sizeof (dtrace_helper_provider_t *));
14793	}
14794	}
14795
14796	help->dthps_provs[help->dthps_nprovs] = hprov;
14797	help->dthps_nprovs++;
14798
14799	return (`0`);
14800	}
14801
14802	static void
14803	dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14804	{
14805	lck_mtx_lock(&dtrace_lock);
14806
14807	if (--hprov->dthp_ref == `0`) {
14808	dof_hdr_t *dof;
14809	lck_mtx_unlock(&dtrace_lock);
14810	dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14811	dtrace_dof_destroy(dof);
14812	kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14813	} else {
14814	lck_mtx_unlock(&dtrace_lock);
14815	}
14816	}
14817
14818	static int
14819	dtrace_helper_provider_validate(dof_hdr_t dof, dof_sec_t sec)
14820	{
14821	uintptr_t daddr = (uintptr_t)dof;
14822	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
14823	dof_provider_t *provider;
14824	dof_probe_t *probe;
14825	uint8_t *arg;
14826	char strtab, typestr;
14827	dof_stridx_t typeidx;
14828	size_t typesz;
14829	uint_t nprobes, j, k;
14830
14831	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14832
14833	if (sec->dofs_offset & (sizeof (uint_t) - `1`)) {
14834	dtrace_dof_error(dof, "misaligned section offset");
14835	return (-`1`);
14836	}
14837
14838	/*
14839	* The section needs to be large enough to contain the DOF provider
14840	* structure appropriate for the given version.
14841	*/
14842	if (sec->dofs_size <
14843	((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14844	offsetof(dof_provider_t, dofpv_prenoffs) :
14845	sizeof (dof_provider_t))) {
14846	dtrace_dof_error(dof, "provider section too small");
14847	return (-`1`);
14848	}
14849
14850	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14851	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14852	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14853	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14854	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14855
14856	if (str_sec == NULL \|\| prb_sec == NULL \|\|
14857	arg_sec == NULL \|\| off_sec == NULL)
14858	return (-`1`);
14859
14860	enoff_sec = NULL;
14861
14862	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14863	provider->dofpv_prenoffs != DOF_SECT_NONE &&
14864	(enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14865	provider->dofpv_prenoffs)) == NULL)
14866	return (-`1`);
14867
14868	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14869
14870	if (provider->dofpv_name >= str_sec->dofs_size \|\|
14871	strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14872	dtrace_dof_error(dof, "invalid provider name");
14873	return (-`1`);
14874	}
14875
14876	if (prb_sec->dofs_entsize == `0` \|\|
14877	prb_sec->dofs_entsize > prb_sec->dofs_size) {
14878	dtrace_dof_error(dof, "invalid entry size");
14879	return (-`1`);
14880	}
14881
14882	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - `1`)) {
14883	dtrace_dof_error(dof, "misaligned entry size");
14884	return (-`1`);
14885	}
14886
14887	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14888	dtrace_dof_error(dof, "invalid entry size");
14889	return (-`1`);
14890	}
14891
14892	if (off_sec->dofs_offset & (sizeof (uint32_t) - `1`)) {
14893	dtrace_dof_error(dof, "misaligned section offset");
14894	return (-`1`);
14895	}
14896
14897	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14898	dtrace_dof_error(dof, "invalid entry size");
14899	return (-`1`);
14900	}
14901
14902	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14903
14904	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14905
14906	/*
14907	* Take a pass through the probes to check for errors.
14908	*/
14909	for (j = `0`; j < nprobes; j++) {
14910	probe = (dof_probe_t *)(uintptr_t)(daddr +
14911	prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14912
14913	if (probe->dofpr_func >= str_sec->dofs_size) {
14914	dtrace_dof_error(dof, "invalid function name");
14915	return (-`1`);
14916	}
14917
14918	if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14919	dtrace_dof_error(dof, "function name too long");
14920	return (-`1`);
14921	}
14922
14923	if (probe->dofpr_name >= str_sec->dofs_size \|\|
14924	strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14925	dtrace_dof_error(dof, "invalid probe name");
14926	return (-`1`);
14927	}
14928
14929	/*
14930	* The offset count must not wrap the index, and the offsets
14931	* must also not overflow the section's data.
14932	*/
14933	if (probe->dofpr_offidx + probe->dofpr_noffs <
14934	probe->dofpr_offidx \|\|
14935	(probe->dofpr_offidx + probe->dofpr_noffs) *
14936	off_sec->dofs_entsize > off_sec->dofs_size) {
14937	dtrace_dof_error(dof, "invalid probe offset");
14938	return (-`1`);
14939	}
14940
14941	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14942	/*
14943	* If there's no is-enabled offset section, make sure
14944	* there aren't any is-enabled offsets. Otherwise
14945	* perform the same checks as for probe offsets
14946	* (immediately above).
14947	*/
14948	if (enoff_sec == NULL) {
14949	if (probe->dofpr_enoffidx != `0` \|\|
14950	probe->dofpr_nenoffs != `0`) {
14951	dtrace_dof_error(dof, "is-enabled "
14952	"offsets with null section");
14953	return (-`1`);
14954	}
14955	} else if (probe->dofpr_enoffidx +
14956	probe->dofpr_nenoffs < probe->dofpr_enoffidx \|\|
14957	(probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14958	enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14959	dtrace_dof_error(dof, "invalid is-enabled "
14960	"offset");
14961	return (-`1`);
14962	}
14963
14964	if (probe->dofpr_noffs + probe->dofpr_nenoffs == `0`) {
14965	dtrace_dof_error(dof, "zero probe and "
14966	"is-enabled offsets");
14967	return (-`1`);
14968	}
14969	} else if (probe->dofpr_noffs == `0`) {
14970	dtrace_dof_error(dof, "zero probe offsets");
14971	return (-`1`);
14972	}
14973
14974	if (probe->dofpr_argidx + probe->dofpr_xargc <
14975	probe->dofpr_argidx \|\|
14976	(probe->dofpr_argidx + probe->dofpr_xargc) *
14977	arg_sec->dofs_entsize > arg_sec->dofs_size) {
14978	dtrace_dof_error(dof, "invalid args");
14979	return (-`1`);
14980	}
14981
14982	typeidx = probe->dofpr_nargv;
14983	typestr = strtab + probe->dofpr_nargv;
14984	for (k = `0`; k < probe->dofpr_nargc; k++) {
14985	if (typeidx >= str_sec->dofs_size) {
14986	dtrace_dof_error(dof, "bad "
14987	"native argument type");
14988	return (-`1`);
14989	}
14990
14991	typesz = strlen(typestr) + `1`;
14992	if (typesz > DTRACE_ARGTYPELEN) {
14993	dtrace_dof_error(dof, "native "
14994	"argument type too long");
14995	return (-`1`);
14996	}
14997	typeidx += typesz;
14998	typestr += typesz;
14999	}
15000
15001	typeidx = probe->dofpr_xargv;
15002	typestr = strtab + probe->dofpr_xargv;
15003	for (k = `0`; k < probe->dofpr_xargc; k++) {
15004	if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15005	dtrace_dof_error(dof, "bad "
15006	"native argument index");
15007	return (-`1`);
15008	}
15009
15010	if (typeidx >= str_sec->dofs_size) {
15011	dtrace_dof_error(dof, "bad "
15012	"translated argument type");
15013	return (-`1`);
15014	}
15015
15016	typesz = strlen(typestr) + `1`;
15017	if (typesz > DTRACE_ARGTYPELEN) {
15018	dtrace_dof_error(dof, "translated argument "
15019	"type too long");
15020	return (-`1`);
15021	}
15022
15023	typeidx += typesz;
15024	typestr += typesz;
15025	}
15026	}
15027
15028	return (`0`);
15029	}
15030
15031	static int
15032	dtrace_helper_slurp(proc_t* p, dof_hdr_t dof, dof_helper_t dhp)
15033	{
15034	dtrace_helpers_t *help;
15035	dtrace_vstate_t *vstate;
15036	dtrace_enabling_t *enab = NULL;
15037	int i, gen, rv, nhelpers = `0`, nprovs = `0`, destroy = `1`;
15038	uintptr_t daddr = (uintptr_t)dof;
15039
15040	LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15041	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15042
15043	if ((help = p->p_dtrace_helpers) == NULL)
15044	help = dtrace_helpers_create(p);
15045
15046	vstate = &help->dthps_vstate;
15047
15048	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15049	dhp != NULL ? dhp->dofhp_addr : `0`, B_FALSE)) != `0`) {
15050	dtrace_dof_destroy(dof);
15051	return (rv);
15052	}
15053
15054	/*
15055	* Look for helper providers and validate their descriptions.
15056	*/
15057	if (dhp != NULL) {
15058	for (i = `0`; (uint32_t)i < dof->dofh_secnum; i++) {
15059	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
15060	dof->dofh_secoff + i * dof->dofh_secsize);
15061
15062	if (sec->dofs_type != DOF_SECT_PROVIDER)
15063	continue;
15064
15065	if (dtrace_helper_provider_validate(dof, sec) != `0`) {
15066	dtrace_enabling_destroy(enab);
15067	dtrace_dof_destroy(dof);
15068	return (-`1`);
15069	}
15070
15071	nprovs++;
15072	}
15073	}
15074
15075	/*
15076	* Now we need to walk through the ECB descriptions in the enabling.
15077	*/
15078	for (i = `0`; i < enab->dten_ndesc; i++) {
15079	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15080	dtrace_probedesc_t *desc = &ep->dted_probe;
15081
15082	/ APPLE NOTE: Darwin employs size bounded string operation. /
15083	if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15084	continue;
15085
15086	if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15087	continue;
15088
15089	if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15090	continue;
15091
15092	if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15093	ep)) != `0`) {
15094	/*
15095	* Adding this helper action failed -- we are now going
15096	* to rip out the entire generation and return failure.
15097	*/
15098	(void) dtrace_helper_destroygen(p, help->dthps_generation);
15099	dtrace_enabling_destroy(enab);
15100	dtrace_dof_destroy(dof);
15101	return (-`1`);
15102	}
15103
15104	nhelpers++;
15105	}
15106
15107	if (nhelpers < enab->dten_ndesc)
15108	dtrace_dof_error(dof, "unmatched helpers");
15109
15110	gen = help->dthps_generation++;
15111	dtrace_enabling_destroy(enab);
15112
15113	if (dhp != NULL && nprovs > `0`) {
15114	dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15115	if (dtrace_helper_provider_add(p, dhp, gen) == `0`) {
15116	lck_mtx_unlock(&dtrace_lock);
15117	dtrace_helper_provider_register(p, help, dhp);
15118	lck_mtx_lock(&dtrace_lock);
15119
15120	destroy = `0`;
15121	}
15122	}
15123
15124	if (destroy)
15125	dtrace_dof_destroy(dof);
15126
15127	return (gen);
15128	}
15129
15130	/*
15131	* APPLE NOTE: DTrace lazy dof implementation
15132	*
15133	* DTrace user static probes (USDT probes) and helper actions are loaded
15134	* in a process by proccessing dof sections. The dof sections are passed
15135	* into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15136	* expensive to process dof for a process that will never use it. There
15137	* is a memory cost (allocating the providers/probes), and a cpu cost
15138	* (creating the providers/probes).
15139	*
15140	* To reduce this cost, we use "lazy dof". The normal proceedure for
15141	* dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15142	* block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15143	* used, each process retains the dof_ioctl_data_t block, instead of
15144	* copying in the data it points to.
15145	*
15146	* The dof_ioctl_data_t blocks are managed as if they were the actual
15147	* processed dof; on fork the block is copied to the child, on exec and
15148	* exit the block is freed.
15149	*
15150	* If the process loads library(s) containing additional dof, the
15151	* new dof_ioctl_data_t is merged with the existing block.
15152	*
15153	* There are a few catches that make this slightly more difficult.
15154	* When dyld registers dof_ioctl_data_t blocks, it expects a unique
15155	* identifier value for each dof in the block. In non-lazy dof terms,
15156	* this is the generation that dof was loaded in. If we hand back
15157	* a UID for a lazy dof, that same UID must be able to unload the
15158	* dof once it has become non-lazy. To meet this requirement, the
15159	* code that loads lazy dof requires that the UID's for dof(s) in
15160	* the lazy dof be sorted, and in ascending order. It is okay to skip
15161	* UID's, I.E., 1 -> 5 -> 6 is legal.
15162	*
15163	* Once a process has become non-lazy, it will stay non-lazy. All
15164	* future dof operations for that process will be non-lazy, even
15165	* if the dof mode transitions back to lazy.
15166	*
15167	* Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15168	* That way if the lazy check fails due to transitioning to non-lazy, the
15169	* right thing is done with the newly faulted in dof.
15170	*/
15171
15172	/*
15173	* This method is a bit squicky. It must handle:
15174	*
15175	* dof should not be lazy.
15176	* dof should have been handled lazily, but there was an error
15177	* dof was handled lazily, and needs to be freed.
15178	* dof was handled lazily, and must not be freed.
15179	*
15180	*
15181	* Returns EACCESS if dof should be handled non-lazily.
15182	*
15183	* KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15184	*
15185	* If the dofs data is claimed by this method, dofs_claimed will be set.
15186	* Callers should not free claimed dofs.
15187	*/
15188	static int
15189	dtrace_lazy_dofs_add(proc_t p, dof_ioctl_data_t incoming_dofs, int *dofs_claimed)
15190	{
15191	ASSERT(p);
15192	ASSERT(incoming_dofs && incoming_dofs->dofiod_count > `0`);
15193
15194	int rval = `0`;
15195	*dofs_claimed = `0`;
15196
15197	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15198
15199	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
15200	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15201
15202	/*
15203	* Any existing helpers force non-lazy behavior.
15204	*/
15205	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15206	dtrace_sprlock(p);
15207
15208	dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15209	unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : `0`;
15210	unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15211
15212	/*
15213	* Range check...
15214	*/
15215	if (merged_dofs_count == `0` \|\| merged_dofs_count > `1024`) {
15216	dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15217	rval = EINVAL;
15218	goto unlock;
15219	}
15220
15221	/*
15222	* Each dof being added must be assigned a unique generation.
15223	*/
15224	uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - `1`].dofhp_dof + `1` : `1`;
15225	for (i=`0`; i<incoming_dofs->dofiod_count; i++) {
15226	/*
15227	* We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15228	*/
15229	ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15230	incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15231	}
15232
15233
15234	if (existing_dofs) {
15235	/*
15236	* Merge the existing and incoming dofs
15237	*/
15238	size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15239	dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15240
15241	bcopy(&existing_dofs->dofiod_helpers[`0`],
15242	&merged_dofs->dofiod_helpers[`0`],
15243	sizeof(dof_helper_t) * existing_dofs_count);
15244	bcopy(&incoming_dofs->dofiod_helpers[`0`],
15245	&merged_dofs->dofiod_helpers[existing_dofs_count],
15246	sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15247
15248	merged_dofs->dofiod_count = merged_dofs_count;
15249
15250	kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15251
15252	p->p_dtrace_lazy_dofs = merged_dofs;
15253	} else {
15254	/*
15255	* Claim the incoming dofs
15256	*/
15257	*dofs_claimed = `1`;
15258	p->p_dtrace_lazy_dofs = incoming_dofs;
15259	}
15260
15261	#if DEBUG
15262	dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15263	for (i=`0`; i<all_dofs->dofiod_count-`1`; i++) {
15264	ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+`1`].dofhp_dof);
15265	}
15266	#endif /* DEBUG */
15267
15268	unlock:
15269	dtrace_sprunlock(p);
15270	} else {
15271	rval = EACCES;
15272	}
15273
15274	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15275
15276	return rval;
15277	}
15278
15279	/*
15280	* Returns:
15281	*
15282	* EINVAL: lazy dof is enabled, but the requested generation was not found.
15283	* EACCES: This removal needs to be handled non-lazily.
15284	*/
15285	static int
15286	dtrace_lazy_dofs_remove(proc_t p, int* generation)
15287	{
15288	int rval = EINVAL;
15289
15290	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15291
15292	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
15293	ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15294
15295	/*
15296	* Any existing helpers force non-lazy behavior.
15297	*/
15298	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15299	dtrace_sprlock(p);
15300
15301	dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15302
15303	if (existing_dofs) {
15304	int index, existing_dofs_count = existing_dofs->dofiod_count;
15305	for (index=`0`; index<existing_dofs_count; index++) {
15306	if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15307	dof_ioctl_data_t* removed_dofs = NULL;
15308
15309	/*
15310	* If there is only 1 dof, we'll delete it and swap in NULL.
15311	*/
15312	if (existing_dofs_count > `1`) {
15313	int removed_dofs_count = existing_dofs_count - `1`;
15314	size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15315
15316	removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15317	removed_dofs->dofiod_count = removed_dofs_count;
15318
15319	/*
15320	* copy the remaining data.
15321	*/
15322	if (index > `0`) {
15323	bcopy(&existing_dofs->dofiod_helpers[`0`],
15324	&removed_dofs->dofiod_helpers[`0`],
15325	index * sizeof(dof_helper_t));
15326	}
15327
15328	if (index < existing_dofs_count-`1`) {
15329	bcopy(&existing_dofs->dofiod_helpers[index+`1`],
15330	&removed_dofs->dofiod_helpers[index],
15331	(existing_dofs_count - index - `1`) * sizeof(dof_helper_t));
15332	}
15333	}
15334
15335	kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15336
15337	p->p_dtrace_lazy_dofs = removed_dofs;
15338
15339	rval = KERN_SUCCESS;
15340
15341	break;
15342	}
15343	}
15344
15345	#if DEBUG
15346	dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15347	if (all_dofs) {
15348	unsigned int i;
15349	for (i=`0`; i<all_dofs->dofiod_count-`1`; i++) {
15350	ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+`1`].dofhp_dof);
15351	}
15352	}
15353	#endif
15354
15355	}
15356	dtrace_sprunlock(p);
15357	} else {
15358	rval = EACCES;
15359	}
15360
15361	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15362
15363	return rval;
15364	}
15365
15366	void
15367	dtrace_lazy_dofs_destroy(proc_t *p)
15368	{
15369	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15370	dtrace_sprlock(p);
15371
15372	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
15373
15374	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15375	p->p_dtrace_lazy_dofs = NULL;
15376
15377	dtrace_sprunlock(p);
15378	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15379
15380	if (lazy_dofs) {
15381	kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15382	}
15383	}
15384
15385	static int
15386	dtrace_lazy_dofs_proc_iterate_filter(proc_t p, void** ignored)
15387	{
15388	#pragma unused(ignored)
15389	/*
15390	* Okay to NULL test without taking the sprlock.
15391	*/
15392	return p->p_dtrace_lazy_dofs != NULL;
15393	}
15394
15395	static void
15396	dtrace_lazy_dofs_process(proc_t *p) {
15397	/*
15398	* It is possible this process may exit during our attempt to
15399	* fault in the dof. We could fix this by holding locks longer,
15400	* but the errors are benign.
15401	*/
15402	dtrace_sprlock(p);
15403
15404
15405	ASSERT(p->p_dtrace_lazy_dofs == NULL \|\| p->p_dtrace_helpers == NULL);
15406	ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
15407
15408	dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15409	p->p_dtrace_lazy_dofs = NULL;
15410
15411	dtrace_sprunlock(p);
15412	lck_mtx_lock(&dtrace_meta_lock);
15413	/*
15414	* Process each dof_helper_t
15415	*/
15416	if (lazy_dofs != NULL) {
15417	unsigned int i;
15418	int rval;
15419
15420	for (i=`0`; i<lazy_dofs->dofiod_count; i++) {
15421	/*
15422	* When loading lazy dof, we depend on the generations being sorted in ascending order.
15423	*/
15424	ASSERT(i >= (lazy_dofs->dofiod_count - `1`) \|\| lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+`1`].dofhp_dof);
15425
15426	dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
15427
15428	/*
15429	* We stored the generation in dofhp_dof. Save it, and restore the original value.
15430	*/
15431	int generation = dhp->dofhp_dof;
15432	dhp->dofhp_dof = dhp->dofhp_addr;
15433
15434	dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
15435
15436	if (dof != NULL) {
15437	dtrace_helpers_t *help;
15438
15439	lck_mtx_lock(&dtrace_lock);
15440
15441	/*
15442	* This must be done with the dtrace_lock held
15443	*/
15444	if ((help = p->p_dtrace_helpers) == NULL)
15445	help = dtrace_helpers_create(p);
15446
15447	/*
15448	* If the generation value has been bumped, someone snuck in
15449	* when we released the dtrace lock. We have to dump this generation,
15450	* there is no safe way to load it.
15451	*/
15452	if (help->dthps_generation <= generation) {
15453	help->dthps_generation = generation;
15454
15455	/*
15456	* dtrace_helper_slurp() takes responsibility for the dof --
15457	* it may free it now or it may save it and free it later.
15458	*/
15459	if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
15460	dtrace_dof_error(NULL, "returned value did not match expected generation");
15461	}
15462	}
15463
15464	lck_mtx_unlock(&dtrace_lock);
15465	}
15466	}
15467	lck_mtx_unlock(&dtrace_meta_lock);
15468	kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15469	} else {
15470	lck_mtx_unlock(&dtrace_meta_lock);
15471	}
15472	}
15473
15474	static int
15475	dtrace_lazy_dofs_proc_iterate_doit(proc_t p, void** ignored)
15476	{
15477	#pragma unused(ignored)
15478
15479	dtrace_lazy_dofs_process(p);
15480
15481	return PROC_RETURNED;
15482	}
15483
15484	#define DTRACE_LAZY_DOFS_DUPLICATED 1
15485
15486	static int
15487	dtrace_lazy_dofs_duplicate(proc_t parent, proc_t child)
15488	{
15489	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15490	LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15491	LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15492
15493	lck_rw_lock_shared(&dtrace_dof_mode_lock);
15494	dtrace_sprlock(parent);
15495
15496	/*
15497	* We need to make sure that the transition to lazy dofs -> helpers
15498	* was atomic for our parent
15499	*/
15500	ASSERT(parent->p_dtrace_lazy_dofs == NULL \|\| parent->p_dtrace_helpers == NULL);
15501	/*
15502	* In theory we should hold the child sprlock, but this is safe...
15503	*/
15504	ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
15505
15506	dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
15507	dof_ioctl_data_t* child_dofs = NULL;
15508	if (parent_dofs) {
15509	size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
15510	child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
15511	bcopy(parent_dofs, child_dofs, parent_dofs_size);
15512	}
15513
15514	dtrace_sprunlock(parent);
15515
15516	if (child_dofs) {
15517	dtrace_sprlock(child);
15518	child->p_dtrace_lazy_dofs = child_dofs;
15519	dtrace_sprunlock(child);
15520	/**
15521	* We process the DOF at this point if the mode is set to
15522	* LAZY_OFF. This can happen if DTrace is still processing the
15523	* DOF of other process (which can happen because the
15524	* protected pager can have a huge latency)
15525	* but has not processed our parent yet
15526	*/
15527	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
15528	dtrace_lazy_dofs_process(child);
15529	}
15530	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15531
15532	return DTRACE_LAZY_DOFS_DUPLICATED;
15533	}
15534	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15535
15536	return `0`;
15537	}
15538
15539	static dtrace_helpers_t *
15540	dtrace_helpers_create(proc_t *p)
15541	{
15542	dtrace_helpers_t *help;
15543
15544	LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15545	ASSERT(p->p_dtrace_helpers == NULL);
15546
15547	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15548	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t )
15549	DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15550
15551	p->p_dtrace_helpers = help;
15552	dtrace_helpers++;
15553
15554	return (help);
15555	}
15556
15557	static void
15558	dtrace_helpers_destroy(proc_t* p)
15559	{
15560	dtrace_helpers_t *help;
15561	dtrace_vstate_t *vstate;
15562	uint_t i;
15563
15564	lck_mtx_lock(&dtrace_meta_lock);
15565	lck_mtx_lock(&dtrace_lock);
15566
15567	ASSERT(p->p_dtrace_helpers != NULL);
15568	ASSERT(dtrace_helpers > `0`);
15569
15570	help = p->p_dtrace_helpers;
15571	vstate = &help->dthps_vstate;
15572
15573	/*
15574	* We're now going to lose the help from this process.
15575	*/
15576	p->p_dtrace_helpers = NULL;
15577	dtrace_sync();
15578
15579	/*
15580	* Destory the helper actions.
15581	*/
15582	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
15583	dtrace_helper_action_t h, next;
15584
15585	for (h = help->dthps_actions[i]; h != NULL; h = next) {
15586	next = h->dtha_next;
15587	dtrace_helper_action_destroy(h, vstate);
15588	h = next;
15589	}
15590	}
15591
15592	lck_mtx_unlock(&dtrace_lock);
15593
15594	/*
15595	* Destroy the helper providers.
15596	*/
15597	if (help->dthps_maxprovs > `0`) {
15598	if (dtrace_meta_pid != NULL) {
15599	ASSERT(dtrace_deferred_pid == NULL);
15600
15601	for (i = `0`; i < help->dthps_nprovs; i++) {
15602	dtrace_helper_provider_remove(
15603	&help->dthps_provs[i]->dthp_prov, p);
15604	}
15605	} else {
15606	lck_mtx_lock(&dtrace_lock);
15607	ASSERT(help->dthps_deferred == `0` \|\|
15608	help->dthps_next != NULL \|\|
15609	help->dthps_prev != NULL \|\|
15610	help == dtrace_deferred_pid);
15611
15612	/*
15613	* Remove the helper from the deferred list.
15614	*/
15615	if (help->dthps_next != NULL)
15616	help->dthps_next->dthps_prev = help->dthps_prev;
15617	if (help->dthps_prev != NULL)
15618	help->dthps_prev->dthps_next = help->dthps_next;
15619	if (dtrace_deferred_pid == help) {
15620	dtrace_deferred_pid = help->dthps_next;
15621	ASSERT(help->dthps_prev == NULL);
15622	}
15623
15624	lck_mtx_unlock(&dtrace_lock);
15625	}
15626
15627
15628	for (i = `0`; i < help->dthps_nprovs; i++) {
15629	dtrace_helper_provider_destroy(help->dthps_provs[i]);
15630	}
15631
15632	kmem_free(help->dthps_provs, help->dthps_maxprovs *
15633	sizeof (dtrace_helper_provider_t *));
15634	}
15635
15636	lck_mtx_lock(&dtrace_lock);
15637
15638	dtrace_vstate_fini(&help->dthps_vstate);
15639	kmem_free(help->dthps_actions,
15640	sizeof (dtrace_helper_action_t ) DTRACE_NHELPER_ACTIONS);
15641	kmem_free(help, sizeof (dtrace_helpers_t));
15642
15643	--dtrace_helpers;
15644	lck_mtx_unlock(&dtrace_lock);
15645	lck_mtx_unlock(&dtrace_meta_lock);
15646	}
15647
15648	static void
15649	dtrace_helpers_duplicate(proc_t from, proc_t to)
15650	{
15651	dtrace_helpers_t help, newhelp;
15652	dtrace_helper_action_t helper, new, *last;
15653	dtrace_difo_t *dp;
15654	dtrace_vstate_t *vstate;
15655	uint_t i;
15656	int j, sz, hasprovs = `0`;
15657
15658	lck_mtx_lock(&dtrace_meta_lock);
15659	lck_mtx_lock(&dtrace_lock);
15660	ASSERT(from->p_dtrace_helpers != NULL);
15661	ASSERT(dtrace_helpers > `0`);
15662
15663	help = from->p_dtrace_helpers;
15664	newhelp = dtrace_helpers_create(to);
15665	ASSERT(to->p_dtrace_helpers != NULL);
15666
15667	newhelp->dthps_generation = help->dthps_generation;
15668	vstate = &newhelp->dthps_vstate;
15669
15670	/*
15671	* Duplicate the helper actions.
15672	*/
15673	for (i = `0`; i < DTRACE_NHELPER_ACTIONS; i++) {
15674	if ((helper = help->dthps_actions[i]) == NULL)
15675	continue;
15676
15677	for (last = NULL; helper != NULL; helper = helper->dtha_next) {
15678	new = kmem_zalloc(sizeof (dtrace_helper_action_t),
15679	KM_SLEEP);
15680	new->dtha_generation = helper->dtha_generation;
15681
15682	if ((dp = helper->dtha_predicate) != NULL) {
15683	dp = dtrace_difo_duplicate(dp, vstate);
15684	new->dtha_predicate = dp;
15685	}
15686
15687	new->dtha_nactions = helper->dtha_nactions;
15688	sz = sizeof (dtrace_difo_t ) new->dtha_nactions;
15689	new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
15690
15691	for (j = `0`; j < new->dtha_nactions; j++) {
15692	dtrace_difo_t *dpj = helper->dtha_actions[j];
15693
15694	ASSERT(dpj != NULL);
15695	dpj = dtrace_difo_duplicate(dpj, vstate);
15696	new->dtha_actions[j] = dpj;
15697	}
15698
15699	if (last != NULL) {
15700	last->dtha_next = new;
15701	} else {
15702	newhelp->dthps_actions[i] = new;
15703	}
15704
15705	last = new;
15706	}
15707	}
15708
15709	/*
15710	* Duplicate the helper providers and register them with the
15711	* DTrace framework.
15712	*/
15713	if (help->dthps_nprovs > `0`) {
15714	newhelp->dthps_nprovs = help->dthps_nprovs;
15715	newhelp->dthps_maxprovs = help->dthps_nprovs;
15716	newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
15717	sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15718	for (i = `0`; i < newhelp->dthps_nprovs; i++) {
15719	newhelp->dthps_provs[i] = help->dthps_provs[i];
15720	newhelp->dthps_provs[i]->dthp_ref++;
15721	}
15722
15723	hasprovs = `1`;
15724	}
15725
15726	lck_mtx_unlock(&dtrace_lock);
15727
15728	if (hasprovs)
15729	dtrace_helper_provider_register(to, newhelp, NULL);
15730
15731	lck_mtx_unlock(&dtrace_meta_lock);
15732	}
15733
15734	/**
15735	* DTrace Process functions
15736	*/
15737
15738	void
15739	dtrace_proc_fork(proc_t parent_proc, proc_t child_proc, int spawn)
15740	{
15741	/*
15742	* This code applies to new processes who are copying the task
15743	* and thread state and address spaces of their parent process.
15744	*/
15745	if (!spawn) {
15746	/*
15747	* APPLE NOTE: Solaris does a sprlock() and drops the
15748	* proc_lock here. We're cheating a bit and only taking
15749	* the p_dtrace_sprlock lock. A full sprlock would
15750	* task_suspend the parent.
15751	*/
15752	dtrace_sprlock(parent_proc);
15753
15754	/*
15755	* Remove all DTrace tracepoints from the child process. We
15756	* need to do this _before_ duplicating USDT providers since
15757	* any associated probes may be immediately enabled.
15758	*/
15759	if (parent_proc->p_dtrace_count > `0`) {
15760	dtrace_fasttrap_fork(parent_proc, child_proc);
15761	}
15762
15763	dtrace_sprunlock(parent_proc);
15764
15765	/*
15766	* Duplicate any lazy dof(s). This must be done while NOT
15767	* holding the parent sprlock! Lock ordering is
15768	* dtrace_dof_mode_lock, then sprlock. It is imperative we
15769	* always call dtrace_lazy_dofs_duplicate, rather than null
15770	* check and call if !NULL. If we NULL test, during lazy dof
15771	* faulting we can race with the faulting code and proceed
15772	* from here to beyond the helpers copy. The lazy dof
15773	* faulting will then fail to copy the helpers to the child
15774	* process. We return if we duplicated lazy dofs as a process
15775	* can only have one at the same time to avoid a race between
15776	* a dtrace client and dtrace_proc_fork where a process would
15777	* end up with both lazy dofs and helpers.
15778	*/
15779	if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
15780	return;
15781	}
15782
15783	/*
15784	* Duplicate any helper actions and providers if they haven't
15785	* already.
15786	*/
15787	#if !defined(__APPLE__)
15788	/*
15789	* The SFORKING
15790	* we set above informs the code to enable USDT probes that
15791	* sprlock() may fail because the child is being forked.
15792	*/
15793	#endif
15794	/*
15795	* APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
15796	* never fails to find the child. We do not set SFORKING.
15797	*/
15798	if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
15799	(*dtrace_helpers_fork)(parent_proc, child_proc);
15800	}
15801	}
15802	}
15803
15804	void
15805	dtrace_proc_exec(proc_t *p)
15806	{
15807	/*
15808	* Invalidate any predicate evaluation already cached for this thread by DTrace.
15809	* That's because we've just stored to p_comm and DTrace refers to that when it
15810	* evaluates the "execname" special variable. uid and gid may have changed as well.
15811	*/
15812	dtrace_set_thread_predcache(current_thread(), `0`);
15813
15814	/*
15815	* Free any outstanding lazy dof entries. It is imperative we
15816	* always call dtrace_lazy_dofs_destroy, rather than null check
15817	* and call if !NULL. If we NULL test, during lazy dof faulting
15818	* we can race with the faulting code and proceed from here to
15819	* beyond the helpers cleanup. The lazy dof faulting will then
15820	* install new helpers which no longer belong to this process!
15821	*/
15822	dtrace_lazy_dofs_destroy(p);
15823
15824
15825	/*
15826	* Clean up any DTrace helpers for the process.
15827	*/
15828	if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
15829	(*dtrace_helpers_cleanup)(p);
15830	}
15831
15832	/*
15833	* Cleanup the DTrace provider associated with this process.
15834	*/
15835	proc_lock(p);
15836	if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
15837	(*dtrace_fasttrap_exec_ptr)(p);
15838	}
15839	proc_unlock(p);
15840	}
15841
15842	void
15843	dtrace_proc_exit(proc_t *p)
15844	{
15845	/*
15846	* Free any outstanding lazy dof entries. It is imperative we
15847	* always call dtrace_lazy_dofs_destroy, rather than null check
15848	* and call if !NULL. If we NULL test, during lazy dof faulting
15849	* we can race with the faulting code and proceed from here to
15850	* beyond the helpers cleanup. The lazy dof faulting will then
15851	* install new helpers which will never be cleaned up, and leak.
15852	*/
15853	dtrace_lazy_dofs_destroy(p);
15854
15855	/*
15856	* Clean up any DTrace helper actions or probes for the process.
15857	*/
15858	if (p->p_dtrace_helpers != NULL) {
15859	(*dtrace_helpers_cleanup)(p);
15860	}
15861
15862	/*
15863	* Clean up any DTrace probes associated with this process.
15864	*/
15865	/*
15866	* APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
15867	* call this after dtrace_helpers_cleanup()
15868	*/
15869	proc_lock(p);
15870	if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
15871	(*dtrace_fasttrap_exit_ptr)(p);
15872	}
15873	proc_unlock(p);
15874	}
15875
15876	/*
15877	* DTrace Hook Functions
15878	*/
15879
15880	/*
15881	* APPLE NOTE: dtrace_modctl_* routines for kext support.
15882	* Used to manipulate the modctl list within dtrace xnu.
15883	*/
15884
15885	modctl_t *dtrace_modctl_list;
15886
15887	static void
15888	dtrace_modctl_add(struct modctl * newctl)
15889	{
15890	struct modctl nextp, prevp;
15891
15892	ASSERT(newctl != NULL);
15893	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15894
15895	// Insert new module at the front of the list,
15896
15897	newctl->mod_next = dtrace_modctl_list;
15898	dtrace_modctl_list = newctl;
15899
15900	/*
15901	* If a module exists with the same name, then that module
15902	* must have been unloaded with enabled probes. We will move
15903	* the unloaded module to the new module's stale chain and
15904	* then stop traversing the list.
15905	*/
15906
15907	prevp = newctl;
15908	nextp = newctl->mod_next;
15909
15910	while (nextp != NULL) {
15911	if (nextp->mod_loaded) {
15912	/ This is a loaded module. Keep traversing. /
15913	prevp = nextp;
15914	nextp = nextp->mod_next;
15915	continue;
15916	}
15917	else {
15918	/ Found an unloaded module /
15919	if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
15920	/ Names don't match. Keep traversing. /
15921	prevp = nextp;
15922	nextp = nextp->mod_next;
15923	continue;
15924	}
15925	else {
15926	/ We found a stale entry, move it. We're done. /
15927	prevp->mod_next = nextp->mod_next;
15928	newctl->mod_stale = nextp;
15929	nextp->mod_next = NULL;
15930	break;
15931	}
15932	}
15933	}
15934	}
15935
15936	static modctl_t *
15937	dtrace_modctl_lookup(struct kmod_info * kmod)
15938	{
15939	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15940
15941	struct modctl * ctl;
15942
15943	for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
15944	if (ctl->mod_id == kmod->id)
15945	return(ctl);
15946	}
15947	return (NULL);
15948	}
15949
15950	/*
15951	* This routine is called from dtrace_module_unloaded().
15952	* It removes a modctl structure and its stale chain
15953	* from the kext shadow list.
15954	*/
15955	static void
15956	dtrace_modctl_remove(struct modctl * ctl)
15957	{
15958	ASSERT(ctl != NULL);
15959	LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15960	modctl_t prevp, nextp, *curp;
15961
15962	// Remove stale chain first
15963	for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
15964	nextp = curp->mod_stale;
15965	/ There should NEVER be user symbols allocated at this point /
15966	ASSERT(curp->mod_user_symbols == NULL);
15967	kmem_free(curp, sizeof(modctl_t));
15968	}
15969
15970	prevp = NULL;
15971	curp = dtrace_modctl_list;
15972
15973	while (curp != ctl) {
15974	prevp = curp;
15975	curp = curp->mod_next;
15976	}
15977
15978	if (prevp != NULL) {
15979	prevp->mod_next = ctl->mod_next;
15980	}
15981	else {
15982	dtrace_modctl_list = ctl->mod_next;
15983	}
15984
15985	/ There should NEVER be user symbols allocated at this point /
15986	ASSERT(ctl->mod_user_symbols == NULL);
15987
15988	kmem_free (ctl, sizeof(modctl_t));
15989	}
15990
15991	/*
15992	* APPLE NOTE: The kext loader will call dtrace_module_loaded
15993	* when the kext is loaded in memory, but before calling the
15994	* kext's start routine.
15995	*
15996	* Return 0 on success
15997	* Return -1 on failure
15998	*/
15999
16000	static int
16001	dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16002	{
16003	dtrace_provider_t *prv;
16004
16005	/*
16006	* If kernel symbols have been disabled, return immediately
16007	* DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16008	*/
16009	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16010	return `0`;
16011
16012	struct modctl *ctl = NULL;
16013	if (!kmod \|\| kmod->address == `0` \|\| kmod->size == `0`)
16014	return(-`1`);
16015
16016	lck_mtx_lock(&dtrace_provider_lock);
16017	lck_mtx_lock(&mod_lock);
16018
16019	/*
16020	* Have we seen this kext before?
16021	*/
16022
16023	ctl = dtrace_modctl_lookup(kmod);
16024
16025	if (ctl != NULL) {
16026	/ bail... we already have this kext in the modctl list /
16027	lck_mtx_unlock(&mod_lock);
16028	lck_mtx_unlock(&dtrace_provider_lock);
16029	if (dtrace_err_verbose)
16030	cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16031	return(-`1`);
16032	}
16033	else {
16034	ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16035	if (ctl == NULL) {
16036	if (dtrace_err_verbose)
16037	cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16038	lck_mtx_unlock(&mod_lock);
16039	lck_mtx_unlock(&dtrace_provider_lock);
16040	return (-`1`);
16041	}
16042	ctl->mod_next = NULL;
16043	ctl->mod_stale = NULL;
16044	strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16045	ctl->mod_loadcnt = kmod->id;
16046	ctl->mod_nenabled = `0`;
16047	ctl->mod_address = kmod->address;
16048	ctl->mod_size = kmod->size;
16049	ctl->mod_id = kmod->id;
16050	ctl->mod_loaded = `1`;
16051	ctl->mod_flags = `0`;
16052	ctl->mod_user_symbols = NULL;
16053
16054	/*
16055	* Find the UUID for this module, if it has one
16056	*/
16057	kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16058	struct load_command* load_cmd = (struct load_command *)&header[`1`];
16059	uint32_t i;
16060	for (i = `0`; i < header->ncmds; i++) {
16061	if (load_cmd->cmd == LC_UUID) {
16062	struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16063	memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16064	ctl->mod_flags \|= MODCTL_HAS_UUID;
16065	break;
16066	}
16067	load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16068	}
16069
16070	if (ctl->mod_address == g_kernel_kmod_info.address) {
16071	ctl->mod_flags \|= MODCTL_IS_MACH_KERNEL;
16072	memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16073	}
16074	/*
16075	* Static kexts have a UUID that is not used for symbolication, as all their
16076	* symbols are in kernel
16077	*/
16078	else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16079	memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16080	ctl->mod_flags \|= MODCTL_IS_STATIC_KEXT;
16081	}
16082	}
16083	dtrace_modctl_add(ctl);
16084
16085	/*
16086	* We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16087	*/
16088	lck_mtx_lock(&dtrace_lock);
16089
16090	/*
16091	* DTrace must decide if it will instrument modules lazily via
16092	* userspace symbols (default mode), or instrument immediately via
16093	* kernel symbols (non-default mode)
16094	*
16095	* When in default/lazy mode, DTrace will only support modules
16096	* built with a valid UUID.
16097	*
16098	* Overriding the default can be done explicitly in one of
16099	* the following two ways.
16100	*
16101	* A module can force symbols from kernel space using the plist key,
16102	* OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16103	* we fall through and instrument this module now.
16104	*
16105	* Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16106	* from kernel space (see dtrace_impl.h). If this system state is set
16107	* to a non-userspace mode, we fall through and instrument the module now.
16108	*/
16109
16110	if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16111	(!(flag & KMOD_DTRACE_FORCE_INIT)))
16112	{
16113	/ We will instrument the module lazily -- this is the default /
16114	lck_mtx_unlock(&dtrace_lock);
16115	lck_mtx_unlock(&mod_lock);
16116	lck_mtx_unlock(&dtrace_provider_lock);
16117	return `0`;
16118	}
16119
16120	/ We will instrument the module immediately using kernel symbols /
16121	ctl->mod_flags \|= MODCTL_HAS_KERNEL_SYMBOLS;
16122
16123	lck_mtx_unlock(&dtrace_lock);
16124
16125	/*
16126	* We're going to call each providers per-module provide operation
16127	* specifying only this module.
16128	*/
16129	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16130	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16131
16132	/*
16133	* APPLE NOTE: The contract with the kext loader is that once this function
16134	* has completed, it may delete kernel symbols at will.
16135	* We must set this while still holding the mod_lock.
16136	*/
16137	ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16138
16139	lck_mtx_unlock(&mod_lock);
16140	lck_mtx_unlock(&dtrace_provider_lock);
16141
16142	/*
16143	* If we have any retained enablings, we need to match against them.
16144	* Enabling probes requires that cpu_lock be held, and we cannot hold
16145	* cpu_lock here -- it is legal for cpu_lock to be held when loading a
16146	* module. (In particular, this happens when loading scheduling
16147	* classes.) So if we have any retained enablings, we need to dispatch
16148	* our task queue to do the match for us.
16149	*/
16150	lck_mtx_lock(&dtrace_lock);
16151
16152	if (dtrace_retained == NULL) {
16153	lck_mtx_unlock(&dtrace_lock);
16154	return `0`;
16155	}
16156
16157	/ APPLE NOTE!*
16158	*
16159	* The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16160	* holds it for any reason. Thus the comment above is invalid, we can directly invoke
16161	* dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16162	* the delay call as well.
16163	*/
16164	lck_mtx_unlock(&dtrace_lock);
16165
16166	dtrace_enabling_matchall();
16167
16168	return `0`;
16169	}
16170
16171	/*
16172	* Return 0 on success
16173	* Return -1 on failure
16174	*/
16175	static int
16176	dtrace_module_unloaded(struct kmod_info *kmod)
16177	{
16178	dtrace_probe_t template, probe, first, *next;
16179	dtrace_provider_t *prov;
16180	struct modctl *ctl = NULL;
16181	struct modctl *syncctl = NULL;
16182	struct modctl *nextsyncctl = NULL;
16183	int syncmode = `0`;
16184
16185	lck_mtx_lock(&dtrace_provider_lock);
16186	lck_mtx_lock(&mod_lock);
16187	lck_mtx_lock(&dtrace_lock);
16188
16189	if (kmod == NULL) {
16190	syncmode = `1`;
16191	}
16192	else {
16193	ctl = dtrace_modctl_lookup(kmod);
16194	if (ctl == NULL)
16195	{
16196	lck_mtx_unlock(&dtrace_lock);
16197	lck_mtx_unlock(&mod_lock);
16198	lck_mtx_unlock(&dtrace_provider_lock);
16199	return (-`1`);
16200	}
16201	ctl->mod_loaded = `0`;
16202	ctl->mod_address = `0`;
16203	ctl->mod_size = `0`;
16204	}
16205
16206	if (dtrace_bymod == NULL) {
16207	/*
16208	* The DTrace module is loaded (obviously) but not attached;
16209	* we don't have any work to do.
16210	*/
16211	if (ctl != NULL)
16212	(void)dtrace_modctl_remove(ctl);
16213	lck_mtx_unlock(&dtrace_lock);
16214	lck_mtx_unlock(&mod_lock);
16215	lck_mtx_unlock(&dtrace_provider_lock);
16216	return(`0`);
16217	}
16218
16219	/ Syncmode set means we target and traverse entire modctl list. /
16220	if (syncmode)
16221	nextsyncctl = dtrace_modctl_list;
16222
16223	syncloop:
16224	if (syncmode)
16225	{
16226	/ find a stale modctl struct /
16227	for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16228	if (syncctl->mod_address == `0`)
16229	break;
16230	}
16231	if (syncctl==NULL)
16232	{
16233	/ We have no more work to do /
16234	lck_mtx_unlock(&dtrace_lock);
16235	lck_mtx_unlock(&mod_lock);
16236	lck_mtx_unlock(&dtrace_provider_lock);
16237	return(`0`);
16238	}
16239	else {
16240	/ keep track of next syncctl in case this one is removed /
16241	nextsyncctl = syncctl->mod_next;
16242	ctl = syncctl;
16243	}
16244	}
16245
16246	template.dtpr_mod = ctl->mod_modname;
16247
16248	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16249	probe != NULL; probe = probe->dtpr_nextmod) {
16250	if (probe->dtpr_ecb != NULL) {
16251	/*
16252	* This shouldn't _actually_ be possible -- we're
16253	* unloading a module that has an enabled probe in it.
16254	* (It's normally up to the provider to make sure that
16255	* this can't happen.) However, because dtps_enable()
16256	* doesn't have a failure mode, there can be an
16257	* enable/unload race. Upshot: we don't want to
16258	* assert, but we're not going to disable the
16259	* probe, either.
16260	*/
16261
16262
16263	if (syncmode) {
16264	/ We're syncing, let's look at next in list /
16265	goto syncloop;
16266	}
16267
16268	lck_mtx_unlock(&dtrace_lock);
16269	lck_mtx_unlock(&mod_lock);
16270	lck_mtx_unlock(&dtrace_provider_lock);
16271
16272	if (dtrace_err_verbose) {
16273	cmn_err(CE_WARN, "unloaded module '%s' had "
16274	"enabled probes", ctl->mod_modname);
16275	}
16276	return(-`1`);
16277	}
16278	}
16279
16280	probe = first;
16281
16282	for (first = NULL; probe != NULL; probe = next) {
16283	ASSERT(dtrace_probes[probe->dtpr_id - `1`] == probe);
16284
16285	dtrace_probes[probe->dtpr_id - `1`] = NULL;
16286	probe->dtpr_provider->dtpv_probe_count--;
16287
16288	next = probe->dtpr_nextmod;
16289	dtrace_hash_remove(dtrace_byprov, probe);
16290	dtrace_hash_remove(dtrace_bymod, probe);
16291	dtrace_hash_remove(dtrace_byfunc, probe);
16292	dtrace_hash_remove(dtrace_byname, probe);
16293
16294	if (first == NULL) {
16295	first = probe;
16296	probe->dtpr_nextmod = NULL;
16297	} else {
16298	probe->dtpr_nextmod = first;
16299	first = probe;
16300	}
16301	}
16302
16303	/*
16304	* We've removed all of the module's probes from the hash chains and
16305	* from the probe array. Now issue a dtrace_sync() to be sure that
16306	* everyone has cleared out from any probe array processing.
16307	*/
16308	dtrace_sync();
16309
16310	for (probe = first; probe != NULL; probe = first) {
16311	first = probe->dtpr_nextmod;
16312	prov = probe->dtpr_provider;
16313	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16314	probe->dtpr_arg);
16315	dtrace_strunref(probe->dtpr_mod);
16316	dtrace_strunref(probe->dtpr_func);
16317	dtrace_strunref(probe->dtpr_name);
16318	vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, `1`);
16319
16320	zfree(dtrace_probe_t_zone, probe);
16321	}
16322
16323	dtrace_modctl_remove(ctl);
16324
16325	if (syncmode)
16326	goto syncloop;
16327
16328	lck_mtx_unlock(&dtrace_lock);
16329	lck_mtx_unlock(&mod_lock);
16330	lck_mtx_unlock(&dtrace_provider_lock);
16331
16332	return(`0`);
16333	}
16334
16335	void
16336	dtrace_suspend(void)
16337	{
16338	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16339	}
16340
16341	void
16342	dtrace_resume(void)
16343	{
16344	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16345	}
16346
16347	static int
16348	dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16349	{
16350	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16351	lck_mtx_lock(&dtrace_lock);
16352
16353	switch (what) {
16354	case CPU_CONFIG: {
16355	dtrace_state_t *state;
16356	dtrace_optval_t *opt, rs, c;
16357
16358	/*
16359	* For now, we only allocate a new buffer for anonymous state.
16360	*/
16361	if ((state = dtrace_anon.dta_state) == NULL)
16362	break;
16363
16364	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16365	break;
16366
16367	opt = state->dts_options;
16368	c = opt[DTRACEOPT_CPU];
16369
16370	if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16371	break;
16372
16373	/*
16374	* Regardless of what the actual policy is, we're going to
16375	* temporarily set our resize policy to be manual. We're
16376	* also going to temporarily set our CPU option to denote
16377	* the newly configured CPU.
16378	*/
16379	rs = opt[DTRACEOPT_BUFRESIZE];
16380	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16381	opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16382
16383	(void) dtrace_state_buffers(state);
16384
16385	opt[DTRACEOPT_BUFRESIZE] = rs;
16386	opt[DTRACEOPT_CPU] = c;
16387
16388	break;
16389	}
16390
16391	case CPU_UNCONFIG:
16392	/*
16393	* We don't free the buffer in the CPU_UNCONFIG case. (The
16394	* buffer will be freed when the consumer exits.)
16395	*/
16396	break;
16397
16398	default:
16399	break;
16400	}
16401
16402	lck_mtx_unlock(&dtrace_lock);
16403	return (`0`);
16404	}
16405
16406	static void
16407	dtrace_cpu_setup_initial(processorid_t cpu)
16408	{
16409	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16410	}
16411
16412	static void
16413	dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16414	{
16415	if (dtrace_toxranges >= dtrace_toxranges_max) {
16416	int osize, nsize;
16417	dtrace_toxrange_t *range;
16418
16419	osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16420
16421	if (osize == `0`) {
16422	ASSERT(dtrace_toxrange == NULL);
16423	ASSERT(dtrace_toxranges_max == `0`);
16424	dtrace_toxranges_max = `1`;
16425	} else {
16426	dtrace_toxranges_max <<= `1`;
16427	}
16428
16429	nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16430	range = kmem_zalloc(nsize, KM_SLEEP);
16431
16432	if (dtrace_toxrange != NULL) {
16433	ASSERT(osize != `0`);
16434	bcopy(dtrace_toxrange, range, osize);
16435	kmem_free(dtrace_toxrange, osize);
16436	}
16437
16438	dtrace_toxrange = range;
16439	}
16440
16441	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == `0`);
16442	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == `0`);
16443
16444	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16445	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16446	dtrace_toxranges++;
16447	}
16448
16449	/*
16450	* DTrace Driver Cookbook Functions
16451	*/
16452	/ARGSUSED/
16453	static int
16454	dtrace_attach(dev_info_t *devi)
16455	{
16456	dtrace_provider_id_t id;
16457	dtrace_state_t *state = NULL;
16458	dtrace_enabling_t *enab;
16459
16460	lck_mtx_lock(&cpu_lock);
16461	lck_mtx_lock(&dtrace_provider_lock);
16462	lck_mtx_lock(&dtrace_lock);
16463
16464	/ Darwin uses BSD cloning device driver to automagically obtain minor device number. /
16465	dtrace_devi = devi;
16466
16467	dtrace_modload = dtrace_module_loaded;
16468	dtrace_modunload = dtrace_module_unloaded;
16469	dtrace_cpu_init = dtrace_cpu_setup_initial;
16470	dtrace_helpers_cleanup = dtrace_helpers_destroy;
16471	dtrace_helpers_fork = dtrace_helpers_duplicate;
16472	dtrace_cpustart_init = dtrace_suspend;
16473	dtrace_cpustart_fini = dtrace_resume;
16474	dtrace_debugger_init = dtrace_suspend;
16475	dtrace_debugger_fini = dtrace_resume;
16476
16477	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16478
16479	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16480
16481	dtrace_arena = vmem_create("dtrace", (void *)`1`, UINT32_MAX, `1`,
16482	NULL, NULL, NULL, `0`, VM_SLEEP \| VMC_IDENTIFIER);
16483
16484	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16485	sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
16486	NULL, NULL, NULL, NULL, NULL, `0`);
16487
16488	LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16489
16490	dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
16491	`0`, / unused /
16492	offsetof(dtrace_probe_t, dtpr_nextprov),
16493	offsetof(dtrace_probe_t, dtpr_prevprov));
16494
16495	dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
16496	offsetof(dtrace_probe_t, dtpr_mod),
16497	offsetof(dtrace_probe_t, dtpr_nextmod),
16498	offsetof(dtrace_probe_t, dtpr_prevmod));
16499
16500	dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
16501	offsetof(dtrace_probe_t, dtpr_func),
16502	offsetof(dtrace_probe_t, dtpr_nextfunc),
16503	offsetof(dtrace_probe_t, dtpr_prevfunc));
16504
16505	dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
16506	offsetof(dtrace_probe_t, dtpr_name),
16507	offsetof(dtrace_probe_t, dtpr_nextname),
16508	offsetof(dtrace_probe_t, dtpr_prevname));
16509
16510	if (dtrace_retain_max < `1`) {
16511	cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16512	"setting to 1", dtrace_retain_max);
16513	dtrace_retain_max = `1`;
16514	}
16515
16516	/*
16517	* Now discover our toxic ranges.
16518	*/
16519	dtrace_toxic_ranges(dtrace_toxrange_add);
16520
16521	/*
16522	* Before we register ourselves as a provider to our own framework,
16523	* we would like to assert that dtrace_provider is NULL -- but that's
16524	* not true if we were loaded as a dependency of a DTrace provider.
16525	* Once we've registered, we can assert that dtrace_provider is our
16526	* pseudo provider.
16527	*/
16528	(void) dtrace_register("dtrace", &dtrace_provider_attr,
16529	DTRACE_PRIV_NONE, `0`, &dtrace_provider_ops, NULL, &id);
16530
16531	ASSERT(dtrace_provider != NULL);
16532	ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16533
16534	#if defined (__x86_64__)
16535	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16536	dtrace_provider, NULL, NULL, "BEGIN", `1`, NULL);
16537	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16538	dtrace_provider, NULL, NULL, "END", `0`, NULL);
16539	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16540	dtrace_provider, NULL, NULL, "ERROR", `3`, NULL);
16541	#elif (defined(__arm__) \|\| defined(__arm64__))
16542	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16543	dtrace_provider, NULL, NULL, "BEGIN", `2`, NULL);
16544	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16545	dtrace_provider, NULL, NULL, "END", `1`, NULL);
16546	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16547	dtrace_provider, NULL, NULL, "ERROR", `4`, NULL);
16548	#else
16549	#error Unknown Architecture
16550	#endif
16551
16552	dtrace_anon_property();
16553	lck_mtx_unlock(&cpu_lock);
16554
16555	/*
16556	* If DTrace helper tracing is enabled, we need to allocate the
16557	* trace buffer and initialize the values.
16558	*/
16559	if (dtrace_helptrace_enabled) {
16560	ASSERT(dtrace_helptrace_buffer == NULL);
16561	dtrace_helptrace_buffer =
16562	kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16563	dtrace_helptrace_next = `0`;
16564	}
16565
16566	/*
16567	* If there are already providers, we must ask them to provide their
16568	* probes, and then match any anonymous enabling against them. Note
16569	* that there should be no other retained enablings at this time:
16570	* the only retained enablings at this time should be the anonymous
16571	* enabling.
16572	*/
16573	if (dtrace_anon.dta_enabling != NULL) {
16574	ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16575
16576	/*
16577	* APPLE NOTE: if handling anonymous dof, switch symbol modes.
16578	*/
16579	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16580	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16581	}
16582
16583	dtrace_enabling_provide(NULL);
16584	state = dtrace_anon.dta_state;
16585
16586	/*
16587	* We couldn't hold cpu_lock across the above call to
16588	* dtrace_enabling_provide(), but we must hold it to actually
16589	* enable the probes. We have to drop all of our locks, pick
16590	* up cpu_lock, and regain our locks before matching the
16591	* retained anonymous enabling.
16592	*/
16593	lck_mtx_unlock(&dtrace_lock);
16594	lck_mtx_unlock(&dtrace_provider_lock);
16595
16596	lck_mtx_lock(&cpu_lock);
16597	lck_mtx_lock(&dtrace_provider_lock);
16598	lck_mtx_lock(&dtrace_lock);
16599
16600	if ((enab = dtrace_anon.dta_enabling) != NULL)
16601	(void) dtrace_enabling_match(enab, NULL, NULL);
16602
16603	lck_mtx_unlock(&cpu_lock);
16604	}
16605
16606	lck_mtx_unlock(&dtrace_lock);
16607	lck_mtx_unlock(&dtrace_provider_lock);
16608
16609	if (state != NULL) {
16610	/*
16611	* If we created any anonymous state, set it going now.
16612	*/
16613	(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16614	}
16615
16616	return (DDI_SUCCESS);
16617	}
16618
16619	/ARGSUSED/
16620	static int
16621	dtrace_open(dev_t devp, int* flag, int otyp, cred_t *cred_p)
16622	{
16623	#pragma unused(flag, otyp)
16624	dtrace_state_t *state;
16625	uint32_t priv;
16626	uid_t uid;
16627	zoneid_t zoneid;
16628	int rv;
16629
16630	/ APPLE: Darwin puts Helper on its own major device. /
16631
16632	/*
16633	* If no DTRACE_PRIV_* bits are set in the credential, then the
16634	* caller lacks sufficient permission to do anything with DTrace.
16635	*/
16636	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16637	if (priv == DTRACE_PRIV_NONE)
16638	return (EACCES);
16639
16640	/*
16641	* APPLE NOTE: We delay the initialization of fasttrap as late as possible.
16642	* It certainly can't be later than now!
16643	*/
16644	fasttrap_init();
16645
16646	/*
16647	* Ask all providers to provide all their probes.
16648	*/
16649	lck_mtx_lock(&dtrace_provider_lock);
16650	dtrace_probe_provide(NULL, NULL);
16651	lck_mtx_unlock(&dtrace_provider_lock);
16652
16653	lck_mtx_lock(&cpu_lock);
16654	lck_mtx_lock(&dtrace_lock);
16655	dtrace_opens++;
16656	dtrace_membar_producer();
16657
16658	#ifdef illumos
16659	/*
16660	* If the kernel debugger is active (that is, if the kernel debugger
16661	* modified text in some way), we won't allow the open.
16662	*/
16663	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != `0`) {
16664	dtrace_opens--;
16665	lck_mtx_unlock(&dtrace_lock);
16666	lck_mtx_unlock(&cpu_lock);
16667	return (EBUSY);
16668	}
16669	#endif
16670
16671	rv = dtrace_state_create(devp, cred_p, &state);
16672	lck_mtx_unlock(&cpu_lock);
16673
16674	if (rv != `0` \|\| state == NULL) {
16675	if (--dtrace_opens == `0` && dtrace_anon.dta_enabling == NULL) {
16676	#ifdef illumos
16677	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16678	#endif
16679	}
16680	lck_mtx_unlock(&dtrace_lock);
16681	/ propagate EAGAIN or ERESTART /
16682	return (rv);
16683	}
16684
16685	lck_mtx_unlock(&dtrace_lock);
16686
16687	lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16688
16689	/*
16690	* If we are currently lazy, transition states.
16691	*
16692	* Unlike dtrace_close, we do not need to check the
16693	* value of dtrace_opens, as any positive value (and
16694	* we count as 1) means we transition states.
16695	*/
16696	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
16697	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
16698	/*
16699	* We do not need to hold the exclusive lock while processing
16700	* DOF on processes. We do need to make sure the mode does not get
16701	* changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
16702	* (which should not happen anyway since it only happens in
16703	* dtrace_close). There is no way imcomplete USDT probes can be
16704	* activate by any DTrace clients here since they all have to
16705	* call dtrace_open and be blocked on dtrace_dof_mode_lock
16706	*/
16707	lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
16708	/*
16709	* Iterate all existing processes and load lazy dofs.
16710	*/
16711	proc_iterate(PROC_ALLPROCLIST \| PROC_NOWAITTRANS,
16712	dtrace_lazy_dofs_proc_iterate_doit,
16713	NULL,
16714	dtrace_lazy_dofs_proc_iterate_filter,
16715	NULL);
16716
16717	lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16718	}
16719	else {
16720	lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16721	}
16722
16723
16724	/*
16725	* Update kernel symbol state.
16726	*
16727	* We must own the provider and dtrace locks.
16728	*
16729	* NOTE! It may appear there is a race by setting this value so late
16730	* after dtrace_probe_provide. However, any kext loaded after the
16731	* call to probe provide and before we set LAZY_OFF will be marked as
16732	* eligible for symbols from userspace. The same dtrace that is currently
16733	* calling dtrace_open() (this call!) will get a list of kexts needing
16734	* symbols and fill them in, thus closing the race window.
16735	*
16736	* We want to set this value only after it certain it will succeed, as
16737	* this significantly reduces the complexity of error exits.
16738	*/
16739	lck_mtx_lock(&dtrace_lock);
16740	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16741	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16742	}
16743	lck_mtx_unlock(&dtrace_lock);
16744
16745	return (`0`);
16746	}
16747
16748	/ARGSUSED/
16749	static int
16750	dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16751	{
16752	#pragma unused(flag, otyp, cred_p) /* __APPLE__ */
16753	minor_t minor = getminor(dev);
16754	dtrace_state_t *state;
16755
16756	/ APPLE NOTE: Darwin puts Helper on its own major device. /
16757	state = dtrace_state_get(minor);
16758
16759	lck_mtx_lock(&cpu_lock);
16760	lck_mtx_lock(&dtrace_lock);
16761
16762	if (state->dts_anon) {
16763	/*
16764	* There is anonymous state. Destroy that first.
16765	*/
16766	ASSERT(dtrace_anon.dta_state == NULL);
16767	dtrace_state_destroy(state->dts_anon);
16768	}
16769
16770	dtrace_state_destroy(state);
16771	ASSERT(dtrace_opens > `0`);
16772
16773	/*
16774	* Only relinquish control of the kernel debugger interface when there
16775	* are no consumers and no anonymous enablings.
16776	*/
16777	if (--dtrace_opens == `0` && dtrace_anon.dta_enabling == NULL) {
16778	#ifdef illumos
16779	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16780	#endif
16781	}
16782
16783	lck_mtx_unlock(&dtrace_lock);
16784	lck_mtx_unlock(&cpu_lock);
16785
16786	/*
16787	* Lock ordering requires the dof mode lock be taken before
16788	* the dtrace_lock.
16789	*/
16790	lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16791	lck_mtx_lock(&dtrace_lock);
16792
16793	if (dtrace_opens == `0`) {
16794	/*
16795	* If we are currently lazy-off, and this is the last close, transition to
16796	* lazy state.
16797	*/
16798	if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16799	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
16800	}
16801
16802	/*
16803	* If we are the last dtrace client, switch back to lazy (from userspace) symbols
16804	*/
16805	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
16806	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
16807	}
16808	}
16809
16810	lck_mtx_unlock(&dtrace_lock);
16811	lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16812
16813	/*
16814	* Kext probes may be retained past the end of the kext's lifespan. The
16815	* probes are kept until the last reference to them has been removed.
16816	* Since closing an active dtrace context is likely to drop that last reference,
16817	* lets take a shot at cleaning out the orphaned probes now.
16818	*/
16819	dtrace_module_unloaded(NULL);
16820
16821	return (`0`);
16822	}
16823
16824	/ARGSUSED/
16825	static int
16826	dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
16827	{
16828	#pragma unused(rv)
16829	/*
16830	* Safe to check this outside the dof mode lock
16831	*/
16832	if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
16833	return KERN_SUCCESS;
16834
16835	switch (cmd) {
16836	#if defined (__arm64__)
16837	case DTRACEHIOC_ADDDOF_U32:
16838	case DTRACEHIOC_ADDDOF_U64:
16839	#else
16840	case DTRACEHIOC_ADDDOF:
16841	#endif /* __arm64__*/
16842	{
16843	dof_helper_t *dhp = NULL;
16844	size_t dof_ioctl_data_size;
16845	dof_ioctl_data_t* multi_dof;
16846	unsigned int i;
16847	int rval = `0`;
16848	user_addr_t user_address = (user_addr_t)arg;
16849	uint64_t dof_count;
16850	int multi_dof_claimed = `0`;
16851	proc_t* p = current_proc();
16852
16853	/*
16854	* If this is a restricted process and dtrace is restricted,
16855	* do not allow DOFs to be registered
16856	*/
16857	if (dtrace_is_restricted() &&
16858	!dtrace_are_restrictions_relaxed() &&
16859	!dtrace_can_attach_to_proc(current_proc())) {
16860	return (EACCES);
16861	}
16862
16863	/*
16864	* Read the number of DOF sections being passed in.
16865	*/
16866	if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
16867	&dof_count,
16868	sizeof(dof_count))) {
16869	dtrace_dof_error(NULL, "failed to copyin dofiod_count");
16870	return (EFAULT);
16871	}
16872
16873	/*
16874	* Range check the count.
16875	*/
16876	if (dof_count == `0` \|\| dof_count > `1024`) {
16877	dtrace_dof_error(NULL, "dofiod_count is not valid");
16878	return (EINVAL);
16879	}
16880
16881	/*
16882	* Allocate a correctly sized structure and copyin the data.
16883	*/
16884	dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
16885	if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
16886	return (ENOMEM);
16887
16888	/ NOTE! We can no longer exit this method via return /
16889	if (copyin(user_address, multi_dof, dof_ioctl_data_size) != `0`) {
16890	dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
16891	rval = EFAULT;
16892	goto cleanup;
16893	}
16894
16895	/*
16896	* Check that the count didn't change between the first copyin and the second.
16897	*/
16898	if (multi_dof->dofiod_count != dof_count) {
16899	rval = EINVAL;
16900	goto cleanup;
16901	}
16902
16903	/*
16904	* Try to process lazily first.
16905	*/
16906	rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
16907
16908	/*
16909	* If rval is EACCES, we must be non-lazy.
16910	*/
16911	if (rval == EACCES) {
16912	rval = `0`;
16913	/*
16914	* Process each dof_helper_t
16915	*/
16916	i = `0`;
16917	do {
16918	dhp = &multi_dof->dofiod_helpers[i];
16919
16920	dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
16921
16922	if (dof != NULL) {
16923	lck_mtx_lock(&dtrace_meta_lock);
16924	lck_mtx_lock(&dtrace_lock);
16925
16926	/*
16927	* dtrace_helper_slurp() takes responsibility for the dof --
16928	* it may free it now or it may save it and free it later.
16929	*/
16930	if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -`1ULL`) {
16931	rval = EINVAL;
16932	}
16933
16934	lck_mtx_unlock(&dtrace_lock);
16935	lck_mtx_unlock(&dtrace_meta_lock);
16936	}
16937	} while (++i < multi_dof->dofiod_count && rval == `0`);
16938	}
16939
16940	/*
16941	* We need to copyout the multi_dof struct, because it contains
16942	* the generation (unique id) values needed to call DTRACEHIOC_REMOVE
16943	*
16944	* This could certainly be better optimized.
16945	*/
16946	if (copyout(multi_dof, user_address, dof_ioctl_data_size) != `0`) {
16947	dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
16948	/ Don't overwrite pre-existing error code /
16949	if (rval == `0`) rval = EFAULT;
16950	}
16951
16952	cleanup:
16953	/*
16954	* If we had to allocate struct memory, free it.
16955	*/
16956	if (multi_dof != NULL && !multi_dof_claimed) {
16957	kmem_free(multi_dof, dof_ioctl_data_size);
16958	}
16959
16960	return rval;
16961	}
16962
16963	case DTRACEHIOC_REMOVE: {
16964	int generation = (int**)arg;
16965	proc_t* p = current_proc();
16966
16967	/*
16968	* Try lazy first.
16969	*/
16970	int rval = dtrace_lazy_dofs_remove(p, generation);
16971
16972	/*
16973	* EACCES means non-lazy
16974	*/
16975	if (rval == EACCES) {
16976	lck_mtx_lock(&dtrace_meta_lock);
16977	lck_mtx_lock(&dtrace_lock);
16978	rval = dtrace_helper_destroygen(p, generation);
16979	lck_mtx_unlock(&dtrace_lock);
16980	lck_mtx_unlock(&dtrace_meta_lock);
16981	}
16982
16983	return (rval);
16984	}
16985
16986	default:
16987	break;
16988	}
16989
16990	return ENOTTY;
16991	}
16992
16993	/ARGSUSED/
16994	static int
16995	dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t cr, int* *rv)
16996	{
16997	#pragma unused(md)
16998	minor_t minor = getminor(dev);
16999	dtrace_state_t *state;
17000	int rval;
17001
17002	/ Darwin puts Helper on its own major device. /
17003
17004	state = dtrace_state_get(minor);
17005
17006	if (state->dts_anon) {
17007	ASSERT(dtrace_anon.dta_state == NULL);
17008	state = state->dts_anon;
17009	}
17010
17011	switch (cmd) {
17012	case DTRACEIOC_PROVIDER: {
17013	dtrace_providerdesc_t pvd;
17014	dtrace_provider_t *pvp;
17015
17016	if (copyin(arg, &pvd, sizeof (pvd)) != `0`)
17017	return (EFAULT);
17018
17019	pvd.dtvd_name[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
17020	lck_mtx_lock(&dtrace_provider_lock);
17021
17022	for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17023	if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == `0`)
17024	break;
17025	}
17026
17027	lck_mtx_unlock(&dtrace_provider_lock);
17028
17029	if (pvp == NULL)
17030	return (ESRCH);
17031
17032	bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17033	bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17034	if (copyout(&pvd, arg, sizeof (pvd)) != `0`)
17035	return (EFAULT);
17036
17037	return (`0`);
17038	}
17039
17040	case DTRACEIOC_EPROBE: {
17041	dtrace_eprobedesc_t epdesc;
17042	dtrace_ecb_t *ecb;
17043	dtrace_action_t *act;
17044	void *buf;
17045	size_t size;
17046	uintptr_t dest;
17047	int nrecs;
17048
17049	if (copyin(arg, &epdesc, sizeof (epdesc)) != `0`)
17050	return (EFAULT);
17051
17052	lck_mtx_lock(&dtrace_lock);
17053
17054	if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17055	lck_mtx_unlock(&dtrace_lock);
17056	return (EINVAL);
17057	}
17058
17059	if (ecb->dte_probe == NULL) {
17060	lck_mtx_unlock(&dtrace_lock);
17061	return (EINVAL);
17062	}
17063
17064	epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17065	epdesc.dtepd_uarg = ecb->dte_uarg;
17066	epdesc.dtepd_size = ecb->dte_size;
17067
17068	nrecs = epdesc.dtepd_nrecs;
17069	epdesc.dtepd_nrecs = `0`;
17070	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17071	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
17072	continue;
17073
17074	epdesc.dtepd_nrecs++;
17075	}
17076
17077	/*
17078	* Now that we have the size, we need to allocate a temporary
17079	* buffer in which to store the complete description. We need
17080	* the temporary buffer to be able to drop dtrace_lock()
17081	* across the copyout(), below.
17082	*/
17083	size = sizeof (dtrace_eprobedesc_t) +
17084	(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17085
17086	buf = kmem_alloc(size, KM_SLEEP);
17087	dest = (uintptr_t)buf;
17088
17089	bcopy(&epdesc, (void )dest, sizeof* (epdesc));
17090	dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[`0`]);
17091
17092	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17093	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
17094	continue;
17095
17096	if (nrecs-- == `0`)
17097	break;
17098
17099	bcopy(&act->dta_rec, (void *)dest,
17100	sizeof (dtrace_recdesc_t));
17101	dest += sizeof (dtrace_recdesc_t);
17102	}
17103
17104	lck_mtx_unlock(&dtrace_lock);
17105
17106	if (copyout(buf, arg, dest - (uintptr_t)buf) != `0`) {
17107	kmem_free(buf, size);
17108	return (EFAULT);
17109	}
17110
17111	kmem_free(buf, size);
17112	return (`0`);
17113	}
17114
17115	case DTRACEIOC_AGGDESC: {
17116	dtrace_aggdesc_t aggdesc;
17117	dtrace_action_t *act;
17118	dtrace_aggregation_t *agg;
17119	int nrecs;
17120	uint32_t offs;
17121	dtrace_recdesc_t *lrec;
17122	void *buf;
17123	size_t size;
17124	uintptr_t dest;
17125
17126	if (copyin(arg, &aggdesc, sizeof (aggdesc)) != `0`)
17127	return (EFAULT);
17128
17129	lck_mtx_lock(&dtrace_lock);
17130
17131	if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17132	lck_mtx_unlock(&dtrace_lock);
17133	return (EINVAL);
17134	}
17135
17136	aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17137
17138	nrecs = aggdesc.dtagd_nrecs;
17139	aggdesc.dtagd_nrecs = `0`;
17140
17141	offs = agg->dtag_base;
17142	lrec = &agg->dtag_action.dta_rec;
17143	aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17144
17145	for (act = agg->dtag_first; ; act = act->dta_next) {
17146	ASSERT(act->dta_intuple \|\|
17147	DTRACEACT_ISAGG(act->dta_kind));
17148
17149	/*
17150	* If this action has a record size of zero, it
17151	* denotes an argument to the aggregating action.
17152	* Because the presence of this record doesn't (or
17153	* shouldn't) affect the way the data is interpreted,
17154	* we don't copy it out to save user-level the
17155	* confusion of dealing with a zero-length record.
17156	*/
17157	if (act->dta_rec.dtrd_size == `0`) {
17158	ASSERT(agg->dtag_hasarg);
17159	continue;
17160	}
17161
17162	aggdesc.dtagd_nrecs++;
17163
17164	if (act == &agg->dtag_action)
17165	break;
17166	}
17167
17168	/*
17169	* Now that we have the size, we need to allocate a temporary
17170	* buffer in which to store the complete description. We need
17171	* the temporary buffer to be able to drop dtrace_lock()
17172	* across the copyout(), below.
17173	*/
17174	size = sizeof (dtrace_aggdesc_t) +
17175	(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17176
17177	buf = kmem_alloc(size, KM_SLEEP);
17178	dest = (uintptr_t)buf;
17179
17180	bcopy(&aggdesc, (void )dest, sizeof* (aggdesc));
17181	dest += offsetof(dtrace_aggdesc_t, dtagd_rec[`0`]);
17182
17183	for (act = agg->dtag_first; ; act = act->dta_next) {
17184	dtrace_recdesc_t rec = act->dta_rec;
17185
17186	/*
17187	* See the comment in the above loop for why we pass
17188	* over zero-length records.
17189	*/
17190	if (rec.dtrd_size == `0`) {
17191	ASSERT(agg->dtag_hasarg);
17192	continue;
17193	}
17194
17195	if (nrecs-- == `0`)
17196	break;
17197
17198	rec.dtrd_offset -= offs;
17199	bcopy(&rec, (void )dest, sizeof* (rec));
17200	dest += sizeof (dtrace_recdesc_t);
17201
17202	if (act == &agg->dtag_action)
17203	break;
17204	}
17205
17206	lck_mtx_unlock(&dtrace_lock);
17207
17208	if (copyout(buf, arg, dest - (uintptr_t)buf) != `0`) {
17209	kmem_free(buf, size);
17210	return (EFAULT);
17211	}
17212
17213	kmem_free(buf, size);
17214	return (`0`);
17215	}
17216
17217	case DTRACEIOC_ENABLE: {
17218	dof_hdr_t *dof;
17219	dtrace_enabling_t *enab = NULL;
17220	dtrace_vstate_t *vstate;
17221	int err = `0`;
17222
17223	*rv = `0`;
17224
17225	/*
17226	* If a NULL argument has been passed, we take this as our
17227	* cue to reevaluate our enablings.
17228	*/
17229	if (arg == `0`) {
17230	dtrace_enabling_matchall();
17231
17232	return (`0`);
17233	}
17234
17235	if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17236	return (rval);
17237
17238	lck_mtx_lock(&cpu_lock);
17239	lck_mtx_lock(&dtrace_lock);
17240	vstate = &state->dts_vstate;
17241
17242	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17243	lck_mtx_unlock(&dtrace_lock);
17244	lck_mtx_unlock(&cpu_lock);
17245	dtrace_dof_destroy(dof);
17246	return (EBUSY);
17247	}
17248
17249	if (dtrace_dof_slurp(dof, vstate, cr, &enab, `0`, B_TRUE) != `0`) {
17250	lck_mtx_unlock(&dtrace_lock);
17251	lck_mtx_unlock(&cpu_lock);
17252	dtrace_dof_destroy(dof);
17253	return (EINVAL);
17254	}
17255
17256	if ((rval = dtrace_dof_options(dof, state)) != `0`) {
17257	dtrace_enabling_destroy(enab);
17258	lck_mtx_unlock(&dtrace_lock);
17259	lck_mtx_unlock(&cpu_lock);
17260	dtrace_dof_destroy(dof);
17261	return (rval);
17262	}
17263
17264	if ((err = dtrace_enabling_match(enab, rv, NULL)) == `0`) {
17265	err = dtrace_enabling_retain(enab);
17266	} else {
17267	dtrace_enabling_destroy(enab);
17268	}
17269
17270	lck_mtx_unlock(&dtrace_lock);
17271	lck_mtx_unlock(&cpu_lock);
17272	dtrace_dof_destroy(dof);
17273
17274	return (err);
17275	}
17276
17277	case DTRACEIOC_REPLICATE: {
17278	dtrace_repldesc_t desc;
17279	dtrace_probedesc_t *match = &desc.dtrpd_match;
17280	dtrace_probedesc_t *create = &desc.dtrpd_create;
17281	int err;
17282
17283	if (copyin(arg, &desc, sizeof (desc)) != `0`)
17284	return (EFAULT);
17285
17286	match->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
17287	match->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
17288	match->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
17289	match->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
17290
17291	create->dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
17292	create->dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
17293	create->dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
17294	create->dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
17295
17296	lck_mtx_lock(&dtrace_lock);
17297	err = dtrace_enabling_replicate(state, match, create);
17298	lck_mtx_unlock(&dtrace_lock);
17299
17300	return (err);
17301	}
17302
17303	case DTRACEIOC_PROBEMATCH:
17304	case DTRACEIOC_PROBES: {
17305	dtrace_probe_t *probe = NULL;
17306	dtrace_probedesc_t desc;
17307	dtrace_probekey_t pkey;
17308	dtrace_id_t i;
17309	int m = `0`;
17310	uint32_t priv;
17311	uid_t uid;
17312	zoneid_t zoneid;
17313
17314	if (copyin(arg, &desc, sizeof (desc)) != `0`)
17315	return (EFAULT);
17316
17317	desc.dtpd_provider[DTRACE_PROVNAMELEN - `1`] = `'\0'`;
17318	desc.dtpd_mod[DTRACE_MODNAMELEN - `1`] = `'\0'`;
17319	desc.dtpd_func[DTRACE_FUNCNAMELEN - `1`] = `'\0'`;
17320	desc.dtpd_name[DTRACE_NAMELEN - `1`] = `'\0'`;
17321
17322	/*
17323	* Before we attempt to match this probe, we want to give
17324	* all providers the opportunity to provide it.
17325	*/
17326	if (desc.dtpd_id == DTRACE_IDNONE) {
17327	lck_mtx_lock(&dtrace_provider_lock);
17328	dtrace_probe_provide(&desc, NULL);
17329	lck_mtx_unlock(&dtrace_provider_lock);
17330	desc.dtpd_id++;
17331	}
17332
17333	dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17334
17335	lck_mtx_lock(&dtrace_lock);
17336
17337	if (cmd == DTRACEIOC_PROBEMATCH) {
17338	dtrace_probekey(&desc, &pkey);
17339	pkey.dtpk_id = DTRACE_IDNONE;
17340
17341	/ Quiet compiler warning /
17342	for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17343	if ((probe = dtrace_probes[i - `1`]) != NULL &&
17344	(m = dtrace_match_probe(probe, &pkey,
17345	priv, uid, zoneid)) != `0`)
17346	break;
17347	}
17348
17349	if (m < `0`) {
17350	lck_mtx_unlock(&dtrace_lock);
17351	return (EINVAL);
17352	}
17353	dtrace_probekey_release(&pkey);
17354
17355	} else {
17356	/ Quiet compiler warning /
17357	for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17358	if ((probe = dtrace_probes[i - `1`]) != NULL &&
17359	dtrace_match_priv(probe, priv, uid, zoneid))
17360	break;
17361	}
17362	}
17363
17364	if (probe == NULL) {
17365	lck_mtx_unlock(&dtrace_lock);
17366	return (ESRCH);
17367	}
17368
17369	dtrace_probe_description(probe, &desc);
17370	lck_mtx_unlock(&dtrace_lock);
17371
17372	if (copyout(&desc, arg, sizeof (desc)) != `0`)
17373	return (EFAULT);
17374
17375	return (`0`);
17376	}
17377
17378	case DTRACEIOC_PROBEARG: {
17379	dtrace_argdesc_t desc;
17380	dtrace_probe_t *probe;
17381	dtrace_provider_t *prov;
17382
17383	if (copyin(arg, &desc, sizeof (desc)) != `0`)
17384	return (EFAULT);
17385
17386	if (desc.dtargd_id == DTRACE_IDNONE)
17387	return (EINVAL);
17388
17389	if (desc.dtargd_ndx == DTRACE_ARGNONE)
17390	return (EINVAL);
17391
17392	lck_mtx_lock(&dtrace_provider_lock);
17393	lck_mtx_lock(&mod_lock);
17394	lck_mtx_lock(&dtrace_lock);
17395
17396	/ Quiet compiler warning /
17397	if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
17398	lck_mtx_unlock(&dtrace_lock);
17399	lck_mtx_unlock(&mod_lock);
17400	lck_mtx_unlock(&dtrace_provider_lock);
17401	return (EINVAL);
17402	}
17403
17404	if ((probe = dtrace_probes[desc.dtargd_id - `1`]) == NULL) {
17405	lck_mtx_unlock(&dtrace_lock);
17406	lck_mtx_unlock(&mod_lock);
17407	lck_mtx_unlock(&dtrace_provider_lock);
17408	return (EINVAL);
17409	}
17410
17411	lck_mtx_unlock(&dtrace_lock);
17412
17413	prov = probe->dtpr_provider;
17414
17415	if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17416	/*
17417	* There isn't any typed information for this probe.
17418	* Set the argument number to DTRACE_ARGNONE.
17419	*/
17420	desc.dtargd_ndx = DTRACE_ARGNONE;
17421	} else {
17422	desc.dtargd_native[`0`] = `'\0'`;
17423	desc.dtargd_xlate[`0`] = `'\0'`;
17424	desc.dtargd_mapping = desc.dtargd_ndx;
17425
17426	prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17427	probe->dtpr_id, probe->dtpr_arg, &desc);
17428	}
17429
17430	lck_mtx_unlock(&mod_lock);
17431	lck_mtx_unlock(&dtrace_provider_lock);
17432
17433	if (copyout(&desc, arg, sizeof (desc)) != `0`)
17434	return (EFAULT);
17435
17436	return (`0`);
17437	}
17438
17439	case DTRACEIOC_GO: {
17440	processorid_t cpuid;
17441	rval = dtrace_state_go(state, &cpuid);
17442
17443	if (rval != `0`)
17444	return (rval);
17445
17446	if (copyout(&cpuid, arg, sizeof (cpuid)) != `0`)
17447	return (EFAULT);
17448
17449	return (`0`);
17450	}
17451
17452	case DTRACEIOC_STOP: {
17453	processorid_t cpuid;
17454
17455	lck_mtx_lock(&dtrace_lock);
17456	rval = dtrace_state_stop(state, &cpuid);
17457	lck_mtx_unlock(&dtrace_lock);
17458
17459	if (rval != `0`)
17460	return (rval);
17461
17462	if (copyout(&cpuid, arg, sizeof (cpuid)) != `0`)
17463	return (EFAULT);
17464
17465	return (`0`);
17466	}
17467
17468	case DTRACEIOC_DOFGET: {
17469	dof_hdr_t hdr, *dof;
17470	uint64_t len;
17471
17472	if (copyin(arg, &hdr, sizeof (hdr)) != `0`)
17473	return (EFAULT);
17474
17475	lck_mtx_lock(&dtrace_lock);
17476	dof = dtrace_dof_create(state);
17477	lck_mtx_unlock(&dtrace_lock);
17478
17479	len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17480	rval = copyout(dof, arg, len);
17481	dtrace_dof_destroy(dof);
17482
17483	return (rval == `0` ? `0` : EFAULT);
17484	}
17485
17486	case DTRACEIOC_SLEEP: {
17487	int64_t time;
17488	uint64_t abstime;
17489	uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
17490
17491	if (copyin(arg, &time, sizeof(time)) != `0`)
17492	return (EFAULT);
17493
17494	nanoseconds_to_absolutetime((uint64_t)time, &abstime);
17495	clock_absolutetime_interval_to_deadline(abstime, &abstime);
17496
17497	if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
17498	if (state->dts_buf_over_limit > `0`) {
17499	clear_wait(current_thread(), THREAD_INTERRUPTED);
17500	rvalue = DTRACE_WAKE_BUF_LIMIT;
17501	} else {
17502	thread_block(THREAD_CONTINUE_NULL);
17503	if (state->dts_buf_over_limit > `0`) {
17504	rvalue = DTRACE_WAKE_BUF_LIMIT;
17505	}
17506	}
17507	}
17508
17509	if (copyout(&rvalue, arg, sizeof(rvalue)) != `0`)
17510	return (EFAULT);
17511
17512	return (`0`);
17513	}
17514
17515	case DTRACEIOC_SIGNAL: {
17516	wakeup(state);
17517	return (`0`);
17518	}
17519
17520	case DTRACEIOC_AGGSNAP:
17521	case DTRACEIOC_BUFSNAP: {
17522	dtrace_bufdesc_t desc;
17523	caddr_t cached;
17524	boolean_t over_limit;
17525	dtrace_buffer_t *buf;
17526
17527	if (copyin(arg, &desc, sizeof (desc)) != `0`)
17528	return (EFAULT);
17529
17530	if ((int)desc.dtbd_cpu < `0` \|\| desc.dtbd_cpu >= NCPU)
17531	return (EINVAL);
17532
17533	lck_mtx_lock(&dtrace_lock);
17534
17535	if (cmd == DTRACEIOC_BUFSNAP) {
17536	buf = &state->dts_buffer[desc.dtbd_cpu];
17537	} else {
17538	buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17539	}
17540
17541	if (buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL)) {
17542	size_t sz = buf->dtb_offset;
17543
17544	if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17545	lck_mtx_unlock(&dtrace_lock);
17546	return (EBUSY);
17547	}
17548
17549	/*
17550	* If this buffer has already been consumed, we're
17551	* going to indicate that there's nothing left here
17552	* to consume.
17553	*/
17554	if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17555	lck_mtx_unlock(&dtrace_lock);
17556
17557	desc.dtbd_size = `0`;
17558	desc.dtbd_drops = `0`;
17559	desc.dtbd_errors = `0`;
17560	desc.dtbd_oldest = `0`;
17561	sz = sizeof (desc);
17562
17563	if (copyout(&desc, arg, sz) != `0`)
17564	return (EFAULT);
17565
17566	return (`0`);
17567	}
17568
17569	/*
17570	* If this is a ring buffer that has wrapped, we want
17571	* to copy the whole thing out.
17572	*/
17573	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17574	dtrace_buffer_polish(buf);
17575	sz = buf->dtb_size;
17576	}
17577
17578	if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != `0`) {
17579	lck_mtx_unlock(&dtrace_lock);
17580	return (EFAULT);
17581	}
17582
17583	desc.dtbd_size = sz;
17584	desc.dtbd_drops = buf->dtb_drops;
17585	desc.dtbd_errors = buf->dtb_errors;
17586	desc.dtbd_oldest = buf->dtb_xamot_offset;
17587	desc.dtbd_timestamp = dtrace_gethrtime();
17588
17589	lck_mtx_unlock(&dtrace_lock);
17590
17591	if (copyout(&desc, arg, sizeof (desc)) != `0`)
17592	return (EFAULT);
17593
17594	buf->dtb_flags \|= DTRACEBUF_CONSUMED;
17595
17596	return (`0`);
17597	}
17598
17599	if (buf->dtb_tomax == NULL) {
17600	ASSERT(buf->dtb_xamot == NULL);
17601	lck_mtx_unlock(&dtrace_lock);
17602	return (ENOENT);
17603	}
17604
17605	cached = buf->dtb_tomax;
17606	over_limit = buf->dtb_cur_limit == buf->dtb_size;
17607
17608	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17609
17610	dtrace_xcall(desc.dtbd_cpu,
17611	(dtrace_xcall_t)dtrace_buffer_switch, buf);
17612
17613	state->dts_errors += buf->dtb_xamot_errors;
17614
17615	/*
17616	* If the buffers did not actually switch, then the cross call
17617	* did not take place -- presumably because the given CPU is
17618	* not in the ready set. If this is the case, we'll return
17619	* ENOENT.
17620	*/
17621	if (buf->dtb_tomax == cached) {
17622	ASSERT(buf->dtb_xamot != cached);
17623	lck_mtx_unlock(&dtrace_lock);
17624	return (ENOENT);
17625	}
17626
17627	ASSERT(cached == buf->dtb_xamot);
17628	/*
17629	* At this point we know the buffer have switched, so we
17630	* can decrement the over limit count if the buffer was over
17631	* its limit. The new buffer might already be over its limit
17632	* yet, but we don't care since we're guaranteed not to be
17633	* checking the buffer over limit count at this point.
17634	*/
17635	if (over_limit) {
17636	uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -`1`);
17637	#pragma unused(old)
17638
17639	/*
17640	* Verify that we didn't underflow the value
17641	*/
17642	ASSERT(old != `0`);
17643	}
17644
17645	/*
17646	* We have our snapshot; now copy it out.
17647	*/
17648	if (dtrace_buffer_copyout(buf->dtb_xamot,
17649	(user_addr_t)desc.dtbd_data,
17650	buf->dtb_xamot_offset) != `0`) {
17651	lck_mtx_unlock(&dtrace_lock);
17652	return (EFAULT);
17653	}
17654
17655	desc.dtbd_size = buf->dtb_xamot_offset;
17656	desc.dtbd_drops = buf->dtb_xamot_drops;
17657	desc.dtbd_errors = buf->dtb_xamot_errors;
17658	desc.dtbd_oldest = `0`;
17659	desc.dtbd_timestamp = buf->dtb_switched;
17660
17661	lck_mtx_unlock(&dtrace_lock);
17662
17663	/*
17664	* Finally, copy out the buffer description.
17665	*/
17666	if (copyout(&desc, arg, sizeof (desc)) != `0`)
17667	return (EFAULT);
17668
17669	return (`0`);
17670	}
17671
17672	case DTRACEIOC_CONF: {
17673	dtrace_conf_t conf;
17674
17675	bzero(&conf, sizeof (conf));
17676	conf.dtc_difversion = DIF_VERSION;
17677	conf.dtc_difintregs = DIF_DIR_NREGS;
17678	conf.dtc_diftupregs = DIF_DTR_NREGS;
17679	conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17680
17681	if (copyout(&conf, arg, sizeof (conf)) != `0`)
17682	return (EFAULT);
17683
17684	return (`0`);
17685	}
17686
17687	case DTRACEIOC_STATUS: {
17688	dtrace_status_t stat;
17689	dtrace_dstate_t *dstate;
17690	int i, j;
17691	uint64_t nerrs;
17692
17693	/*
17694	* See the comment in dtrace_state_deadman() for the reason
17695	* for setting dts_laststatus to INT64_MAX before setting
17696	* it to the correct value.
17697	*/
17698	state->dts_laststatus = INT64_MAX;
17699	dtrace_membar_producer();
17700	state->dts_laststatus = dtrace_gethrtime();
17701
17702	bzero(&stat, sizeof (stat));
17703
17704	lck_mtx_lock(&dtrace_lock);
17705
17706	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17707	lck_mtx_unlock(&dtrace_lock);
17708	return (ENOENT);
17709	}
17710
17711	if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17712	stat.dtst_exiting = `1`;
17713
17714	nerrs = state->dts_errors;
17715	dstate = &state->dts_vstate.dtvs_dynvars;
17716
17717	for (i = `0`; i < (int)NCPU; i++) {
17718	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17719
17720	stat.dtst_dyndrops += dcpu->dtdsc_drops;
17721	stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17722	stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17723
17724	if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17725	stat.dtst_filled++;
17726
17727	nerrs += state->dts_buffer[i].dtb_errors;
17728
17729	for (j = `0`; j < state->dts_nspeculations; j++) {
17730	dtrace_speculation_t *spec;
17731	dtrace_buffer_t *buf;
17732
17733	spec = &state->dts_speculations[j];
17734	buf = &spec->dtsp_buffer[i];
17735	stat.dtst_specdrops += buf->dtb_xamot_drops;
17736	}
17737	}
17738
17739	stat.dtst_specdrops_busy = state->dts_speculations_busy;
17740	stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17741	stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17742	stat.dtst_dblerrors = state->dts_dblerrors;
17743	stat.dtst_killed =
17744	(state->dts_activity == DTRACE_ACTIVITY_KILLED);
17745	stat.dtst_errors = nerrs;
17746
17747	lck_mtx_unlock(&dtrace_lock);
17748
17749	if (copyout(&stat, arg, sizeof (stat)) != `0`)
17750	return (EFAULT);
17751
17752	return (`0`);
17753	}
17754
17755	case DTRACEIOC_FORMAT: {
17756	dtrace_fmtdesc_t fmt;
17757	char *str;
17758	int len;
17759
17760	if (copyin(arg, &fmt, sizeof (fmt)) != `0`)
17761	return (EFAULT);
17762
17763	lck_mtx_lock(&dtrace_lock);
17764
17765	if (fmt.dtfd_format == `0` \|\|
17766	fmt.dtfd_format > state->dts_nformats) {
17767	lck_mtx_unlock(&dtrace_lock);
17768	return (EINVAL);
17769	}
17770
17771	/*
17772	* Format strings are allocated contiguously and they are
17773	* never freed; if a format index is less than the number
17774	* of formats, we can assert that the format map is non-NULL
17775	* and that the format for the specified index is non-NULL.
17776	*/
17777	ASSERT(state->dts_formats != NULL);
17778	str = state->dts_formats[fmt.dtfd_format - `1`];
17779	ASSERT(str != NULL);
17780
17781	len = strlen(str) + `1`;
17782
17783	if (len > fmt.dtfd_length) {
17784	fmt.dtfd_length = len;
17785
17786	if (copyout(&fmt, arg, sizeof (fmt)) != `0`) {
17787	lck_mtx_unlock(&dtrace_lock);
17788	return (EINVAL);
17789	}
17790	} else {
17791	if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != `0`) {
17792	lck_mtx_unlock(&dtrace_lock);
17793	return (EINVAL);
17794	}
17795	}
17796
17797	lck_mtx_unlock(&dtrace_lock);
17798	return (`0`);
17799	}
17800
17801	case DTRACEIOC_MODUUIDSLIST: {
17802	size_t module_uuids_list_size;
17803	dtrace_module_uuids_list_t* uuids_list;
17804	uint64_t dtmul_count;
17805
17806	/*
17807	* Security restrictions make this operation illegal, if this is enabled DTrace
17808	* must refuse to provide any fbt probes.
17809	*/
17810	if (dtrace_fbt_probes_restricted()) {
17811	cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17812	return (EPERM);
17813	}
17814
17815	/*
17816	* Fail if the kernel symbol mode makes this operation illegal.
17817	* Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17818	* for them without holding the dtrace_lock.
17819	*/
17820	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER \|\|
17821	dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17822	cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
17823	return (EPERM);
17824	}
17825
17826	/*
17827	* Read the number of symbolsdesc structs being passed in.
17828	*/
17829	if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
17830	&dtmul_count,
17831	sizeof(dtmul_count))) {
17832	cmn_err(CE_WARN, "failed to copyin dtmul_count");
17833	return (EFAULT);
17834	}
17835
17836	/*
17837	* Range check the count. More than 2k kexts is probably an error.
17838	*/
17839	if (dtmul_count > `2048`) {
17840	cmn_err(CE_WARN, "dtmul_count is not valid");
17841	return (EINVAL);
17842	}
17843
17844	/*
17845	* For all queries, we return EINVAL when the user specified
17846	* count does not match the actual number of modules we find
17847	* available.
17848	*
17849	* If the user specified count is zero, then this serves as a
17850	* simple query to count the available modules in need of symbols.
17851	*/
17852
17853	rval = `0`;
17854
17855	if (dtmul_count == `0`)
17856	{
17857	lck_mtx_lock(&mod_lock);
17858	struct modctl* ctl = dtrace_modctl_list;
17859	while (ctl) {
17860	/ Update the private probes bit /
17861	if (dtrace_provide_private_probes)
17862	ctl->mod_flags \|= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17863
17864	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17865	if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17866	dtmul_count++;
17867	rval = EINVAL;
17868	}
17869	ctl = ctl->mod_next;
17870	}
17871	lck_mtx_unlock(&mod_lock);
17872
17873	if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != `0`)
17874	return (EFAULT);
17875	else
17876	return (rval);
17877	}
17878
17879	/*
17880	* If we reach this point, then we have a request for full list data.
17881	* Allocate a correctly sized structure and copyin the data.
17882	*/
17883	module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
17884	if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
17885	return (ENOMEM);
17886
17887	/ NOTE! We can no longer exit this method via return /
17888	if (copyin(arg, uuids_list, module_uuids_list_size) != `0`) {
17889	cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
17890	rval = EFAULT;
17891	goto moduuidslist_cleanup;
17892	}
17893
17894	/*
17895	* Check that the count didn't change between the first copyin and the second.
17896	*/
17897	if (uuids_list->dtmul_count != dtmul_count) {
17898	rval = EINVAL;
17899	goto moduuidslist_cleanup;
17900	}
17901
17902	/*
17903	* Build the list of UUID's that need symbols
17904	*/
17905	lck_mtx_lock(&mod_lock);
17906
17907	dtmul_count = `0`;
17908
17909	struct modctl* ctl = dtrace_modctl_list;
17910	while (ctl) {
17911	/ Update the private probes bit /
17912	if (dtrace_provide_private_probes)
17913	ctl->mod_flags \|= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17914
17915	/*
17916	* We assume that userspace symbols will be "better" than kernel level symbols,
17917	* as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
17918	* are available, add user syms if the module might use them.
17919	*/
17920	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17921	if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17922	UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
17923	if (dtmul_count++ < uuids_list->dtmul_count) {
17924	memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
17925	}
17926	}
17927	ctl = ctl->mod_next;
17928	}
17929
17930	lck_mtx_unlock(&mod_lock);
17931
17932	if (uuids_list->dtmul_count < dtmul_count)
17933	rval = EINVAL;
17934
17935	uuids_list->dtmul_count = dtmul_count;
17936
17937	/*
17938	* Copyout the symbols list (or at least the count!)
17939	*/
17940	if (copyout(uuids_list, arg, module_uuids_list_size) != `0`) {
17941	cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
17942	rval = EFAULT;
17943	}
17944
17945	moduuidslist_cleanup:
17946	/*
17947	* If we had to allocate struct memory, free it.
17948	*/
17949	if (uuids_list != NULL) {
17950	kmem_free(uuids_list, module_uuids_list_size);
17951	}
17952
17953	return rval;
17954	}
17955
17956	case DTRACEIOC_PROVMODSYMS: {
17957	size_t module_symbols_size;
17958	dtrace_module_symbols_t* module_symbols;
17959	uint64_t dtmodsyms_count;
17960
17961	/*
17962	* Security restrictions make this operation illegal, if this is enabled DTrace
17963	* must refuse to provide any fbt probes.
17964	*/
17965	if (dtrace_fbt_probes_restricted()) {
17966	cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17967	return (EPERM);
17968	}
17969
17970	/*
17971	* Fail if the kernel symbol mode makes this operation illegal.
17972	* Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17973	* for them without holding the dtrace_lock.
17974	*/
17975	if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER \|\|
17976	dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17977	cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
17978	return (EPERM);
17979	}
17980
17981	/*
17982	* Read the number of module symbols structs being passed in.
17983	*/
17984	if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
17985	&dtmodsyms_count,
17986	sizeof(dtmodsyms_count))) {
17987	cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
17988	return (EFAULT);
17989	}
17990
17991	/*
17992	* Range check the count. How much data can we pass around?
17993	* FIX ME!
17994	*/
17995	if (dtmodsyms_count == `0` \|\| (dtmodsyms_count > `100` * `1024`)) {
17996	cmn_err(CE_WARN, "dtmodsyms_count is not valid");
17997	return (EINVAL);
17998	}
17999
18000	/*
18001	* Allocate a correctly sized structure and copyin the data.
18002	*/
18003	module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18004	if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18005	return (ENOMEM);
18006
18007	rval = `0`;
18008
18009	/ NOTE! We can no longer exit this method via return /
18010	if (copyin(arg, module_symbols, module_symbols_size) != `0`) {
18011	cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18012	rval = EFAULT;
18013	goto module_symbols_cleanup;
18014	}
18015
18016	/*
18017	* Check that the count didn't change between the first copyin and the second.
18018	*/
18019	if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18020	rval = EINVAL;
18021	goto module_symbols_cleanup;
18022	}
18023
18024	/*
18025	* Find the modctl to add symbols to.
18026	*/
18027	lck_mtx_lock(&dtrace_provider_lock);
18028	lck_mtx_lock(&mod_lock);
18029
18030	struct modctl* ctl = dtrace_modctl_list;
18031	while (ctl) {
18032	/ Update the private probes bit /
18033	if (dtrace_provide_private_probes)
18034	ctl->mod_flags \|= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
18035
18036	ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18037	if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == `0`) {
18038	dtrace_provider_t *prv;
18039	ctl->mod_user_symbols = module_symbols;
18040
18041	/*
18042	* We're going to call each providers per-module provide operation
18043	* specifying only this module.
18044	*/
18045	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18046	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18047	/*
18048	* We gave every provider a chance to provide with the user syms, go ahead and clear them
18049	*/
18050	ctl->mod_user_symbols = NULL; / MUST reset this to clear HAS_USERSPACE_SYMBOLS /
18051	}
18052	ctl = ctl->mod_next;
18053	}
18054
18055	lck_mtx_unlock(&mod_lock);
18056	lck_mtx_unlock(&dtrace_provider_lock);
18057
18058	module_symbols_cleanup:
18059	/*
18060	* If we had to allocate struct memory, free it.
18061	*/
18062	if (module_symbols != NULL) {
18063	kmem_free(module_symbols, module_symbols_size);
18064	}
18065
18066	return rval;
18067	}
18068
18069	case DTRACEIOC_PROCWAITFOR: {
18070	dtrace_procdesc_t pdesc = {
18071	.p_name = {`0`},
18072	.p_pid = -`1`
18073	};
18074
18075	if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != `0`)
18076	goto proc_waitfor_error;
18077
18078	if ((rval = dtrace_proc_waitfor(&pdesc)) != `0`)
18079	goto proc_waitfor_error;
18080
18081	if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != `0`)
18082	goto proc_waitfor_error;
18083
18084	return `0`;
18085
18086	proc_waitfor_error:
18087	/ The process was suspended, revert this since the client will not do it. /
18088	if (pdesc.p_pid != -`1`) {
18089	proc_t *proc = proc_find(pdesc.p_pid);
18090	if (proc != PROC_NULL) {
18091	task_pidresume(proc->task);
18092	proc_rele(proc);
18093	}
18094	}
18095
18096	return rval;
18097	}
18098
18099	default:
18100	break;
18101	}
18102
18103	return (ENOTTY);
18104	}
18105
18106	/*
18107	* APPLE NOTE: dtrace_detach not implemented
18108	*/
18109	#if !defined(__APPLE__)
18110	/ARGSUSED/
18111	static int
18112	dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18113	{
18114	dtrace_state_t *state;
18115
18116	switch (cmd) {
18117	case DDI_DETACH:
18118	break;
18119
18120	case DDI_SUSPEND:
18121	return (DDI_SUCCESS);
18122
18123	default:
18124	return (DDI_FAILURE);
18125	}
18126
18127	lck_mtx_lock(&cpu_lock);
18128	lck_mtx_lock(&dtrace_provider_lock);
18129	lck_mtx_lock(&dtrace_lock);
18130
18131	ASSERT(dtrace_opens == `0`);
18132
18133	if (dtrace_helpers > `0`) {
18134	lck_mtx_unlock(&dtrace_lock);
18135	lck_mtx_unlock(&dtrace_provider_lock);
18136	lck_mtx_unlock(&cpu_lock);
18137	return (DDI_FAILURE);
18138	}
18139
18140	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != `0`) {
18141	lck_mtx_unlock(&dtrace_lock);
18142	lck_mtx_unlock(&dtrace_provider_lock);
18143	lck_mtx_unlock(&cpu_lock);
18144	return (DDI_FAILURE);
18145	}
18146
18147	dtrace_provider = NULL;
18148
18149	if ((state = dtrace_anon_grab()) != NULL) {
18150	/*
18151	* If there were ECBs on this state, the provider should
18152	* have not been allowed to detach; assert that there is
18153	* none.
18154	*/
18155	ASSERT(state->dts_necbs == `0`);
18156	dtrace_state_destroy(state);
18157
18158	/*
18159	* If we're being detached with anonymous state, we need to
18160	* indicate to the kernel debugger that DTrace is now inactive.
18161	*/
18162	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18163	}
18164
18165	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18166	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18167	dtrace_cpu_init = NULL;
18168	dtrace_helpers_cleanup = NULL;
18169	dtrace_helpers_fork = NULL;
18170	dtrace_cpustart_init = NULL;
18171	dtrace_cpustart_fini = NULL;
18172	dtrace_debugger_init = NULL;
18173	dtrace_debugger_fini = NULL;
18174	dtrace_kreloc_init = NULL;
18175	dtrace_kreloc_fini = NULL;
18176	dtrace_modload = NULL;
18177	dtrace_modunload = NULL;
18178
18179	lck_mtx_unlock(&cpu_lock);
18180
18181	if (dtrace_helptrace_enabled) {
18182	kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18183	dtrace_helptrace_buffer = NULL;
18184	}
18185
18186	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18187	dtrace_probes = NULL;
18188	dtrace_nprobes = `0`;
18189
18190	dtrace_hash_destroy(dtrace_strings);
18191	dtrace_hash_destroy(dtrace_byprov);
18192	dtrace_hash_destroy(dtrace_bymod);
18193	dtrace_hash_destroy(dtrace_byfunc);
18194	dtrace_hash_destroy(dtrace_byname);
18195	dtrace_strings = NULL;
18196	dtrace_byprov = NULL;
18197	dtrace_bymod = NULL;
18198	dtrace_byfunc = NULL;
18199	dtrace_byname = NULL;
18200
18201	kmem_cache_destroy(dtrace_state_cache);
18202	vmem_destroy(dtrace_arena);
18203
18204	if (dtrace_toxrange != NULL) {
18205	kmem_free(dtrace_toxrange,
18206	dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18207	dtrace_toxrange = NULL;
18208	dtrace_toxranges = `0`;
18209	dtrace_toxranges_max = `0`;
18210	}
18211
18212	ddi_remove_minor_node(dtrace_devi, NULL);
18213	dtrace_devi = NULL;
18214
18215	ddi_soft_state_fini(&dtrace_softstate);
18216
18217	ASSERT(dtrace_vtime_references == `0`);
18218	ASSERT(dtrace_opens == `0`);
18219	ASSERT(dtrace_retained == NULL);
18220
18221	lck_mtx_unlock(&dtrace_lock);
18222	lck_mtx_unlock(&dtrace_provider_lock);
18223
18224	#ifdef illumos
18225	/*
18226	* We don't destroy the task queue until after we have dropped our
18227	* locks (taskq_destroy() may block on running tasks). To prevent
18228	* attempting to do work after we have effectively detached but before
18229	* the task queue has been destroyed, all tasks dispatched via the
18230	* task queue must check that DTrace is still attached before
18231	* performing any operation.
18232	*/
18233	taskq_destroy(dtrace_taskq);
18234	dtrace_taskq = NULL;
18235	#endif
18236
18237	return (DDI_SUCCESS);
18238	}
18239	#endif /* __APPLE__ */
18240
18241	d_open_t _dtrace_open, helper_open;
18242	d_close_t _dtrace_close, helper_close;
18243	d_ioctl_t _dtrace_ioctl, helper_ioctl;
18244
18245	int
18246	_dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18247	{
18248	#pragma unused(p)
18249	dev_t locdev = dev;
18250
18251	return dtrace_open( &locdev, flags, devtype, CRED());
18252	}
18253
18254	int
18255	helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18256	{
18257	#pragma unused(dev,flags,devtype,p)
18258	return `0`;
18259	}
18260
18261	int
18262	_dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18263	{
18264	#pragma unused(p)
18265	return dtrace_close( dev, flags, devtype, CRED());
18266	}
18267
18268	int
18269	helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18270	{
18271	#pragma unused(dev,flags,devtype,p)
18272	return `0`;
18273	}
18274
18275	int
18276	_dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18277	{
18278	#pragma unused(p)
18279	int err, rv = `0`;
18280	user_addr_t uaddrp;
18281
18282	if (proc_is64bit(p))
18283	uaddrp = (user_addr_t )data;
18284	else
18285	uaddrp = (user_addr_t) (uint32_t )data;
18286
18287	err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18288
18289	/ Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. /
18290	if (err != `0`) {
18291	ASSERT( (err & `0xfffff000`) == `0` );
18292	return (err & `0xfff`); / ioctl will return -1 and will set errno to an error code < 4096 /
18293	} else if (rv != `0`) {
18294	ASSERT( (rv & `0xfff00000`) == `0` );
18295	return (((rv & `0xfffff`) << `12`)); / ioctl will return -1 and will set errno to a value >= 4096 /
18296	} else
18297	return `0`;
18298	}
18299
18300	int
18301	helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18302	{
18303	#pragma unused(dev,fflag,p)
18304	int err, rv = `0`;
18305
18306	err = dtrace_ioctl_helper(cmd, data, &rv);
18307	/ Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. /
18308	if (err != `0`) {
18309	ASSERT( (err & `0xfffff000`) == `0` );
18310	return (err & `0xfff`); / ioctl will return -1 and will set errno to an error code < 4096 /
18311	} else if (rv != `0`) {
18312	ASSERT( (rv & `0xfff00000`) == `0` );
18313	return (((rv & `0xfffff`) << `12`)); / ioctl will return -1 and will set errno to a value >= 4096 /
18314	} else
18315	return `0`;
18316	}
18317
18318	#define HELPER_MAJOR -24 /* let the kernel pick the device number */
18319
18320	/*
18321	* A struct describing which functions will get invoked for certain
18322	* actions.
18323	*/
18324	static struct cdevsw helper_cdevsw =
18325	{
18326	helper_open, / open /
18327	helper_close, / close /
18328	eno_rdwrt, / read /
18329	eno_rdwrt, / write /
18330	helper_ioctl, / ioctl /
18331	(stop_fcn_t )nulldev, /* stop /
18332	(reset_fcn_t )nulldev, /* reset /
18333	NULL, / tty's /
18334	eno_select, / select /
18335	eno_mmap, / mmap /
18336	eno_strat, / strategy /
18337	eno_getc, / getc /
18338	eno_putc, / putc /
18339	`0` / type /
18340	};
18341
18342	static int helper_majdevno = `0`;
18343
18344	static int gDTraceInited = `0`;
18345
18346	void
18347	helper_init( void )
18348	{
18349	/*
18350	* Once the "helper" is initialized, it can take ioctl calls that use locks
18351	* and zones initialized in dtrace_init. Make certain dtrace_init was called
18352	* before us.
18353	*/
18354
18355	if (!gDTraceInited) {
18356	panic("helper_init before dtrace_init\n");
18357	}
18358
18359	if (`0` >= helper_majdevno)
18360	{
18361	helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
18362
18363	if (helper_majdevno < `0`) {
18364	printf("helper_init: failed to allocate a major number!\n");
18365	return;
18366	}
18367
18368	if (NULL == devfs_make_node( makedev(helper_majdevno, `0`), DEVFS_CHAR, UID_ROOT, GID_WHEEL, `0666`,
18369	DTRACEMNR_HELPER, `0` )) {
18370	printf("dtrace_init: failed to devfs_make_node for helper!\n");
18371	return;
18372	}
18373	} else
18374	panic("helper_init: called twice!\n");
18375	}
18376
18377	#undef HELPER_MAJOR
18378
18379	static int
18380	dtrace_clone_func(dev_t dev, int action)
18381	{
18382	#pragma unused(dev)
18383
18384	if (action == DEVFS_CLONE_ALLOC) {
18385	return dtrace_state_reserve();
18386	}
18387	else if (action == DEVFS_CLONE_FREE) {
18388	return `0`;
18389	}
18390	else return -`1`;
18391	}
18392
18393	void dtrace_ast(void);
18394
18395	void
18396	dtrace_ast(void)
18397	{
18398	int i;
18399	uint32_t clients = atomic_and_32(&dtrace_wake_clients, `0`);
18400	if (clients == `0`)
18401	return;
18402	/**
18403	* We disable preemption here to be sure that we won't get
18404	* interrupted by a wakeup to a thread that is higher
18405	* priority than us, so that we do issue all wakeups
18406	*/
18407	disable_preemption();
18408	for (i = `0`; i < DTRACE_NCLIENTS; i++) {
18409	if (clients & (`1` << i)) {
18410	dtrace_state_t *state = dtrace_state_get(i);
18411	if (state) {
18412	wakeup(state);
18413	}
18414
18415	}
18416	}
18417	enable_preemption();
18418	}
18419
18420
18421	#define DTRACE_MAJOR -24 /* let the kernel pick the device number */
18422
18423	static struct cdevsw dtrace_cdevsw =
18424	{
18425	_dtrace_open, / open /
18426	_dtrace_close, / close /
18427	eno_rdwrt, / read /
18428	eno_rdwrt, / write /
18429	_dtrace_ioctl, / ioctl /
18430	(stop_fcn_t )nulldev, /* stop /
18431	(reset_fcn_t )nulldev, /* reset /
18432	NULL, / tty's /
18433	eno_select, / select /
18434	eno_mmap, / mmap /
18435	eno_strat, / strategy /
18436	eno_getc, / getc /
18437	eno_putc, / putc /
18438	`0` / type /
18439	};
18440
18441	lck_attr_t* dtrace_lck_attr;
18442	lck_grp_attr_t* dtrace_lck_grp_attr;
18443	lck_grp_t* dtrace_lck_grp;
18444
18445	static int gMajDevNo;
18446
18447	void dtrace_early_init (void)
18448	{
18449	dtrace_restriction_policy_load();
18450
18451	/*
18452	* See dtrace_impl.h for a description of kernel symbol modes.
18453	* The default is to wait for symbols from userspace (lazy symbols).
18454	*/
18455	if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
18456	dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
18457	}
18458	}
18459
18460	void
18461	dtrace_init( void )
18462	{
18463	if (`0` == gDTraceInited) {
18464	int i, ncpu;
18465	size_t size = sizeof(dtrace_buffer_memory_maxsize);
18466
18467	/*
18468	* DTrace allocates buffers based on the maximum number
18469	* of enabled cpus. This call avoids any race when finding
18470	* that count.
18471	*/
18472	ASSERT(dtrace_max_cpus == `0`);
18473	ncpu = dtrace_max_cpus = ml_get_max_cpus();
18474
18475	/*
18476	* Retrieve the size of the physical memory in order to define
18477	* the state buffer memory maximal size. If we cannot retrieve
18478	* this value, we'll consider that we have 1Gb of memory per CPU, that's
18479	* still better than raising a kernel panic.
18480	*/
18481	if (`0` != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
18482	&size, NULL, `0`))
18483	{
18484	dtrace_buffer_memory_maxsize = ncpu * `1024` * `1024` * `1024`;
18485	printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
18486	dtrace_buffer_memory_maxsize);
18487	}
18488
18489	/*
18490	* Finally, divide by three to prevent DTrace from eating too
18491	* much memory.
18492	*/
18493	dtrace_buffer_memory_maxsize /= `3`;
18494	ASSERT(dtrace_buffer_memory_maxsize > `0`);
18495
18496	gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
18497
18498	if (gMajDevNo < `0`) {
18499	printf("dtrace_init: failed to allocate a major number!\n");
18500	gDTraceInited = `0`;
18501	return;
18502	}
18503
18504	if (NULL == devfs_make_node_clone( makedev(gMajDevNo, `0`), DEVFS_CHAR, UID_ROOT, GID_WHEEL, `0666`,
18505	dtrace_clone_func, DTRACEMNR_DTRACE, `0` )) {
18506	printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
18507	gDTraceInited = `0`;
18508	return;
18509	}
18510
18511	/*
18512	* Allocate the dtrace_probe_t zone
18513	*/
18514	dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
18515	`1024` * sizeof(dtrace_probe_t),
18516	sizeof(dtrace_probe_t),
18517	"dtrace.dtrace_probe_t");
18518
18519	/*
18520	* Create the dtrace lock group and attrs.
18521	*/
18522	dtrace_lck_attr = lck_attr_alloc_init();
18523	dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
18524	dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr);
18525
18526	/*
18527	* We have to initialize all locks explicitly
18528	*/
18529	lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
18530	lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
18531	lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
18532	lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
18533	#if DEBUG
18534	lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
18535	#endif
18536	lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
18537
18538	/*
18539	* The cpu_core structure consists of per-CPU state available in any context.
18540	* On some architectures, this may mean that the page(s) containing the
18541	* NCPU-sized array of cpu_core structures must be locked in the TLB -- it
18542	* is up to the platform to assure that this is performed properly. Note that
18543	* the structure is sized to avoid false sharing.
18544	*/
18545	lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
18546	lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
18547	lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
18548
18549	/*
18550	* Initialize the CPU offline/online hooks.
18551	*/
18552	dtrace_install_cpu_hooks();
18553
18554	dtrace_modctl_list = NULL;
18555
18556	cpu_core = (cpu_core_t )kmem_zalloc( ncpu sizeof(cpu_core_t), KM_SLEEP );
18557	for (i = `0`; i < ncpu; ++i) {
18558	lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
18559	}
18560
18561	cpu_list = (dtrace_cpu_t )kmem_zalloc( ncpu sizeof(dtrace_cpu_t), KM_SLEEP );
18562	for (i = `0`; i < ncpu; ++i) {
18563	cpu_list[i].cpu_id = (processorid_t)i;
18564	cpu_list[i].cpu_next = &(cpu_list[(i+`1`) % ncpu]);
18565	LIST_INIT(&cpu_list[i].cpu_cyc_list);
18566	lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
18567	}
18568
18569	lck_mtx_lock(&cpu_lock);
18570	for (i = `0`; i < ncpu; ++i)
18571	/ FIXME: track CPU configuration /
18572	dtrace_cpu_setup_initial( (processorid_t)i ); / In lieu of register_cpu_setup_func() callback /
18573	lck_mtx_unlock(&cpu_lock);
18574
18575	(void)dtrace_abs_to_nano(`0LL`); / Force once only call to clock_timebase_info (which can take a lock) /
18576
18577	dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
18578	offsetof(dtrace_string_t, dtst_str),
18579	offsetof(dtrace_string_t, dtst_next),
18580	offsetof(dtrace_string_t, dtst_prev));
18581
18582	dtrace_isa_init();
18583	/*
18584	* See dtrace_impl.h for a description of dof modes.
18585	* The default is lazy dof.
18586	*
18587	* FIXME: Warn if state is LAZY_OFF? It won't break anything, but
18588	* makes no sense...
18589	*/
18590	if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
18591	#if CONFIG_EMBEDDED
18592	/ Disable DOF mode by default for performance reasons /
18593	dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
18594	#else
18595	dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
18596	#endif
18597	}
18598
18599	/*
18600	* Sanity check of dof mode value.
18601	*/
18602	switch (dtrace_dof_mode) {
18603	case DTRACE_DOF_MODE_NEVER:
18604	case DTRACE_DOF_MODE_LAZY_ON:
18605	/ valid modes, but nothing else we need to do /
18606	break;
18607
18608	case DTRACE_DOF_MODE_LAZY_OFF:
18609	case DTRACE_DOF_MODE_NON_LAZY:
18610	/ Cannot wait for a dtrace_open to init fasttrap /
18611	fasttrap_init();
18612	break;
18613
18614	default:
18615	/ Invalid, clamp to non lazy /
18616	dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
18617	fasttrap_init();
18618	break;
18619	}
18620
18621	gDTraceInited = `1`;
18622
18623	} else
18624	panic("dtrace_init: called twice!\n");
18625	}
18626
18627	void
18628	dtrace_postinit(void)
18629	{
18630	/*
18631	* Called from bsd_init after all provider's *_init() routines have been
18632	* run. That way, anonymous DOF enabled under dtrace_attach() is safe
18633	* to go.
18634	*/
18635	dtrace_attach( (dev_info_t )(uintptr_t)makedev(gMajDevNo, `0`)); /* Punning a dev_t to a dev_info_t* /
18636
18637	/*
18638	* Add the mach_kernel to the module list for lazy processing
18639	*/
18640	struct kmod_info fake_kernel_kmod;
18641	memset(&fake_kernel_kmod, `0`, sizeof(fake_kernel_kmod));
18642
18643	strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
18644	fake_kernel_kmod.id = `1`;
18645	fake_kernel_kmod.address = g_kernel_kmod_info.address;
18646	fake_kernel_kmod.size = g_kernel_kmod_info.size;
18647
18648	if (dtrace_module_loaded(&fake_kernel_kmod, `0`) != `0`) {
18649	printf("dtrace_postinit: Could not register mach_kernel modctl\n");
18650	}
18651
18652	if (!PE_parse_boot_argn("dtrace_provide_private_probes", &dtrace_provide_private_probes, sizeof (dtrace_provide_private_probes))) {
18653	dtrace_provide_private_probes = `0`;
18654	}
18655
18656	(void)OSKextRegisterKextsWithDTrace();
18657	}
18658	#undef DTRACE_MAJOR
18659
18660	/*
18661	* Routines used to register interest in cpu's being added to or removed
18662	* from the system.
18663	*/
18664	void
18665	register_cpu_setup_func(cpu_setup_func_t ignore1, void* *ignore2)
18666	{
18667	#pragma unused(ignore1,ignore2)
18668	}
18669
18670	void
18671	unregister_cpu_setup_func(cpu_setup_func_t ignore1, void* *ignore2)
18672	{
18673	#pragma unused(ignore1,ignore2)
18674	}
18675

Browse the source code of codebrowser/bsd/dev/dtrace/dtrace.c