1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24 * Portions Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27/*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32/* #pragma ident "@(#)dtrace.c 1.65 08/07/02 SMI" */
33
34/*
35 * DTrace - Dynamic Tracing for Solaris
36 *
37 * This is the implementation of the Solaris Dynamic Tracing framework
38 * (DTrace). The user-visible interface to DTrace is described at length in
39 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
40 * library, the in-kernel DTrace framework, and the DTrace providers are
41 * described in the block comments in the <sys/dtrace.h> header file. The
42 * internal architecture of DTrace is described in the block comments in the
43 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
44 * implementation very much assume mastery of all of these sources; if one has
45 * an unanswered question about the implementation, one should consult them
46 * first.
47 *
48 * The functions here are ordered roughly as follows:
49 *
50 * - Probe context functions
51 * - Probe hashing functions
52 * - Non-probe context utility functions
53 * - Matching functions
54 * - Provider-to-Framework API functions
55 * - Probe management functions
56 * - DIF object functions
57 * - Format functions
58 * - Predicate functions
59 * - ECB functions
60 * - Buffer functions
61 * - Enabling functions
62 * - DOF functions
63 * - Anonymous enabling functions
64 * - Process functions
65 * - Consumer state functions
66 * - Helper functions
67 * - Hook functions
68 * - Driver cookbook functions
69 *
70 * Each group of functions begins with a block comment labelled the "DTrace
71 * [Group] Functions", allowing one to find each block by searching forward
72 * on capital-f functions.
73 */
74#include <sys/errno.h>
75#include <sys/types.h>
76#include <sys/stat.h>
77#include <sys/conf.h>
78#include <sys/systm.h>
79#include <sys/dtrace_impl.h>
80#include <sys/param.h>
81#include <sys/proc_internal.h>
82#include <sys/ioctl.h>
83#include <sys/fcntl.h>
84#include <miscfs/devfs/devfs.h>
85#include <sys/malloc.h>
86#include <sys/kernel_types.h>
87#include <sys/proc_internal.h>
88#include <sys/uio_internal.h>
89#include <sys/kauth.h>
90#include <vm/pmap.h>
91#include <sys/user.h>
92#include <mach/exception_types.h>
93#include <sys/signalvar.h>
94#include <mach/task.h>
95#include <kern/zalloc.h>
96#include <kern/ast.h>
97#include <kern/sched_prim.h>
98#include <kern/task.h>
99#include <netinet/in.h>
100#include <libkern/sysctl.h>
101#include <sys/kdebug.h>
102
103#if MONOTONIC
104#include <kern/monotonic.h>
105#include <machine/monotonic.h>
106#endif /* MONOTONIC */
107
108#include <IOKit/IOPlatformExpert.h>
109
110#include <kern/cpu_data.h>
111extern uint32_t pmap_find_phys(void *, uint64_t);
112extern boolean_t pmap_valid_page(uint32_t);
113extern void OSKextRegisterKextsWithDTrace(void);
114extern kmod_info_t g_kernel_kmod_info;
115
116/* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
117#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
118
119#define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
120
121extern void dtrace_suspend(void);
122extern void dtrace_resume(void);
123extern void dtrace_early_init(void);
124extern int dtrace_keep_kernel_symbols(void);
125extern void dtrace_init(void);
126extern void helper_init(void);
127extern void fasttrap_init(void);
128
129static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
130extern void dtrace_lazy_dofs_destroy(proc_t *);
131extern void dtrace_postinit(void);
132
133extern void dtrace_proc_fork(proc_t*, proc_t*, int);
134extern void dtrace_proc_exec(proc_t*);
135extern void dtrace_proc_exit(proc_t*);
136
137/*
138 * DTrace Tunable Variables
139 *
140 * The following variables may be dynamically tuned by using sysctl(8), the
141 * variables being stored in the kern.dtrace namespace. For example:
142 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
143 *
144 * In general, the only variables that one should be tuning this way are those
145 * that affect system-wide DTrace behavior, and for which the default behavior
146 * is undesirable. Most of these variables are tunable on a per-consumer
147 * basis using DTrace options, and need not be tuned on a system-wide basis.
148 * When tuning these variables, avoid pathological values; while some attempt
149 * is made to verify the integrity of these variables, they are not considered
150 * part of the supported interface to DTrace, and they are therefore not
151 * checked comprehensively.
152 */
153uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
154uint64_t dtrace_buffer_memory_inuse = 0;
155int dtrace_destructive_disallow = 0;
156dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
157size_t dtrace_difo_maxsize = (256 * 1024);
158dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
159dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
160dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
161size_t dtrace_actions_max = (16 * 1024);
162size_t dtrace_retain_max = 1024;
163dtrace_optval_t dtrace_helper_actions_max = 32;
164dtrace_optval_t dtrace_helper_providers_max = 64;
165dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
166size_t dtrace_strsize_default = 256;
167dtrace_optval_t dtrace_strsize_min = 8;
168dtrace_optval_t dtrace_strsize_max = 65536;
169dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
170dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
171dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
172dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
173dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
174dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
175dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
176dtrace_optval_t dtrace_nspec_default = 1;
177dtrace_optval_t dtrace_specsize_default = 32 * 1024;
178dtrace_optval_t dtrace_stackframes_default = 20;
179dtrace_optval_t dtrace_ustackframes_default = 20;
180dtrace_optval_t dtrace_jstackframes_default = 50;
181dtrace_optval_t dtrace_jstackstrsize_default = 512;
182dtrace_optval_t dtrace_buflimit_default = 75;
183dtrace_optval_t dtrace_buflimit_min = 1;
184dtrace_optval_t dtrace_buflimit_max = 99;
185int dtrace_msgdsize_max = 128;
186hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
187hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
188int dtrace_devdepth_max = 32;
189int dtrace_err_verbose;
190int dtrace_provide_private_probes = 0;
191hrtime_t dtrace_deadman_interval = NANOSEC;
192hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
193hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
194
195/*
196 * DTrace External Variables
197 *
198 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
199 * available to DTrace consumers via the backtick (`) syntax. One of these,
200 * dtrace_zero, is made deliberately so: it is provided as a source of
201 * well-known, zero-filled memory. While this variable is not documented,
202 * it is used by some translators as an implementation detail.
203 */
204const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
205unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
206/*
207 * DTrace Internal Variables
208 */
209static dev_info_t *dtrace_devi; /* device info */
210static vmem_t *dtrace_arena; /* probe ID arena */
211static dtrace_probe_t **dtrace_probes; /* array of all probes */
212static int dtrace_nprobes; /* number of probes */
213static dtrace_provider_t *dtrace_provider; /* provider list */
214static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
215static int dtrace_opens; /* number of opens */
216static int dtrace_helpers; /* number of helpers */
217static dtrace_hash_t *dtrace_strings;
218static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */
219static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
220static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
221static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
222static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
223static int dtrace_toxranges; /* number of toxic ranges */
224static int dtrace_toxranges_max; /* size of toxic range array */
225static dtrace_anon_t dtrace_anon; /* anonymous enabling */
226static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
227static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
228static kthread_t *dtrace_panicked; /* panicking thread */
229static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
230static dtrace_genid_t dtrace_probegen; /* current probe generation */
231static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
232static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
233static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
234static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
235
236static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
237
238 /*
239 * This does't quite fit as an internal variable, as it must be accessed in
240 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
241 */
242int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
243static uint32_t dtrace_wake_clients;
244static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
245
246/*
247 * To save memory, some common memory allocations are given a
248 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
249 * which means it would fall into the kalloc.128 bucket. With
250 * 20k elements allocated, the space saved is substantial.
251 */
252
253struct zone *dtrace_probe_t_zone;
254
255static int dtrace_module_unloaded(struct kmod_info *kmod);
256
257/*
258 * DTrace Locking
259 * DTrace is protected by three (relatively coarse-grained) locks:
260 *
261 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
262 * including enabling state, probes, ECBs, consumer state, helper state,
263 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
264 * probe context is lock-free -- synchronization is handled via the
265 * dtrace_sync() cross call mechanism.
266 *
267 * (2) dtrace_provider_lock is required when manipulating provider state, or
268 * when provider state must be held constant.
269 *
270 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
271 * when meta provider state must be held constant.
272 *
273 * The lock ordering between these three locks is dtrace_meta_lock before
274 * dtrace_provider_lock before dtrace_lock. (In particular, there are
275 * several places where dtrace_provider_lock is held by the framework as it
276 * calls into the providers -- which then call back into the framework,
277 * grabbing dtrace_lock.)
278 *
279 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
280 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
281 * role as a coarse-grained lock; it is acquired before both of these locks.
282 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
283 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
284 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
285 * acquired _between_ dtrace_provider_lock and dtrace_lock.
286 */
287
288
289/*
290 * APPLE NOTE:
291 *
292 * For porting purposes, all kmutex_t vars have been changed
293 * to lck_mtx_t, which require explicit initialization.
294 *
295 * kmutex_t becomes lck_mtx_t
296 * mutex_enter() becomes lck_mtx_lock()
297 * mutex_exit() becomes lck_mtx_unlock()
298 *
299 * Lock asserts are changed like this:
300 *
301 * ASSERT(MUTEX_HELD(&cpu_lock));
302 * becomes:
303 * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
304 *
305 */
306static lck_mtx_t dtrace_lock; /* probe state lock */
307static lck_mtx_t dtrace_provider_lock; /* provider state lock */
308static lck_mtx_t dtrace_meta_lock; /* meta-provider state lock */
309static lck_rw_t dtrace_dof_mode_lock; /* dof mode lock */
310
311/*
312 * DTrace Provider Variables
313 *
314 * These are the variables relating to DTrace as a provider (that is, the
315 * provider of the BEGIN, END, and ERROR probes).
316 */
317static dtrace_pattr_t dtrace_provider_attr = {
318{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
319{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
320{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
321{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
322{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
323};
324
325static void
326dtrace_nullop(void)
327{}
328
329static int
330dtrace_enable_nullop(void)
331{
332 return (0);
333}
334
335static dtrace_pops_t dtrace_provider_ops = {
336 .dtps_provide = (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
337 .dtps_provide_module = (void (*)(void *, struct modctl *))dtrace_nullop,
338 .dtps_enable = (int (*)(void *, dtrace_id_t, void *))dtrace_nullop,
339 .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
340 .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
341 .dtps_resume = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
342 .dtps_getargdesc = NULL,
343 .dtps_getargval = NULL,
344 .dtps_usermode = NULL,
345 .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
346};
347
348static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
349static dtrace_id_t dtrace_probeid_end; /* special END probe */
350dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
351
352/*
353 * DTrace Helper Tracing Variables
354 */
355uint32_t dtrace_helptrace_next = 0;
356uint32_t dtrace_helptrace_nlocals;
357char *dtrace_helptrace_buffer;
358size_t dtrace_helptrace_bufsize = 512 * 1024;
359
360#if DEBUG
361int dtrace_helptrace_enabled = 1;
362#else
363int dtrace_helptrace_enabled = 0;
364#endif
365
366#if defined (__arm64__)
367/*
368 * The ioctl for adding helper DOF is based on the
369 * size of a user_addr_t. We need to recognize both
370 * U32 and U64 as the same action.
371 */
372#define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
373#define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
374#endif /* __arm64__ */
375
376/*
377 * DTrace Error Hashing
378 *
379 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
380 * table. This is very useful for checking coverage of tests that are
381 * expected to induce DIF or DOF processing errors, and may be useful for
382 * debugging problems in the DIF code generator or in DOF generation . The
383 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
384 */
385#if DEBUG
386static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
387static const char *dtrace_errlast;
388static kthread_t *dtrace_errthread;
389static lck_mtx_t dtrace_errlock;
390#endif
391
392/*
393 * DTrace Macros and Constants
394 *
395 * These are various macros that are useful in various spots in the
396 * implementation, along with a few random constants that have no meaning
397 * outside of the implementation. There is no real structure to this cpp
398 * mishmash -- but is there ever?
399 */
400
401#define DTRACE_GETSTR(hash, elm) \
402 (hash->dth_getstr(elm, hash->dth_stroffs))
403
404#define DTRACE_HASHSTR(hash, elm) \
405 dtrace_hash_str(DTRACE_GETSTR(hash, elm))
406
407#define DTRACE_HASHNEXT(hash, elm) \
408 (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
409
410#define DTRACE_HASHPREV(hash, elm) \
411 (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
412
413#define DTRACE_HASHEQ(hash, lhs, rhs) \
414 (strcmp(DTRACE_GETSTR(hash, lhs), \
415 DTRACE_GETSTR(hash, rhs)) == 0)
416
417#define DTRACE_AGGHASHSIZE_SLEW 17
418
419#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
420
421/*
422 * The key for a thread-local variable consists of the lower 61 bits of the
423 * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
424 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
425 * equal to a variable identifier. This is necessary (but not sufficient) to
426 * assure that global associative arrays never collide with thread-local
427 * variables. To guarantee that they cannot collide, we must also define the
428 * order for keying dynamic variables. That order is:
429 *
430 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
431 *
432 * Because the variable-key and the tls-key are in orthogonal spaces, there is
433 * no way for a global variable key signature to match a thread-local key
434 * signature.
435 */
436#if defined (__x86_64__)
437/* FIXME: two function calls!! */
438#define DTRACE_TLS_THRKEY(where) { \
439 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
440 uint64_t thr = (uintptr_t)current_thread(); \
441 ASSERT(intr < (1 << 3)); \
442 (where) = ((thr + DIF_VARIABLE_MAX) & \
443 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
444}
445#elif defined(__arm__)
446/* FIXME: three function calls!!! */
447#define DTRACE_TLS_THRKEY(where) { \
448 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
449 uint64_t thr = (uintptr_t)current_thread(); \
450 uint_t pid = (uint_t)dtrace_proc_selfpid(); \
451 ASSERT(intr < (1 << 3)); \
452 (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
453 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
454}
455#elif defined (__arm64__)
456/* FIXME: two function calls!! */
457#define DTRACE_TLS_THRKEY(where) { \
458 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
459 uint64_t thr = (uintptr_t)current_thread(); \
460 ASSERT(intr < (1 << 3)); \
461 (where) = ((thr + DIF_VARIABLE_MAX) & \
462 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
463}
464#else
465#error Unknown architecture
466#endif
467
468#define DT_BSWAP_8(x) ((x) & 0xff)
469#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
470#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
471#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
472
473#define DT_MASK_LO 0x00000000FFFFFFFFULL
474
475#define DTRACE_STORE(type, tomax, offset, what) \
476 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
477
478
479#define DTRACE_ALIGNCHECK(addr, size, flags) \
480 if (addr & (MIN(size,4) - 1)) { \
481 *flags |= CPU_DTRACE_BADALIGN; \
482 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
483 return (0); \
484 }
485
486#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
487do { \
488 if ((remp) != NULL) { \
489 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
490 } \
491} while (0)
492
493
494/*
495 * Test whether a range of memory starting at testaddr of size testsz falls
496 * within the range of memory described by addr, sz. We take care to avoid
497 * problems with overflow and underflow of the unsigned quantities, and
498 * disallow all negative sizes. Ranges of size 0 are allowed.
499 */
500#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
501 ((testaddr) - (baseaddr) < (basesz) && \
502 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
503 (testaddr) + (testsz) >= (testaddr))
504
505/*
506 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
507 * alloc_sz on the righthand side of the comparison in order to avoid overflow
508 * or underflow in the comparison with it. This is simpler than the INRANGE
509 * check above, because we know that the dtms_scratch_ptr is valid in the
510 * range. Allocations of size zero are allowed.
511 */
512#define DTRACE_INSCRATCH(mstate, alloc_sz) \
513 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
514 (mstate)->dtms_scratch_ptr >= (alloc_sz))
515
516#define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
517
518#if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
519#define DTRACE_LOADFUNC(bits) \
520/*CSTYLED*/ \
521uint##bits##_t dtrace_load##bits(uintptr_t addr); \
522 \
523uint##bits##_t \
524dtrace_load##bits(uintptr_t addr) \
525{ \
526 size_t size = bits / NBBY; \
527 /*CSTYLED*/ \
528 uint##bits##_t rval = 0; \
529 int i; \
530 volatile uint16_t *flags = (volatile uint16_t *) \
531 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
532 \
533 DTRACE_ALIGNCHECK(addr, size, flags); \
534 \
535 for (i = 0; i < dtrace_toxranges; i++) { \
536 if (addr >= dtrace_toxrange[i].dtt_limit) \
537 continue; \
538 \
539 if (addr + size <= dtrace_toxrange[i].dtt_base) \
540 continue; \
541 \
542 /* \
543 * This address falls within a toxic region; return 0. \
544 */ \
545 *flags |= CPU_DTRACE_BADADDR; \
546 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
547 return (0); \
548 } \
549 \
550 { \
551 volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
552 *flags |= CPU_DTRACE_NOFAULT; \
553 recover = dtrace_set_thread_recover(current_thread(), recover); \
554 /*CSTYLED*/ \
555 /* \
556 * PR6394061 - avoid device memory that is unpredictably \
557 * mapped and unmapped \
558 */ \
559 if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
560 rval = *((volatile uint##bits##_t *)addr); \
561 else { \
562 *flags |= CPU_DTRACE_BADADDR; \
563 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
564 return (0); \
565 } \
566 \
567 RECOVER_LABEL(bits); \
568 (void)dtrace_set_thread_recover(current_thread(), recover); \
569 *flags &= ~CPU_DTRACE_NOFAULT; \
570 } \
571 \
572 return (rval); \
573}
574#else /* all other architectures */
575#error Unknown Architecture
576#endif
577
578#ifdef __LP64__
579#define dtrace_loadptr dtrace_load64
580#else
581#define dtrace_loadptr dtrace_load32
582#endif
583
584#define DTRACE_DYNHASH_FREE 0
585#define DTRACE_DYNHASH_SINK 1
586#define DTRACE_DYNHASH_VALID 2
587
588#define DTRACE_MATCH_FAIL -1
589#define DTRACE_MATCH_NEXT 0
590#define DTRACE_MATCH_DONE 1
591#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
592#define DTRACE_STATE_ALIGN 64
593
594#define DTRACE_FLAGS2FLT(flags) \
595 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
596 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
597 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
598 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
599 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
600 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
601 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
602 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
603 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
604 DTRACEFLT_UNKNOWN)
605
606#define DTRACEACT_ISSTRING(act) \
607 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
608 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
609
610
611static size_t dtrace_strlen(const char *, size_t);
612static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
613static void dtrace_enabling_provide(dtrace_provider_t *);
614static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
615static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
616static void dtrace_enabling_matchall(void);
617static dtrace_state_t *dtrace_anon_grab(void);
618static uint64_t dtrace_helper(int, dtrace_mstate_t *,
619 dtrace_state_t *, uint64_t, uint64_t);
620static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
621static void dtrace_buffer_drop(dtrace_buffer_t *);
622static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
623 dtrace_state_t *, dtrace_mstate_t *);
624static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
625 dtrace_optval_t);
626static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
627static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
628static int dtrace_canload_remains(uint64_t, size_t, size_t *,
629 dtrace_mstate_t *, dtrace_vstate_t *);
630static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
631 dtrace_mstate_t *, dtrace_vstate_t *);
632
633
634/*
635 * DTrace sysctl handlers
636 *
637 * These declarations and functions are used for a deeper DTrace configuration.
638 * Most of them are not per-consumer basis and may impact the other DTrace
639 * consumers. Correctness may not be supported for all the variables, so you
640 * should be careful about what values you are using.
641 */
642
643SYSCTL_DECL(_kern_dtrace);
644SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
645
646static int
647sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
648{
649#pragma unused(oidp, arg2)
650 int changed, error;
651 int value = *(int *) arg1;
652
653 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
654 if (error || !changed)
655 return (error);
656
657 if (value != 0 && value != 1)
658 return (ERANGE);
659
660 lck_mtx_lock(&dtrace_lock);
661 dtrace_err_verbose = value;
662 lck_mtx_unlock(&dtrace_lock);
663
664 return (0);
665}
666
667/*
668 * kern.dtrace.err_verbose
669 *
670 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
671 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
672 */
673SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
674 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
675 &dtrace_err_verbose, 0,
676 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
677
678static int
679sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
680{
681#pragma unused(oidp, arg2, req)
682 int changed, error;
683 uint64_t value = *(uint64_t *) arg1;
684
685 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
686 if (error || !changed)
687 return (error);
688
689 if (value <= dtrace_buffer_memory_inuse)
690 return (ERANGE);
691
692 lck_mtx_lock(&dtrace_lock);
693 dtrace_buffer_memory_maxsize = value;
694 lck_mtx_unlock(&dtrace_lock);
695
696 return (0);
697}
698
699/*
700 * kern.dtrace.buffer_memory_maxsize
701 *
702 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
703 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
704 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
705 */
706SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
707 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
708 &dtrace_buffer_memory_maxsize, 0,
709 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
710
711/*
712 * kern.dtrace.buffer_memory_inuse
713 *
714 * Current state buffer memory used, in bytes, by all the DTrace consumers.
715 * This value is read-only.
716 */
717SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
718 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
719
720static int
721sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
722{
723#pragma unused(oidp, arg2, req)
724 int changed, error;
725 size_t value = *(size_t*) arg1;
726
727 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
728 if (error || !changed)
729 return (error);
730
731 if (value <= 0)
732 return (ERANGE);
733
734 lck_mtx_lock(&dtrace_lock);
735 dtrace_difo_maxsize = value;
736 lck_mtx_unlock(&dtrace_lock);
737
738 return (0);
739}
740
741/*
742 * kern.dtrace.difo_maxsize
743 *
744 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
745 * to get the default value. Attempting to set a null or negative size will
746 * result in a failure.
747 */
748SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
749 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
750 &dtrace_difo_maxsize, 0,
751 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
752
753static int
754sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
755{
756#pragma unused(oidp, arg2, req)
757 int changed, error;
758 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
759
760 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
761 if (error || !changed)
762 return (error);
763
764 if (value <= 0)
765 return (ERANGE);
766
767 if (value >= dtrace_copy_maxsize())
768 return (ERANGE);
769
770 lck_mtx_lock(&dtrace_lock);
771 dtrace_dof_maxsize = value;
772 lck_mtx_unlock(&dtrace_lock);
773
774 return (0);
775}
776
777/*
778 * kern.dtrace.dof_maxsize
779 *
780 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
781 * get the default value. Attempting to set a null or negative size will result
782 * in a failure.
783 */
784SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
785 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
786 &dtrace_dof_maxsize, 0,
787 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
788
789static int
790sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
791{
792#pragma unused(oidp, arg2, req)
793 int changed, error;
794 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
795
796 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
797 if (error || !changed)
798 return (error);
799
800 if (value <= 0)
801 return (ERANGE);
802 if (value > dtrace_statvar_maxsize_max)
803 return (ERANGE);
804
805 lck_mtx_lock(&dtrace_lock);
806 dtrace_statvar_maxsize = value;
807 lck_mtx_unlock(&dtrace_lock);
808
809 return (0);
810}
811
812/*
813 * kern.dtrace.global_maxsize
814 *
815 * Set the variable max size in bytes, check the definition of
816 * dtrace_statvar_maxsize to get the default value. Attempting to set a null,
817 * too high or negative size will result in a failure.
818 */
819SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
820 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
821 &dtrace_statvar_maxsize, 0,
822 sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
823
824static int
825sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
826{
827#pragma unused(oidp, arg2)
828 int error;
829 int value = *(int *) arg1;
830
831 error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
832 if (error)
833 return (error);
834
835 if (req->newptr) {
836 if (value != 0 && value != 1)
837 return (ERANGE);
838
839 /*
840 * We do not allow changing this back to zero, as private probes
841 * would still be left registered
842 */
843 if (value != 1)
844 return (EPERM);
845
846 lck_mtx_lock(&dtrace_lock);
847 dtrace_provide_private_probes = value;
848 lck_mtx_unlock(&dtrace_lock);
849 }
850 return (0);
851}
852
853/*
854 * kern.dtrace.provide_private_probes
855 *
856 * Set whether the providers must provide the private probes. This is
857 * mainly used by the FBT provider to request probes for the private/static
858 * symbols.
859 */
860SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
861 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
862 &dtrace_provide_private_probes, 0,
863 sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
864
865/*
866 * kern.dtrace.dof_mode
867 *
868 * Returns the current DOF mode.
869 * This value is read-only.
870 */
871SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
872 &dtrace_dof_mode, 0, "dtrace dof mode");
873
874/*
875 * DTrace Probe Context Functions
876 *
877 * These functions are called from probe context. Because probe context is
878 * any context in which C may be called, arbitrarily locks may be held,
879 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
880 * As a result, functions called from probe context may only call other DTrace
881 * support functions -- they may not interact at all with the system at large.
882 * (Note that the ASSERT macro is made probe-context safe by redefining it in
883 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
884 * loads are to be performed from probe context, they _must_ be in terms of
885 * the safe dtrace_load*() variants.
886 *
887 * Some functions in this block are not actually called from probe context;
888 * for these functions, there will be a comment above the function reading
889 * "Note: not called from probe context."
890 */
891
892int
893dtrace_assfail(const char *a, const char *f, int l)
894{
895 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
896
897 /*
898 * We just need something here that even the most clever compiler
899 * cannot optimize away.
900 */
901 return (a[(uintptr_t)f]);
902}
903
904/*
905 * Atomically increment a specified error counter from probe context.
906 */
907static void
908dtrace_error(uint32_t *counter)
909{
910 /*
911 * Most counters stored to in probe context are per-CPU counters.
912 * However, there are some error conditions that are sufficiently
913 * arcane that they don't merit per-CPU storage. If these counters
914 * are incremented concurrently on different CPUs, scalability will be
915 * adversely affected -- but we don't expect them to be white-hot in a
916 * correctly constructed enabling...
917 */
918 uint32_t oval, nval;
919
920 do {
921 oval = *counter;
922
923 if ((nval = oval + 1) == 0) {
924 /*
925 * If the counter would wrap, set it to 1 -- assuring
926 * that the counter is never zero when we have seen
927 * errors. (The counter must be 32-bits because we
928 * aren't guaranteed a 64-bit compare&swap operation.)
929 * To save this code both the infamy of being fingered
930 * by a priggish news story and the indignity of being
931 * the target of a neo-puritan witch trial, we're
932 * carefully avoiding any colorful description of the
933 * likelihood of this condition -- but suffice it to
934 * say that it is only slightly more likely than the
935 * overflow of predicate cache IDs, as discussed in
936 * dtrace_predicate_create().
937 */
938 nval = 1;
939 }
940 } while (dtrace_cas32(counter, oval, nval) != oval);
941}
942
943/*
944 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
945 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
946 */
947DTRACE_LOADFUNC(8)
948DTRACE_LOADFUNC(16)
949DTRACE_LOADFUNC(32)
950DTRACE_LOADFUNC(64)
951
952static int
953dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
954{
955 if (dest < mstate->dtms_scratch_base)
956 return (0);
957
958 if (dest + size < dest)
959 return (0);
960
961 if (dest + size > mstate->dtms_scratch_ptr)
962 return (0);
963
964 return (1);
965}
966
967static int
968dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
969 dtrace_statvar_t **svars, int nsvars)
970{
971 int i;
972
973 size_t maxglobalsize, maxlocalsize;
974
975 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
976 maxlocalsize = (maxglobalsize) * NCPU;
977
978 if (nsvars == 0)
979 return (0);
980
981 for (i = 0; i < nsvars; i++) {
982 dtrace_statvar_t *svar = svars[i];
983 uint8_t scope;
984 size_t size;
985
986 if (svar == NULL || (size = svar->dtsv_size) == 0)
987 continue;
988
989 scope = svar->dtsv_var.dtdv_scope;
990
991 /**
992 * We verify that our size is valid in the spirit of providing
993 * defense in depth: we want to prevent attackers from using
994 * DTrace to escalate an orthogonal kernel heap corruption bug
995 * into the ability to store to arbitrary locations in memory.
996 */
997 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
998 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
999
1000 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1001 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1002 svar->dtsv_size);
1003 return (1);
1004 }
1005 }
1006
1007 return (0);
1008}
1009
1010/*
1011 * Check to see if the address is within a memory region to which a store may
1012 * be issued. This includes the DTrace scratch areas, and any DTrace variable
1013 * region. The caller of dtrace_canstore() is responsible for performing any
1014 * alignment checks that are needed before stores are actually executed.
1015 */
1016static int
1017dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1018 dtrace_vstate_t *vstate)
1019{
1020 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1021}
1022/*
1023 * Implementation of dtrace_canstore which communicates the upper bound of the
1024 * allowed memory region.
1025 */
1026static int
1027dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1028 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1029{
1030 /*
1031 * First, check to see if the address is in scratch space...
1032 */
1033 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1034 mstate->dtms_scratch_size)) {
1035 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1036 mstate->dtms_scratch_size);
1037 return (1);
1038 }
1039 /*
1040 * Now check to see if it's a dynamic variable. This check will pick
1041 * up both thread-local variables and any global dynamically-allocated
1042 * variables.
1043 */
1044 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1045 vstate->dtvs_dynvars.dtds_size)) {
1046 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1047 uintptr_t base = (uintptr_t)dstate->dtds_base +
1048 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1049 uintptr_t chunkoffs;
1050 dtrace_dynvar_t *dvar;
1051
1052 /*
1053 * Before we assume that we can store here, we need to make
1054 * sure that it isn't in our metadata -- storing to our
1055 * dynamic variable metadata would corrupt our state. For
1056 * the range to not include any dynamic variable metadata,
1057 * it must:
1058 *
1059 * (1) Start above the hash table that is at the base of
1060 * the dynamic variable space
1061 *
1062 * (2) Have a starting chunk offset that is beyond the
1063 * dtrace_dynvar_t that is at the base of every chunk
1064 *
1065 * (3) Not span a chunk boundary
1066 *
1067 * (4) Not be in the tuple space of a dynamic variable
1068 *
1069 */
1070 if (addr < base)
1071 return (0);
1072
1073 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1074
1075 if (chunkoffs < sizeof (dtrace_dynvar_t))
1076 return (0);
1077
1078 if (chunkoffs + sz > dstate->dtds_chunksize)
1079 return (0);
1080
1081 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1082
1083 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1084 return (0);
1085
1086 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1087 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1088 return (0);
1089
1090 return (1);
1091 }
1092
1093 /*
1094 * Finally, check the static local and global variables. These checks
1095 * take the longest, so we perform them last.
1096 */
1097 if (dtrace_canstore_statvar(addr, sz, remain,
1098 vstate->dtvs_locals, vstate->dtvs_nlocals))
1099 return (1);
1100
1101 if (dtrace_canstore_statvar(addr, sz, remain,
1102 vstate->dtvs_globals, vstate->dtvs_nglobals))
1103 return (1);
1104
1105 return (0);
1106}
1107
1108
1109/*
1110 * Convenience routine to check to see if the address is within a memory
1111 * region in which a load may be issued given the user's privilege level;
1112 * if not, it sets the appropriate error flags and loads 'addr' into the
1113 * illegal value slot.
1114 *
1115 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1116 * appropriate memory access protection.
1117 */
1118int
1119dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1120 dtrace_vstate_t *vstate)
1121{
1122 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1123}
1124
1125/*
1126 * Implementation of dtrace_canload which communicates the upper bound of the
1127 * allowed memory region.
1128 */
1129static int
1130dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1131 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1132{
1133 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1134
1135 /*
1136 * If we hold the privilege to read from kernel memory, then
1137 * everything is readable.
1138 */
1139 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1140 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1141 return (1);
1142 }
1143
1144 /*
1145 * You can obviously read that which you can store.
1146 */
1147 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1148 return (1);
1149
1150 /*
1151 * We're allowed to read from our own string table.
1152 */
1153 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1154 mstate->dtms_difo->dtdo_strlen)) {
1155 DTRACE_RANGE_REMAIN(remain, addr,
1156 mstate->dtms_difo->dtdo_strtab,
1157 mstate->dtms_difo->dtdo_strlen);
1158 return (1);
1159 }
1160
1161 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1162 *illval = addr;
1163 return (0);
1164}
1165
1166/*
1167 * Convenience routine to check to see if a given string is within a memory
1168 * region in which a load may be issued given the user's privilege level;
1169 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1170 * calls in the event that the user has all privileges.
1171 */
1172static int
1173dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1174 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1175{
1176 size_t rsize;
1177
1178 /*
1179 * If we hold the privilege to read from kernel memory, then
1180 * everything is readable.
1181 */
1182 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1183 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1184 return (1);
1185 }
1186
1187 /*
1188 * Even if the caller is uninterested in querying the remaining valid
1189 * range, it is required to ensure that the access is allowed.
1190 */
1191 if (remain == NULL) {
1192 remain = &rsize;
1193 }
1194 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1195 size_t strsz;
1196 /*
1197 * Perform the strlen after determining the length of the
1198 * memory region which is accessible. This prevents timing
1199 * information from being used to find NULs in memory which is
1200 * not accessible to the caller.
1201 */
1202 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1203 MIN(sz, *remain));
1204 if (strsz <= *remain) {
1205 return (1);
1206 }
1207 }
1208
1209 return (0);
1210}
1211
1212/*
1213 * Convenience routine to check to see if a given variable is within a memory
1214 * region in which a load may be issued given the user's privilege level.
1215 */
1216static int
1217dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1218 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1219{
1220 size_t sz;
1221 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222
1223 /*
1224 * Calculate the max size before performing any checks since even
1225 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1226 * return the max length via 'remain'.
1227 */
1228 if (type->dtdt_kind == DIF_TYPE_STRING) {
1229 dtrace_state_t *state = vstate->dtvs_state;
1230
1231 if (state != NULL) {
1232 sz = state->dts_options[DTRACEOPT_STRSIZE];
1233 } else {
1234 /*
1235 * In helper context, we have a NULL state; fall back
1236 * to using the system-wide default for the string size
1237 * in this case.
1238 */
1239 sz = dtrace_strsize_default;
1240 }
1241 } else {
1242 sz = type->dtdt_size;
1243 }
1244
1245 /*
1246 * If we hold the privilege to read from kernel memory, then
1247 * everything is readable.
1248 */
1249 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1250 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1251 return (1);
1252 }
1253
1254 if (type->dtdt_kind == DIF_TYPE_STRING) {
1255 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1256 vstate));
1257 }
1258 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1259 vstate));
1260}
1261
1262/*
1263 * Compare two strings using safe loads.
1264 */
1265static int
1266dtrace_strncmp(char *s1, char *s2, size_t limit)
1267{
1268 uint8_t c1, c2;
1269 volatile uint16_t *flags;
1270
1271 if (s1 == s2 || limit == 0)
1272 return (0);
1273
1274 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1275
1276 do {
1277 if (s1 == NULL) {
1278 c1 = '\0';
1279 } else {
1280 c1 = dtrace_load8((uintptr_t)s1++);
1281 }
1282
1283 if (s2 == NULL) {
1284 c2 = '\0';
1285 } else {
1286 c2 = dtrace_load8((uintptr_t)s2++);
1287 }
1288
1289 if (c1 != c2)
1290 return (c1 - c2);
1291 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1292
1293 return (0);
1294}
1295
1296/*
1297 * Compute strlen(s) for a string using safe memory accesses. The additional
1298 * len parameter is used to specify a maximum length to ensure completion.
1299 */
1300static size_t
1301dtrace_strlen(const char *s, size_t lim)
1302{
1303 uint_t len;
1304
1305 for (len = 0; len != lim; len++) {
1306 if (dtrace_load8((uintptr_t)s++) == '\0')
1307 break;
1308 }
1309
1310 return (len);
1311}
1312
1313/*
1314 * Check if an address falls within a toxic region.
1315 */
1316static int
1317dtrace_istoxic(uintptr_t kaddr, size_t size)
1318{
1319 uintptr_t taddr, tsize;
1320 int i;
1321
1322 for (i = 0; i < dtrace_toxranges; i++) {
1323 taddr = dtrace_toxrange[i].dtt_base;
1324 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1325
1326 if (kaddr - taddr < tsize) {
1327 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1328 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1329 return (1);
1330 }
1331
1332 if (taddr - kaddr < size) {
1333 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1334 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1335 return (1);
1336 }
1337 }
1338
1339 return (0);
1340}
1341
1342/*
1343 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1344 * memory specified by the DIF program. The dst is assumed to be safe memory
1345 * that we can store to directly because it is managed by DTrace. As with
1346 * standard bcopy, overlapping copies are handled properly.
1347 */
1348static void
1349dtrace_bcopy(const void *src, void *dst, size_t len)
1350{
1351 if (len != 0) {
1352 uint8_t *s1 = dst;
1353 const uint8_t *s2 = src;
1354
1355 if (s1 <= s2) {
1356 do {
1357 *s1++ = dtrace_load8((uintptr_t)s2++);
1358 } while (--len != 0);
1359 } else {
1360 s2 += len;
1361 s1 += len;
1362
1363 do {
1364 *--s1 = dtrace_load8((uintptr_t)--s2);
1365 } while (--len != 0);
1366 }
1367 }
1368}
1369
1370/*
1371 * Copy src to dst using safe memory accesses, up to either the specified
1372 * length, or the point that a nul byte is encountered. The src is assumed to
1373 * be unsafe memory specified by the DIF program. The dst is assumed to be
1374 * safe memory that we can store to directly because it is managed by DTrace.
1375 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1376 */
1377static void
1378dtrace_strcpy(const void *src, void *dst, size_t len)
1379{
1380 if (len != 0) {
1381 uint8_t *s1 = dst, c;
1382 const uint8_t *s2 = src;
1383
1384 do {
1385 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1386 } while (--len != 0 && c != '\0');
1387 }
1388}
1389
1390/*
1391 * Copy src to dst, deriving the size and type from the specified (BYREF)
1392 * variable type. The src is assumed to be unsafe memory specified by the DIF
1393 * program. The dst is assumed to be DTrace variable memory that is of the
1394 * specified type; we assume that we can store to directly.
1395 */
1396static void
1397dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1398{
1399 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1400
1401 if (type->dtdt_kind == DIF_TYPE_STRING) {
1402 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1403 } else {
1404 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1405 }
1406}
1407
1408/*
1409 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1410 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1411 * safe memory that we can access directly because it is managed by DTrace.
1412 */
1413static int
1414dtrace_bcmp(const void *s1, const void *s2, size_t len)
1415{
1416 volatile uint16_t *flags;
1417
1418 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1419
1420 if (s1 == s2)
1421 return (0);
1422
1423 if (s1 == NULL || s2 == NULL)
1424 return (1);
1425
1426 if (s1 != s2 && len != 0) {
1427 const uint8_t *ps1 = s1;
1428 const uint8_t *ps2 = s2;
1429
1430 do {
1431 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1432 return (1);
1433 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1434 }
1435 return (0);
1436}
1437
1438/*
1439 * Zero the specified region using a simple byte-by-byte loop. Note that this
1440 * is for safe DTrace-managed memory only.
1441 */
1442static void
1443dtrace_bzero(void *dst, size_t len)
1444{
1445 uchar_t *cp;
1446
1447 for (cp = dst; len != 0; len--)
1448 *cp++ = 0;
1449}
1450
1451static void
1452dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1453{
1454 uint64_t result[2];
1455
1456 result[0] = addend1[0] + addend2[0];
1457 result[1] = addend1[1] + addend2[1] +
1458 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1459
1460 sum[0] = result[0];
1461 sum[1] = result[1];
1462}
1463
1464/*
1465 * Shift the 128-bit value in a by b. If b is positive, shift left.
1466 * If b is negative, shift right.
1467 */
1468static void
1469dtrace_shift_128(uint64_t *a, int b)
1470{
1471 uint64_t mask;
1472
1473 if (b == 0)
1474 return;
1475
1476 if (b < 0) {
1477 b = -b;
1478 if (b >= 64) {
1479 a[0] = a[1] >> (b - 64);
1480 a[1] = 0;
1481 } else {
1482 a[0] >>= b;
1483 mask = 1LL << (64 - b);
1484 mask -= 1;
1485 a[0] |= ((a[1] & mask) << (64 - b));
1486 a[1] >>= b;
1487 }
1488 } else {
1489 if (b >= 64) {
1490 a[1] = a[0] << (b - 64);
1491 a[0] = 0;
1492 } else {
1493 a[1] <<= b;
1494 mask = a[0] >> (64 - b);
1495 a[1] |= mask;
1496 a[0] <<= b;
1497 }
1498 }
1499}
1500
1501/*
1502 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1503 * use native multiplication on those, and then re-combine into the
1504 * resulting 128-bit value.
1505 *
1506 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1507 * hi1 * hi2 << 64 +
1508 * hi1 * lo2 << 32 +
1509 * hi2 * lo1 << 32 +
1510 * lo1 * lo2
1511 */
1512static void
1513dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1514{
1515 uint64_t hi1, hi2, lo1, lo2;
1516 uint64_t tmp[2];
1517
1518 hi1 = factor1 >> 32;
1519 hi2 = factor2 >> 32;
1520
1521 lo1 = factor1 & DT_MASK_LO;
1522 lo2 = factor2 & DT_MASK_LO;
1523
1524 product[0] = lo1 * lo2;
1525 product[1] = hi1 * hi2;
1526
1527 tmp[0] = hi1 * lo2;
1528 tmp[1] = 0;
1529 dtrace_shift_128(tmp, 32);
1530 dtrace_add_128(product, tmp, product);
1531
1532 tmp[0] = hi2 * lo1;
1533 tmp[1] = 0;
1534 dtrace_shift_128(tmp, 32);
1535 dtrace_add_128(product, tmp, product);
1536}
1537
1538/*
1539 * This privilege check should be used by actions and subroutines to
1540 * verify that the user credentials of the process that enabled the
1541 * invoking ECB match the target credentials
1542 */
1543static int
1544dtrace_priv_proc_common_user(dtrace_state_t *state)
1545{
1546 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1547
1548 /*
1549 * We should always have a non-NULL state cred here, since if cred
1550 * is null (anonymous tracing), we fast-path bypass this routine.
1551 */
1552 ASSERT(s_cr != NULL);
1553
1554 if ((cr = dtrace_CRED()) != NULL &&
1555 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1556 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1557 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1558 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1559 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1560 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1561 return (1);
1562
1563 return (0);
1564}
1565
1566/*
1567 * This privilege check should be used by actions and subroutines to
1568 * verify that the zone of the process that enabled the invoking ECB
1569 * matches the target credentials
1570 */
1571static int
1572dtrace_priv_proc_common_zone(dtrace_state_t *state)
1573{
1574 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1575#pragma unused(cr, s_cr, state) /* __APPLE__ */
1576
1577 /*
1578 * We should always have a non-NULL state cred here, since if cred
1579 * is null (anonymous tracing), we fast-path bypass this routine.
1580 */
1581 ASSERT(s_cr != NULL);
1582
1583 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1584}
1585
1586/*
1587 * This privilege check should be used by actions and subroutines to
1588 * verify that the process has not setuid or changed credentials.
1589 */
1590static int
1591dtrace_priv_proc_common_nocd(void)
1592{
1593 return 1; /* Darwin omits "No Core Dump" flag. */
1594}
1595
1596static int
1597dtrace_priv_proc_destructive(dtrace_state_t *state)
1598{
1599 int action = state->dts_cred.dcr_action;
1600
1601 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1602 goto bad;
1603
1604 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1605 goto bad;
1606
1607 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1608 dtrace_priv_proc_common_zone(state) == 0)
1609 goto bad;
1610
1611 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1612 dtrace_priv_proc_common_user(state) == 0)
1613 goto bad;
1614
1615 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1616 dtrace_priv_proc_common_nocd() == 0)
1617 goto bad;
1618
1619 return (1);
1620
1621bad:
1622 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1623
1624 return (0);
1625}
1626
1627static int
1628dtrace_priv_proc_control(dtrace_state_t *state)
1629{
1630 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1631 goto bad;
1632
1633 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1634 goto bad;
1635
1636 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1637 return (1);
1638
1639 if (dtrace_priv_proc_common_zone(state) &&
1640 dtrace_priv_proc_common_user(state) &&
1641 dtrace_priv_proc_common_nocd())
1642 return (1);
1643
1644bad:
1645 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1646
1647 return (0);
1648}
1649
1650static int
1651dtrace_priv_proc(dtrace_state_t *state)
1652{
1653 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1654 goto bad;
1655
1656 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1657 goto bad;
1658
1659 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1660 return (1);
1661
1662bad:
1663 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1664
1665 return (0);
1666}
1667
1668/*
1669 * The P_LNOATTACH check is an Apple specific check.
1670 * We need a version of dtrace_priv_proc() that omits
1671 * that check for PID and EXECNAME accesses
1672 */
1673static int
1674dtrace_priv_proc_relaxed(dtrace_state_t *state)
1675{
1676
1677 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1678 return (1);
1679
1680 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1681
1682 return (0);
1683}
1684
1685static int
1686dtrace_priv_kernel(dtrace_state_t *state)
1687{
1688 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1689 goto bad;
1690
1691 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1692 return (1);
1693
1694bad:
1695 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1696
1697 return (0);
1698}
1699
1700static int
1701dtrace_priv_kernel_destructive(dtrace_state_t *state)
1702{
1703 if (dtrace_is_restricted())
1704 goto bad;
1705
1706 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1707 return (1);
1708
1709bad:
1710 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1711
1712 return (0);
1713}
1714
1715/*
1716 * Note: not called from probe context. This function is called
1717 * asynchronously (and at a regular interval) from outside of probe context to
1718 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1719 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1720 */
1721static void
1722dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1723{
1724 dtrace_dynvar_t *dirty;
1725 dtrace_dstate_percpu_t *dcpu;
1726 int i, work = 0;
1727
1728 for (i = 0; i < (int)NCPU; i++) {
1729 dcpu = &dstate->dtds_percpu[i];
1730
1731 ASSERT(dcpu->dtdsc_rinsing == NULL);
1732
1733 /*
1734 * If the dirty list is NULL, there is no dirty work to do.
1735 */
1736 if (dcpu->dtdsc_dirty == NULL)
1737 continue;
1738
1739 /*
1740 * If the clean list is non-NULL, then we're not going to do
1741 * any work for this CPU -- it means that there has not been
1742 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1743 * since the last time we cleaned house.
1744 */
1745 if (dcpu->dtdsc_clean != NULL)
1746 continue;
1747
1748 work = 1;
1749
1750 /*
1751 * Atomically move the dirty list aside.
1752 */
1753 do {
1754 dirty = dcpu->dtdsc_dirty;
1755
1756 /*
1757 * Before we zap the dirty list, set the rinsing list.
1758 * (This allows for a potential assertion in
1759 * dtrace_dynvar(): if a free dynamic variable appears
1760 * on a hash chain, either the dirty list or the
1761 * rinsing list for some CPU must be non-NULL.)
1762 */
1763 dcpu->dtdsc_rinsing = dirty;
1764 dtrace_membar_producer();
1765 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1766 dirty, NULL) != dirty);
1767 }
1768
1769 if (!work) {
1770 /*
1771 * We have no work to do; we can simply return.
1772 */
1773 return;
1774 }
1775
1776 dtrace_sync();
1777
1778 for (i = 0; i < (int)NCPU; i++) {
1779 dcpu = &dstate->dtds_percpu[i];
1780
1781 if (dcpu->dtdsc_rinsing == NULL)
1782 continue;
1783
1784 /*
1785 * We are now guaranteed that no hash chain contains a pointer
1786 * into this dirty list; we can make it clean.
1787 */
1788 ASSERT(dcpu->dtdsc_clean == NULL);
1789 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1790 dcpu->dtdsc_rinsing = NULL;
1791 }
1792
1793 /*
1794 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1795 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1796 * This prevents a race whereby a CPU incorrectly decides that
1797 * the state should be something other than DTRACE_DSTATE_CLEAN
1798 * after dtrace_dynvar_clean() has completed.
1799 */
1800 dtrace_sync();
1801
1802 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1803}
1804
1805/*
1806 * Depending on the value of the op parameter, this function looks-up,
1807 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1808 * allocation is requested, this function will return a pointer to a
1809 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1810 * variable can be allocated. If NULL is returned, the appropriate counter
1811 * will be incremented.
1812 */
1813static dtrace_dynvar_t *
1814dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1815 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1816 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1817{
1818 uint64_t hashval = DTRACE_DYNHASH_VALID;
1819 dtrace_dynhash_t *hash = dstate->dtds_hash;
1820 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1821 processorid_t me = CPU->cpu_id, cpu = me;
1822 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1823 size_t bucket, ksize;
1824 size_t chunksize = dstate->dtds_chunksize;
1825 uintptr_t kdata, lock, nstate;
1826 uint_t i;
1827
1828 ASSERT(nkeys != 0);
1829
1830 /*
1831 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1832 * algorithm. For the by-value portions, we perform the algorithm in
1833 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1834 * bit, and seems to have only a minute effect on distribution. For
1835 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1836 * over each referenced byte. It's painful to do this, but it's much
1837 * better than pathological hash distribution. The efficacy of the
1838 * hashing algorithm (and a comparison with other algorithms) may be
1839 * found by running the ::dtrace_dynstat MDB dcmd.
1840 */
1841 for (i = 0; i < nkeys; i++) {
1842 if (key[i].dttk_size == 0) {
1843 uint64_t val = key[i].dttk_value;
1844
1845 hashval += (val >> 48) & 0xffff;
1846 hashval += (hashval << 10);
1847 hashval ^= (hashval >> 6);
1848
1849 hashval += (val >> 32) & 0xffff;
1850 hashval += (hashval << 10);
1851 hashval ^= (hashval >> 6);
1852
1853 hashval += (val >> 16) & 0xffff;
1854 hashval += (hashval << 10);
1855 hashval ^= (hashval >> 6);
1856
1857 hashval += val & 0xffff;
1858 hashval += (hashval << 10);
1859 hashval ^= (hashval >> 6);
1860 } else {
1861 /*
1862 * This is incredibly painful, but it beats the hell
1863 * out of the alternative.
1864 */
1865 uint64_t j, size = key[i].dttk_size;
1866 uintptr_t base = (uintptr_t)key[i].dttk_value;
1867
1868 if (!dtrace_canload(base, size, mstate, vstate))
1869 break;
1870
1871 for (j = 0; j < size; j++) {
1872 hashval += dtrace_load8(base + j);
1873 hashval += (hashval << 10);
1874 hashval ^= (hashval >> 6);
1875 }
1876 }
1877 }
1878
1879 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1880 return (NULL);
1881
1882 hashval += (hashval << 3);
1883 hashval ^= (hashval >> 11);
1884 hashval += (hashval << 15);
1885
1886 /*
1887 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1888 * comes out to be one of our two sentinel hash values. If this
1889 * actually happens, we set the hashval to be a value known to be a
1890 * non-sentinel value.
1891 */
1892 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1893 hashval = DTRACE_DYNHASH_VALID;
1894
1895 /*
1896 * Yes, it's painful to do a divide here. If the cycle count becomes
1897 * important here, tricks can be pulled to reduce it. (However, it's
1898 * critical that hash collisions be kept to an absolute minimum;
1899 * they're much more painful than a divide.) It's better to have a
1900 * solution that generates few collisions and still keeps things
1901 * relatively simple.
1902 */
1903 bucket = hashval % dstate->dtds_hashsize;
1904
1905 if (op == DTRACE_DYNVAR_DEALLOC) {
1906 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1907
1908 for (;;) {
1909 while ((lock = *lockp) & 1)
1910 continue;
1911
1912 if (dtrace_casptr((void *)(uintptr_t)lockp,
1913 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1914 break;
1915 }
1916
1917 dtrace_membar_producer();
1918 }
1919
1920top:
1921 prev = NULL;
1922 lock = hash[bucket].dtdh_lock;
1923
1924 dtrace_membar_consumer();
1925
1926 start = hash[bucket].dtdh_chain;
1927 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1928 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1929 op != DTRACE_DYNVAR_DEALLOC));
1930
1931 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1932 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1933 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1934
1935 if (dvar->dtdv_hashval != hashval) {
1936 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1937 /*
1938 * We've reached the sink, and therefore the
1939 * end of the hash chain; we can kick out of
1940 * the loop knowing that we have seen a valid
1941 * snapshot of state.
1942 */
1943 ASSERT(dvar->dtdv_next == NULL);
1944 ASSERT(dvar == &dtrace_dynhash_sink);
1945 break;
1946 }
1947
1948 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1949 /*
1950 * We've gone off the rails: somewhere along
1951 * the line, one of the members of this hash
1952 * chain was deleted. Note that we could also
1953 * detect this by simply letting this loop run
1954 * to completion, as we would eventually hit
1955 * the end of the dirty list. However, we
1956 * want to avoid running the length of the
1957 * dirty list unnecessarily (it might be quite
1958 * long), so we catch this as early as
1959 * possible by detecting the hash marker. In
1960 * this case, we simply set dvar to NULL and
1961 * break; the conditional after the loop will
1962 * send us back to top.
1963 */
1964 dvar = NULL;
1965 break;
1966 }
1967
1968 goto next;
1969 }
1970
1971 if (dtuple->dtt_nkeys != nkeys)
1972 goto next;
1973
1974 for (i = 0; i < nkeys; i++, dkey++) {
1975 if (dkey->dttk_size != key[i].dttk_size)
1976 goto next; /* size or type mismatch */
1977
1978 if (dkey->dttk_size != 0) {
1979 if (dtrace_bcmp(
1980 (void *)(uintptr_t)key[i].dttk_value,
1981 (void *)(uintptr_t)dkey->dttk_value,
1982 dkey->dttk_size))
1983 goto next;
1984 } else {
1985 if (dkey->dttk_value != key[i].dttk_value)
1986 goto next;
1987 }
1988 }
1989
1990 if (op != DTRACE_DYNVAR_DEALLOC)
1991 return (dvar);
1992
1993 ASSERT(dvar->dtdv_next == NULL ||
1994 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1995
1996 if (prev != NULL) {
1997 ASSERT(hash[bucket].dtdh_chain != dvar);
1998 ASSERT(start != dvar);
1999 ASSERT(prev->dtdv_next == dvar);
2000 prev->dtdv_next = dvar->dtdv_next;
2001 } else {
2002 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2003 start, dvar->dtdv_next) != start) {
2004 /*
2005 * We have failed to atomically swing the
2006 * hash table head pointer, presumably because
2007 * of a conflicting allocation on another CPU.
2008 * We need to reread the hash chain and try
2009 * again.
2010 */
2011 goto top;
2012 }
2013 }
2014
2015 dtrace_membar_producer();
2016
2017 /*
2018 * Now set the hash value to indicate that it's free.
2019 */
2020 ASSERT(hash[bucket].dtdh_chain != dvar);
2021 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2022
2023 dtrace_membar_producer();
2024
2025 /*
2026 * Set the next pointer to point at the dirty list, and
2027 * atomically swing the dirty pointer to the newly freed dvar.
2028 */
2029 do {
2030 next = dcpu->dtdsc_dirty;
2031 dvar->dtdv_next = next;
2032 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2033
2034 /*
2035 * Finally, unlock this hash bucket.
2036 */
2037 ASSERT(hash[bucket].dtdh_lock == lock);
2038 ASSERT(lock & 1);
2039 hash[bucket].dtdh_lock++;
2040
2041 return (NULL);
2042next:
2043 prev = dvar;
2044 continue;
2045 }
2046
2047 if (dvar == NULL) {
2048 /*
2049 * If dvar is NULL, it is because we went off the rails:
2050 * one of the elements that we traversed in the hash chain
2051 * was deleted while we were traversing it. In this case,
2052 * we assert that we aren't doing a dealloc (deallocs lock
2053 * the hash bucket to prevent themselves from racing with
2054 * one another), and retry the hash chain traversal.
2055 */
2056 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2057 goto top;
2058 }
2059
2060 if (op != DTRACE_DYNVAR_ALLOC) {
2061 /*
2062 * If we are not to allocate a new variable, we want to
2063 * return NULL now. Before we return, check that the value
2064 * of the lock word hasn't changed. If it has, we may have
2065 * seen an inconsistent snapshot.
2066 */
2067 if (op == DTRACE_DYNVAR_NOALLOC) {
2068 if (hash[bucket].dtdh_lock != lock)
2069 goto top;
2070 } else {
2071 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2072 ASSERT(hash[bucket].dtdh_lock == lock);
2073 ASSERT(lock & 1);
2074 hash[bucket].dtdh_lock++;
2075 }
2076
2077 return (NULL);
2078 }
2079
2080 /*
2081 * We need to allocate a new dynamic variable. The size we need is the
2082 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2083 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2084 * the size of any referred-to data (dsize). We then round the final
2085 * size up to the chunksize for allocation.
2086 */
2087 for (ksize = 0, i = 0; i < nkeys; i++)
2088 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2089
2090 /*
2091 * This should be pretty much impossible, but could happen if, say,
2092 * strange DIF specified the tuple. Ideally, this should be an
2093 * assertion and not an error condition -- but that requires that the
2094 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2095 * bullet-proof. (That is, it must not be able to be fooled by
2096 * malicious DIF.) Given the lack of backwards branches in DIF,
2097 * solving this would presumably not amount to solving the Halting
2098 * Problem -- but it still seems awfully hard.
2099 */
2100 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2101 ksize + dsize > chunksize) {
2102 dcpu->dtdsc_drops++;
2103 return (NULL);
2104 }
2105
2106 nstate = DTRACE_DSTATE_EMPTY;
2107
2108 do {
2109retry:
2110 free = dcpu->dtdsc_free;
2111
2112 if (free == NULL) {
2113 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2114 void *rval;
2115
2116 if (clean == NULL) {
2117 /*
2118 * We're out of dynamic variable space on
2119 * this CPU. Unless we have tried all CPUs,
2120 * we'll try to allocate from a different
2121 * CPU.
2122 */
2123 switch (dstate->dtds_state) {
2124 case DTRACE_DSTATE_CLEAN: {
2125 void *sp = &dstate->dtds_state;
2126
2127 if (++cpu >= (int)NCPU)
2128 cpu = 0;
2129
2130 if (dcpu->dtdsc_dirty != NULL &&
2131 nstate == DTRACE_DSTATE_EMPTY)
2132 nstate = DTRACE_DSTATE_DIRTY;
2133
2134 if (dcpu->dtdsc_rinsing != NULL)
2135 nstate = DTRACE_DSTATE_RINSING;
2136
2137 dcpu = &dstate->dtds_percpu[cpu];
2138
2139 if (cpu != me)
2140 goto retry;
2141
2142 (void) dtrace_cas32(sp,
2143 DTRACE_DSTATE_CLEAN, nstate);
2144
2145 /*
2146 * To increment the correct bean
2147 * counter, take another lap.
2148 */
2149 goto retry;
2150 }
2151
2152 case DTRACE_DSTATE_DIRTY:
2153 dcpu->dtdsc_dirty_drops++;
2154 break;
2155
2156 case DTRACE_DSTATE_RINSING:
2157 dcpu->dtdsc_rinsing_drops++;
2158 break;
2159
2160 case DTRACE_DSTATE_EMPTY:
2161 dcpu->dtdsc_drops++;
2162 break;
2163 }
2164
2165 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2166 return (NULL);
2167 }
2168
2169 /*
2170 * The clean list appears to be non-empty. We want to
2171 * move the clean list to the free list; we start by
2172 * moving the clean pointer aside.
2173 */
2174 if (dtrace_casptr(&dcpu->dtdsc_clean,
2175 clean, NULL) != clean) {
2176 /*
2177 * We are in one of two situations:
2178 *
2179 * (a) The clean list was switched to the
2180 * free list by another CPU.
2181 *
2182 * (b) The clean list was added to by the
2183 * cleansing cyclic.
2184 *
2185 * In either of these situations, we can
2186 * just reattempt the free list allocation.
2187 */
2188 goto retry;
2189 }
2190
2191 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2192
2193 /*
2194 * Now we'll move the clean list to the free list.
2195 * It's impossible for this to fail: the only way
2196 * the free list can be updated is through this
2197 * code path, and only one CPU can own the clean list.
2198 * Thus, it would only be possible for this to fail if
2199 * this code were racing with dtrace_dynvar_clean().
2200 * (That is, if dtrace_dynvar_clean() updated the clean
2201 * list, and we ended up racing to update the free
2202 * list.) This race is prevented by the dtrace_sync()
2203 * in dtrace_dynvar_clean() -- which flushes the
2204 * owners of the clean lists out before resetting
2205 * the clean lists.
2206 */
2207 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2208 ASSERT(rval == NULL);
2209 goto retry;
2210 }
2211
2212 dvar = free;
2213 new_free = dvar->dtdv_next;
2214 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2215
2216 /*
2217 * We have now allocated a new chunk. We copy the tuple keys into the
2218 * tuple array and copy any referenced key data into the data space
2219 * following the tuple array. As we do this, we relocate dttk_value
2220 * in the final tuple to point to the key data address in the chunk.
2221 */
2222 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2223 dvar->dtdv_data = (void *)(kdata + ksize);
2224 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2225
2226 for (i = 0; i < nkeys; i++) {
2227 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2228 size_t kesize = key[i].dttk_size;
2229
2230 if (kesize != 0) {
2231 dtrace_bcopy(
2232 (const void *)(uintptr_t)key[i].dttk_value,
2233 (void *)kdata, kesize);
2234 dkey->dttk_value = kdata;
2235 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2236 } else {
2237 dkey->dttk_value = key[i].dttk_value;
2238 }
2239
2240 dkey->dttk_size = kesize;
2241 }
2242
2243 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2244 dvar->dtdv_hashval = hashval;
2245 dvar->dtdv_next = start;
2246
2247 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2248 return (dvar);
2249
2250 /*
2251 * The cas has failed. Either another CPU is adding an element to
2252 * this hash chain, or another CPU is deleting an element from this
2253 * hash chain. The simplest way to deal with both of these cases
2254 * (though not necessarily the most efficient) is to free our
2255 * allocated block and tail-call ourselves. Note that the free is
2256 * to the dirty list and _not_ to the free list. This is to prevent
2257 * races with allocators, above.
2258 */
2259 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2260
2261 dtrace_membar_producer();
2262
2263 do {
2264 free = dcpu->dtdsc_dirty;
2265 dvar->dtdv_next = free;
2266 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2267
2268 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2269}
2270
2271/*ARGSUSED*/
2272static void
2273dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2274{
2275#pragma unused(arg) /* __APPLE__ */
2276 if ((int64_t)nval < (int64_t)*oval)
2277 *oval = nval;
2278}
2279
2280/*ARGSUSED*/
2281static void
2282dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2283{
2284#pragma unused(arg) /* __APPLE__ */
2285 if ((int64_t)nval > (int64_t)*oval)
2286 *oval = nval;
2287}
2288
2289static void
2290dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2291{
2292 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2293 int64_t val = (int64_t)nval;
2294
2295 if (val < 0) {
2296 for (i = 0; i < zero; i++) {
2297 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2298 quanta[i] += incr;
2299 return;
2300 }
2301 }
2302 } else {
2303 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2304 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2305 quanta[i - 1] += incr;
2306 return;
2307 }
2308 }
2309
2310 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2311 return;
2312 }
2313
2314 ASSERT(0);
2315}
2316
2317static void
2318dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2319{
2320 uint64_t arg = *lquanta++;
2321 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2322 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2323 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2324 int32_t val = (int32_t)nval, level;
2325
2326 ASSERT(step != 0);
2327 ASSERT(levels != 0);
2328
2329 if (val < base) {
2330 /*
2331 * This is an underflow.
2332 */
2333 lquanta[0] += incr;
2334 return;
2335 }
2336
2337 level = (val - base) / step;
2338
2339 if (level < levels) {
2340 lquanta[level + 1] += incr;
2341 return;
2342 }
2343
2344 /*
2345 * This is an overflow.
2346 */
2347 lquanta[levels + 1] += incr;
2348}
2349
2350static int
2351dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2352 int16_t nsteps, int64_t value)
2353{
2354 int64_t this = 1, last, next;
2355 int base = 1, order;
2356
2357 for (order = 0; order < low; ++order)
2358 this *= factor;
2359
2360 /*
2361 * If our value is less than our factor taken to the power of the
2362 * low order of magnitude, it goes into the zeroth bucket.
2363 */
2364 if (value < this)
2365 return 0;
2366 else
2367 last = this;
2368
2369 for (this *= factor; order <= high; ++order) {
2370 int nbuckets = this > nsteps ? nsteps : this;
2371
2372 /*
2373 * We should not generally get log/linear quantizations
2374 * with a high magnitude that allows 64-bits to
2375 * overflow, but we nonetheless protect against this
2376 * by explicitly checking for overflow, and clamping
2377 * our value accordingly.
2378 */
2379 next = this * factor;
2380 if (next < this) {
2381 value = this - 1;
2382 }
2383
2384 /*
2385 * If our value lies within this order of magnitude,
2386 * determine its position by taking the offset within
2387 * the order of magnitude, dividing by the bucket
2388 * width, and adding to our (accumulated) base.
2389 */
2390 if (value < this) {
2391 return (base + (value - last) / (this / nbuckets));
2392 }
2393
2394 base += nbuckets - (nbuckets / factor);
2395 last = this;
2396 this = next;
2397 }
2398
2399 /*
2400 * Our value is greater than or equal to our factor taken to the
2401 * power of one plus the high magnitude -- return the top bucket.
2402 */
2403 return base;
2404}
2405
2406static void
2407dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2408{
2409 uint64_t arg = *llquanta++;
2410 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2411 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2412 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2413 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2414
2415 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2416}
2417
2418/*ARGSUSED*/
2419static void
2420dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2421{
2422#pragma unused(arg) /* __APPLE__ */
2423 data[0]++;
2424 data[1] += nval;
2425}
2426
2427/*ARGSUSED*/
2428static void
2429dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2430{
2431#pragma unused(arg) /* __APPLE__ */
2432 int64_t snval = (int64_t)nval;
2433 uint64_t tmp[2];
2434
2435 data[0]++;
2436 data[1] += nval;
2437
2438 /*
2439 * What we want to say here is:
2440 *
2441 * data[2] += nval * nval;
2442 *
2443 * But given that nval is 64-bit, we could easily overflow, so
2444 * we do this as 128-bit arithmetic.
2445 */
2446 if (snval < 0)
2447 snval = -snval;
2448
2449 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2450 dtrace_add_128(data + 2, tmp, data + 2);
2451}
2452
2453/*ARGSUSED*/
2454static void
2455dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2456{
2457#pragma unused(nval, arg) /* __APPLE__ */
2458 *oval = *oval + 1;
2459}
2460
2461/*ARGSUSED*/
2462static void
2463dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2464{
2465#pragma unused(arg) /* __APPLE__ */
2466 *oval += nval;
2467}
2468
2469/*
2470 * Aggregate given the tuple in the principal data buffer, and the aggregating
2471 * action denoted by the specified dtrace_aggregation_t. The aggregation
2472 * buffer is specified as the buf parameter. This routine does not return
2473 * failure; if there is no space in the aggregation buffer, the data will be
2474 * dropped, and a corresponding counter incremented.
2475 */
2476static void
2477dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2478 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2479{
2480#pragma unused(arg)
2481 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2482 uint32_t i, ndx, size, fsize;
2483 uint32_t align = sizeof (uint64_t) - 1;
2484 dtrace_aggbuffer_t *agb;
2485 dtrace_aggkey_t *key;
2486 uint32_t hashval = 0, limit, isstr;
2487 caddr_t tomax, data, kdata;
2488 dtrace_actkind_t action;
2489 dtrace_action_t *act;
2490 uintptr_t offs;
2491
2492 if (buf == NULL)
2493 return;
2494
2495 if (!agg->dtag_hasarg) {
2496 /*
2497 * Currently, only quantize() and lquantize() take additional
2498 * arguments, and they have the same semantics: an increment
2499 * value that defaults to 1 when not present. If additional
2500 * aggregating actions take arguments, the setting of the
2501 * default argument value will presumably have to become more
2502 * sophisticated...
2503 */
2504 arg = 1;
2505 }
2506
2507 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2508 size = rec->dtrd_offset - agg->dtag_base;
2509 fsize = size + rec->dtrd_size;
2510
2511 ASSERT(dbuf->dtb_tomax != NULL);
2512 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2513
2514 if ((tomax = buf->dtb_tomax) == NULL) {
2515 dtrace_buffer_drop(buf);
2516 return;
2517 }
2518
2519 /*
2520 * The metastructure is always at the bottom of the buffer.
2521 */
2522 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2523 sizeof (dtrace_aggbuffer_t));
2524
2525 if (buf->dtb_offset == 0) {
2526 /*
2527 * We just kludge up approximately 1/8th of the size to be
2528 * buckets. If this guess ends up being routinely
2529 * off-the-mark, we may need to dynamically readjust this
2530 * based on past performance.
2531 */
2532 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2533
2534 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2535 (uintptr_t)tomax || hashsize == 0) {
2536 /*
2537 * We've been given a ludicrously small buffer;
2538 * increment our drop count and leave.
2539 */
2540 dtrace_buffer_drop(buf);
2541 return;
2542 }
2543
2544 /*
2545 * And now, a pathetic attempt to try to get a an odd (or
2546 * perchance, a prime) hash size for better hash distribution.
2547 */
2548 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2549 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2550
2551 agb->dtagb_hashsize = hashsize;
2552 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2553 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2554 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2555
2556 for (i = 0; i < agb->dtagb_hashsize; i++)
2557 agb->dtagb_hash[i] = NULL;
2558 }
2559
2560 ASSERT(agg->dtag_first != NULL);
2561 ASSERT(agg->dtag_first->dta_intuple);
2562
2563 /*
2564 * Calculate the hash value based on the key. Note that we _don't_
2565 * include the aggid in the hashing (but we will store it as part of
2566 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2567 * algorithm: a simple, quick algorithm that has no known funnels, and
2568 * gets good distribution in practice. The efficacy of the hashing
2569 * algorithm (and a comparison with other algorithms) may be found by
2570 * running the ::dtrace_aggstat MDB dcmd.
2571 */
2572 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2573 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2574 limit = i + act->dta_rec.dtrd_size;
2575 ASSERT(limit <= size);
2576 isstr = DTRACEACT_ISSTRING(act);
2577
2578 for (; i < limit; i++) {
2579 hashval += data[i];
2580 hashval += (hashval << 10);
2581 hashval ^= (hashval >> 6);
2582
2583 if (isstr && data[i] == '\0')
2584 break;
2585 }
2586 }
2587
2588 hashval += (hashval << 3);
2589 hashval ^= (hashval >> 11);
2590 hashval += (hashval << 15);
2591
2592 /*
2593 * Yes, the divide here is expensive -- but it's generally the least
2594 * of the performance issues given the amount of data that we iterate
2595 * over to compute hash values, compare data, etc.
2596 */
2597 ndx = hashval % agb->dtagb_hashsize;
2598
2599 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2600 ASSERT((caddr_t)key >= tomax);
2601 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2602
2603 if (hashval != key->dtak_hashval || key->dtak_size != size)
2604 continue;
2605
2606 kdata = key->dtak_data;
2607 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2608
2609 for (act = agg->dtag_first; act->dta_intuple;
2610 act = act->dta_next) {
2611 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2612 limit = i + act->dta_rec.dtrd_size;
2613 ASSERT(limit <= size);
2614 isstr = DTRACEACT_ISSTRING(act);
2615
2616 for (; i < limit; i++) {
2617 if (kdata[i] != data[i])
2618 goto next;
2619
2620 if (isstr && data[i] == '\0')
2621 break;
2622 }
2623 }
2624
2625 if (action != key->dtak_action) {
2626 /*
2627 * We are aggregating on the same value in the same
2628 * aggregation with two different aggregating actions.
2629 * (This should have been picked up in the compiler,
2630 * so we may be dealing with errant or devious DIF.)
2631 * This is an error condition; we indicate as much,
2632 * and return.
2633 */
2634 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2635 return;
2636 }
2637
2638 /*
2639 * This is a hit: we need to apply the aggregator to
2640 * the value at this key.
2641 */
2642 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2643 return;
2644next:
2645 continue;
2646 }
2647
2648 /*
2649 * We didn't find it. We need to allocate some zero-filled space,
2650 * link it into the hash table appropriately, and apply the aggregator
2651 * to the (zero-filled) value.
2652 */
2653 offs = buf->dtb_offset;
2654 while (offs & (align - 1))
2655 offs += sizeof (uint32_t);
2656
2657 /*
2658 * If we don't have enough room to both allocate a new key _and_
2659 * its associated data, increment the drop count and return.
2660 */
2661 if ((uintptr_t)tomax + offs + fsize >
2662 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2663 dtrace_buffer_drop(buf);
2664 return;
2665 }
2666
2667 /*CONSTCOND*/
2668 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2669 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2670 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2671
2672 key->dtak_data = kdata = tomax + offs;
2673 buf->dtb_offset = offs + fsize;
2674
2675 /*
2676 * Now copy the data across.
2677 */
2678 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2679
2680 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2681 kdata[i] = data[i];
2682
2683 /*
2684 * Because strings are not zeroed out by default, we need to iterate
2685 * looking for actions that store strings, and we need to explicitly
2686 * pad these strings out with zeroes.
2687 */
2688 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2689 int nul;
2690
2691 if (!DTRACEACT_ISSTRING(act))
2692 continue;
2693
2694 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2695 limit = i + act->dta_rec.dtrd_size;
2696 ASSERT(limit <= size);
2697
2698 for (nul = 0; i < limit; i++) {
2699 if (nul) {
2700 kdata[i] = '\0';
2701 continue;
2702 }
2703
2704 if (data[i] != '\0')
2705 continue;
2706
2707 nul = 1;
2708 }
2709 }
2710
2711 for (i = size; i < fsize; i++)
2712 kdata[i] = 0;
2713
2714 key->dtak_hashval = hashval;
2715 key->dtak_size = size;
2716 key->dtak_action = action;
2717 key->dtak_next = agb->dtagb_hash[ndx];
2718 agb->dtagb_hash[ndx] = key;
2719
2720 /*
2721 * Finally, apply the aggregator.
2722 */
2723 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2724 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2725}
2726
2727/*
2728 * Given consumer state, this routine finds a speculation in the INACTIVE
2729 * state and transitions it into the ACTIVE state. If there is no speculation
2730 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2731 * incremented -- it is up to the caller to take appropriate action.
2732 */
2733static int
2734dtrace_speculation(dtrace_state_t *state)
2735{
2736 int i = 0;
2737 dtrace_speculation_state_t current;
2738 uint32_t *stat = &state->dts_speculations_unavail, count;
2739
2740 while (i < state->dts_nspeculations) {
2741 dtrace_speculation_t *spec = &state->dts_speculations[i];
2742
2743 current = spec->dtsp_state;
2744
2745 if (current != DTRACESPEC_INACTIVE) {
2746 if (current == DTRACESPEC_COMMITTINGMANY ||
2747 current == DTRACESPEC_COMMITTING ||
2748 current == DTRACESPEC_DISCARDING)
2749 stat = &state->dts_speculations_busy;
2750 i++;
2751 continue;
2752 }
2753
2754 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2755 current, DTRACESPEC_ACTIVE) == current)
2756 return (i + 1);
2757 }
2758
2759 /*
2760 * We couldn't find a speculation. If we found as much as a single
2761 * busy speculation buffer, we'll attribute this failure as "busy"
2762 * instead of "unavail".
2763 */
2764 do {
2765 count = *stat;
2766 } while (dtrace_cas32(stat, count, count + 1) != count);
2767
2768 return (0);
2769}
2770
2771/*
2772 * This routine commits an active speculation. If the specified speculation
2773 * is not in a valid state to perform a commit(), this routine will silently do
2774 * nothing. The state of the specified speculation is transitioned according
2775 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2776 */
2777static void
2778dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2779 dtrace_specid_t which)
2780{
2781 dtrace_speculation_t *spec;
2782 dtrace_buffer_t *src, *dest;
2783 uintptr_t daddr, saddr, dlimit, slimit;
2784 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2785 intptr_t offs;
2786 uint64_t timestamp;
2787
2788 if (which == 0)
2789 return;
2790
2791 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2792 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2793 return;
2794 }
2795
2796 spec = &state->dts_speculations[which - 1];
2797 src = &spec->dtsp_buffer[cpu];
2798 dest = &state->dts_buffer[cpu];
2799
2800 do {
2801 current = spec->dtsp_state;
2802
2803 if (current == DTRACESPEC_COMMITTINGMANY)
2804 break;
2805
2806 switch (current) {
2807 case DTRACESPEC_INACTIVE:
2808 case DTRACESPEC_DISCARDING:
2809 return;
2810
2811 case DTRACESPEC_COMMITTING:
2812 /*
2813 * This is only possible if we are (a) commit()'ing
2814 * without having done a prior speculate() on this CPU
2815 * and (b) racing with another commit() on a different
2816 * CPU. There's nothing to do -- we just assert that
2817 * our offset is 0.
2818 */
2819 ASSERT(src->dtb_offset == 0);
2820 return;
2821
2822 case DTRACESPEC_ACTIVE:
2823 new = DTRACESPEC_COMMITTING;
2824 break;
2825
2826 case DTRACESPEC_ACTIVEONE:
2827 /*
2828 * This speculation is active on one CPU. If our
2829 * buffer offset is non-zero, we know that the one CPU
2830 * must be us. Otherwise, we are committing on a
2831 * different CPU from the speculate(), and we must
2832 * rely on being asynchronously cleaned.
2833 */
2834 if (src->dtb_offset != 0) {
2835 new = DTRACESPEC_COMMITTING;
2836 break;
2837 }
2838 /*FALLTHROUGH*/
2839
2840 case DTRACESPEC_ACTIVEMANY:
2841 new = DTRACESPEC_COMMITTINGMANY;
2842 break;
2843
2844 default:
2845 ASSERT(0);
2846 }
2847 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2848 current, new) != current);
2849
2850 /*
2851 * We have set the state to indicate that we are committing this
2852 * speculation. Now reserve the necessary space in the destination
2853 * buffer.
2854 */
2855 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2856 sizeof (uint64_t), state, NULL)) < 0) {
2857 dtrace_buffer_drop(dest);
2858 goto out;
2859 }
2860
2861 /*
2862 * We have sufficient space to copy the speculative buffer into the
2863 * primary buffer. First, modify the speculative buffer, filling
2864 * in the timestamp of all entries with the current time. The data
2865 * must have the commit() time rather than the time it was traced,
2866 * so that all entries in the primary buffer are in timestamp order.
2867 */
2868 timestamp = dtrace_gethrtime();
2869 saddr = (uintptr_t)src->dtb_tomax;
2870 slimit = saddr + src->dtb_offset;
2871 while (saddr < slimit) {
2872 size_t size;
2873 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2874
2875 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2876 saddr += sizeof (dtrace_epid_t);
2877 continue;
2878 }
2879
2880 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2881 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2882
2883 ASSERT(saddr + size <= slimit);
2884 ASSERT(size >= sizeof(dtrace_rechdr_t));
2885 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2886
2887 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2888
2889 saddr += size;
2890 }
2891
2892 /*
2893 * Copy the buffer across. (Note that this is a
2894 * highly subobtimal bcopy(); in the unlikely event that this becomes
2895 * a serious performance issue, a high-performance DTrace-specific
2896 * bcopy() should obviously be invented.)
2897 */
2898 daddr = (uintptr_t)dest->dtb_tomax + offs;
2899 dlimit = daddr + src->dtb_offset;
2900 saddr = (uintptr_t)src->dtb_tomax;
2901
2902 /*
2903 * First, the aligned portion.
2904 */
2905 while (dlimit - daddr >= sizeof (uint64_t)) {
2906 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2907
2908 daddr += sizeof (uint64_t);
2909 saddr += sizeof (uint64_t);
2910 }
2911
2912 /*
2913 * Now any left-over bit...
2914 */
2915 while (dlimit - daddr)
2916 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2917
2918 /*
2919 * Finally, commit the reserved space in the destination buffer.
2920 */
2921 dest->dtb_offset = offs + src->dtb_offset;
2922
2923out:
2924 /*
2925 * If we're lucky enough to be the only active CPU on this speculation
2926 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2927 */
2928 if (current == DTRACESPEC_ACTIVE ||
2929 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2930 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2931 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2932#pragma unused(rval) /* __APPLE__ */
2933
2934 ASSERT(rval == DTRACESPEC_COMMITTING);
2935 }
2936
2937 src->dtb_offset = 0;
2938 src->dtb_xamot_drops += src->dtb_drops;
2939 src->dtb_drops = 0;
2940}
2941
2942/*
2943 * This routine discards an active speculation. If the specified speculation
2944 * is not in a valid state to perform a discard(), this routine will silently
2945 * do nothing. The state of the specified speculation is transitioned
2946 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2947 */
2948static void
2949dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2950 dtrace_specid_t which)
2951{
2952 dtrace_speculation_t *spec;
2953 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2954 dtrace_buffer_t *buf;
2955
2956 if (which == 0)
2957 return;
2958
2959 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2960 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2961 return;
2962 }
2963
2964 spec = &state->dts_speculations[which - 1];
2965 buf = &spec->dtsp_buffer[cpu];
2966
2967 do {
2968 current = spec->dtsp_state;
2969
2970 switch (current) {
2971 case DTRACESPEC_INACTIVE:
2972 case DTRACESPEC_COMMITTINGMANY:
2973 case DTRACESPEC_COMMITTING:
2974 case DTRACESPEC_DISCARDING:
2975 return;
2976
2977 case DTRACESPEC_ACTIVE:
2978 case DTRACESPEC_ACTIVEMANY:
2979 new = DTRACESPEC_DISCARDING;
2980 break;
2981
2982 case DTRACESPEC_ACTIVEONE:
2983 if (buf->dtb_offset != 0) {
2984 new = DTRACESPEC_INACTIVE;
2985 } else {
2986 new = DTRACESPEC_DISCARDING;
2987 }
2988 break;
2989
2990 default:
2991 ASSERT(0);
2992 }
2993 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2994 current, new) != current);
2995
2996 buf->dtb_offset = 0;
2997 buf->dtb_drops = 0;
2998}
2999
3000/*
3001 * Note: not called from probe context. This function is called
3002 * asynchronously from cross call context to clean any speculations that are
3003 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3004 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3005 * speculation.
3006 */
3007static void
3008dtrace_speculation_clean_here(dtrace_state_t *state)
3009{
3010 dtrace_icookie_t cookie;
3011 processorid_t cpu = CPU->cpu_id;
3012 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3013 dtrace_specid_t i;
3014
3015 cookie = dtrace_interrupt_disable();
3016
3017 if (dest->dtb_tomax == NULL) {
3018 dtrace_interrupt_enable(cookie);
3019 return;
3020 }
3021
3022 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3023 dtrace_speculation_t *spec = &state->dts_speculations[i];
3024 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3025
3026 if (src->dtb_tomax == NULL)
3027 continue;
3028
3029 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3030 src->dtb_offset = 0;
3031 continue;
3032 }
3033
3034 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3035 continue;
3036
3037 if (src->dtb_offset == 0)
3038 continue;
3039
3040 dtrace_speculation_commit(state, cpu, i + 1);
3041 }
3042
3043 dtrace_interrupt_enable(cookie);
3044}
3045
3046/*
3047 * Note: not called from probe context. This function is called
3048 * asynchronously (and at a regular interval) to clean any speculations that
3049 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3050 * is work to be done, it cross calls all CPUs to perform that work;
3051 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3052 * INACTIVE state until they have been cleaned by all CPUs.
3053 */
3054static void
3055dtrace_speculation_clean(dtrace_state_t *state)
3056{
3057 int work = 0;
3058 uint32_t rv;
3059 dtrace_specid_t i;
3060
3061 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3062 dtrace_speculation_t *spec = &state->dts_speculations[i];
3063
3064 ASSERT(!spec->dtsp_cleaning);
3065
3066 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3067 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3068 continue;
3069
3070 work++;
3071 spec->dtsp_cleaning = 1;
3072 }
3073
3074 if (!work)
3075 return;
3076
3077 dtrace_xcall(DTRACE_CPUALL,
3078 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3079
3080 /*
3081 * We now know that all CPUs have committed or discarded their
3082 * speculation buffers, as appropriate. We can now set the state
3083 * to inactive.
3084 */
3085 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3086 dtrace_speculation_t *spec = &state->dts_speculations[i];
3087 dtrace_speculation_state_t current, new;
3088
3089 if (!spec->dtsp_cleaning)
3090 continue;
3091
3092 current = spec->dtsp_state;
3093 ASSERT(current == DTRACESPEC_DISCARDING ||
3094 current == DTRACESPEC_COMMITTINGMANY);
3095
3096 new = DTRACESPEC_INACTIVE;
3097
3098 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3099 ASSERT(rv == current);
3100 spec->dtsp_cleaning = 0;
3101 }
3102}
3103
3104/*
3105 * Called as part of a speculate() to get the speculative buffer associated
3106 * with a given speculation. Returns NULL if the specified speculation is not
3107 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3108 * the active CPU is not the specified CPU -- the speculation will be
3109 * atomically transitioned into the ACTIVEMANY state.
3110 */
3111static dtrace_buffer_t *
3112dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3113 dtrace_specid_t which)
3114{
3115 dtrace_speculation_t *spec;
3116 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3117 dtrace_buffer_t *buf;
3118
3119 if (which == 0)
3120 return (NULL);
3121
3122 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3123 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3124 return (NULL);
3125 }
3126
3127 spec = &state->dts_speculations[which - 1];
3128 buf = &spec->dtsp_buffer[cpuid];
3129
3130 do {
3131 current = spec->dtsp_state;
3132
3133 switch (current) {
3134 case DTRACESPEC_INACTIVE:
3135 case DTRACESPEC_COMMITTINGMANY:
3136 case DTRACESPEC_DISCARDING:
3137 return (NULL);
3138
3139 case DTRACESPEC_COMMITTING:
3140 ASSERT(buf->dtb_offset == 0);
3141 return (NULL);
3142
3143 case DTRACESPEC_ACTIVEONE:
3144 /*
3145 * This speculation is currently active on one CPU.
3146 * Check the offset in the buffer; if it's non-zero,
3147 * that CPU must be us (and we leave the state alone).
3148 * If it's zero, assume that we're starting on a new
3149 * CPU -- and change the state to indicate that the
3150 * speculation is active on more than one CPU.
3151 */
3152 if (buf->dtb_offset != 0)
3153 return (buf);
3154
3155 new = DTRACESPEC_ACTIVEMANY;
3156 break;
3157
3158 case DTRACESPEC_ACTIVEMANY:
3159 return (buf);
3160
3161 case DTRACESPEC_ACTIVE:
3162 new = DTRACESPEC_ACTIVEONE;
3163 break;
3164
3165 default:
3166 ASSERT(0);
3167 }
3168 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3169 current, new) != current);
3170
3171 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3172 return (buf);
3173}
3174
3175/*
3176 * Return a string. In the event that the user lacks the privilege to access
3177 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3178 * don't fail access checking.
3179 *
3180 * dtrace_dif_variable() uses this routine as a helper for various
3181 * builtin values such as 'execname' and 'probefunc.'
3182 */
3183static
3184uintptr_t
3185dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3186 dtrace_mstate_t *mstate)
3187{
3188 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3189 uintptr_t ret;
3190 size_t strsz;
3191
3192 /*
3193 * The easy case: this probe is allowed to read all of memory, so
3194 * we can just return this as a vanilla pointer.
3195 */
3196 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3197 return (addr);
3198
3199 /*
3200 * This is the tougher case: we copy the string in question from
3201 * kernel memory into scratch memory and return it that way: this
3202 * ensures that we won't trip up when access checking tests the
3203 * BYREF return value.
3204 */
3205 strsz = dtrace_strlen((char *)addr, size) + 1;
3206
3207 if (mstate->dtms_scratch_ptr + strsz >
3208 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3209 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3210 return (0);
3211 }
3212
3213 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3214 strsz);
3215 ret = mstate->dtms_scratch_ptr;
3216 mstate->dtms_scratch_ptr += strsz;
3217 return (ret);
3218}
3219
3220/*
3221 * This function implements the DIF emulator's variable lookups. The emulator
3222 * passes a reserved variable identifier and optional built-in array index.
3223 */
3224static uint64_t
3225dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3226 uint64_t ndx)
3227{
3228 /*
3229 * If we're accessing one of the uncached arguments, we'll turn this
3230 * into a reference in the args array.
3231 */
3232 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3233 ndx = v - DIF_VAR_ARG0;
3234 v = DIF_VAR_ARGS;
3235 }
3236
3237 switch (v) {
3238 case DIF_VAR_ARGS:
3239 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3240 if (ndx >= sizeof (mstate->dtms_arg) /
3241 sizeof (mstate->dtms_arg[0])) {
3242 /*
3243 * APPLE NOTE: Account for introduction of __dtrace_probe()
3244 */
3245 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3246 dtrace_vstate_t *vstate = &state->dts_vstate;
3247 dtrace_provider_t *pv;
3248 uint64_t val;
3249
3250 pv = mstate->dtms_probe->dtpr_provider;
3251 if (pv->dtpv_pops.dtps_getargval != NULL)
3252 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3253 mstate->dtms_probe->dtpr_id,
3254 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3255 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3256 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3257 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3258 }
3259
3260 else
3261 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3262
3263 /*
3264 * This is regrettably required to keep the compiler
3265 * from tail-optimizing the call to dtrace_getarg().
3266 * The condition always evaluates to true, but the
3267 * compiler has no way of figuring that out a priori.
3268 * (None of this would be necessary if the compiler
3269 * could be relied upon to _always_ tail-optimize
3270 * the call to dtrace_getarg() -- but it can't.)
3271 */
3272 if (mstate->dtms_probe != NULL)
3273 return (val);
3274
3275 ASSERT(0);
3276 }
3277
3278 return (mstate->dtms_arg[ndx]);
3279
3280 case DIF_VAR_UREGS: {
3281 thread_t thread;
3282
3283 if (!dtrace_priv_proc(state))
3284 return (0);
3285
3286 if ((thread = current_thread()) == NULL) {
3287 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3288 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3289 return (0);
3290 }
3291
3292 return (dtrace_getreg(find_user_regs(thread), ndx));
3293 }
3294
3295
3296 case DIF_VAR_CURTHREAD:
3297 if (!dtrace_priv_kernel(state))
3298 return (0);
3299
3300 return ((uint64_t)(uintptr_t)current_thread());
3301
3302 case DIF_VAR_TIMESTAMP:
3303 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3304 mstate->dtms_timestamp = dtrace_gethrtime();
3305 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3306 }
3307 return (mstate->dtms_timestamp);
3308
3309 case DIF_VAR_VTIMESTAMP:
3310 ASSERT(dtrace_vtime_references != 0);
3311 return (dtrace_get_thread_vtime(current_thread()));
3312
3313 case DIF_VAR_WALLTIMESTAMP:
3314 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3315 mstate->dtms_walltimestamp = dtrace_gethrestime();
3316 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3317 }
3318 return (mstate->dtms_walltimestamp);
3319
3320 case DIF_VAR_MACHTIMESTAMP:
3321 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3322 mstate->dtms_machtimestamp = mach_absolute_time();
3323 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3324 }
3325 return (mstate->dtms_machtimestamp);
3326
3327 case DIF_VAR_CPU:
3328 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3329
3330 case DIF_VAR_IPL:
3331 if (!dtrace_priv_kernel(state))
3332 return (0);
3333 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3334 mstate->dtms_ipl = dtrace_getipl();
3335 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3336 }
3337 return (mstate->dtms_ipl);
3338
3339 case DIF_VAR_EPID:
3340 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3341 return (mstate->dtms_epid);
3342
3343 case DIF_VAR_ID:
3344 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3345 return (mstate->dtms_probe->dtpr_id);
3346
3347 case DIF_VAR_STACKDEPTH:
3348 if (!dtrace_priv_kernel(state))
3349 return (0);
3350 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3351 /*
3352 * APPLE NOTE: Account for introduction of __dtrace_probe()
3353 */
3354 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3355
3356 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3357 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3358 }
3359 return (mstate->dtms_stackdepth);
3360
3361 case DIF_VAR_USTACKDEPTH:
3362 if (!dtrace_priv_proc(state))
3363 return (0);
3364 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3365 /*
3366 * See comment in DIF_VAR_PID.
3367 */
3368 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3369 CPU_ON_INTR(CPU)) {
3370 mstate->dtms_ustackdepth = 0;
3371 } else {
3372 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3373 mstate->dtms_ustackdepth =
3374 dtrace_getustackdepth();
3375 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3376 }
3377 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3378 }
3379 return (mstate->dtms_ustackdepth);
3380
3381 case DIF_VAR_CALLER:
3382 if (!dtrace_priv_kernel(state))
3383 return (0);
3384 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3385 /*
3386 * APPLE NOTE: Account for introduction of __dtrace_probe()
3387 */
3388 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3389
3390 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3391 /*
3392 * If this is an unanchored probe, we are
3393 * required to go through the slow path:
3394 * dtrace_caller() only guarantees correct
3395 * results for anchored probes.
3396 */
3397 pc_t caller[2];
3398
3399 dtrace_getpcstack(caller, 2, aframes,
3400 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3401 mstate->dtms_caller = caller[1];
3402 } else if ((mstate->dtms_caller =
3403 dtrace_caller(aframes)) == (uintptr_t)-1) {
3404 /*
3405 * We have failed to do this the quick way;
3406 * we must resort to the slower approach of
3407 * calling dtrace_getpcstack().
3408 */
3409 pc_t caller;
3410
3411 dtrace_getpcstack(&caller, 1, aframes, NULL);
3412 mstate->dtms_caller = caller;
3413 }
3414
3415 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3416 }
3417 return (mstate->dtms_caller);
3418
3419 case DIF_VAR_UCALLER:
3420 if (!dtrace_priv_proc(state))
3421 return (0);
3422
3423 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3424 uint64_t ustack[3];
3425
3426 /*
3427 * dtrace_getupcstack() fills in the first uint64_t
3428 * with the current PID. The second uint64_t will
3429 * be the program counter at user-level. The third
3430 * uint64_t will contain the caller, which is what
3431 * we're after.
3432 */
3433 ustack[2] = 0;
3434 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3435 dtrace_getupcstack(ustack, 3);
3436 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437 mstate->dtms_ucaller = ustack[2];
3438 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3439 }
3440
3441 return (mstate->dtms_ucaller);
3442
3443 case DIF_VAR_PROBEPROV:
3444 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3445 return (dtrace_dif_varstr(
3446 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3447 state, mstate));
3448
3449 case DIF_VAR_PROBEMOD:
3450 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3451 return (dtrace_dif_varstr(
3452 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3453 state, mstate));
3454
3455 case DIF_VAR_PROBEFUNC:
3456 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3457 return (dtrace_dif_varstr(
3458 (uintptr_t)mstate->dtms_probe->dtpr_func,
3459 state, mstate));
3460
3461 case DIF_VAR_PROBENAME:
3462 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3463 return (dtrace_dif_varstr(
3464 (uintptr_t)mstate->dtms_probe->dtpr_name,
3465 state, mstate));
3466
3467 case DIF_VAR_PID:
3468 if (!dtrace_priv_proc_relaxed(state))
3469 return (0);
3470
3471 /*
3472 * Note that we are assuming that an unanchored probe is
3473 * always due to a high-level interrupt. (And we're assuming
3474 * that there is only a single high level interrupt.)
3475 */
3476 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3477 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3478 return 0;
3479
3480 return ((uint64_t)dtrace_proc_selfpid());
3481
3482 case DIF_VAR_PPID:
3483 if (!dtrace_priv_proc_relaxed(state))
3484 return (0);
3485
3486 /*
3487 * See comment in DIF_VAR_PID.
3488 */
3489 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3490 return (0);
3491
3492 return ((uint64_t)dtrace_proc_selfppid());
3493
3494 case DIF_VAR_TID:
3495 /* We do not need to check for null current_thread() */
3496 return thread_tid(current_thread()); /* globally unique */
3497
3498 case DIF_VAR_PTHREAD_SELF:
3499 if (!dtrace_priv_proc(state))
3500 return (0);
3501
3502 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3503 return 0;
3504
3505 case DIF_VAR_DISPATCHQADDR:
3506 if (!dtrace_priv_proc(state))
3507 return (0);
3508
3509 /* We do not need to check for null current_thread() */
3510 return thread_dispatchqaddr(current_thread());
3511
3512 case DIF_VAR_EXECNAME:
3513 {
3514 char *xname = (char *)mstate->dtms_scratch_ptr;
3515 size_t scratch_size = MAXCOMLEN+1;
3516
3517 /* The scratch allocation's lifetime is that of the clause. */
3518 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3519 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3520 return 0;
3521 }
3522
3523 if (!dtrace_priv_proc_relaxed(state))
3524 return (0);
3525
3526 mstate->dtms_scratch_ptr += scratch_size;
3527 proc_selfname( xname, scratch_size );
3528
3529 return ((uint64_t)(uintptr_t)xname);
3530 }
3531
3532
3533 case DIF_VAR_ZONENAME:
3534 {
3535 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3536 char *zname = (char *)mstate->dtms_scratch_ptr;
3537 size_t scratch_size = 6 + 1;
3538
3539 if (!dtrace_priv_proc(state))
3540 return (0);
3541
3542 /* The scratch allocation's lifetime is that of the clause. */
3543 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3544 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3545 return 0;
3546 }
3547
3548 mstate->dtms_scratch_ptr += scratch_size;
3549
3550 /* The kernel does not provide zonename, it will always return 'global'. */
3551 strlcpy(zname, "global", scratch_size);
3552
3553 return ((uint64_t)(uintptr_t)zname);
3554 }
3555
3556#if MONOTONIC
3557 case DIF_VAR_CPUINSTRS:
3558 return mt_cur_cpu_instrs();
3559
3560 case DIF_VAR_CPUCYCLES:
3561 return mt_cur_cpu_cycles();
3562
3563 case DIF_VAR_VINSTRS:
3564 return mt_cur_thread_instrs();
3565
3566 case DIF_VAR_VCYCLES:
3567 return mt_cur_thread_cycles();
3568#else /* MONOTONIC */
3569 case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3570 case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3571 case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3572 case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3573 return 0;
3574#endif /* !MONOTONIC */
3575
3576 case DIF_VAR_UID:
3577 if (!dtrace_priv_proc_relaxed(state))
3578 return (0);
3579
3580 /*
3581 * See comment in DIF_VAR_PID.
3582 */
3583 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3584 return (0);
3585
3586 return ((uint64_t) dtrace_proc_selfruid());
3587
3588 case DIF_VAR_GID:
3589 if (!dtrace_priv_proc(state))
3590 return (0);
3591
3592 /*
3593 * See comment in DIF_VAR_PID.
3594 */
3595 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3596 return (0);
3597
3598 if (dtrace_CRED() != NULL)
3599 /* Credential does not require lazy initialization. */
3600 return ((uint64_t)kauth_getgid());
3601 else {
3602 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3603 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3604 return -1ULL;
3605 }
3606
3607 case DIF_VAR_ERRNO: {
3608 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3609 if (!dtrace_priv_proc(state))
3610 return (0);
3611
3612 /*
3613 * See comment in DIF_VAR_PID.
3614 */
3615 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3616 return (0);
3617
3618 if (uthread)
3619 return (uint64_t)uthread->t_dtrace_errno;
3620 else {
3621 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3622 return -1ULL;
3623 }
3624 }
3625
3626 default:
3627 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3628 return (0);
3629 }
3630}
3631
3632/*
3633 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3634 * Notice that we don't bother validating the proper number of arguments or
3635 * their types in the tuple stack. This isn't needed because all argument
3636 * interpretation is safe because of our load safety -- the worst that can
3637 * happen is that a bogus program can obtain bogus results.
3638 */
3639static void
3640dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3641 dtrace_key_t *tupregs, int nargs,
3642 dtrace_mstate_t *mstate, dtrace_state_t *state)
3643{
3644 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3645 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3646 dtrace_vstate_t *vstate = &state->dts_vstate;
3647
3648#if !defined(__APPLE__)
3649 union {
3650 mutex_impl_t mi;
3651 uint64_t mx;
3652 } m;
3653
3654 union {
3655 krwlock_t ri;
3656 uintptr_t rw;
3657 } r;
3658#else
3659/* FIXME: awaits lock/mutex work */
3660#endif /* __APPLE__ */
3661
3662 switch (subr) {
3663 case DIF_SUBR_RAND:
3664 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3665 break;
3666
3667#if !defined(__APPLE__)
3668 case DIF_SUBR_MUTEX_OWNED:
3669 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3670 mstate, vstate)) {
3671 regs[rd] = 0;
3672 break;
3673 }
3674
3675 m.mx = dtrace_load64(tupregs[0].dttk_value);
3676 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3677 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3678 else
3679 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3680 break;
3681
3682 case DIF_SUBR_MUTEX_OWNER:
3683 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3684 mstate, vstate)) {
3685 regs[rd] = 0;
3686 break;
3687 }
3688
3689 m.mx = dtrace_load64(tupregs[0].dttk_value);
3690 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3691 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3692 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3693 else
3694 regs[rd] = 0;
3695 break;
3696
3697 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3698 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3699 mstate, vstate)) {
3700 regs[rd] = 0;
3701 break;
3702 }
3703
3704 m.mx = dtrace_load64(tupregs[0].dttk_value);
3705 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3706 break;
3707
3708 case DIF_SUBR_MUTEX_TYPE_SPIN:
3709 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3710 mstate, vstate)) {
3711 regs[rd] = 0;
3712 break;
3713 }
3714
3715 m.mx = dtrace_load64(tupregs[0].dttk_value);
3716 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3717 break;
3718
3719 case DIF_SUBR_RW_READ_HELD: {
3720 uintptr_t tmp;
3721
3722 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3723 mstate, vstate)) {
3724 regs[rd] = 0;
3725 break;
3726 }
3727
3728 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3729 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3730 break;
3731 }
3732
3733 case DIF_SUBR_RW_WRITE_HELD:
3734 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3735 mstate, vstate)) {
3736 regs[rd] = 0;
3737 break;
3738 }
3739
3740 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3741 regs[rd] = _RW_WRITE_HELD(&r.ri);
3742 break;
3743
3744 case DIF_SUBR_RW_ISWRITER:
3745 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3746 mstate, vstate)) {
3747 regs[rd] = 0;
3748 break;
3749 }
3750
3751 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3752 regs[rd] = _RW_ISWRITER(&r.ri);
3753 break;
3754#else
3755/* FIXME: awaits lock/mutex work */
3756#endif /* __APPLE__ */
3757
3758 case DIF_SUBR_BCOPY: {
3759 /*
3760 * We need to be sure that the destination is in the scratch
3761 * region -- no other region is allowed.
3762 */
3763 uintptr_t src = tupregs[0].dttk_value;
3764 uintptr_t dest = tupregs[1].dttk_value;
3765 size_t size = tupregs[2].dttk_value;
3766
3767 if (!dtrace_inscratch(dest, size, mstate)) {
3768 *flags |= CPU_DTRACE_BADADDR;
3769 *illval = regs[rd];
3770 break;
3771 }
3772
3773 if (!dtrace_canload(src, size, mstate, vstate)) {
3774 regs[rd] = 0;
3775 break;
3776 }
3777
3778 dtrace_bcopy((void *)src, (void *)dest, size);
3779 break;
3780 }
3781
3782 case DIF_SUBR_ALLOCA:
3783 case DIF_SUBR_COPYIN: {
3784 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3785 uint64_t size =
3786 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3787 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3788
3789 /*
3790 * Check whether the user can access kernel memory
3791 */
3792 if (dtrace_priv_kernel(state) == 0) {
3793 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
3794 regs[rd] = 0;
3795 break;
3796 }
3797 /*
3798 * This action doesn't require any credential checks since
3799 * probes will not activate in user contexts to which the
3800 * enabling user does not have permissions.
3801 */
3802
3803 /*
3804 * Rounding up the user allocation size could have overflowed
3805 * a large, bogus allocation (like -1ULL) to 0.
3806 */
3807 if (scratch_size < size ||
3808 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3809 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3810 regs[rd] = 0;
3811 break;
3812 }
3813
3814 if (subr == DIF_SUBR_COPYIN) {
3815 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3816 if (dtrace_priv_proc(state))
3817 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3818 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3819 }
3820
3821 mstate->dtms_scratch_ptr += scratch_size;
3822 regs[rd] = dest;
3823 break;
3824 }
3825
3826 case DIF_SUBR_COPYINTO: {
3827 uint64_t size = tupregs[1].dttk_value;
3828 uintptr_t dest = tupregs[2].dttk_value;
3829
3830 /*
3831 * This action doesn't require any credential checks since
3832 * probes will not activate in user contexts to which the
3833 * enabling user does not have permissions.
3834 */
3835 if (!dtrace_inscratch(dest, size, mstate)) {
3836 *flags |= CPU_DTRACE_BADADDR;
3837 *illval = regs[rd];
3838 break;
3839 }
3840
3841 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3842 if (dtrace_priv_proc(state))
3843 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3844 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3845 break;
3846 }
3847
3848 case DIF_SUBR_COPYINSTR: {
3849 uintptr_t dest = mstate->dtms_scratch_ptr;
3850 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3851
3852 if (nargs > 1 && tupregs[1].dttk_value < size)
3853 size = tupregs[1].dttk_value + 1;
3854
3855 /*
3856 * This action doesn't require any credential checks since
3857 * probes will not activate in user contexts to which the
3858 * enabling user does not have permissions.
3859 */
3860 if (!DTRACE_INSCRATCH(mstate, size)) {
3861 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3862 regs[rd] = 0;
3863 break;
3864 }
3865
3866 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3867 if (dtrace_priv_proc(state))
3868 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3869 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3870
3871 ((char *)dest)[size - 1] = '\0';
3872 mstate->dtms_scratch_ptr += size;
3873 regs[rd] = dest;
3874 break;
3875 }
3876
3877 case DIF_SUBR_MSGSIZE:
3878 case DIF_SUBR_MSGDSIZE: {
3879 /* Darwin does not implement SysV streams messages */
3880 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3881 regs[rd] = 0;
3882 break;
3883 }
3884
3885 case DIF_SUBR_PROGENYOF: {
3886 pid_t pid = tupregs[0].dttk_value;
3887 struct proc *p = current_proc();
3888 int rval = 0, lim = nprocs;
3889
3890 while(p && (lim-- > 0)) {
3891 pid_t ppid;
3892
3893 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3894 if (*flags & CPU_DTRACE_FAULT)
3895 break;
3896
3897 if (ppid == pid) {
3898 rval = 1;
3899 break;
3900 }
3901
3902 if (ppid == 0)
3903 break; /* Can't climb process tree any further. */
3904
3905 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3906 if (*flags & CPU_DTRACE_FAULT)
3907 break;
3908 }
3909
3910 regs[rd] = rval;
3911 break;
3912 }
3913
3914 case DIF_SUBR_SPECULATION:
3915 regs[rd] = dtrace_speculation(state);
3916 break;
3917
3918
3919 case DIF_SUBR_COPYOUT: {
3920 uintptr_t kaddr = tupregs[0].dttk_value;
3921 user_addr_t uaddr = tupregs[1].dttk_value;
3922 uint64_t size = tupregs[2].dttk_value;
3923
3924 if (!dtrace_destructive_disallow &&
3925 dtrace_priv_proc_control(state) &&
3926 !dtrace_istoxic(kaddr, size) &&
3927 dtrace_canload(kaddr, size, mstate, vstate)) {
3928 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3929 dtrace_copyout(kaddr, uaddr, size, flags);
3930 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3931 }
3932 break;
3933 }
3934
3935 case DIF_SUBR_COPYOUTSTR: {
3936 uintptr_t kaddr = tupregs[0].dttk_value;
3937 user_addr_t uaddr = tupregs[1].dttk_value;
3938 uint64_t size = tupregs[2].dttk_value;
3939 size_t lim;
3940
3941 if (!dtrace_destructive_disallow &&
3942 dtrace_priv_proc_control(state) &&
3943 !dtrace_istoxic(kaddr, size) &&
3944 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
3945 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3946 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
3947 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3948 }
3949 break;
3950 }
3951
3952 case DIF_SUBR_STRLEN: {
3953 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
3954 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3955 size_t lim;
3956
3957 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3958 regs[rd] = 0;
3959 break;
3960 }
3961
3962 regs[rd] = dtrace_strlen((char *)addr, lim);
3963
3964 break;
3965 }
3966
3967 case DIF_SUBR_STRCHR:
3968 case DIF_SUBR_STRRCHR: {
3969 /*
3970 * We're going to iterate over the string looking for the
3971 * specified character. We will iterate until we have reached
3972 * the string length or we have found the character. If this
3973 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3974 * of the specified character instead of the first.
3975 */
3976 uintptr_t addr = tupregs[0].dttk_value;
3977 uintptr_t addr_limit;
3978 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3979 size_t lim;
3980 char c, target = (char)tupregs[1].dttk_value;
3981
3982 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3983 regs[rd] = 0;
3984 break;
3985 }
3986 addr_limit = addr + lim;
3987
3988 for (regs[rd] = 0; addr < addr_limit; addr++) {
3989 if ((c = dtrace_load8(addr)) == target) {
3990 regs[rd] = addr;
3991
3992 if (subr == DIF_SUBR_STRCHR)
3993 break;
3994 }
3995
3996 if (c == '\0')
3997 break;
3998 }
3999
4000 break;
4001 }
4002
4003 case DIF_SUBR_STRSTR:
4004 case DIF_SUBR_INDEX:
4005 case DIF_SUBR_RINDEX: {
4006 /*
4007 * We're going to iterate over the string looking for the
4008 * specified string. We will iterate until we have reached
4009 * the string length or we have found the string. (Yes, this
4010 * is done in the most naive way possible -- but considering
4011 * that the string we're searching for is likely to be
4012 * relatively short, the complexity of Rabin-Karp or similar
4013 * hardly seems merited.)
4014 */
4015 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4016 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4017 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4018 size_t len = dtrace_strlen(addr, size);
4019 size_t sublen = dtrace_strlen(substr, size);
4020 char *limit = addr + len, *orig = addr;
4021 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4022 int inc = 1;
4023
4024 regs[rd] = notfound;
4025
4026 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4027 regs[rd] = 0;
4028 break;
4029 }
4030
4031 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4032 vstate)) {
4033 regs[rd] = 0;
4034 break;
4035 }
4036
4037 /*
4038 * strstr() and index()/rindex() have similar semantics if
4039 * both strings are the empty string: strstr() returns a
4040 * pointer to the (empty) string, and index() and rindex()
4041 * both return index 0 (regardless of any position argument).
4042 */
4043 if (sublen == 0 && len == 0) {
4044 if (subr == DIF_SUBR_STRSTR)
4045 regs[rd] = (uintptr_t)addr;
4046 else
4047 regs[rd] = 0;
4048 break;
4049 }
4050
4051 if (subr != DIF_SUBR_STRSTR) {
4052 if (subr == DIF_SUBR_RINDEX) {
4053 limit = orig - 1;
4054 addr += len;
4055 inc = -1;
4056 }
4057
4058 /*
4059 * Both index() and rindex() take an optional position
4060 * argument that denotes the starting position.
4061 */
4062 if (nargs == 3) {
4063 int64_t pos = (int64_t)tupregs[2].dttk_value;
4064
4065 /*
4066 * If the position argument to index() is
4067 * negative, Perl implicitly clamps it at
4068 * zero. This semantic is a little surprising
4069 * given the special meaning of negative
4070 * positions to similar Perl functions like
4071 * substr(), but it appears to reflect a
4072 * notion that index() can start from a
4073 * negative index and increment its way up to
4074 * the string. Given this notion, Perl's
4075 * rindex() is at least self-consistent in
4076 * that it implicitly clamps positions greater
4077 * than the string length to be the string
4078 * length. Where Perl completely loses
4079 * coherence, however, is when the specified
4080 * substring is the empty string (""). In
4081 * this case, even if the position is
4082 * negative, rindex() returns 0 -- and even if
4083 * the position is greater than the length,
4084 * index() returns the string length. These
4085 * semantics violate the notion that index()
4086 * should never return a value less than the
4087 * specified position and that rindex() should
4088 * never return a value greater than the
4089 * specified position. (One assumes that
4090 * these semantics are artifacts of Perl's
4091 * implementation and not the results of
4092 * deliberate design -- it beggars belief that
4093 * even Larry Wall could desire such oddness.)
4094 * While in the abstract one would wish for
4095 * consistent position semantics across
4096 * substr(), index() and rindex() -- or at the
4097 * very least self-consistent position
4098 * semantics for index() and rindex() -- we
4099 * instead opt to keep with the extant Perl
4100 * semantics, in all their broken glory. (Do
4101 * we have more desire to maintain Perl's
4102 * semantics than Perl does? Probably.)
4103 */
4104 if (subr == DIF_SUBR_RINDEX) {
4105 if (pos < 0) {
4106 if (sublen == 0)
4107 regs[rd] = 0;
4108 break;
4109 }
4110
4111 if ((size_t)pos > len)
4112 pos = len;
4113 } else {
4114 if (pos < 0)
4115 pos = 0;
4116
4117 if ((size_t)pos >= len) {
4118 if (sublen == 0)
4119 regs[rd] = len;
4120 break;
4121 }
4122 }
4123
4124 addr = orig + pos;
4125 }
4126 }
4127
4128 for (regs[rd] = notfound; addr != limit; addr += inc) {
4129 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4130 if (subr != DIF_SUBR_STRSTR) {
4131 /*
4132 * As D index() and rindex() are
4133 * modeled on Perl (and not on awk),
4134 * we return a zero-based (and not a
4135 * one-based) index. (For you Perl
4136 * weenies: no, we're not going to add
4137 * $[ -- and shouldn't you be at a con
4138 * or something?)
4139 */
4140 regs[rd] = (uintptr_t)(addr - orig);
4141 break;
4142 }
4143
4144 ASSERT(subr == DIF_SUBR_STRSTR);
4145 regs[rd] = (uintptr_t)addr;
4146 break;
4147 }
4148 }
4149
4150 break;
4151 }
4152
4153 case DIF_SUBR_STRTOK: {
4154 uintptr_t addr = tupregs[0].dttk_value;
4155 uintptr_t tokaddr = tupregs[1].dttk_value;
4156 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4157 uintptr_t limit, toklimit;
4158 size_t clim;
4159 char *dest = (char *)mstate->dtms_scratch_ptr;
4160 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
4161 uint64_t i = 0;
4162
4163 /*
4164 * Check both the token buffer and (later) the input buffer,
4165 * since both could be non-scratch addresses.
4166 */
4167 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4168 regs[rd] = 0;
4169 break;
4170 }
4171 toklimit = tokaddr + clim;
4172
4173 if (!DTRACE_INSCRATCH(mstate, size)) {
4174 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4175 regs[rd] = 0;
4176 break;
4177 }
4178
4179 if (addr == 0) {
4180 /*
4181 * If the address specified is NULL, we use our saved
4182 * strtok pointer from the mstate. Note that this
4183 * means that the saved strtok pointer is _only_
4184 * valid within multiple enablings of the same probe --
4185 * it behaves like an implicit clause-local variable.
4186 */
4187 addr = mstate->dtms_strtok;
4188 limit = mstate->dtms_strtok_limit;
4189 } else {
4190 /*
4191 * If the user-specified address is non-NULL we must
4192 * access check it. This is the only time we have
4193 * a chance to do so, since this address may reside
4194 * in the string table of this clause-- future calls
4195 * (when we fetch addr from mstate->dtms_strtok)
4196 * would fail this access check.
4197 */
4198 if (!dtrace_strcanload(addr, size, &clim, mstate,
4199 vstate)) {
4200 regs[rd] = 0;
4201 break;
4202 }
4203 limit = addr + clim;
4204 }
4205
4206 /*
4207 * First, zero the token map, and then process the token
4208 * string -- setting a bit in the map for every character
4209 * found in the token string.
4210 */
4211 for (i = 0; i < (int)sizeof (tokmap); i++)
4212 tokmap[i] = 0;
4213
4214 for (; tokaddr < toklimit; tokaddr++) {
4215 if ((c = dtrace_load8(tokaddr)) == '\0')
4216 break;
4217
4218 ASSERT((c >> 3) < sizeof (tokmap));
4219 tokmap[c >> 3] |= (1 << (c & 0x7));
4220 }
4221
4222 for (; addr < limit; addr++) {
4223 /*
4224 * We're looking for a character that is _not_
4225 * contained in the token string.
4226 */
4227 if ((c = dtrace_load8(addr)) == '\0')
4228 break;
4229
4230 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4231 break;
4232 }
4233
4234 if (c == '\0') {
4235 /*
4236 * We reached the end of the string without finding
4237 * any character that was not in the token string.
4238 * We return NULL in this case, and we set the saved
4239 * address to NULL as well.
4240 */
4241 regs[rd] = 0;
4242 mstate->dtms_strtok = 0;
4243 mstate->dtms_strtok_limit = 0;
4244 break;
4245 }
4246
4247 /*
4248 * From here on, we're copying into the destination string.
4249 */
4250 for (i = 0; addr < limit && i < size - 1; addr++) {
4251 if ((c = dtrace_load8(addr)) == '\0')
4252 break;
4253
4254 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4255 break;
4256
4257 ASSERT(i < size);
4258 dest[i++] = c;
4259 }
4260
4261 ASSERT(i < size);
4262 dest[i] = '\0';
4263 regs[rd] = (uintptr_t)dest;
4264 mstate->dtms_scratch_ptr += size;
4265 mstate->dtms_strtok = addr;
4266 mstate->dtms_strtok_limit = limit;
4267 break;
4268 }
4269
4270 case DIF_SUBR_SUBSTR: {
4271 uintptr_t s = tupregs[0].dttk_value;
4272 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4273 char *d = (char *)mstate->dtms_scratch_ptr;
4274 int64_t index = (int64_t)tupregs[1].dttk_value;
4275 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4276 size_t len = dtrace_strlen((char *)s, size);
4277 int64_t i = 0;
4278
4279 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4280 regs[rd] = 0;
4281 break;
4282 }
4283
4284 if (!DTRACE_INSCRATCH(mstate, size)) {
4285 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4286 regs[rd] = 0;
4287 break;
4288 }
4289
4290 if (nargs <= 2)
4291 remaining = (int64_t)size;
4292
4293 if (index < 0) {
4294 index += len;
4295
4296 if (index < 0 && index + remaining > 0) {
4297 remaining += index;
4298 index = 0;
4299 }
4300 }
4301
4302 if ((size_t)index >= len || index < 0) {
4303 remaining = 0;
4304 } else if (remaining < 0) {
4305 remaining += len - index;
4306 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4307 remaining = size - index;
4308 }
4309
4310 for (i = 0; i < remaining; i++) {
4311 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4312 break;
4313 }
4314
4315 d[i] = '\0';
4316
4317 mstate->dtms_scratch_ptr += size;
4318 regs[rd] = (uintptr_t)d;
4319 break;
4320 }
4321
4322 case DIF_SUBR_GETMAJOR:
4323 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4324 break;
4325
4326 case DIF_SUBR_GETMINOR:
4327 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4328 break;
4329
4330 case DIF_SUBR_DDI_PATHNAME: {
4331 /* APPLE NOTE: currently unsupported on Darwin */
4332 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4333 regs[rd] = 0;
4334 break;
4335 }
4336
4337 case DIF_SUBR_STRJOIN: {
4338 char *d = (char *)mstate->dtms_scratch_ptr;
4339 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4340 uintptr_t s1 = tupregs[0].dttk_value;
4341 uintptr_t s2 = tupregs[1].dttk_value;
4342 uint64_t i = 0, j = 0;
4343 size_t lim1, lim2;
4344 char c;
4345
4346 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4347 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4348 regs[rd] = 0;
4349 break;
4350 }
4351
4352 if (!DTRACE_INSCRATCH(mstate, size)) {
4353 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4354 regs[rd] = 0;
4355 break;
4356 }
4357
4358 for (;;) {
4359 if (i >= size) {
4360 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4361 regs[rd] = 0;
4362 break;
4363 }
4364 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4365 if ((d[i++] = c) == '\0') {
4366 i--;
4367 break;
4368 }
4369 }
4370
4371 for (;;) {
4372 if (i >= size) {
4373 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4374 regs[rd] = 0;
4375 break;
4376 }
4377 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4378 if ((d[i++] = c) == '\0')
4379 break;
4380 }
4381
4382 if (i < size) {
4383 mstate->dtms_scratch_ptr += i;
4384 regs[rd] = (uintptr_t)d;
4385 }
4386
4387 break;
4388 }
4389
4390 case DIF_SUBR_LLTOSTR: {
4391 int64_t i = (int64_t)tupregs[0].dttk_value;
4392 uint64_t val, digit;
4393 uint64_t size = 65; /* enough room for 2^64 in binary */
4394 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4395 int base = 10;
4396
4397 if (nargs > 1) {
4398 if ((base = tupregs[1].dttk_value) <= 1 ||
4399 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4400 *flags |= CPU_DTRACE_ILLOP;
4401 break;
4402 }
4403 }
4404
4405 val = (base == 10 && i < 0) ? i * -1 : i;
4406
4407 if (!DTRACE_INSCRATCH(mstate, size)) {
4408 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4409 regs[rd] = 0;
4410 break;
4411 }
4412
4413 for (*end-- = '\0'; val; val /= base) {
4414 if ((digit = val % base) <= '9' - '0') {
4415 *end-- = '0' + digit;
4416 } else {
4417 *end-- = 'a' + (digit - ('9' - '0') - 1);
4418 }
4419 }
4420
4421 if (i == 0 && base == 16)
4422 *end-- = '0';
4423
4424 if (base == 16)
4425 *end-- = 'x';
4426
4427 if (i == 0 || base == 8 || base == 16)
4428 *end-- = '0';
4429
4430 if (i < 0 && base == 10)
4431 *end-- = '-';
4432
4433 regs[rd] = (uintptr_t)end + 1;
4434 mstate->dtms_scratch_ptr += size;
4435 break;
4436 }
4437
4438 case DIF_SUBR_HTONS:
4439 case DIF_SUBR_NTOHS:
4440#ifdef _BIG_ENDIAN
4441 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4442#else
4443 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4444#endif
4445 break;
4446
4447
4448 case DIF_SUBR_HTONL:
4449 case DIF_SUBR_NTOHL:
4450#ifdef _BIG_ENDIAN
4451 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4452#else
4453 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4454#endif
4455 break;
4456
4457
4458 case DIF_SUBR_HTONLL:
4459 case DIF_SUBR_NTOHLL:
4460#ifdef _BIG_ENDIAN
4461 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4462#else
4463 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4464#endif
4465 break;
4466
4467
4468 case DIF_SUBR_DIRNAME:
4469 case DIF_SUBR_BASENAME: {
4470 char *dest = (char *)mstate->dtms_scratch_ptr;
4471 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4472 uintptr_t src = tupregs[0].dttk_value;
4473 int i, j, len = dtrace_strlen((char *)src, size);
4474 int lastbase = -1, firstbase = -1, lastdir = -1;
4475 int start, end;
4476
4477 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4478 regs[rd] = 0;
4479 break;
4480 }
4481
4482 if (!DTRACE_INSCRATCH(mstate, size)) {
4483 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4484 regs[rd] = 0;
4485 break;
4486 }
4487
4488 /*
4489 * The basename and dirname for a zero-length string is
4490 * defined to be "."
4491 */
4492 if (len == 0) {
4493 len = 1;
4494 src = (uintptr_t)".";
4495 }
4496
4497 /*
4498 * Start from the back of the string, moving back toward the
4499 * front until we see a character that isn't a slash. That
4500 * character is the last character in the basename.
4501 */
4502 for (i = len - 1; i >= 0; i--) {
4503 if (dtrace_load8(src + i) != '/')
4504 break;
4505 }
4506
4507 if (i >= 0)
4508 lastbase = i;
4509
4510 /*
4511 * Starting from the last character in the basename, move
4512 * towards the front until we find a slash. The character
4513 * that we processed immediately before that is the first
4514 * character in the basename.
4515 */
4516 for (; i >= 0; i--) {
4517 if (dtrace_load8(src + i) == '/')
4518 break;
4519 }
4520
4521 if (i >= 0)
4522 firstbase = i + 1;
4523
4524 /*
4525 * Now keep going until we find a non-slash character. That
4526 * character is the last character in the dirname.
4527 */
4528 for (; i >= 0; i--) {
4529 if (dtrace_load8(src + i) != '/')
4530 break;
4531 }
4532
4533 if (i >= 0)
4534 lastdir = i;
4535
4536 ASSERT(!(lastbase == -1 && firstbase != -1));
4537 ASSERT(!(firstbase == -1 && lastdir != -1));
4538
4539 if (lastbase == -1) {
4540 /*
4541 * We didn't find a non-slash character. We know that
4542 * the length is non-zero, so the whole string must be
4543 * slashes. In either the dirname or the basename
4544 * case, we return '/'.
4545 */
4546 ASSERT(firstbase == -1);
4547 firstbase = lastbase = lastdir = 0;
4548 }
4549
4550 if (firstbase == -1) {
4551 /*
4552 * The entire string consists only of a basename
4553 * component. If we're looking for dirname, we need
4554 * to change our string to be just "."; if we're
4555 * looking for a basename, we'll just set the first
4556 * character of the basename to be 0.
4557 */
4558 if (subr == DIF_SUBR_DIRNAME) {
4559 ASSERT(lastdir == -1);
4560 src = (uintptr_t)".";
4561 lastdir = 0;
4562 } else {
4563 firstbase = 0;
4564 }
4565 }
4566
4567 if (subr == DIF_SUBR_DIRNAME) {
4568 if (lastdir == -1) {
4569 /*
4570 * We know that we have a slash in the name --
4571 * or lastdir would be set to 0, above. And
4572 * because lastdir is -1, we know that this
4573 * slash must be the first character. (That
4574 * is, the full string must be of the form
4575 * "/basename".) In this case, the last
4576 * character of the directory name is 0.
4577 */
4578 lastdir = 0;
4579 }
4580
4581 start = 0;
4582 end = lastdir;
4583 } else {
4584 ASSERT(subr == DIF_SUBR_BASENAME);
4585 ASSERT(firstbase != -1 && lastbase != -1);
4586 start = firstbase;
4587 end = lastbase;
4588 }
4589
4590 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4591 dest[j] = dtrace_load8(src + i);
4592
4593 dest[j] = '\0';
4594 regs[rd] = (uintptr_t)dest;
4595 mstate->dtms_scratch_ptr += size;
4596 break;
4597 }
4598
4599 case DIF_SUBR_CLEANPATH: {
4600 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4601 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4602 uintptr_t src = tupregs[0].dttk_value;
4603 size_t lim;
4604 size_t i = 0, j = 0;
4605
4606 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
4607 regs[rd] = 0;
4608 break;
4609 }
4610
4611 if (!DTRACE_INSCRATCH(mstate, size)) {
4612 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4613 regs[rd] = 0;
4614 break;
4615 }
4616
4617 /*
4618 * Move forward, loading each character.
4619 */
4620 do {
4621 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4622next:
4623 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
4624 break;
4625
4626 if (c != '/') {
4627 dest[j++] = c;
4628 continue;
4629 }
4630
4631 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4632
4633 if (c == '/') {
4634 /*
4635 * We have two slashes -- we can just advance
4636 * to the next character.
4637 */
4638 goto next;
4639 }
4640
4641 if (c != '.') {
4642 /*
4643 * This is not "." and it's not ".." -- we can
4644 * just store the "/" and this character and
4645 * drive on.
4646 */
4647 dest[j++] = '/';
4648 dest[j++] = c;
4649 continue;
4650 }
4651
4652 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4653
4654 if (c == '/') {
4655 /*
4656 * This is a "/./" component. We're not going
4657 * to store anything in the destination buffer;
4658 * we're just going to go to the next component.
4659 */
4660 goto next;
4661 }
4662
4663 if (c != '.') {
4664 /*
4665 * This is not ".." -- we can just store the
4666 * "/." and this character and continue
4667 * processing.
4668 */
4669 dest[j++] = '/';
4670 dest[j++] = '.';
4671 dest[j++] = c;
4672 continue;
4673 }
4674
4675 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4676
4677 if (c != '/' && c != '\0') {
4678 /*
4679 * This is not ".." -- it's "..[mumble]".
4680 * We'll store the "/.." and this character
4681 * and continue processing.
4682 */
4683 dest[j++] = '/';
4684 dest[j++] = '.';
4685 dest[j++] = '.';
4686 dest[j++] = c;
4687 continue;
4688 }
4689
4690 /*
4691 * This is "/../" or "/..\0". We need to back up
4692 * our destination pointer until we find a "/".
4693 */
4694 i--;
4695 while (j != 0 && dest[--j] != '/')
4696 continue;
4697
4698 if (c == '\0')
4699 dest[++j] = '/';
4700 } while (c != '\0');
4701
4702 dest[j] = '\0';
4703 regs[rd] = (uintptr_t)dest;
4704 mstate->dtms_scratch_ptr += size;
4705 break;
4706 }
4707
4708 case DIF_SUBR_INET_NTOA:
4709 case DIF_SUBR_INET_NTOA6:
4710 case DIF_SUBR_INET_NTOP: {
4711 size_t size;
4712 int af, argi, i;
4713 char *base, *end;
4714
4715 if (subr == DIF_SUBR_INET_NTOP) {
4716 af = (int)tupregs[0].dttk_value;
4717 argi = 1;
4718 } else {
4719 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4720 argi = 0;
4721 }
4722
4723 if (af == AF_INET) {
4724#if !defined(__APPLE__)
4725 ipaddr_t ip4;
4726#else
4727 uint32_t ip4;
4728#endif /* __APPLE__ */
4729 uint8_t *ptr8, val;
4730
4731 /*
4732 * Safely load the IPv4 address.
4733 */
4734#if !defined(__APPLE__)
4735 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4736#else
4737 if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
4738 mstate, vstate)) {
4739 regs[rd] = 0;
4740 break;
4741 }
4742
4743 dtrace_bcopy(
4744 (void *)(uintptr_t)tupregs[argi].dttk_value,
4745 (void *)(uintptr_t)&ip4, sizeof (ip4));
4746#endif /* __APPLE__ */
4747 /*
4748 * Check an IPv4 string will fit in scratch.
4749 */
4750#if !defined(__APPLE__)
4751 size = INET_ADDRSTRLEN;
4752#else
4753 size = MAX_IPv4_STR_LEN;
4754#endif /* __APPLE__ */
4755 if (!DTRACE_INSCRATCH(mstate, size)) {
4756 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4757 regs[rd] = 0;
4758 break;
4759 }
4760 base = (char *)mstate->dtms_scratch_ptr;
4761 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4762
4763 /*
4764 * Stringify as a dotted decimal quad.
4765 */
4766 *end-- = '\0';
4767 ptr8 = (uint8_t *)&ip4;
4768 for (i = 3; i >= 0; i--) {
4769 val = ptr8[i];
4770
4771 if (val == 0) {
4772 *end-- = '0';
4773 } else {
4774 for (; val; val /= 10) {
4775 *end-- = '0' + (val % 10);
4776 }
4777 }
4778
4779 if (i > 0)
4780 *end-- = '.';
4781 }
4782 ASSERT(end + 1 >= base);
4783
4784 } else if (af == AF_INET6) {
4785#if defined(__APPLE__)
4786#define _S6_un __u6_addr
4787#define _S6_u8 __u6_addr8
4788#endif /* __APPLE__ */
4789 struct in6_addr ip6;
4790 int firstzero, tryzero, numzero, v6end;
4791 uint16_t val;
4792 const char digits[] = "0123456789abcdef";
4793
4794 /*
4795 * Stringify using RFC 1884 convention 2 - 16 bit
4796 * hexadecimal values with a zero-run compression.
4797 * Lower case hexadecimal digits are used.
4798 * eg, fe80::214:4fff:fe0b:76c8.
4799 * The IPv4 embedded form is returned for inet_ntop,
4800 * just the IPv4 string is returned for inet_ntoa6.
4801 */
4802
4803 if (!dtrace_canload(tupregs[argi].dttk_value,
4804 sizeof(struct in6_addr), mstate, vstate)) {
4805 regs[rd] = 0;
4806 break;
4807 }
4808
4809 /*
4810 * Safely load the IPv6 address.
4811 */
4812 dtrace_bcopy(
4813 (void *)(uintptr_t)tupregs[argi].dttk_value,
4814 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4815
4816 /*
4817 * Check an IPv6 string will fit in scratch.
4818 */
4819 size = INET6_ADDRSTRLEN;
4820 if (!DTRACE_INSCRATCH(mstate, size)) {
4821 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4822 regs[rd] = 0;
4823 break;
4824 }
4825 base = (char *)mstate->dtms_scratch_ptr;
4826 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4827 *end-- = '\0';
4828
4829 /*
4830 * Find the longest run of 16 bit zero values
4831 * for the single allowed zero compression - "::".
4832 */
4833 firstzero = -1;
4834 tryzero = -1;
4835 numzero = 1;
4836 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
4837 if (ip6._S6_un._S6_u8[i] == 0 &&
4838 tryzero == -1 && i % 2 == 0) {
4839 tryzero = i;
4840 continue;
4841 }
4842
4843 if (tryzero != -1 &&
4844 (ip6._S6_un._S6_u8[i] != 0 ||
4845 i == sizeof (struct in6_addr) - 1)) {
4846
4847 if (i - tryzero <= numzero) {
4848 tryzero = -1;
4849 continue;
4850 }
4851
4852 firstzero = tryzero;
4853 numzero = i - i % 2 - tryzero;
4854 tryzero = -1;
4855
4856 if (ip6._S6_un._S6_u8[i] == 0 &&
4857 i == sizeof (struct in6_addr) - 1)
4858 numzero += 2;
4859 }
4860 }
4861 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
4862
4863 /*
4864 * Check for an IPv4 embedded address.
4865 */
4866 v6end = sizeof (struct in6_addr) - 2;
4867 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4868 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4869 for (i = sizeof (struct in6_addr) - 1;
4870 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
4871 ASSERT(end >= base);
4872
4873 val = ip6._S6_un._S6_u8[i];
4874
4875 if (val == 0) {
4876 *end-- = '0';
4877 } else {
4878 for (; val; val /= 10) {
4879 *end-- = '0' + val % 10;
4880 }
4881 }
4882
4883 if (i > (int)DTRACE_V4MAPPED_OFFSET)
4884 *end-- = '.';
4885 }
4886
4887 if (subr == DIF_SUBR_INET_NTOA6)
4888 goto inetout;
4889
4890 /*
4891 * Set v6end to skip the IPv4 address that
4892 * we have already stringified.
4893 */
4894 v6end = 10;
4895 }
4896
4897 /*
4898 * Build the IPv6 string by working through the
4899 * address in reverse.
4900 */
4901 for (i = v6end; i >= 0; i -= 2) {
4902 ASSERT(end >= base);
4903
4904 if (i == firstzero + numzero - 2) {
4905 *end-- = ':';
4906 *end-- = ':';
4907 i -= numzero - 2;
4908 continue;
4909 }
4910
4911 if (i < 14 && i != firstzero - 2)
4912 *end-- = ':';
4913
4914 val = (ip6._S6_un._S6_u8[i] << 8) +
4915 ip6._S6_un._S6_u8[i + 1];
4916
4917 if (val == 0) {
4918 *end-- = '0';
4919 } else {
4920 for (; val; val /= 16) {
4921 *end-- = digits[val % 16];
4922 }
4923 }
4924 }
4925 ASSERT(end + 1 >= base);
4926
4927#if defined(__APPLE__)
4928#undef _S6_un
4929#undef _S6_u8
4930#endif /* __APPLE__ */
4931 } else {
4932 /*
4933 * The user didn't use AH_INET or AH_INET6.
4934 */
4935 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4936 regs[rd] = 0;
4937 break;
4938 }
4939
4940inetout: regs[rd] = (uintptr_t)end + 1;
4941 mstate->dtms_scratch_ptr += size;
4942 break;
4943 }
4944
4945 case DIF_SUBR_TOUPPER:
4946 case DIF_SUBR_TOLOWER: {
4947 uintptr_t src = tupregs[0].dttk_value;
4948 char *dest = (char *)mstate->dtms_scratch_ptr;
4949 char lower, upper, base, c;
4950 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4951 size_t len = dtrace_strlen((char*) src, size);
4952 size_t i = 0;
4953
4954 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
4955 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
4956 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
4957
4958 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4959 regs[rd] = 0;
4960 break;
4961 }
4962
4963 if (!DTRACE_INSCRATCH(mstate, size)) {
4964 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4965 regs[rd] = 0;
4966 break;
4967 }
4968
4969 for (i = 0; i < size - 1; ++i) {
4970 if ((c = dtrace_load8(src + i)) == '\0')
4971 break;
4972 if (c >= lower && c <= upper)
4973 c = base + (c - lower);
4974 dest[i] = c;
4975 }
4976
4977 ASSERT(i < size);
4978
4979 dest[i] = '\0';
4980 regs[rd] = (uintptr_t) dest;
4981 mstate->dtms_scratch_ptr += size;
4982
4983 break;
4984 }
4985
4986#if defined(__APPLE__)
4987 case DIF_SUBR_VM_KERNEL_ADDRPERM: {
4988 if (!dtrace_priv_kernel(state)) {
4989 regs[rd] = 0;
4990 } else {
4991 regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
4992 }
4993
4994 break;
4995 }
4996
4997 case DIF_SUBR_KDEBUG_TRACE: {
4998 uint32_t debugid;
4999 uintptr_t args[4] = {0};
5000 int i;
5001
5002 if (nargs < 2 || nargs > 5) {
5003 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5004 break;
5005 }
5006
5007 if (dtrace_destructive_disallow)
5008 return;
5009
5010 debugid = tupregs[0].dttk_value;
5011 for (i = 0; i < nargs - 1; i++)
5012 args[i] = tupregs[i + 1].dttk_value;
5013
5014 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5015
5016 break;
5017 }
5018
5019 case DIF_SUBR_KDEBUG_TRACE_STRING: {
5020 if (nargs != 3) {
5021 break;
5022 }
5023
5024 if (dtrace_destructive_disallow)
5025 return;
5026
5027 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5028 uint32_t debugid = tupregs[0].dttk_value;
5029 uint64_t str_id = tupregs[1].dttk_value;
5030 uintptr_t src = tupregs[2].dttk_value;
5031 size_t lim;
5032 char buf[size];
5033 char* str = NULL;
5034
5035 if (src != (uintptr_t)0) {
5036 str = buf;
5037 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5038 break;
5039 }
5040 dtrace_strcpy((void*)src, buf, size);
5041 }
5042
5043 (void)kernel_debug_string(debugid, &str_id, str);
5044 regs[rd] = str_id;
5045
5046 break;
5047 }
5048#endif
5049
5050 }
5051}
5052
5053/*
5054 * Emulate the execution of DTrace IR instructions specified by the given
5055 * DIF object. This function is deliberately void of assertions as all of
5056 * the necessary checks are handled by a call to dtrace_difo_validate().
5057 */
5058static uint64_t
5059dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5060 dtrace_vstate_t *vstate, dtrace_state_t *state)
5061{
5062 const dif_instr_t *text = difo->dtdo_buf;
5063 const uint_t textlen = difo->dtdo_len;
5064 const char *strtab = difo->dtdo_strtab;
5065 const uint64_t *inttab = difo->dtdo_inttab;
5066
5067 uint64_t rval = 0;
5068 dtrace_statvar_t *svar;
5069 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5070 dtrace_difv_t *v;
5071 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5072 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5073
5074 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5075 uint64_t regs[DIF_DIR_NREGS];
5076 uint64_t *tmp;
5077
5078 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5079 int64_t cc_r;
5080 uint_t pc = 0, id, opc = 0;
5081 uint8_t ttop = 0;
5082 dif_instr_t instr;
5083 uint_t r1, r2, rd;
5084
5085 /*
5086 * We stash the current DIF object into the machine state: we need it
5087 * for subsequent access checking.
5088 */
5089 mstate->dtms_difo = difo;
5090
5091 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5092
5093 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5094 opc = pc;
5095
5096 instr = text[pc++];
5097 r1 = DIF_INSTR_R1(instr);
5098 r2 = DIF_INSTR_R2(instr);
5099 rd = DIF_INSTR_RD(instr);
5100
5101 switch (DIF_INSTR_OP(instr)) {
5102 case DIF_OP_OR:
5103 regs[rd] = regs[r1] | regs[r2];
5104 break;
5105 case DIF_OP_XOR:
5106 regs[rd] = regs[r1] ^ regs[r2];
5107 break;
5108 case DIF_OP_AND:
5109 regs[rd] = regs[r1] & regs[r2];
5110 break;
5111 case DIF_OP_SLL:
5112 regs[rd] = regs[r1] << regs[r2];
5113 break;
5114 case DIF_OP_SRL:
5115 regs[rd] = regs[r1] >> regs[r2];
5116 break;
5117 case DIF_OP_SUB:
5118 regs[rd] = regs[r1] - regs[r2];
5119 break;
5120 case DIF_OP_ADD:
5121 regs[rd] = regs[r1] + regs[r2];
5122 break;
5123 case DIF_OP_MUL:
5124 regs[rd] = regs[r1] * regs[r2];
5125 break;
5126 case DIF_OP_SDIV:
5127 if (regs[r2] == 0) {
5128 regs[rd] = 0;
5129 *flags |= CPU_DTRACE_DIVZERO;
5130 } else {
5131 regs[rd] = (int64_t)regs[r1] /
5132 (int64_t)regs[r2];
5133 }
5134 break;
5135
5136 case DIF_OP_UDIV:
5137 if (regs[r2] == 0) {
5138 regs[rd] = 0;
5139 *flags |= CPU_DTRACE_DIVZERO;
5140 } else {
5141 regs[rd] = regs[r1] / regs[r2];
5142 }
5143 break;
5144
5145 case DIF_OP_SREM:
5146 if (regs[r2] == 0) {
5147 regs[rd] = 0;
5148 *flags |= CPU_DTRACE_DIVZERO;
5149 } else {
5150 regs[rd] = (int64_t)regs[r1] %
5151 (int64_t)regs[r2];
5152 }
5153 break;
5154
5155 case DIF_OP_UREM:
5156 if (regs[r2] == 0) {
5157 regs[rd] = 0;
5158 *flags |= CPU_DTRACE_DIVZERO;
5159 } else {
5160 regs[rd] = regs[r1] % regs[r2];
5161 }
5162 break;
5163
5164 case DIF_OP_NOT:
5165 regs[rd] = ~regs[r1];
5166 break;
5167 case DIF_OP_MOV:
5168 regs[rd] = regs[r1];
5169 break;
5170 case DIF_OP_CMP:
5171 cc_r = regs[r1] - regs[r2];
5172 cc_n = cc_r < 0;
5173 cc_z = cc_r == 0;
5174 cc_v = 0;
5175 cc_c = regs[r1] < regs[r2];
5176 break;
5177 case DIF_OP_TST:
5178 cc_n = cc_v = cc_c = 0;
5179 cc_z = regs[r1] == 0;
5180 break;
5181 case DIF_OP_BA:
5182 pc = DIF_INSTR_LABEL(instr);
5183 break;
5184 case DIF_OP_BE:
5185 if (cc_z)
5186 pc = DIF_INSTR_LABEL(instr);
5187 break;
5188 case DIF_OP_BNE:
5189 if (cc_z == 0)
5190 pc = DIF_INSTR_LABEL(instr);
5191 break;
5192 case DIF_OP_BG:
5193 if ((cc_z | (cc_n ^ cc_v)) == 0)
5194 pc = DIF_INSTR_LABEL(instr);
5195 break;
5196 case DIF_OP_BGU:
5197 if ((cc_c | cc_z) == 0)
5198 pc = DIF_INSTR_LABEL(instr);
5199 break;
5200 case DIF_OP_BGE:
5201 if ((cc_n ^ cc_v) == 0)
5202 pc = DIF_INSTR_LABEL(instr);
5203 break;
5204 case DIF_OP_BGEU:
5205 if (cc_c == 0)
5206 pc = DIF_INSTR_LABEL(instr);
5207 break;
5208 case DIF_OP_BL:
5209 if (cc_n ^ cc_v)
5210 pc = DIF_INSTR_LABEL(instr);
5211 break;
5212 case DIF_OP_BLU:
5213 if (cc_c)
5214 pc = DIF_INSTR_LABEL(instr);
5215 break;
5216 case DIF_OP_BLE:
5217 if (cc_z | (cc_n ^ cc_v))
5218 pc = DIF_INSTR_LABEL(instr);
5219 break;
5220 case DIF_OP_BLEU:
5221 if (cc_c | cc_z)
5222 pc = DIF_INSTR_LABEL(instr);
5223 break;
5224 case DIF_OP_RLDSB:
5225 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5226 *flags |= CPU_DTRACE_KPRIV;
5227 *illval = regs[r1];
5228 break;
5229 }
5230 /*FALLTHROUGH*/
5231 case DIF_OP_LDSB:
5232 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5233 break;
5234 case DIF_OP_RLDSH:
5235 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5236 *flags |= CPU_DTRACE_KPRIV;
5237 *illval = regs[r1];
5238 break;
5239 }
5240 /*FALLTHROUGH*/
5241 case DIF_OP_LDSH:
5242 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5243 break;
5244 case DIF_OP_RLDSW:
5245 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5246 *flags |= CPU_DTRACE_KPRIV;
5247 *illval = regs[r1];
5248 break;
5249 }
5250 /*FALLTHROUGH*/
5251 case DIF_OP_LDSW:
5252 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5253 break;
5254 case DIF_OP_RLDUB:
5255 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5256 *flags |= CPU_DTRACE_KPRIV;
5257 *illval = regs[r1];
5258 break;
5259 }
5260 /*FALLTHROUGH*/
5261 case DIF_OP_LDUB:
5262 regs[rd] = dtrace_load8(regs[r1]);
5263 break;
5264 case DIF_OP_RLDUH:
5265 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5266 *flags |= CPU_DTRACE_KPRIV;
5267 *illval = regs[r1];
5268 break;
5269 }
5270 /*FALLTHROUGH*/
5271 case DIF_OP_LDUH:
5272 regs[rd] = dtrace_load16(regs[r1]);
5273 break;
5274 case DIF_OP_RLDUW:
5275 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5276 *flags |= CPU_DTRACE_KPRIV;
5277 *illval = regs[r1];
5278 break;
5279 }
5280 /*FALLTHROUGH*/
5281 case DIF_OP_LDUW:
5282 regs[rd] = dtrace_load32(regs[r1]);
5283 break;
5284 case DIF_OP_RLDX:
5285 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5286 *flags |= CPU_DTRACE_KPRIV;
5287 *illval = regs[r1];
5288 break;
5289 }
5290 /*FALLTHROUGH*/
5291 case DIF_OP_LDX:
5292 regs[rd] = dtrace_load64(regs[r1]);
5293 break;
5294/*
5295 * Darwin 32-bit kernel may fetch from 64-bit user.
5296 * Do not cast regs to uintptr_t
5297 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5298 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5299 */
5300 case DIF_OP_ULDSB:
5301 regs[rd] = (int8_t)
5302 dtrace_fuword8(regs[r1]);
5303 break;
5304 case DIF_OP_ULDSH:
5305 regs[rd] = (int16_t)
5306 dtrace_fuword16(regs[r1]);
5307 break;
5308 case DIF_OP_ULDSW:
5309 regs[rd] = (int32_t)
5310 dtrace_fuword32(regs[r1]);
5311 break;
5312 case DIF_OP_ULDUB:
5313 regs[rd] =
5314 dtrace_fuword8(regs[r1]);
5315 break;
5316 case DIF_OP_ULDUH:
5317 regs[rd] =
5318 dtrace_fuword16(regs[r1]);
5319 break;
5320 case DIF_OP_ULDUW:
5321 regs[rd] =
5322 dtrace_fuword32(regs[r1]);
5323 break;
5324 case DIF_OP_ULDX:
5325 regs[rd] =
5326 dtrace_fuword64(regs[r1]);
5327 break;
5328 case DIF_OP_RET:
5329 rval = regs[rd];
5330 pc = textlen;
5331 break;
5332 case DIF_OP_NOP:
5333 break;
5334 case DIF_OP_SETX:
5335 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5336 break;
5337 case DIF_OP_SETS:
5338 regs[rd] = (uint64_t)(uintptr_t)
5339 (strtab + DIF_INSTR_STRING(instr));
5340 break;
5341 case DIF_OP_SCMP: {
5342 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5343 uintptr_t s1 = regs[r1];
5344 uintptr_t s2 = regs[r2];
5345 size_t lim1 = sz, lim2 = sz;
5346
5347 if (s1 != 0 &&
5348 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5349 break;
5350 if (s2 != 0 &&
5351 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5352 break;
5353
5354 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5355 MIN(lim1, lim2));
5356
5357 cc_n = cc_r < 0;
5358 cc_z = cc_r == 0;
5359 cc_v = cc_c = 0;
5360 break;
5361 }
5362 case DIF_OP_LDGA:
5363 regs[rd] = dtrace_dif_variable(mstate, state,
5364 r1, regs[r2]);
5365 break;
5366 case DIF_OP_LDGS:
5367 id = DIF_INSTR_VAR(instr);
5368
5369 if (id >= DIF_VAR_OTHER_UBASE) {
5370 uintptr_t a;
5371
5372 id -= DIF_VAR_OTHER_UBASE;
5373 svar = vstate->dtvs_globals[id];
5374 ASSERT(svar != NULL);
5375 v = &svar->dtsv_var;
5376
5377 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5378 regs[rd] = svar->dtsv_data;
5379 break;
5380 }
5381
5382 a = (uintptr_t)svar->dtsv_data;
5383
5384 if (*(uint8_t *)a == UINT8_MAX) {
5385 /*
5386 * If the 0th byte is set to UINT8_MAX
5387 * then this is to be treated as a
5388 * reference to a NULL variable.
5389 */
5390 regs[rd] = 0;
5391 } else {
5392 regs[rd] = a + sizeof (uint64_t);
5393 }
5394
5395 break;
5396 }
5397
5398 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5399 break;
5400
5401 case DIF_OP_STGS:
5402 id = DIF_INSTR_VAR(instr);
5403
5404 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5405 id -= DIF_VAR_OTHER_UBASE;
5406
5407 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5408 svar = vstate->dtvs_globals[id];
5409 ASSERT(svar != NULL);
5410 v = &svar->dtsv_var;
5411
5412 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5413 uintptr_t a = (uintptr_t)svar->dtsv_data;
5414 size_t lim;
5415
5416 ASSERT(a != 0);
5417 ASSERT(svar->dtsv_size != 0);
5418
5419 if (regs[rd] == 0) {
5420 *(uint8_t *)a = UINT8_MAX;
5421 break;
5422 } else {
5423 *(uint8_t *)a = 0;
5424 a += sizeof (uint64_t);
5425 }
5426 if (!dtrace_vcanload(
5427 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5428 &lim, mstate, vstate))
5429 break;
5430
5431 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5432 (void *)a, &v->dtdv_type, lim);
5433 break;
5434 }
5435
5436 svar->dtsv_data = regs[rd];
5437 break;
5438
5439 case DIF_OP_LDTA:
5440 /*
5441 * There are no DTrace built-in thread-local arrays at
5442 * present. This opcode is saved for future work.
5443 */
5444 *flags |= CPU_DTRACE_ILLOP;
5445 regs[rd] = 0;
5446 break;
5447
5448 case DIF_OP_LDLS:
5449 id = DIF_INSTR_VAR(instr);
5450
5451 if (id < DIF_VAR_OTHER_UBASE) {
5452 /*
5453 * For now, this has no meaning.
5454 */
5455 regs[rd] = 0;
5456 break;
5457 }
5458
5459 id -= DIF_VAR_OTHER_UBASE;
5460
5461 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5462 ASSERT(vstate->dtvs_locals != NULL);
5463 svar = vstate->dtvs_locals[id];
5464 ASSERT(svar != NULL);
5465 v = &svar->dtsv_var;
5466
5467 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5468 uintptr_t a = (uintptr_t)svar->dtsv_data;
5469 size_t sz = v->dtdv_type.dtdt_size;
5470
5471 sz += sizeof (uint64_t);
5472 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5473 a += CPU->cpu_id * sz;
5474
5475 if (*(uint8_t *)a == UINT8_MAX) {
5476 /*
5477 * If the 0th byte is set to UINT8_MAX
5478 * then this is to be treated as a
5479 * reference to a NULL variable.
5480 */
5481 regs[rd] = 0;
5482 } else {
5483 regs[rd] = a + sizeof (uint64_t);
5484 }
5485
5486 break;
5487 }
5488
5489 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5490 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5491 regs[rd] = tmp[CPU->cpu_id];
5492 break;
5493
5494 case DIF_OP_STLS:
5495 id = DIF_INSTR_VAR(instr);
5496
5497 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5498 id -= DIF_VAR_OTHER_UBASE;
5499 VERIFY(id < (uint_t)vstate->dtvs_nlocals);
5500 ASSERT(vstate->dtvs_locals != NULL);
5501 svar = vstate->dtvs_locals[id];
5502 ASSERT(svar != NULL);
5503 v = &svar->dtsv_var;
5504
5505 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5506 uintptr_t a = (uintptr_t)svar->dtsv_data;
5507 size_t sz = v->dtdv_type.dtdt_size;
5508 size_t lim;
5509
5510 sz += sizeof (uint64_t);
5511 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5512 a += CPU->cpu_id * sz;
5513
5514 if (regs[rd] == 0) {
5515 *(uint8_t *)a = UINT8_MAX;
5516 break;
5517 } else {
5518 *(uint8_t *)a = 0;
5519 a += sizeof (uint64_t);
5520 }
5521
5522 if (!dtrace_vcanload(
5523 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5524 &lim, mstate, vstate))
5525 break;
5526
5527 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5528 (void *)a, &v->dtdv_type, lim);
5529 break;
5530 }
5531
5532 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5533 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5534 tmp[CPU->cpu_id] = regs[rd];
5535 break;
5536
5537 case DIF_OP_LDTS: {
5538 dtrace_dynvar_t *dvar;
5539 dtrace_key_t *key;
5540
5541 id = DIF_INSTR_VAR(instr);
5542 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5543 id -= DIF_VAR_OTHER_UBASE;
5544 v = &vstate->dtvs_tlocals[id];
5545
5546 key = &tupregs[DIF_DTR_NREGS];
5547 key[0].dttk_value = (uint64_t)id;
5548 key[0].dttk_size = 0;
5549 DTRACE_TLS_THRKEY(key[1].dttk_value);
5550 key[1].dttk_size = 0;
5551
5552 dvar = dtrace_dynvar(dstate, 2, key,
5553 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5554 mstate, vstate);
5555
5556 if (dvar == NULL) {
5557 regs[rd] = 0;
5558 break;
5559 }
5560
5561 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5562 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5563 } else {
5564 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5565 }
5566
5567 break;
5568 }
5569
5570 case DIF_OP_STTS: {
5571 dtrace_dynvar_t *dvar;
5572 dtrace_key_t *key;
5573
5574 id = DIF_INSTR_VAR(instr);
5575 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5576 id -= DIF_VAR_OTHER_UBASE;
5577 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5578
5579 key = &tupregs[DIF_DTR_NREGS];
5580 key[0].dttk_value = (uint64_t)id;
5581 key[0].dttk_size = 0;
5582 DTRACE_TLS_THRKEY(key[1].dttk_value);
5583 key[1].dttk_size = 0;
5584 v = &vstate->dtvs_tlocals[id];
5585
5586 dvar = dtrace_dynvar(dstate, 2, key,
5587 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5588 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5589 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5590 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5591
5592 /*
5593 * Given that we're storing to thread-local data,
5594 * we need to flush our predicate cache.
5595 */
5596 dtrace_set_thread_predcache(current_thread(), 0);
5597
5598 if (dvar == NULL)
5599 break;
5600
5601 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5602 size_t lim;
5603
5604 if (!dtrace_vcanload(
5605 (void *)(uintptr_t)regs[rd],
5606 &v->dtdv_type, &lim, mstate, vstate))
5607 break;
5608
5609 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5610 dvar->dtdv_data, &v->dtdv_type, lim);
5611 } else {
5612 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5613 }
5614
5615 break;
5616 }
5617
5618 case DIF_OP_SRA:
5619 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5620 break;
5621
5622 case DIF_OP_CALL:
5623 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5624 regs, tupregs, ttop, mstate, state);
5625 break;
5626
5627 case DIF_OP_PUSHTR:
5628 if (ttop == DIF_DTR_NREGS) {
5629 *flags |= CPU_DTRACE_TUPOFLOW;
5630 break;
5631 }
5632
5633 if (r1 == DIF_TYPE_STRING) {
5634 /*
5635 * If this is a string type and the size is 0,
5636 * we'll use the system-wide default string
5637 * size. Note that we are _not_ looking at
5638 * the value of the DTRACEOPT_STRSIZE option;
5639 * had this been set, we would expect to have
5640 * a non-zero size value in the "pushtr".
5641 */
5642 tupregs[ttop].dttk_size =
5643 dtrace_strlen((char *)(uintptr_t)regs[rd],
5644 regs[r2] ? regs[r2] :
5645 dtrace_strsize_default) + 1;
5646 } else {
5647 if (regs[r2] > LONG_MAX) {
5648 *flags |= CPU_DTRACE_ILLOP;
5649 break;
5650 }
5651 tupregs[ttop].dttk_size = regs[r2];
5652 }
5653
5654 tupregs[ttop++].dttk_value = regs[rd];
5655 break;
5656
5657 case DIF_OP_PUSHTV:
5658 if (ttop == DIF_DTR_NREGS) {
5659 *flags |= CPU_DTRACE_TUPOFLOW;
5660 break;
5661 }
5662
5663 tupregs[ttop].dttk_value = regs[rd];
5664 tupregs[ttop++].dttk_size = 0;
5665 break;
5666
5667 case DIF_OP_POPTS:
5668 if (ttop != 0)
5669 ttop--;
5670 break;
5671
5672 case DIF_OP_FLUSHTS:
5673 ttop = 0;
5674 break;
5675
5676 case DIF_OP_LDGAA:
5677 case DIF_OP_LDTAA: {
5678 dtrace_dynvar_t *dvar;
5679 dtrace_key_t *key = tupregs;
5680 uint_t nkeys = ttop;
5681
5682 id = DIF_INSTR_VAR(instr);
5683 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5684 id -= DIF_VAR_OTHER_UBASE;
5685
5686 key[nkeys].dttk_value = (uint64_t)id;
5687 key[nkeys++].dttk_size = 0;
5688
5689 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5690 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5691 key[nkeys++].dttk_size = 0;
5692 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5693 v = &vstate->dtvs_tlocals[id];
5694 } else {
5695 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5696 v = &vstate->dtvs_globals[id]->dtsv_var;
5697 }
5698
5699 dvar = dtrace_dynvar(dstate, nkeys, key,
5700 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5701 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5702 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5703
5704 if (dvar == NULL) {
5705 regs[rd] = 0;
5706 break;
5707 }
5708
5709 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5710 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5711 } else {
5712 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5713 }
5714
5715 break;
5716 }
5717
5718 case DIF_OP_STGAA:
5719 case DIF_OP_STTAA: {
5720 dtrace_dynvar_t *dvar;
5721 dtrace_key_t *key = tupregs;
5722 uint_t nkeys = ttop;
5723
5724 id = DIF_INSTR_VAR(instr);
5725 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5726 id -= DIF_VAR_OTHER_UBASE;
5727
5728 key[nkeys].dttk_value = (uint64_t)id;
5729 key[nkeys++].dttk_size = 0;
5730
5731 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5732 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5733 key[nkeys++].dttk_size = 0;
5734 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5735 v = &vstate->dtvs_tlocals[id];
5736 } else {
5737 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5738 v = &vstate->dtvs_globals[id]->dtsv_var;
5739 }
5740
5741 dvar = dtrace_dynvar(dstate, nkeys, key,
5742 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5743 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5744 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5745 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5746
5747 if (dvar == NULL)
5748 break;
5749
5750 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5751 size_t lim;
5752
5753 if (!dtrace_vcanload(
5754 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5755 &lim, mstate, vstate))
5756 break;
5757
5758 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5759 dvar->dtdv_data, &v->dtdv_type, lim);
5760 } else {
5761 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5762 }
5763
5764 break;
5765 }
5766
5767 case DIF_OP_ALLOCS: {
5768 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5769 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5770
5771 /*
5772 * Rounding up the user allocation size could have
5773 * overflowed large, bogus allocations (like -1ULL) to
5774 * 0.
5775 */
5776 if (size < regs[r1] ||
5777 !DTRACE_INSCRATCH(mstate, size)) {
5778 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5779 regs[rd] = 0;
5780 break;
5781 }
5782
5783 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5784 mstate->dtms_scratch_ptr += size;
5785 regs[rd] = ptr;
5786 break;
5787 }
5788
5789 case DIF_OP_COPYS:
5790 if (!dtrace_canstore(regs[rd], regs[r2],
5791 mstate, vstate)) {
5792 *flags |= CPU_DTRACE_BADADDR;
5793 *illval = regs[rd];
5794 break;
5795 }
5796
5797 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5798 break;
5799
5800 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5801 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5802 break;
5803
5804 case DIF_OP_STB:
5805 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5806 *flags |= CPU_DTRACE_BADADDR;
5807 *illval = regs[rd];
5808 break;
5809 }
5810 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5811 break;
5812
5813 case DIF_OP_STH:
5814 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5815 *flags |= CPU_DTRACE_BADADDR;
5816 *illval = regs[rd];
5817 break;
5818 }
5819 if (regs[rd] & 1) {
5820 *flags |= CPU_DTRACE_BADALIGN;
5821 *illval = regs[rd];
5822 break;
5823 }
5824 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5825 break;
5826
5827 case DIF_OP_STW:
5828 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5829 *flags |= CPU_DTRACE_BADADDR;
5830 *illval = regs[rd];
5831 break;
5832 }
5833 if (regs[rd] & 3) {
5834 *flags |= CPU_DTRACE_BADALIGN;
5835 *illval = regs[rd];
5836 break;
5837 }
5838 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5839 break;
5840
5841 case DIF_OP_STX:
5842 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5843 *flags |= CPU_DTRACE_BADADDR;
5844 *illval = regs[rd];
5845 break;
5846 }
5847
5848 /*
5849 * Darwin kmem_zalloc() called from
5850 * dtrace_difo_init() is 4-byte aligned.
5851 */
5852 if (regs[rd] & 3) {
5853 *flags |= CPU_DTRACE_BADALIGN;
5854 *illval = regs[rd];
5855 break;
5856 }
5857 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5858 break;
5859 }
5860 }
5861
5862 if (!(*flags & CPU_DTRACE_FAULT))
5863 return (rval);
5864
5865 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5866 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5867
5868 return (0);
5869}
5870
5871static void
5872dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5873{
5874 dtrace_probe_t *probe = ecb->dte_probe;
5875 dtrace_provider_t *prov = probe->dtpr_provider;
5876 char c[DTRACE_FULLNAMELEN + 80], *str;
5877 const char *msg = "dtrace: breakpoint action at probe ";
5878 const char *ecbmsg = " (ecb ";
5879 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5880 uintptr_t val = (uintptr_t)ecb;
5881 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5882
5883 if (dtrace_destructive_disallow)
5884 return;
5885
5886 /*
5887 * It's impossible to be taking action on the NULL probe.
5888 */
5889 ASSERT(probe != NULL);
5890
5891 /*
5892 * This is a poor man's (destitute man's?) sprintf(): we want to
5893 * print the provider name, module name, function name and name of
5894 * the probe, along with the hex address of the ECB with the breakpoint
5895 * action -- all of which we must place in the character buffer by
5896 * hand.
5897 */
5898 while (*msg != '\0')
5899 c[i++] = *msg++;
5900
5901 for (str = prov->dtpv_name; *str != '\0'; str++)
5902 c[i++] = *str;
5903 c[i++] = ':';
5904
5905 for (str = probe->dtpr_mod; *str != '\0'; str++)
5906 c[i++] = *str;
5907 c[i++] = ':';
5908
5909 for (str = probe->dtpr_func; *str != '\0'; str++)
5910 c[i++] = *str;
5911 c[i++] = ':';
5912
5913 for (str = probe->dtpr_name; *str != '\0'; str++)
5914 c[i++] = *str;
5915
5916 while (*ecbmsg != '\0')
5917 c[i++] = *ecbmsg++;
5918
5919 while (shift >= 0) {
5920 mask = (uintptr_t)0xf << shift;
5921
5922 if (val >= ((uintptr_t)1 << shift))
5923 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5924 shift -= 4;
5925 }
5926
5927 c[i++] = ')';
5928 c[i] = '\0';
5929
5930 debug_enter(c);
5931}
5932
5933static void
5934dtrace_action_panic(dtrace_ecb_t *ecb)
5935{
5936 dtrace_probe_t *probe = ecb->dte_probe;
5937
5938 /*
5939 * It's impossible to be taking action on the NULL probe.
5940 */
5941 ASSERT(probe != NULL);
5942
5943 if (dtrace_destructive_disallow)
5944 return;
5945
5946 if (dtrace_panicked != NULL)
5947 return;
5948
5949 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
5950 return;
5951
5952 /*
5953 * We won the right to panic. (We want to be sure that only one
5954 * thread calls panic() from dtrace_probe(), and that panic() is
5955 * called exactly once.)
5956 */
5957 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5958 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5959 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5960
5961 /*
5962 * APPLE NOTE: this was for an old Mac OS X debug feature
5963 * allowing a return from panic(). Revisit someday.
5964 */
5965 dtrace_panicked = NULL;
5966}
5967
5968static void
5969dtrace_action_raise(uint64_t sig)
5970{
5971 if (dtrace_destructive_disallow)
5972 return;
5973
5974 if (sig >= NSIG) {
5975 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5976 return;
5977 }
5978
5979 /*
5980 * raise() has a queue depth of 1 -- we ignore all subsequent
5981 * invocations of the raise() action.
5982 */
5983
5984 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5985
5986 if (uthread && uthread->t_dtrace_sig == 0) {
5987 uthread->t_dtrace_sig = sig;
5988 act_set_astbsd(current_thread());
5989 }
5990}
5991
5992static void
5993dtrace_action_stop(void)
5994{
5995 if (dtrace_destructive_disallow)
5996 return;
5997
5998 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5999 if (uthread) {
6000 /*
6001 * The currently running process will be set to task_suspend
6002 * when it next leaves the kernel.
6003 */
6004 uthread->t_dtrace_stop = 1;
6005 act_set_astbsd(current_thread());
6006 }
6007}
6008
6009
6010/*
6011 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6012 * Both activate only when the currently running process next leaves the
6013 * kernel.
6014 */
6015static void
6016dtrace_action_pidresume(uint64_t pid)
6017{
6018 if (dtrace_destructive_disallow)
6019 return;
6020
6021 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6022 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6023 return;
6024 }
6025 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6026
6027 /*
6028 * When the currently running process leaves the kernel, it attempts to
6029 * task_resume the process (denoted by pid), if that pid appears to have
6030 * been stopped by dtrace_action_stop().
6031 * The currently running process has a pidresume() queue depth of 1 --
6032 * subsequent invocations of the pidresume() action are ignored.
6033 */
6034
6035 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6036 uthread->t_dtrace_resumepid = pid;
6037 act_set_astbsd(current_thread());
6038 }
6039}
6040
6041static void
6042dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6043{
6044 hrtime_t now;
6045 volatile uint16_t *flags;
6046 dtrace_cpu_t *cpu = CPU;
6047
6048 if (dtrace_destructive_disallow)
6049 return;
6050
6051 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6052
6053 now = dtrace_gethrtime();
6054
6055 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6056 /*
6057 * We need to advance the mark to the current time.
6058 */
6059 cpu->cpu_dtrace_chillmark = now;
6060 cpu->cpu_dtrace_chilled = 0;
6061 }
6062
6063 /*
6064 * Now check to see if the requested chill time would take us over
6065 * the maximum amount of time allowed in the chill interval. (Or
6066 * worse, if the calculation itself induces overflow.)
6067 */
6068 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6069 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6070 *flags |= CPU_DTRACE_ILLOP;
6071 return;
6072 }
6073
6074 while (dtrace_gethrtime() - now < val)
6075 continue;
6076
6077 /*
6078 * Normally, we assure that the value of the variable "timestamp" does
6079 * not change within an ECB. The presence of chill() represents an
6080 * exception to this rule, however.
6081 */
6082 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6083 cpu->cpu_dtrace_chilled += val;
6084}
6085
6086static void
6087dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6088 uint64_t *buf, uint64_t arg)
6089{
6090 int nframes = DTRACE_USTACK_NFRAMES(arg);
6091 int strsize = DTRACE_USTACK_STRSIZE(arg);
6092 uint64_t *pcs = &buf[1], *fps;
6093 char *str = (char *)&pcs[nframes];
6094 int size, offs = 0, i, j;
6095 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6096 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6097 char *sym;
6098
6099 /*
6100 * Should be taking a faster path if string space has not been
6101 * allocated.
6102 */
6103 ASSERT(strsize != 0);
6104
6105 /*
6106 * We will first allocate some temporary space for the frame pointers.
6107 */
6108 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6109 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6110 (nframes * sizeof (uint64_t));
6111
6112 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6113 /*
6114 * Not enough room for our frame pointers -- need to indicate
6115 * that we ran out of scratch space.
6116 */
6117 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6118 return;
6119 }
6120
6121 mstate->dtms_scratch_ptr += size;
6122 saved = mstate->dtms_scratch_ptr;
6123
6124 /*
6125 * Now get a stack with both program counters and frame pointers.
6126 */
6127 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6128 dtrace_getufpstack(buf, fps, nframes + 1);
6129 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6130
6131 /*
6132 * If that faulted, we're cooked.
6133 */
6134 if (*flags & CPU_DTRACE_FAULT)
6135 goto out;
6136
6137 /*
6138 * Now we want to walk up the stack, calling the USTACK helper. For
6139 * each iteration, we restore the scratch pointer.
6140 */
6141 for (i = 0; i < nframes; i++) {
6142 mstate->dtms_scratch_ptr = saved;
6143
6144 if (offs >= strsize)
6145 break;
6146
6147 sym = (char *)(uintptr_t)dtrace_helper(
6148 DTRACE_HELPER_ACTION_USTACK,
6149 mstate, state, pcs[i], fps[i]);
6150
6151 /*
6152 * If we faulted while running the helper, we're going to
6153 * clear the fault and null out the corresponding string.
6154 */
6155 if (*flags & CPU_DTRACE_FAULT) {
6156 *flags &= ~CPU_DTRACE_FAULT;
6157 str[offs++] = '\0';
6158 continue;
6159 }
6160
6161 if (sym == NULL) {
6162 str[offs++] = '\0';
6163 continue;
6164 }
6165
6166 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6167
6168 /*
6169 * Now copy in the string that the helper returned to us.
6170 */
6171 for (j = 0; offs + j < strsize; j++) {
6172 if ((str[offs + j] = sym[j]) == '\0')
6173 break;
6174 }
6175
6176 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6177
6178 offs += j + 1;
6179 }
6180
6181 if (offs >= strsize) {
6182 /*
6183 * If we didn't have room for all of the strings, we don't
6184 * abort processing -- this needn't be a fatal error -- but we
6185 * still want to increment a counter (dts_stkstroverflows) to
6186 * allow this condition to be warned about. (If this is from
6187 * a jstack() action, it is easily tuned via jstackstrsize.)
6188 */
6189 dtrace_error(&state->dts_stkstroverflows);
6190 }
6191
6192 while (offs < strsize)
6193 str[offs++] = '\0';
6194
6195out:
6196 mstate->dtms_scratch_ptr = old;
6197}
6198
6199static void
6200dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6201 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6202{
6203 volatile uint16_t *flags;
6204 uint64_t val = *valp;
6205 size_t valoffs = *valoffsp;
6206
6207 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6208 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6209
6210 /*
6211 * If this is a string, we're going to only load until we find the zero
6212 * byte -- after which we'll store zero bytes.
6213 */
6214 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6215 char c = '\0' + 1;
6216 size_t s;
6217
6218 for (s = 0; s < size; s++) {
6219 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6220 c = dtrace_load8(val++);
6221 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6222 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6223 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6224 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6225 if (*flags & CPU_DTRACE_FAULT)
6226 break;
6227 }
6228
6229 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6230
6231 if (c == '\0' && intuple)
6232 break;
6233 }
6234 } else {
6235 uint8_t c;
6236 while (valoffs < end) {
6237 if (dtkind == DIF_TF_BYREF) {
6238 c = dtrace_load8(val++);
6239 } else if (dtkind == DIF_TF_BYUREF) {
6240 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6241 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6242 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6243 if (*flags & CPU_DTRACE_FAULT)
6244 break;
6245 }
6246
6247 DTRACE_STORE(uint8_t, tomax,
6248 valoffs++, c);
6249 }
6250 }
6251
6252 *valp = val;
6253 *valoffsp = valoffs;
6254}
6255
6256/*
6257 * If you're looking for the epicenter of DTrace, you just found it. This
6258 * is the function called by the provider to fire a probe -- from which all
6259 * subsequent probe-context DTrace activity emanates.
6260 */
6261static void
6262__dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6263 uint64_t arg2, uint64_t arg3, uint64_t arg4)
6264{
6265 processorid_t cpuid;
6266 dtrace_icookie_t cookie;
6267 dtrace_probe_t *probe;
6268 dtrace_mstate_t mstate;
6269 dtrace_ecb_t *ecb;
6270 dtrace_action_t *act;
6271 intptr_t offs;
6272 size_t size;
6273 int vtime, onintr;
6274 volatile uint16_t *flags;
6275 hrtime_t now;
6276
6277 cookie = dtrace_interrupt_disable();
6278 probe = dtrace_probes[id - 1];
6279 cpuid = CPU->cpu_id;
6280 onintr = CPU_ON_INTR(CPU);
6281
6282 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6283 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6284 /*
6285 * We have hit in the predicate cache; we know that
6286 * this predicate would evaluate to be false.
6287 */
6288 dtrace_interrupt_enable(cookie);
6289 return;
6290 }
6291
6292 if (panic_quiesce) {
6293 /*
6294 * We don't trace anything if we're panicking.
6295 */
6296 dtrace_interrupt_enable(cookie);
6297 return;
6298 }
6299
6300#if !defined(__APPLE__)
6301 now = dtrace_gethrtime();
6302 vtime = dtrace_vtime_references != 0;
6303
6304 if (vtime && curthread->t_dtrace_start)
6305 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6306#else
6307 /*
6308 * APPLE NOTE: The time spent entering DTrace and arriving
6309 * to this point, is attributed to the current thread.
6310 * Instead it should accrue to DTrace. FIXME
6311 */
6312 vtime = dtrace_vtime_references != 0;
6313
6314 if (vtime)
6315 {
6316 int64_t dtrace_accum_time, recent_vtime;
6317 thread_t thread = current_thread();
6318
6319 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6320
6321 if (dtrace_accum_time >= 0) {
6322 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6323
6324 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6325
6326 dtrace_set_thread_vtime(thread, recent_vtime);
6327 }
6328 }
6329
6330 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6331#endif /* __APPLE__ */
6332
6333 /*
6334 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6335 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
6336 * However the provider has no access to ECB context, so passes
6337 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
6338 * Detect that here and cons up a viable state (from the probe_id).
6339 */
6340 if (dtrace_probeid_error == id && 0 == arg0) {
6341 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6342 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6343 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
6344
6345 if (NULL != ftp_ecb) {
6346 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
6347
6348 arg0 = (uint64_t)(uintptr_t)ftp_state;
6349 arg1 = ftp_ecb->dte_epid;
6350 /*
6351 * args[2-4] established by caller.
6352 */
6353 ftp_state->dts_arg_error_illval = -1; /* arg5 */
6354 }
6355 }
6356
6357 mstate.dtms_difo = NULL;
6358 mstate.dtms_probe = probe;
6359 mstate.dtms_strtok = 0;
6360 mstate.dtms_arg[0] = arg0;
6361 mstate.dtms_arg[1] = arg1;
6362 mstate.dtms_arg[2] = arg2;
6363 mstate.dtms_arg[3] = arg3;
6364 mstate.dtms_arg[4] = arg4;
6365
6366 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6367
6368 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6369 dtrace_predicate_t *pred = ecb->dte_predicate;
6370 dtrace_state_t *state = ecb->dte_state;
6371 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6372 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6373 dtrace_vstate_t *vstate = &state->dts_vstate;
6374 dtrace_provider_t *prov = probe->dtpr_provider;
6375 uint64_t tracememsize = 0;
6376 int committed = 0;
6377 caddr_t tomax;
6378
6379 /*
6380 * A little subtlety with the following (seemingly innocuous)
6381 * declaration of the automatic 'val': by looking at the
6382 * code, you might think that it could be declared in the
6383 * action processing loop, below. (That is, it's only used in
6384 * the action processing loop.) However, it must be declared
6385 * out of that scope because in the case of DIF expression
6386 * arguments to aggregating actions, one iteration of the
6387 * action loop will use the last iteration's value.
6388 */
6389#ifdef lint
6390 uint64_t val = 0;
6391#else
6392 uint64_t val = 0;
6393#endif
6394
6395 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6396 *flags &= ~CPU_DTRACE_ERROR;
6397
6398 if (prov == dtrace_provider) {
6399 /*
6400 * If dtrace itself is the provider of this probe,
6401 * we're only going to continue processing the ECB if
6402 * arg0 (the dtrace_state_t) is equal to the ECB's
6403 * creating state. (This prevents disjoint consumers
6404 * from seeing one another's metaprobes.)
6405 */
6406 if (arg0 != (uint64_t)(uintptr_t)state)
6407 continue;
6408 }
6409
6410 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6411 /*
6412 * We're not currently active. If our provider isn't
6413 * the dtrace pseudo provider, we're not interested.
6414 */
6415 if (prov != dtrace_provider)
6416 continue;
6417
6418 /*
6419 * Now we must further check if we are in the BEGIN
6420 * probe. If we are, we will only continue processing
6421 * if we're still in WARMUP -- if one BEGIN enabling
6422 * has invoked the exit() action, we don't want to
6423 * evaluate subsequent BEGIN enablings.
6424 */
6425 if (probe->dtpr_id == dtrace_probeid_begin &&
6426 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6427 ASSERT(state->dts_activity ==
6428 DTRACE_ACTIVITY_DRAINING);
6429 continue;
6430 }
6431 }
6432
6433 if (ecb->dte_cond) {
6434 /*
6435 * If the dte_cond bits indicate that this
6436 * consumer is only allowed to see user-mode firings
6437 * of this probe, call the provider's dtps_usermode()
6438 * entry point to check that the probe was fired
6439 * while in a user context. Skip this ECB if that's
6440 * not the case.
6441 */
6442 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6443 prov->dtpv_pops.dtps_usermode &&
6444 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6445 probe->dtpr_id, probe->dtpr_arg) == 0)
6446 continue;
6447
6448 /*
6449 * This is more subtle than it looks. We have to be
6450 * absolutely certain that CRED() isn't going to
6451 * change out from under us so it's only legit to
6452 * examine that structure if we're in constrained
6453 * situations. Currently, the only times we'll this
6454 * check is if a non-super-user has enabled the
6455 * profile or syscall providers -- providers that
6456 * allow visibility of all processes. For the
6457 * profile case, the check above will ensure that
6458 * we're examining a user context.
6459 */
6460 if (ecb->dte_cond & DTRACE_COND_OWNER) {
6461 cred_t *cr;
6462 cred_t *s_cr =
6463 ecb->dte_state->dts_cred.dcr_cred;
6464 proc_t *proc;
6465#pragma unused(proc) /* __APPLE__ */
6466
6467 ASSERT(s_cr != NULL);
6468
6469 /*
6470 * XXX this is hackish, but so is setting a variable
6471 * XXX in a McCarthy OR...
6472 */
6473 if ((cr = dtrace_CRED()) == NULL ||
6474 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6475 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6476 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6477 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6478 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6479 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6480#if !defined(__APPLE__)
6481 (proc = ttoproc(curthread)) == NULL ||
6482 (proc->p_flag & SNOCD))
6483#else
6484 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
6485#endif /* __APPLE__ */
6486 continue;
6487 }
6488
6489 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6490 cred_t *cr;
6491 cred_t *s_cr =
6492 ecb->dte_state->dts_cred.dcr_cred;
6493#pragma unused(cr, s_cr) /* __APPLE__ */
6494
6495 ASSERT(s_cr != NULL);
6496
6497#if !defined(__APPLE__)
6498 if ((cr = CRED()) == NULL ||
6499 s_cr->cr_zone->zone_id !=
6500 cr->cr_zone->zone_id)
6501 continue;
6502#else
6503 /* APPLE NOTE: Darwin doesn't do zones. */
6504#endif /* __APPLE__ */
6505 }
6506 }
6507
6508 if (now - state->dts_alive > dtrace_deadman_timeout) {
6509 /*
6510 * We seem to be dead. Unless we (a) have kernel
6511 * destructive permissions (b) have expicitly enabled
6512 * destructive actions and (c) destructive actions have
6513 * not been disabled, we're going to transition into
6514 * the KILLED state, from which no further processing
6515 * on this state will be performed.
6516 */
6517 if (!dtrace_priv_kernel_destructive(state) ||
6518 !state->dts_cred.dcr_destructive ||
6519 dtrace_destructive_disallow) {
6520 void *activity = &state->dts_activity;
6521 dtrace_activity_t current;
6522
6523 do {
6524 current = state->dts_activity;
6525 } while (dtrace_cas32(activity, current,
6526 DTRACE_ACTIVITY_KILLED) != current);
6527
6528 continue;
6529 }
6530 }
6531
6532 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6533 ecb->dte_alignment, state, &mstate)) < 0)
6534 continue;
6535
6536 tomax = buf->dtb_tomax;
6537 ASSERT(tomax != NULL);
6538
6539 /*
6540 * Build and store the record header corresponding to the ECB.
6541 */
6542 if (ecb->dte_size != 0) {
6543 dtrace_rechdr_t dtrh;
6544
6545 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
6546 mstate.dtms_timestamp = dtrace_gethrtime();
6547 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
6548 }
6549
6550 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6551
6552 dtrh.dtrh_epid = ecb->dte_epid;
6553 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
6554 DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
6555 }
6556
6557 mstate.dtms_epid = ecb->dte_epid;
6558 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6559
6560 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6561 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6562 else
6563 mstate.dtms_access = 0;
6564
6565 if (pred != NULL) {
6566 dtrace_difo_t *dp = pred->dtp_difo;
6567 uint64_t rval;
6568
6569 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6570
6571 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6572 dtrace_cacheid_t cid = probe->dtpr_predcache;
6573
6574 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6575 /*
6576 * Update the predicate cache...
6577 */
6578 ASSERT(cid == pred->dtp_cacheid);
6579
6580 dtrace_set_thread_predcache(current_thread(), cid);
6581 }
6582
6583 continue;
6584 }
6585 }
6586
6587 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6588 act != NULL; act = act->dta_next) {
6589 size_t valoffs;
6590 dtrace_difo_t *dp;
6591 dtrace_recdesc_t *rec = &act->dta_rec;
6592
6593 size = rec->dtrd_size;
6594 valoffs = offs + rec->dtrd_offset;
6595
6596 if (DTRACEACT_ISAGG(act->dta_kind)) {
6597 uint64_t v = 0xbad;
6598 dtrace_aggregation_t *agg;
6599
6600 agg = (dtrace_aggregation_t *)act;
6601
6602 if ((dp = act->dta_difo) != NULL)
6603 v = dtrace_dif_emulate(dp,
6604 &mstate, vstate, state);
6605
6606 if (*flags & CPU_DTRACE_ERROR)
6607 continue;
6608
6609 /*
6610 * Note that we always pass the expression
6611 * value from the previous iteration of the
6612 * action loop. This value will only be used
6613 * if there is an expression argument to the
6614 * aggregating action, denoted by the
6615 * dtag_hasarg field.
6616 */
6617 dtrace_aggregate(agg, buf,
6618 offs, aggbuf, v, val);
6619 continue;
6620 }
6621
6622 switch (act->dta_kind) {
6623 case DTRACEACT_STOP:
6624 if (dtrace_priv_proc_destructive(state))
6625 dtrace_action_stop();
6626 continue;
6627
6628 case DTRACEACT_BREAKPOINT:
6629 if (dtrace_priv_kernel_destructive(state))
6630 dtrace_action_breakpoint(ecb);
6631 continue;
6632
6633 case DTRACEACT_PANIC:
6634 if (dtrace_priv_kernel_destructive(state))
6635 dtrace_action_panic(ecb);
6636 continue;
6637
6638 case DTRACEACT_STACK:
6639 if (!dtrace_priv_kernel(state))
6640 continue;
6641
6642 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6643 size / sizeof (pc_t), probe->dtpr_aframes,
6644 DTRACE_ANCHORED(probe) ? NULL :
6645 (uint32_t *)(uintptr_t)arg0);
6646 continue;
6647
6648 case DTRACEACT_JSTACK:
6649 case DTRACEACT_USTACK:
6650 if (!dtrace_priv_proc(state))
6651 continue;
6652
6653 /*
6654 * See comment in DIF_VAR_PID.
6655 */
6656 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6657 CPU_ON_INTR(CPU)) {
6658 int depth = DTRACE_USTACK_NFRAMES(
6659 rec->dtrd_arg) + 1;
6660
6661 dtrace_bzero((void *)(tomax + valoffs),
6662 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6663 + depth * sizeof (uint64_t));
6664
6665 continue;
6666 }
6667
6668 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6669 curproc->p_dtrace_helpers != NULL) {
6670 /*
6671 * This is the slow path -- we have
6672 * allocated string space, and we're
6673 * getting the stack of a process that
6674 * has helpers. Call into a separate
6675 * routine to perform this processing.
6676 */
6677 dtrace_action_ustack(&mstate, state,
6678 (uint64_t *)(tomax + valoffs),
6679 rec->dtrd_arg);
6680 continue;
6681 }
6682
6683 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6684 dtrace_getupcstack((uint64_t *)
6685 (tomax + valoffs),
6686 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6687 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6688 continue;
6689
6690 default:
6691 break;
6692 }
6693
6694 dp = act->dta_difo;
6695 ASSERT(dp != NULL);
6696
6697 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6698
6699 if (*flags & CPU_DTRACE_ERROR)
6700 continue;
6701
6702 switch (act->dta_kind) {
6703 case DTRACEACT_SPECULATE: {
6704 dtrace_rechdr_t *dtrh = NULL;
6705
6706 ASSERT(buf == &state->dts_buffer[cpuid]);
6707 buf = dtrace_speculation_buffer(state,
6708 cpuid, val);
6709
6710 if (buf == NULL) {
6711 *flags |= CPU_DTRACE_DROP;
6712 continue;
6713 }
6714
6715 offs = dtrace_buffer_reserve(buf,
6716 ecb->dte_needed, ecb->dte_alignment,
6717 state, NULL);
6718
6719 if (offs < 0) {
6720 *flags |= CPU_DTRACE_DROP;
6721 continue;
6722 }
6723
6724 tomax = buf->dtb_tomax;
6725 ASSERT(tomax != NULL);
6726
6727 if (ecb->dte_size == 0)
6728 continue;
6729
6730 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6731 dtrh = ((void *)(tomax + offs));
6732 dtrh->dtrh_epid = ecb->dte_epid;
6733
6734 /*
6735 * When the speculation is committed, all of
6736 * the records in the speculative buffer will
6737 * have their timestamps set to the commit
6738 * time. Until then, it is set to a sentinel
6739 * value, for debugability.
6740 */
6741 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
6742
6743 continue;
6744 }
6745
6746 case DTRACEACT_CHILL:
6747 if (dtrace_priv_kernel_destructive(state))
6748 dtrace_action_chill(&mstate, val);
6749 continue;
6750
6751 case DTRACEACT_RAISE:
6752 if (dtrace_priv_proc_destructive(state))
6753 dtrace_action_raise(val);
6754 continue;
6755
6756 case DTRACEACT_PIDRESUME: /* __APPLE__ */
6757 if (dtrace_priv_proc_destructive(state))
6758 dtrace_action_pidresume(val);
6759 continue;
6760
6761 case DTRACEACT_COMMIT:
6762 ASSERT(!committed);
6763
6764 /*
6765 * We need to commit our buffer state.
6766 */
6767 if (ecb->dte_size)
6768 buf->dtb_offset = offs + ecb->dte_size;
6769 buf = &state->dts_buffer[cpuid];
6770 dtrace_speculation_commit(state, cpuid, val);
6771 committed = 1;
6772 continue;
6773
6774 case DTRACEACT_DISCARD:
6775 dtrace_speculation_discard(state, cpuid, val);
6776 continue;
6777
6778 case DTRACEACT_DIFEXPR:
6779 case DTRACEACT_LIBACT:
6780 case DTRACEACT_PRINTF:
6781 case DTRACEACT_PRINTA:
6782 case DTRACEACT_SYSTEM:
6783 case DTRACEACT_FREOPEN:
6784 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
6785 case DTRACEACT_TRACEMEM:
6786 break;
6787
6788 case DTRACEACT_TRACEMEM_DYNSIZE:
6789 tracememsize = val;
6790 break;
6791
6792 case DTRACEACT_SYM:
6793 case DTRACEACT_MOD:
6794 if (!dtrace_priv_kernel(state))
6795 continue;
6796 break;
6797
6798 case DTRACEACT_USYM:
6799 case DTRACEACT_UMOD:
6800 case DTRACEACT_UADDR: {
6801 if (!dtrace_priv_proc(state))
6802 continue;
6803
6804 DTRACE_STORE(uint64_t, tomax,
6805 valoffs, (uint64_t)dtrace_proc_selfpid());
6806 DTRACE_STORE(uint64_t, tomax,
6807 valoffs + sizeof (uint64_t), val);
6808
6809 continue;
6810 }
6811
6812 case DTRACEACT_EXIT: {
6813 /*
6814 * For the exit action, we are going to attempt
6815 * to atomically set our activity to be
6816 * draining. If this fails (either because
6817 * another CPU has beat us to the exit action,
6818 * or because our current activity is something
6819 * other than ACTIVE or WARMUP), we will
6820 * continue. This assures that the exit action
6821 * can be successfully recorded at most once
6822 * when we're in the ACTIVE state. If we're
6823 * encountering the exit() action while in
6824 * COOLDOWN, however, we want to honor the new
6825 * status code. (We know that we're the only
6826 * thread in COOLDOWN, so there is no race.)
6827 */
6828 void *activity = &state->dts_activity;
6829 dtrace_activity_t current = state->dts_activity;
6830
6831 if (current == DTRACE_ACTIVITY_COOLDOWN)
6832 break;
6833
6834 if (current != DTRACE_ACTIVITY_WARMUP)
6835 current = DTRACE_ACTIVITY_ACTIVE;
6836
6837 if (dtrace_cas32(activity, current,
6838 DTRACE_ACTIVITY_DRAINING) != current) {
6839 *flags |= CPU_DTRACE_DROP;
6840 continue;
6841 }
6842
6843 break;
6844 }
6845
6846 default:
6847 ASSERT(0);
6848 }
6849
6850 if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
6851 uintptr_t end = valoffs + size;
6852
6853 if (tracememsize != 0 &&
6854 valoffs + tracememsize < end)
6855 {
6856 end = valoffs + tracememsize;
6857 tracememsize = 0;
6858 }
6859
6860 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
6861 !dtrace_vcanload((void *)(uintptr_t)val,
6862 &dp->dtdo_rtype, NULL, &mstate, vstate))
6863 {
6864 continue;
6865 }
6866
6867 dtrace_store_by_ref(dp, tomax, size, &valoffs,
6868 &val, end, act->dta_intuple,
6869 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
6870 DIF_TF_BYREF: DIF_TF_BYUREF);
6871
6872 continue;
6873 }
6874
6875 switch (size) {
6876 case 0:
6877 break;
6878
6879 case sizeof (uint8_t):
6880 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6881 break;
6882 case sizeof (uint16_t):
6883 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6884 break;
6885 case sizeof (uint32_t):
6886 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6887 break;
6888 case sizeof (uint64_t):
6889 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6890 break;
6891 default:
6892 /*
6893 * Any other size should have been returned by
6894 * reference, not by value.
6895 */
6896 ASSERT(0);
6897 break;
6898 }
6899 }
6900
6901 if (*flags & CPU_DTRACE_DROP)
6902 continue;
6903
6904 if (*flags & CPU_DTRACE_FAULT) {
6905 int ndx;
6906 dtrace_action_t *err;
6907
6908 buf->dtb_errors++;
6909
6910 if (probe->dtpr_id == dtrace_probeid_error) {
6911 /*
6912 * There's nothing we can do -- we had an
6913 * error on the error probe. We bump an
6914 * error counter to at least indicate that
6915 * this condition happened.
6916 */
6917 dtrace_error(&state->dts_dblerrors);
6918 continue;
6919 }
6920
6921 if (vtime) {
6922 /*
6923 * Before recursing on dtrace_probe(), we
6924 * need to explicitly clear out our start
6925 * time to prevent it from being accumulated
6926 * into t_dtrace_vtime.
6927 */
6928
6929 /*
6930 * Darwin sets the sign bit on t_dtrace_tracing
6931 * to suspend accumulation to it.
6932 */
6933 dtrace_set_thread_tracing(current_thread(),
6934 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
6935
6936 }
6937
6938 /*
6939 * Iterate over the actions to figure out which action
6940 * we were processing when we experienced the error.
6941 * Note that act points _past_ the faulting action; if
6942 * act is ecb->dte_action, the fault was in the
6943 * predicate, if it's ecb->dte_action->dta_next it's
6944 * in action #1, and so on.
6945 */
6946 for (err = ecb->dte_action, ndx = 0;
6947 err != act; err = err->dta_next, ndx++)
6948 continue;
6949
6950 dtrace_probe_error(state, ecb->dte_epid, ndx,
6951 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6952 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6953 cpu_core[cpuid].cpuc_dtrace_illval);
6954
6955 continue;
6956 }
6957
6958 if (!committed)
6959 buf->dtb_offset = offs + ecb->dte_size;
6960 }
6961
6962 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
6963 to the current thread. Instead it should accrue to DTrace. */
6964 if (vtime) {
6965 thread_t thread = current_thread();
6966 int64_t t = dtrace_get_thread_tracing(thread);
6967
6968 if (t >= 0) {
6969 /* Usual case, accumulate time spent here into t_dtrace_tracing */
6970 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
6971 } else {
6972 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
6973 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
6974 }
6975 }
6976
6977 dtrace_interrupt_enable(cookie);
6978}
6979
6980/*
6981 * APPLE NOTE: Don't allow a thread to re-enter dtrace_probe().
6982 * This could occur if a probe is encountered on some function in the
6983 * transitive closure of the call to dtrace_probe().
6984 * Solaris has some strong guarantees that this won't happen.
6985 * The Darwin implementation is not so mature as to make those guarantees.
6986 * Hence, the introduction of __dtrace_probe() on xnu.
6987 */
6988
6989void
6990dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6991 uint64_t arg2, uint64_t arg3, uint64_t arg4)
6992{
6993 thread_t thread = current_thread();
6994 disable_preemption();
6995 if (id == dtrace_probeid_error) {
6996 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6997 dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
6998 } else if (!dtrace_get_thread_reentering(thread)) {
6999 dtrace_set_thread_reentering(thread, TRUE);
7000 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7001 dtrace_set_thread_reentering(thread, FALSE);
7002 }
7003#if DEBUG
7004 else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
7005#endif
7006 enable_preemption();
7007}
7008
7009/*
7010 * DTrace Probe Hashing Functions
7011 *
7012 * The functions in this section (and indeed, the functions in remaining
7013 * sections) are not _called_ from probe context. (Any exceptions to this are
7014 * marked with a "Note:".) Rather, they are called from elsewhere in the
7015 * DTrace framework to look-up probes in, add probes to and remove probes from
7016 * the DTrace probe hashes. (Each probe is hashed by each element of the
7017 * probe tuple -- allowing for fast lookups, regardless of what was
7018 * specified.)
7019 */
7020static uint_t
7021dtrace_hash_str(const char *p)
7022{
7023 unsigned int g;
7024 uint_t hval = 0;
7025
7026 while (*p) {
7027 hval = (hval << 4) + *p++;
7028 if ((g = (hval & 0xf0000000)) != 0)
7029 hval ^= g >> 24;
7030 hval &= ~g;
7031 }
7032 return (hval);
7033}
7034
7035static const char*
7036dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7037{
7038#pragma unused(offs)
7039 dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7040 return probe->dtpr_provider->dtpv_name;
7041}
7042
7043static const char*
7044dtrace_strkey_offset(void *elm, uintptr_t offs)
7045{
7046 return ((char *)((uintptr_t)(elm) + offs));
7047}
7048
7049static const char*
7050dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7051{
7052 return *((char **)((uintptr_t)(elm) + offs));
7053}
7054
7055static dtrace_hash_t *
7056dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7057{
7058 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7059
7060 hash->dth_getstr = func;
7061 hash->dth_stroffs = arg;
7062 hash->dth_nextoffs = nextoffs;
7063 hash->dth_prevoffs = prevoffs;
7064
7065 hash->dth_size = 1;
7066 hash->dth_mask = hash->dth_size - 1;
7067
7068 hash->dth_tab = kmem_zalloc(hash->dth_size *
7069 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7070
7071 return (hash);
7072}
7073
7074/*
7075 * APPLE NOTE: dtrace_hash_destroy is not used.
7076 * It is called by dtrace_detach which is not
7077 * currently implemented. Revisit someday.
7078 */
7079#if !defined(__APPLE__)
7080static void
7081dtrace_hash_destroy(dtrace_hash_t *hash)
7082{
7083#if DEBUG
7084 int i;
7085
7086 for (i = 0; i < hash->dth_size; i++)
7087 ASSERT(hash->dth_tab[i] == NULL);
7088#endif
7089
7090 kmem_free(hash->dth_tab,
7091 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7092 kmem_free(hash, sizeof (dtrace_hash_t));
7093}
7094#endif /* __APPLE__ */
7095
7096static void
7097dtrace_hash_resize(dtrace_hash_t *hash)
7098{
7099 int size = hash->dth_size, i, ndx;
7100 int new_size = hash->dth_size << 1;
7101 int new_mask = new_size - 1;
7102 dtrace_hashbucket_t **new_tab, *bucket, *next;
7103
7104 ASSERT((new_size & new_mask) == 0);
7105
7106 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7107
7108 for (i = 0; i < size; i++) {
7109 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7110 void *elm = bucket->dthb_chain;
7111
7112 ASSERT(elm != NULL);
7113 ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7114
7115 next = bucket->dthb_next;
7116 bucket->dthb_next = new_tab[ndx];
7117 new_tab[ndx] = bucket;
7118 }
7119 }
7120
7121 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7122 hash->dth_tab = new_tab;
7123 hash->dth_size = new_size;
7124 hash->dth_mask = new_mask;
7125}
7126
7127static void
7128dtrace_hash_add(dtrace_hash_t *hash, void *new)
7129{
7130 int hashval = DTRACE_HASHSTR(hash, new);
7131 int ndx = hashval & hash->dth_mask;
7132 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7133 void **nextp, **prevp;
7134
7135 for (; bucket != NULL; bucket = bucket->dthb_next) {
7136 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7137 goto add;
7138 }
7139
7140 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7141 dtrace_hash_resize(hash);
7142 dtrace_hash_add(hash, new);
7143 return;
7144 }
7145
7146 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7147 bucket->dthb_next = hash->dth_tab[ndx];
7148 hash->dth_tab[ndx] = bucket;
7149 hash->dth_nbuckets++;
7150
7151add:
7152 nextp = DTRACE_HASHNEXT(hash, new);
7153 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7154 *nextp = bucket->dthb_chain;
7155
7156 if (bucket->dthb_chain != NULL) {
7157 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7158 ASSERT(*prevp == NULL);
7159 *prevp = new;
7160 }
7161
7162 bucket->dthb_chain = new;
7163 bucket->dthb_len++;
7164}
7165
7166static void *
7167dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7168{
7169 int hashval = dtrace_hash_str(str);
7170 int ndx = hashval & hash->dth_mask;
7171 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7172
7173 for (; bucket != NULL; bucket = bucket->dthb_next) {
7174 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7175 return (bucket->dthb_chain);
7176 }
7177
7178 return (NULL);
7179}
7180
7181static dtrace_probe_t *
7182dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7183{
7184 return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7185}
7186
7187static int
7188dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7189{
7190 int hashval = DTRACE_HASHSTR(hash, template);
7191 int ndx = hashval & hash->dth_mask;
7192 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7193
7194 for (; bucket != NULL; bucket = bucket->dthb_next) {
7195 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7196 return (bucket->dthb_len);
7197 }
7198
7199 return (0);
7200}
7201
7202static void
7203dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7204{
7205 int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7206 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7207
7208 void **prevp = DTRACE_HASHPREV(hash, elm);
7209 void **nextp = DTRACE_HASHNEXT(hash, elm);
7210
7211 /*
7212 * Find the bucket that we're removing this elm from.
7213 */
7214 for (; bucket != NULL; bucket = bucket->dthb_next) {
7215 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7216 break;
7217 }
7218
7219 ASSERT(bucket != NULL);
7220
7221 if (*prevp == NULL) {
7222 if (*nextp == NULL) {
7223 /*
7224 * The removed element was the only element on this
7225 * bucket; we need to remove the bucket.
7226 */
7227 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7228
7229 ASSERT(bucket->dthb_chain == elm);
7230 ASSERT(b != NULL);
7231
7232 if (b == bucket) {
7233 hash->dth_tab[ndx] = bucket->dthb_next;
7234 } else {
7235 while (b->dthb_next != bucket)
7236 b = b->dthb_next;
7237 b->dthb_next = bucket->dthb_next;
7238 }
7239
7240 ASSERT(hash->dth_nbuckets > 0);
7241 hash->dth_nbuckets--;
7242 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7243 return;
7244 }
7245
7246 bucket->dthb_chain = *nextp;
7247 } else {
7248 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7249 }
7250
7251 if (*nextp != NULL)
7252 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7253}
7254
7255/*
7256 * DTrace Utility Functions
7257 *
7258 * These are random utility functions that are _not_ called from probe context.
7259 */
7260static int
7261dtrace_badattr(const dtrace_attribute_t *a)
7262{
7263 return (a->dtat_name > DTRACE_STABILITY_MAX ||
7264 a->dtat_data > DTRACE_STABILITY_MAX ||
7265 a->dtat_class > DTRACE_CLASS_MAX);
7266}
7267
7268/*
7269 * Returns a dtrace-managed copy of a string, and will
7270 * deduplicate copies of the same string.
7271 * If the specified string is NULL, returns an empty string
7272 */
7273static char *
7274dtrace_strref(const char *str)
7275{
7276 dtrace_string_t *s = NULL;
7277 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7278
7279 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7280
7281 if (str == NULL)
7282 str = "";
7283
7284 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7285 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7286 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7287 continue;
7288 }
7289 ASSERT(s->dtst_refcount != UINT32_MAX);
7290 s->dtst_refcount++;
7291 return s->dtst_str;
7292 }
7293
7294 s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7295 s->dtst_refcount = 1;
7296 (void) strlcpy(s->dtst_str, str, bufsize);
7297
7298 dtrace_hash_add(dtrace_strings, s);
7299
7300 return s->dtst_str;
7301}
7302
7303static void
7304dtrace_strunref(const char *str)
7305{
7306 ASSERT(str != NULL);
7307 dtrace_string_t *s = NULL;
7308 size_t bufsize = strlen(str) + 1;
7309
7310 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7311
7312 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7313 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7314 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7315 continue;
7316 }
7317 ASSERT(s->dtst_refcount != 0);
7318 s->dtst_refcount--;
7319 if (s->dtst_refcount == 0) {
7320 dtrace_hash_remove(dtrace_strings, s);
7321 kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7322 }
7323 return;
7324 }
7325 panic("attempt to unref non-existent string %s", str);
7326}
7327
7328#define DTRACE_ISALPHA(c) \
7329 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7330
7331static int
7332dtrace_badname(const char *s)
7333{
7334 char c;
7335
7336 if (s == NULL || (c = *s++) == '\0')
7337 return (0);
7338
7339 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7340 return (1);
7341
7342 while ((c = *s++) != '\0') {
7343 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7344 c != '-' && c != '_' && c != '.' && c != '`')
7345 return (1);
7346 }
7347
7348 return (0);
7349}
7350
7351static void
7352dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7353{
7354 uint32_t priv;
7355
7356 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7357 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7358 priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
7359 }
7360 else {
7361 priv = DTRACE_PRIV_ALL;
7362 }
7363 *uidp = 0;
7364 *zoneidp = 0;
7365 } else {
7366 *uidp = crgetuid(cr);
7367 *zoneidp = crgetzoneid(cr);
7368
7369 priv = 0;
7370 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7371 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7372 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7373 priv |= DTRACE_PRIV_USER;
7374 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7375 priv |= DTRACE_PRIV_PROC;
7376 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7377 priv |= DTRACE_PRIV_OWNER;
7378 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7379 priv |= DTRACE_PRIV_ZONEOWNER;
7380 }
7381
7382 *privp = priv;
7383}
7384
7385#ifdef DTRACE_ERRDEBUG
7386static void
7387dtrace_errdebug(const char *str)
7388{
7389 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7390 int occupied = 0;
7391
7392 lck_mtx_lock(&dtrace_errlock);
7393 dtrace_errlast = str;
7394 dtrace_errthread = (kthread_t *)current_thread();
7395
7396 while (occupied++ < DTRACE_ERRHASHSZ) {
7397 if (dtrace_errhash[hval].dter_msg == str) {
7398 dtrace_errhash[hval].dter_count++;
7399 goto out;
7400 }
7401
7402 if (dtrace_errhash[hval].dter_msg != NULL) {
7403 hval = (hval + 1) % DTRACE_ERRHASHSZ;
7404 continue;
7405 }
7406
7407 dtrace_errhash[hval].dter_msg = str;
7408 dtrace_errhash[hval].dter_count = 1;
7409 goto out;
7410 }
7411
7412 panic("dtrace: undersized error hash");
7413out:
7414 lck_mtx_unlock(&dtrace_errlock);
7415}
7416#endif
7417
7418/*
7419 * DTrace Matching Functions
7420 *
7421 * These functions are used to match groups of probes, given some elements of
7422 * a probe tuple, or some globbed expressions for elements of a probe tuple.
7423 */
7424static int
7425dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7426 zoneid_t zoneid)
7427{
7428 if (priv != DTRACE_PRIV_ALL) {
7429 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7430 uint32_t match = priv & ppriv;
7431
7432 /*
7433 * No PRIV_DTRACE_* privileges...
7434 */
7435 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7436 DTRACE_PRIV_KERNEL)) == 0)
7437 return (0);
7438
7439 /*
7440 * No matching bits, but there were bits to match...
7441 */
7442 if (match == 0 && ppriv != 0)
7443 return (0);
7444
7445 /*
7446 * Need to have permissions to the process, but don't...
7447 */
7448 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7449 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7450 return (0);
7451 }
7452
7453 /*
7454 * Need to be in the same zone unless we possess the
7455 * privilege to examine all zones.
7456 */
7457 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7458 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7459 return (0);
7460 }
7461 }
7462
7463 return (1);
7464}
7465
7466/*
7467 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7468 * consists of input pattern strings and an ops-vector to evaluate them.
7469 * This function returns >0 for match, 0 for no match, and <0 for error.
7470 */
7471static int
7472dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7473 uint32_t priv, uid_t uid, zoneid_t zoneid)
7474{
7475 dtrace_provider_t *pvp = prp->dtpr_provider;
7476 int rv;
7477
7478 if (pvp->dtpv_defunct)
7479 return (0);
7480
7481 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7482 return (rv);
7483
7484 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7485 return (rv);
7486
7487 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7488 return (rv);
7489
7490 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7491 return (rv);
7492
7493 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7494 return (0);
7495
7496 return (rv);
7497}
7498
7499/*
7500 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7501 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
7502 * libc's version, the kernel version only applies to 8-bit ASCII strings.
7503 * In addition, all of the recursion cases except for '*' matching have been
7504 * unwound. For '*', we still implement recursive evaluation, but a depth
7505 * counter is maintained and matching is aborted if we recurse too deep.
7506 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7507 */
7508static int
7509dtrace_match_glob(const char *s, const char *p, int depth)
7510{
7511 const char *olds;
7512 char s1, c;
7513 int gs;
7514
7515 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7516 return (-1);
7517
7518 if (s == NULL)
7519 s = ""; /* treat NULL as empty string */
7520
7521top:
7522 olds = s;
7523 s1 = *s++;
7524
7525 if (p == NULL)
7526 return (0);
7527
7528 if ((c = *p++) == '\0')
7529 return (s1 == '\0');
7530
7531 switch (c) {
7532 case '[': {
7533 int ok = 0, notflag = 0;
7534 char lc = '\0';
7535
7536 if (s1 == '\0')
7537 return (0);
7538
7539 if (*p == '!') {
7540 notflag = 1;
7541 p++;
7542 }
7543
7544 if ((c = *p++) == '\0')
7545 return (0);
7546
7547 do {
7548 if (c == '-' && lc != '\0' && *p != ']') {
7549 if ((c = *p++) == '\0')
7550 return (0);
7551 if (c == '\\' && (c = *p++) == '\0')
7552 return (0);
7553
7554 if (notflag) {
7555 if (s1 < lc || s1 > c)
7556 ok++;
7557 else
7558 return (0);
7559 } else if (lc <= s1 && s1 <= c)
7560 ok++;
7561
7562 } else if (c == '\\' && (c = *p++) == '\0')
7563 return (0);
7564
7565 lc = c; /* save left-hand 'c' for next iteration */
7566
7567 if (notflag) {
7568 if (s1 != c)
7569 ok++;
7570 else
7571 return (0);
7572 } else if (s1 == c)
7573 ok++;
7574
7575 if ((c = *p++) == '\0')
7576 return (0);
7577
7578 } while (c != ']');
7579
7580 if (ok)
7581 goto top;
7582
7583 return (0);
7584 }
7585
7586 case '\\':
7587 if ((c = *p++) == '\0')
7588 return (0);
7589 /*FALLTHRU*/
7590
7591 default:
7592 if (c != s1)
7593 return (0);
7594 /*FALLTHRU*/
7595
7596 case '?':
7597 if (s1 != '\0')
7598 goto top;
7599 return (0);
7600
7601 case '*':
7602 while (*p == '*')
7603 p++; /* consecutive *'s are identical to a single one */
7604
7605 if (*p == '\0')
7606 return (1);
7607
7608 for (s = olds; *s != '\0'; s++) {
7609 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7610 return (gs);
7611 }
7612
7613 return (0);
7614 }
7615}
7616
7617/*ARGSUSED*/
7618static int
7619dtrace_match_string(const char *s, const char *p, int depth)
7620{
7621#pragma unused(depth) /* __APPLE__ */
7622 return (s != NULL && s == p);
7623}
7624
7625/*ARGSUSED*/
7626static int
7627dtrace_match_module(const char *s, const char *p, int depth)
7628{
7629#pragma unused(depth) /* __APPLE__ */
7630 size_t len;
7631 if (s == NULL || p == NULL)
7632 return (0);
7633
7634 len = strlen(p);
7635
7636 if (strncmp(p, s, len) != 0)
7637 return (0);
7638
7639 if (s[len] == '.' || s[len] == '\0')
7640 return (1);
7641
7642 return (0);
7643}
7644
7645/*ARGSUSED*/
7646static int
7647dtrace_match_nul(const char *s, const char *p, int depth)
7648{
7649#pragma unused(s, p, depth) /* __APPLE__ */
7650 return (1); /* always match the empty pattern */
7651}
7652
7653/*ARGSUSED*/
7654static int
7655dtrace_match_nonzero(const char *s, const char *p, int depth)
7656{
7657#pragma unused(p, depth) /* __APPLE__ */
7658 return (s != NULL && s[0] != '\0');
7659}
7660
7661static int
7662dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7663 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
7664{
7665 dtrace_probe_t *probe;
7666 dtrace_provider_t prov_template = {
7667 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
7668 };
7669
7670 dtrace_probe_t template = {
7671 .dtpr_provider = &prov_template,
7672 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
7673 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
7674 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
7675 };
7676
7677 dtrace_hash_t *hash = NULL;
7678 int len, rc, best = INT_MAX, nmatched = 0;
7679 dtrace_id_t i;
7680
7681 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7682
7683 /*
7684 * If the probe ID is specified in the key, just lookup by ID and
7685 * invoke the match callback once if a matching probe is found.
7686 */
7687 if (pkp->dtpk_id != DTRACE_IDNONE) {
7688 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7689 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7690 if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
7691 return (DTRACE_MATCH_FAIL);
7692 nmatched++;
7693 }
7694 return (nmatched);
7695 }
7696
7697 /*
7698 * We want to find the most distinct of the provider name, module name,
7699 * function name, and name. So for each one that is not a glob
7700 * pattern or empty string, we perform a lookup in the corresponding
7701 * hash and use the hash table with the fewest collisions to do our
7702 * search.
7703 */
7704 if (pkp->dtpk_pmatch == &dtrace_match_string &&
7705 (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
7706 best = len;
7707 hash = dtrace_byprov;
7708 }
7709
7710 if (pkp->dtpk_mmatch == &dtrace_match_string &&
7711 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7712 best = len;
7713 hash = dtrace_bymod;
7714 }
7715
7716 if (pkp->dtpk_fmatch == &dtrace_match_string &&
7717 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7718 best = len;
7719 hash = dtrace_byfunc;
7720 }
7721
7722 if (pkp->dtpk_nmatch == &dtrace_match_string &&
7723 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7724 best = len;
7725 hash = dtrace_byname;
7726 }
7727
7728 /*
7729 * If we did not select a hash table, iterate over every probe and
7730 * invoke our callback for each one that matches our input probe key.
7731 */
7732 if (hash == NULL) {
7733 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7734 if ((probe = dtrace_probes[i]) == NULL ||
7735 dtrace_match_probe(probe, pkp, priv, uid,
7736 zoneid) <= 0)
7737 continue;
7738
7739 nmatched++;
7740
7741 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7742 if (rc == DTRACE_MATCH_FAIL)
7743 return (DTRACE_MATCH_FAIL);
7744 break;
7745 }
7746 }
7747
7748 return (nmatched);
7749 }
7750
7751 /*
7752 * If we selected a hash table, iterate over each probe of the same key
7753 * name and invoke the callback for every probe that matches the other
7754 * attributes of our input probe key.
7755 */
7756 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7757 probe = *(DTRACE_HASHNEXT(hash, probe))) {
7758
7759 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7760 continue;
7761
7762 nmatched++;
7763
7764 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7765 if (rc == DTRACE_MATCH_FAIL)
7766 return (DTRACE_MATCH_FAIL);
7767 break;
7768 }
7769 }
7770
7771 return (nmatched);
7772}
7773
7774/*
7775 * Return the function pointer dtrace_probecmp() should use to compare the
7776 * specified pattern with a string. For NULL or empty patterns, we select
7777 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7778 * For non-empty non-glob strings, we use dtrace_match_string().
7779 */
7780static dtrace_probekey_f *
7781dtrace_probekey_func(const char *p)
7782{
7783 char c;
7784
7785 if (p == NULL || *p == '\0')
7786 return (&dtrace_match_nul);
7787
7788 while ((c = *p++) != '\0') {
7789 if (c == '[' || c == '?' || c == '*' || c == '\\')
7790 return (&dtrace_match_glob);
7791 }
7792
7793 return (&dtrace_match_string);
7794}
7795
7796static dtrace_probekey_f *
7797dtrace_probekey_module_func(const char *p)
7798{
7799 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7800
7801 dtrace_probekey_f *f = dtrace_probekey_func(p);
7802 if (f == &dtrace_match_string) {
7803 dtrace_probe_t template = {
7804 .dtpr_mod = (char *)(uintptr_t)p,
7805 };
7806 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
7807 return (&dtrace_match_module);
7808 }
7809 return (&dtrace_match_string);
7810 }
7811 return f;
7812}
7813
7814/*
7815 * Build a probe comparison key for use with dtrace_match_probe() from the
7816 * given probe description. By convention, a null key only matches anchored
7817 * probes: if each field is the empty string, reset dtpk_fmatch to
7818 * dtrace_match_nonzero().
7819 */
7820static void
7821dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7822{
7823
7824 pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
7825 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7826
7827 pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
7828 pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
7829
7830 pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
7831 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7832
7833 pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
7834 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7835
7836 pkp->dtpk_id = pdp->dtpd_id;
7837
7838 if (pkp->dtpk_id == DTRACE_IDNONE &&
7839 pkp->dtpk_pmatch == &dtrace_match_nul &&
7840 pkp->dtpk_mmatch == &dtrace_match_nul &&
7841 pkp->dtpk_fmatch == &dtrace_match_nul &&
7842 pkp->dtpk_nmatch == &dtrace_match_nul)
7843 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7844}
7845
7846static void
7847dtrace_probekey_release(dtrace_probekey_t *pkp)
7848{
7849 dtrace_strunref(pkp->dtpk_prov);
7850 dtrace_strunref(pkp->dtpk_mod);
7851 dtrace_strunref(pkp->dtpk_func);
7852 dtrace_strunref(pkp->dtpk_name);
7853}
7854
7855static int
7856dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
7857{
7858 if (desc == NULL)
7859 return 1;
7860
7861 dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
7862
7863 return func((char*)data, desc->dtpd_provider, 0);
7864}
7865
7866/*
7867 * DTrace Provider-to-Framework API Functions
7868 *
7869 * These functions implement much of the Provider-to-Framework API, as
7870 * described in <sys/dtrace.h>. The parts of the API not in this section are
7871 * the functions in the API for probe management (found below), and
7872 * dtrace_probe() itself (found above).
7873 */
7874
7875/*
7876 * Register the calling provider with the DTrace framework. This should
7877 * generally be called by DTrace providers in their attach(9E) entry point.
7878 */
7879int
7880dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7881 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7882{
7883 dtrace_provider_t *provider;
7884
7885 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7886 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7887 "arguments", name ? name : "<NULL>");
7888 return (EINVAL);
7889 }
7890
7891 if (name[0] == '\0' || dtrace_badname(name)) {
7892 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7893 "provider name", name);
7894 return (EINVAL);
7895 }
7896
7897 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7898 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7899 pops->dtps_destroy == NULL ||
7900 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7901 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7902 "provider ops", name);
7903 return (EINVAL);
7904 }
7905
7906 if (dtrace_badattr(&pap->dtpa_provider) ||
7907 dtrace_badattr(&pap->dtpa_mod) ||
7908 dtrace_badattr(&pap->dtpa_func) ||
7909 dtrace_badattr(&pap->dtpa_name) ||
7910 dtrace_badattr(&pap->dtpa_args)) {
7911 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7912 "provider attributes", name);
7913 return (EINVAL);
7914 }
7915
7916 if (priv & ~DTRACE_PRIV_ALL) {
7917 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7918 "privilege attributes", name);
7919 return (EINVAL);
7920 }
7921
7922 if ((priv & DTRACE_PRIV_KERNEL) &&
7923 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7924 pops->dtps_usermode == NULL) {
7925 cmn_err(CE_WARN, "failed to register provider '%s': need "
7926 "dtps_usermode() op for given privilege attributes", name);
7927 return (EINVAL);
7928 }
7929
7930 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7931
7932 provider->dtpv_attr = *pap;
7933 provider->dtpv_priv.dtpp_flags = priv;
7934 if (cr != NULL) {
7935 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7936 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7937 }
7938 provider->dtpv_pops = *pops;
7939
7940 if (pops->dtps_provide == NULL) {
7941 ASSERT(pops->dtps_provide_module != NULL);
7942 provider->dtpv_pops.dtps_provide =
7943 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7944 }
7945
7946 if (pops->dtps_provide_module == NULL) {
7947 ASSERT(pops->dtps_provide != NULL);
7948 provider->dtpv_pops.dtps_provide_module =
7949 (void (*)(void *, struct modctl *))dtrace_nullop;
7950 }
7951
7952 if (pops->dtps_suspend == NULL) {
7953 ASSERT(pops->dtps_resume == NULL);
7954 provider->dtpv_pops.dtps_suspend =
7955 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7956 provider->dtpv_pops.dtps_resume =
7957 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7958 }
7959
7960 provider->dtpv_arg = arg;
7961 *idp = (dtrace_provider_id_t)provider;
7962
7963 if (pops == &dtrace_provider_ops) {
7964 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7965 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7966
7967 provider->dtpv_name = dtrace_strref(name);
7968
7969 ASSERT(dtrace_anon.dta_enabling == NULL);
7970
7971 /*
7972 * We make sure that the DTrace provider is at the head of
7973 * the provider chain.
7974 */
7975 provider->dtpv_next = dtrace_provider;
7976 dtrace_provider = provider;
7977 return (0);
7978 }
7979
7980 lck_mtx_lock(&dtrace_provider_lock);
7981 lck_mtx_lock(&dtrace_lock);
7982
7983 provider->dtpv_name = dtrace_strref(name);
7984
7985 /*
7986 * If there is at least one provider registered, we'll add this
7987 * provider after the first provider.
7988 */
7989 if (dtrace_provider != NULL) {
7990 provider->dtpv_next = dtrace_provider->dtpv_next;
7991 dtrace_provider->dtpv_next = provider;
7992 } else {
7993 dtrace_provider = provider;
7994 }
7995
7996 if (dtrace_retained != NULL) {
7997 dtrace_enabling_provide(provider);
7998
7999 /*
8000 * Now we need to call dtrace_enabling_matchall_with_cond() --
8001 * with a condition matching the provider name we just added,
8002 * which will acquire cpu_lock and dtrace_lock. We therefore need
8003 * to drop all of our locks before calling into it...
8004 */
8005 lck_mtx_unlock(&dtrace_lock);
8006 lck_mtx_unlock(&dtrace_provider_lock);
8007
8008 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8009 dtrace_enabling_matchall_with_cond(&cond);
8010
8011 return (0);
8012 }
8013
8014 lck_mtx_unlock(&dtrace_lock);
8015 lck_mtx_unlock(&dtrace_provider_lock);
8016
8017 return (0);
8018}
8019
8020/*
8021 * Unregister the specified provider from the DTrace framework. This should
8022 * generally be called by DTrace providers in their detach(9E) entry point.
8023 */
8024int
8025dtrace_unregister(dtrace_provider_id_t id)
8026{
8027 dtrace_provider_t *old = (dtrace_provider_t *)id;
8028 dtrace_provider_t *prev = NULL;
8029 int self = 0;
8030 dtrace_probe_t *probe, *first = NULL, *next = NULL;
8031 dtrace_probe_t template = {
8032 .dtpr_provider = old
8033 };
8034
8035 if (old->dtpv_pops.dtps_enable ==
8036 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8037 /*
8038 * If DTrace itself is the provider, we're called with locks
8039 * already held.
8040 */
8041 ASSERT(old == dtrace_provider);
8042 ASSERT(dtrace_devi != NULL);
8043 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8044 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8045 self = 1;
8046
8047 if (dtrace_provider->dtpv_next != NULL) {
8048 /*
8049 * There's another provider here; return failure.
8050 */
8051 return (EBUSY);
8052 }
8053 } else {
8054 lck_mtx_lock(&dtrace_provider_lock);
8055 lck_mtx_lock(&mod_lock);
8056 lck_mtx_lock(&dtrace_lock);
8057 }
8058
8059 /*
8060 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8061 * probes, we refuse to let providers slither away, unless this
8062 * provider has already been explicitly invalidated.
8063 */
8064 if (!old->dtpv_defunct &&
8065 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8066 dtrace_anon.dta_state->dts_necbs > 0))) {
8067 if (!self) {
8068 lck_mtx_unlock(&dtrace_lock);
8069 lck_mtx_unlock(&mod_lock);
8070 lck_mtx_unlock(&dtrace_provider_lock);
8071 }
8072 return (EBUSY);
8073 }
8074
8075 /*
8076 * Attempt to destroy the probes associated with this provider.
8077 */
8078 if (old->dtpv_ecb_count!=0) {
8079 /*
8080 * We have at least one ECB; we can't remove this provider.
8081 */
8082 if (!self) {
8083 lck_mtx_unlock(&dtrace_lock);
8084 lck_mtx_unlock(&mod_lock);
8085 lck_mtx_unlock(&dtrace_provider_lock);
8086 }
8087 return (EBUSY);
8088 }
8089
8090 /*
8091 * All of the probes for this provider are disabled; we can safely
8092 * remove all of them from their hash chains and from the probe array.
8093 */
8094 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8095 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8096 if (probe->dtpr_provider != old)
8097 continue;
8098
8099 dtrace_probes[probe->dtpr_id - 1] = NULL;
8100 old->dtpv_probe_count--;
8101
8102 dtrace_hash_remove(dtrace_bymod, probe);
8103 dtrace_hash_remove(dtrace_byfunc, probe);
8104 dtrace_hash_remove(dtrace_byname, probe);
8105
8106 if (first == NULL) {
8107 first = probe;
8108 probe->dtpr_nextmod = NULL;
8109 } else {
8110 /*
8111 * Use nextmod as the chain of probes to remove
8112 */
8113 probe->dtpr_nextmod = first;
8114 first = probe;
8115 }
8116 }
8117
8118 for (probe = first; probe != NULL; probe = next) {
8119 next = probe->dtpr_nextmod;
8120 dtrace_hash_remove(dtrace_byprov, probe);
8121 }
8122
8123 /*
8124 * The provider's probes have been removed from the hash chains and
8125 * from the probe array. Now issue a dtrace_sync() to be sure that
8126 * everyone has cleared out from any probe array processing.
8127 */
8128 dtrace_sync();
8129
8130 for (probe = first; probe != NULL; probe = next) {
8131 next = probe->dtpr_nextmod;
8132
8133 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8134 probe->dtpr_arg);
8135 dtrace_strunref(probe->dtpr_mod);
8136 dtrace_strunref(probe->dtpr_func);
8137 dtrace_strunref(probe->dtpr_name);
8138 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8139 zfree(dtrace_probe_t_zone, probe);
8140 }
8141
8142 if ((prev = dtrace_provider) == old) {
8143 ASSERT(self || dtrace_devi == NULL);
8144 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8145 dtrace_provider = old->dtpv_next;
8146 } else {
8147 while (prev != NULL && prev->dtpv_next != old)
8148 prev = prev->dtpv_next;
8149
8150 if (prev == NULL) {
8151 panic("attempt to unregister non-existent "
8152 "dtrace provider %p\n", (void *)id);
8153 }
8154
8155 prev->dtpv_next = old->dtpv_next;
8156 }
8157
8158 dtrace_strunref(old->dtpv_name);
8159
8160 if (!self) {
8161 lck_mtx_unlock(&dtrace_lock);
8162 lck_mtx_unlock(&mod_lock);
8163 lck_mtx_unlock(&dtrace_provider_lock);
8164 }
8165
8166 kmem_free(old, sizeof (dtrace_provider_t));
8167
8168 return (0);
8169}
8170
8171/*
8172 * Invalidate the specified provider. All subsequent probe lookups for the
8173 * specified provider will fail, but its probes will not be removed.
8174 */
8175void
8176dtrace_invalidate(dtrace_provider_id_t id)
8177{
8178 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8179
8180 ASSERT(pvp->dtpv_pops.dtps_enable !=
8181 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8182
8183 lck_mtx_lock(&dtrace_provider_lock);
8184 lck_mtx_lock(&dtrace_lock);
8185
8186 pvp->dtpv_defunct = 1;
8187
8188 lck_mtx_unlock(&dtrace_lock);
8189 lck_mtx_unlock(&dtrace_provider_lock);
8190}
8191
8192/*
8193 * Indicate whether or not DTrace has attached.
8194 */
8195int
8196dtrace_attached(void)
8197{
8198 /*
8199 * dtrace_provider will be non-NULL iff the DTrace driver has
8200 * attached. (It's non-NULL because DTrace is always itself a
8201 * provider.)
8202 */
8203 return (dtrace_provider != NULL);
8204}
8205
8206/*
8207 * Remove all the unenabled probes for the given provider. This function is
8208 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8209 * -- just as many of its associated probes as it can.
8210 */
8211int
8212dtrace_condense(dtrace_provider_id_t id)
8213{
8214 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8215 dtrace_probe_t *probe, *first = NULL;
8216 dtrace_probe_t template = {
8217 .dtpr_provider = prov
8218 };
8219
8220 /*
8221 * Make sure this isn't the dtrace provider itself.
8222 */
8223 ASSERT(prov->dtpv_pops.dtps_enable !=
8224 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8225
8226 lck_mtx_lock(&dtrace_provider_lock);
8227 lck_mtx_lock(&dtrace_lock);
8228
8229 /*
8230 * Attempt to destroy the probes associated with this provider.
8231 */
8232 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8233 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8234
8235 if (probe->dtpr_provider != prov)
8236 continue;
8237
8238 if (probe->dtpr_ecb != NULL)
8239 continue;
8240
8241 dtrace_probes[probe->dtpr_id - 1] = NULL;
8242 prov->dtpv_probe_count--;
8243
8244 dtrace_hash_remove(dtrace_bymod, probe);
8245 dtrace_hash_remove(dtrace_byfunc, probe);
8246 dtrace_hash_remove(dtrace_byname, probe);
8247
8248 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8249 probe->dtpr_arg);
8250 dtrace_strunref(probe->dtpr_mod);
8251 dtrace_strunref(probe->dtpr_func);
8252 dtrace_strunref(probe->dtpr_name);
8253 if (first == NULL) {
8254 first = probe;
8255 probe->dtpr_nextmod = NULL;
8256 } else {
8257 /*
8258 * Use nextmod as the chain of probes to remove
8259 */
8260 probe->dtpr_nextmod = first;
8261 first = probe;
8262 }
8263 }
8264
8265 for (probe = first; probe != NULL; probe = first) {
8266 first = probe->dtpr_nextmod;
8267 dtrace_hash_remove(dtrace_byprov, probe);
8268 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8269 zfree(dtrace_probe_t_zone, probe);
8270 }
8271
8272 lck_mtx_unlock(&dtrace_lock);
8273 lck_mtx_unlock(&dtrace_provider_lock);
8274
8275 return (0);
8276}
8277
8278/*
8279 * DTrace Probe Management Functions
8280 *
8281 * The functions in this section perform the DTrace probe management,
8282 * including functions to create probes, look-up probes, and call into the
8283 * providers to request that probes be provided. Some of these functions are
8284 * in the Provider-to-Framework API; these functions can be identified by the
8285 * fact that they are not declared "static".
8286 */
8287
8288/*
8289 * Create a probe with the specified module name, function name, and name.
8290 */
8291dtrace_id_t
8292dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8293 const char *func, const char *name, int aframes, void *arg)
8294{
8295 dtrace_probe_t *probe, **probes;
8296 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8297 dtrace_id_t id;
8298
8299 if (provider == dtrace_provider) {
8300 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8301 } else {
8302 lck_mtx_lock(&dtrace_lock);
8303 }
8304
8305 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8306 VM_BESTFIT | VM_SLEEP);
8307
8308 probe = zalloc(dtrace_probe_t_zone);
8309 bzero(probe, sizeof (dtrace_probe_t));
8310
8311 probe->dtpr_id = id;
8312 probe->dtpr_gen = dtrace_probegen++;
8313 probe->dtpr_mod = dtrace_strref(mod);
8314 probe->dtpr_func = dtrace_strref(func);
8315 probe->dtpr_name = dtrace_strref(name);
8316 probe->dtpr_arg = arg;
8317 probe->dtpr_aframes = aframes;
8318 probe->dtpr_provider = provider;
8319
8320 dtrace_hash_add(dtrace_byprov, probe);
8321 dtrace_hash_add(dtrace_bymod, probe);
8322 dtrace_hash_add(dtrace_byfunc, probe);
8323 dtrace_hash_add(dtrace_byname, probe);
8324
8325 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8326 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8327 size_t nsize = osize << 1;
8328
8329 if (nsize == 0) {
8330 ASSERT(osize == 0);
8331 ASSERT(dtrace_probes == NULL);
8332 nsize = sizeof (dtrace_probe_t *);
8333 }
8334
8335 probes = kmem_zalloc(nsize, KM_SLEEP);
8336
8337 if (dtrace_probes == NULL) {
8338 ASSERT(osize == 0);
8339 dtrace_probes = probes;
8340 dtrace_nprobes = 1;
8341 } else {
8342 dtrace_probe_t **oprobes = dtrace_probes;
8343
8344 bcopy(oprobes, probes, osize);
8345 dtrace_membar_producer();
8346 dtrace_probes = probes;
8347
8348 dtrace_sync();
8349
8350 /*
8351 * All CPUs are now seeing the new probes array; we can
8352 * safely free the old array.
8353 */
8354 kmem_free(oprobes, osize);
8355 dtrace_nprobes <<= 1;
8356 }
8357
8358 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8359 }
8360
8361 ASSERT(dtrace_probes[id - 1] == NULL);
8362 dtrace_probes[id - 1] = probe;
8363 provider->dtpv_probe_count++;
8364
8365 if (provider != dtrace_provider)
8366 lck_mtx_unlock(&dtrace_lock);
8367
8368 return (id);
8369}
8370
8371static dtrace_probe_t *
8372dtrace_probe_lookup_id(dtrace_id_t id)
8373{
8374 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8375
8376 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8377 return (NULL);
8378
8379 return (dtrace_probes[id - 1]);
8380}
8381
8382static int
8383dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
8384{
8385#pragma unused(arg2)
8386 *((dtrace_id_t *)arg1) = probe->dtpr_id;
8387
8388 return (DTRACE_MATCH_DONE);
8389}
8390
8391/*
8392 * Look up a probe based on provider and one or more of module name, function
8393 * name and probe name.
8394 */
8395dtrace_id_t
8396dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
8397 const char *func, const char *name)
8398{
8399 dtrace_probekey_t pkey;
8400 dtrace_id_t id;
8401 int match;
8402
8403 lck_mtx_lock(&dtrace_lock);
8404
8405 pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
8406 pkey.dtpk_pmatch = &dtrace_match_string;
8407 pkey.dtpk_mod = dtrace_strref(mod);
8408 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8409 pkey.dtpk_func = dtrace_strref(func);
8410 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8411 pkey.dtpk_name = dtrace_strref(name);
8412 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8413 pkey.dtpk_id = DTRACE_IDNONE;
8414
8415 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8416 dtrace_probe_lookup_match, &id, NULL);
8417
8418 dtrace_probekey_release(&pkey);
8419
8420 lck_mtx_unlock(&dtrace_lock);
8421
8422 ASSERT(match == 1 || match == 0);
8423 return (match ? id : 0);
8424}
8425
8426/*
8427 * Returns the probe argument associated with the specified probe.
8428 */
8429void *
8430dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8431{
8432 dtrace_probe_t *probe;
8433 void *rval = NULL;
8434
8435 lck_mtx_lock(&dtrace_lock);
8436
8437 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8438 probe->dtpr_provider == (dtrace_provider_t *)id)
8439 rval = probe->dtpr_arg;
8440
8441 lck_mtx_unlock(&dtrace_lock);
8442
8443 return (rval);
8444}
8445
8446/*
8447 * Copy a probe into a probe description.
8448 */
8449static void
8450dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8451{
8452 bzero(pdp, sizeof (dtrace_probedesc_t));
8453 pdp->dtpd_id = prp->dtpr_id;
8454
8455 /* APPLE NOTE: Darwin employs size bounded string operation. */
8456 (void) strlcpy(pdp->dtpd_provider,
8457 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
8458
8459 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
8460 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
8461 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
8462}
8463
8464/*
8465 * Called to indicate that a probe -- or probes -- should be provided by a
8466 * specfied provider. If the specified description is NULL, the provider will
8467 * be told to provide all of its probes. (This is done whenever a new
8468 * consumer comes along, or whenever a retained enabling is to be matched.) If
8469 * the specified description is non-NULL, the provider is given the
8470 * opportunity to dynamically provide the specified probe, allowing providers
8471 * to support the creation of probes on-the-fly. (So-called _autocreated_
8472 * probes.) If the provider is NULL, the operations will be applied to all
8473 * providers; if the provider is non-NULL the operations will only be applied
8474 * to the specified provider. The dtrace_provider_lock must be held, and the
8475 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8476 * will need to grab the dtrace_lock when it reenters the framework through
8477 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8478 */
8479static void
8480dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8481{
8482 struct modctl *ctl;
8483 int all = 0;
8484
8485 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8486
8487 if (prv == NULL) {
8488 all = 1;
8489 prv = dtrace_provider;
8490 }
8491
8492 do {
8493 /*
8494 * First, call the blanket provide operation.
8495 */
8496 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8497
8498 /*
8499 * Now call the per-module provide operation. We will grab
8500 * mod_lock to prevent the list from being modified. Note
8501 * that this also prevents the mod_busy bits from changing.
8502 * (mod_busy can only be changed with mod_lock held.)
8503 */
8504 lck_mtx_lock(&mod_lock);
8505
8506 ctl = dtrace_modctl_list;
8507 while (ctl) {
8508 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8509 ctl = ctl->mod_next;
8510 }
8511
8512 lck_mtx_unlock(&mod_lock);
8513 } while (all && (prv = prv->dtpv_next) != NULL);
8514}
8515
8516/*
8517 * Iterate over each probe, and call the Framework-to-Provider API function
8518 * denoted by offs.
8519 */
8520static void
8521dtrace_probe_foreach(uintptr_t offs)
8522{
8523 dtrace_provider_t *prov;
8524 void (*func)(void *, dtrace_id_t, void *);
8525 dtrace_probe_t *probe;
8526 dtrace_icookie_t cookie;
8527 int i;
8528
8529 /*
8530 * We disable interrupts to walk through the probe array. This is
8531 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8532 * won't see stale data.
8533 */
8534 cookie = dtrace_interrupt_disable();
8535
8536 for (i = 0; i < dtrace_nprobes; i++) {
8537 if ((probe = dtrace_probes[i]) == NULL)
8538 continue;
8539
8540 if (probe->dtpr_ecb == NULL) {
8541 /*
8542 * This probe isn't enabled -- don't call the function.
8543 */
8544 continue;
8545 }
8546
8547 prov = probe->dtpr_provider;
8548 func = *((void(**)(void *, dtrace_id_t, void *))
8549 ((uintptr_t)&prov->dtpv_pops + offs));
8550
8551 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8552 }
8553
8554 dtrace_interrupt_enable(cookie);
8555}
8556
8557static int
8558dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
8559{
8560 dtrace_probekey_t pkey;
8561 uint32_t priv;
8562 uid_t uid;
8563 zoneid_t zoneid;
8564 int err;
8565
8566 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8567
8568 dtrace_ecb_create_cache = NULL;
8569
8570 if (desc == NULL) {
8571 /*
8572 * If we're passed a NULL description, we're being asked to
8573 * create an ECB with a NULL probe.
8574 */
8575 (void) dtrace_ecb_create_enable(NULL, enab, ep);
8576 return (0);
8577 }
8578
8579 dtrace_probekey(desc, &pkey);
8580 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8581 &priv, &uid, &zoneid);
8582
8583 err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
8584
8585 dtrace_probekey_release(&pkey);
8586
8587 return err;
8588}
8589
8590/*
8591 * DTrace Helper Provider Functions
8592 */
8593static void
8594dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8595{
8596 attr->dtat_name = DOF_ATTR_NAME(dofattr);
8597 attr->dtat_data = DOF_ATTR_DATA(dofattr);
8598 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8599}
8600
8601static void
8602dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8603 const dof_provider_t *dofprov, char *strtab)
8604{
8605 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8606 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8607 dofprov->dofpv_provattr);
8608 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8609 dofprov->dofpv_modattr);
8610 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8611 dofprov->dofpv_funcattr);
8612 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8613 dofprov->dofpv_nameattr);
8614 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8615 dofprov->dofpv_argsattr);
8616}
8617
8618static void
8619dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
8620{
8621 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8622 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8623 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8624 dof_provider_t *provider;
8625 dof_probe_t *probe;
8626 uint32_t *off, *enoff;
8627 uint8_t *arg;
8628 char *strtab;
8629 uint_t i, nprobes;
8630 dtrace_helper_provdesc_t dhpv;
8631 dtrace_helper_probedesc_t dhpb;
8632 dtrace_meta_t *meta = dtrace_meta_pid;
8633 dtrace_mops_t *mops = &meta->dtm_mops;
8634 void *parg;
8635
8636 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8637 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8638 provider->dofpv_strtab * dof->dofh_secsize);
8639 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8640 provider->dofpv_probes * dof->dofh_secsize);
8641 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8642 provider->dofpv_prargs * dof->dofh_secsize);
8643 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8644 provider->dofpv_proffs * dof->dofh_secsize);
8645
8646 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8647 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8648 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8649 enoff = NULL;
8650
8651 /*
8652 * See dtrace_helper_provider_validate().
8653 */
8654 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8655 provider->dofpv_prenoffs != DOF_SECT_NONE) {
8656 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8657 provider->dofpv_prenoffs * dof->dofh_secsize);
8658 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8659 }
8660
8661 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8662
8663 /*
8664 * Create the provider.
8665 */
8666 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8667
8668 if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
8669 return;
8670
8671 meta->dtm_count++;
8672
8673 /*
8674 * Create the probes.
8675 */
8676 for (i = 0; i < nprobes; i++) {
8677 probe = (dof_probe_t *)(uintptr_t)(daddr +
8678 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8679
8680 dhpb.dthpb_mod = dhp->dofhp_mod;
8681 dhpb.dthpb_func = strtab + probe->dofpr_func;
8682 dhpb.dthpb_name = strtab + probe->dofpr_name;
8683#if !defined(__APPLE__)
8684 dhpb.dthpb_base = probe->dofpr_addr;
8685#else
8686 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8687#endif
8688 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8689 dhpb.dthpb_noffs = probe->dofpr_noffs;
8690 if (enoff != NULL) {
8691 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8692 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8693 } else {
8694 dhpb.dthpb_enoffs = NULL;
8695 dhpb.dthpb_nenoffs = 0;
8696 }
8697 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8698 dhpb.dthpb_nargc = probe->dofpr_nargc;
8699 dhpb.dthpb_xargc = probe->dofpr_xargc;
8700 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8701 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8702
8703 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8704 }
8705
8706 /*
8707 * Since we just created probes, we need to match our enablings
8708 * against those, with a precondition knowing that we have only
8709 * added probes from this provider
8710 */
8711 char *prov_name = mops->dtms_provider_name(parg);
8712 ASSERT(prov_name != NULL);
8713 dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
8714
8715 dtrace_enabling_matchall_with_cond(&cond);
8716}
8717
8718static void
8719dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
8720{
8721 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8722 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8723 uint32_t i;
8724
8725 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8726
8727 for (i = 0; i < dof->dofh_secnum; i++) {
8728 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8729 dof->dofh_secoff + i * dof->dofh_secsize);
8730
8731 if (sec->dofs_type != DOF_SECT_PROVIDER)
8732 continue;
8733
8734 dtrace_helper_provide_one(dhp, sec, p);
8735 }
8736}
8737
8738static void
8739dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
8740{
8741 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8742 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8743 dof_sec_t *str_sec;
8744 dof_provider_t *provider;
8745 char *strtab;
8746 dtrace_helper_provdesc_t dhpv;
8747 dtrace_meta_t *meta = dtrace_meta_pid;
8748 dtrace_mops_t *mops = &meta->dtm_mops;
8749
8750 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8751 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8752 provider->dofpv_strtab * dof->dofh_secsize);
8753
8754 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8755
8756 /*
8757 * Create the provider.
8758 */
8759 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8760
8761 mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
8762
8763 meta->dtm_count--;
8764}
8765
8766static void
8767dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
8768{
8769 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8770 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8771 uint32_t i;
8772
8773 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8774
8775 for (i = 0; i < dof->dofh_secnum; i++) {
8776 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8777 dof->dofh_secoff + i * dof->dofh_secsize);
8778
8779 if (sec->dofs_type != DOF_SECT_PROVIDER)
8780 continue;
8781
8782 dtrace_helper_provider_remove_one(dhp, sec, p);
8783 }
8784}
8785
8786/*
8787 * DTrace Meta Provider-to-Framework API Functions
8788 *
8789 * These functions implement the Meta Provider-to-Framework API, as described
8790 * in <sys/dtrace.h>.
8791 */
8792int
8793dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8794 dtrace_meta_provider_id_t *idp)
8795{
8796 dtrace_meta_t *meta;
8797 dtrace_helpers_t *help, *next;
8798 uint_t i;
8799
8800 *idp = DTRACE_METAPROVNONE;
8801
8802 /*
8803 * We strictly don't need the name, but we hold onto it for
8804 * debuggability. All hail error queues!
8805 */
8806 if (name == NULL) {
8807 cmn_err(CE_WARN, "failed to register meta-provider: "
8808 "invalid name");
8809 return (EINVAL);
8810 }
8811
8812 if (mops == NULL ||
8813 mops->dtms_create_probe == NULL ||
8814 mops->dtms_provide_proc == NULL ||
8815 mops->dtms_remove_proc == NULL) {
8816 cmn_err(CE_WARN, "failed to register meta-register %s: "
8817 "invalid ops", name);
8818 return (EINVAL);
8819 }
8820
8821 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8822 meta->dtm_mops = *mops;
8823 meta->dtm_arg = arg;
8824
8825 lck_mtx_lock(&dtrace_meta_lock);
8826 lck_mtx_lock(&dtrace_lock);
8827
8828 if (dtrace_meta_pid != NULL) {
8829 lck_mtx_unlock(&dtrace_lock);
8830 lck_mtx_unlock(&dtrace_meta_lock);
8831 cmn_err(CE_WARN, "failed to register meta-register %s: "
8832 "user-land meta-provider exists", name);
8833 kmem_free(meta, sizeof (dtrace_meta_t));
8834 return (EINVAL);
8835 }
8836
8837 meta->dtm_name = dtrace_strref(name);
8838
8839 dtrace_meta_pid = meta;
8840 *idp = (dtrace_meta_provider_id_t)meta;
8841
8842 /*
8843 * If there are providers and probes ready to go, pass them
8844 * off to the new meta provider now.
8845 */
8846
8847 help = dtrace_deferred_pid;
8848 dtrace_deferred_pid = NULL;
8849
8850 lck_mtx_unlock(&dtrace_lock);
8851
8852 while (help != NULL) {
8853 for (i = 0; i < help->dthps_nprovs; i++) {
8854 proc_t *p = proc_find(help->dthps_pid);
8855 if (p == PROC_NULL)
8856 continue;
8857 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8858 p);
8859 proc_rele(p);
8860 }
8861
8862 next = help->dthps_next;
8863 help->dthps_next = NULL;
8864 help->dthps_prev = NULL;
8865 help->dthps_deferred = 0;
8866 help = next;
8867 }
8868
8869 lck_mtx_unlock(&dtrace_meta_lock);
8870
8871 return (0);
8872}
8873
8874int
8875dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8876{
8877 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8878
8879 lck_mtx_lock(&dtrace_meta_lock);
8880 lck_mtx_lock(&dtrace_lock);
8881
8882 if (old == dtrace_meta_pid) {
8883 pp = &dtrace_meta_pid;
8884 } else {
8885 panic("attempt to unregister non-existent "
8886 "dtrace meta-provider %p\n", (void *)old);
8887 }
8888
8889 if (old->dtm_count != 0) {
8890 lck_mtx_unlock(&dtrace_lock);
8891 lck_mtx_unlock(&dtrace_meta_lock);
8892 return (EBUSY);
8893 }
8894
8895 *pp = NULL;
8896
8897 dtrace_strunref(old->dtm_name);
8898
8899 lck_mtx_unlock(&dtrace_lock);
8900 lck_mtx_unlock(&dtrace_meta_lock);
8901
8902 kmem_free(old, sizeof (dtrace_meta_t));
8903
8904 return (0);
8905}
8906
8907
8908/*
8909 * DTrace DIF Object Functions
8910 */
8911static int
8912dtrace_difo_err(uint_t pc, const char *format, ...)
8913{
8914 if (dtrace_err_verbose) {
8915 va_list alist;
8916
8917 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8918 va_start(alist, format);
8919 (void) vuprintf(format, alist);
8920 va_end(alist);
8921 }
8922
8923#ifdef DTRACE_ERRDEBUG
8924 dtrace_errdebug(format);
8925#endif
8926 return (1);
8927}
8928
8929/*
8930 * Validate a DTrace DIF object by checking the IR instructions. The following
8931 * rules are currently enforced by dtrace_difo_validate():
8932 *
8933 * 1. Each instruction must have a valid opcode
8934 * 2. Each register, string, variable, or subroutine reference must be valid
8935 * 3. No instruction can modify register %r0 (must be zero)
8936 * 4. All instruction reserved bits must be set to zero
8937 * 5. The last instruction must be a "ret" instruction
8938 * 6. All branch targets must reference a valid instruction _after_ the branch
8939 */
8940static int
8941dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8942 cred_t *cr)
8943{
8944 int err = 0;
8945 uint_t i;
8946
8947 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8948 int kcheckload;
8949 uint_t pc;
8950 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
8951
8952 kcheckload = cr == NULL ||
8953 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8954
8955 dp->dtdo_destructive = 0;
8956
8957 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8958 dif_instr_t instr = dp->dtdo_buf[pc];
8959
8960 uint_t r1 = DIF_INSTR_R1(instr);
8961 uint_t r2 = DIF_INSTR_R2(instr);
8962 uint_t rd = DIF_INSTR_RD(instr);
8963 uint_t rs = DIF_INSTR_RS(instr);
8964 uint_t label = DIF_INSTR_LABEL(instr);
8965 uint_t v = DIF_INSTR_VAR(instr);
8966 uint_t subr = DIF_INSTR_SUBR(instr);
8967 uint_t type = DIF_INSTR_TYPE(instr);
8968 uint_t op = DIF_INSTR_OP(instr);
8969
8970 switch (op) {
8971 case DIF_OP_OR:
8972 case DIF_OP_XOR:
8973 case DIF_OP_AND:
8974 case DIF_OP_SLL:
8975 case DIF_OP_SRL:
8976 case DIF_OP_SRA:
8977 case DIF_OP_SUB:
8978 case DIF_OP_ADD:
8979 case DIF_OP_MUL:
8980 case DIF_OP_SDIV:
8981 case DIF_OP_UDIV:
8982 case DIF_OP_SREM:
8983 case DIF_OP_UREM:
8984 case DIF_OP_COPYS:
8985 if (r1 >= nregs)
8986 err += efunc(pc, "invalid register %u\n", r1);
8987 if (r2 >= nregs)
8988 err += efunc(pc, "invalid register %u\n", r2);
8989 if (rd >= nregs)
8990 err += efunc(pc, "invalid register %u\n", rd);
8991 if (rd == 0)
8992 err += efunc(pc, "cannot write to %r0\n");
8993 break;
8994 case DIF_OP_NOT:
8995 case DIF_OP_MOV:
8996 case DIF_OP_ALLOCS:
8997 if (r1 >= nregs)
8998 err += efunc(pc, "invalid register %u\n", r1);
8999 if (r2 != 0)
9000 err += efunc(pc, "non-zero reserved bits\n");
9001 if (rd >= nregs)
9002 err += efunc(pc, "invalid register %u\n", rd);
9003 if (rd == 0)
9004 err += efunc(pc, "cannot write to %r0\n");
9005 break;
9006 case DIF_OP_LDSB:
9007 case DIF_OP_LDSH:
9008 case DIF_OP_LDSW:
9009 case DIF_OP_LDUB:
9010 case DIF_OP_LDUH:
9011 case DIF_OP_LDUW:
9012 case DIF_OP_LDX:
9013 if (r1 >= nregs)
9014 err += efunc(pc, "invalid register %u\n", r1);
9015 if (r2 != 0)
9016 err += efunc(pc, "non-zero reserved bits\n");
9017 if (rd >= nregs)
9018 err += efunc(pc, "invalid register %u\n", rd);
9019 if (rd == 0)
9020 err += efunc(pc, "cannot write to %r0\n");
9021 if (kcheckload)
9022 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9023 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9024 break;
9025 case DIF_OP_RLDSB:
9026 case DIF_OP_RLDSH:
9027 case DIF_OP_RLDSW:
9028 case DIF_OP_RLDUB:
9029 case DIF_OP_RLDUH:
9030 case DIF_OP_RLDUW:
9031 case DIF_OP_RLDX:
9032 if (r1 >= nregs)
9033 err += efunc(pc, "invalid register %u\n", r1);
9034 if (r2 != 0)
9035 err += efunc(pc, "non-zero reserved bits\n");
9036 if (rd >= nregs)
9037 err += efunc(pc, "invalid register %u\n", rd);
9038 if (rd == 0)
9039 err += efunc(pc, "cannot write to %r0\n");
9040 break;
9041 case DIF_OP_ULDSB:
9042 case DIF_OP_ULDSH:
9043 case DIF_OP_ULDSW:
9044 case DIF_OP_ULDUB:
9045 case DIF_OP_ULDUH:
9046 case DIF_OP_ULDUW:
9047 case DIF_OP_ULDX:
9048 if (r1 >= nregs)
9049 err += efunc(pc, "invalid register %u\n", r1);
9050 if (r2 != 0)
9051 err += efunc(pc, "non-zero reserved bits\n");
9052 if (rd >= nregs)
9053 err += efunc(pc, "invalid register %u\n", rd);
9054 if (rd == 0)
9055 err += efunc(pc, "cannot write to %r0\n");
9056 break;
9057 case DIF_OP_STB:
9058 case DIF_OP_STH:
9059 case DIF_OP_STW:
9060 case DIF_OP_STX:
9061 if (r1 >= nregs)
9062 err += efunc(pc, "invalid register %u\n", r1);
9063 if (r2 != 0)
9064 err += efunc(pc, "non-zero reserved bits\n");
9065 if (rd >= nregs)
9066 err += efunc(pc, "invalid register %u\n", rd);
9067 if (rd == 0)
9068 err += efunc(pc, "cannot write to 0 address\n");
9069 break;
9070 case DIF_OP_CMP:
9071 case DIF_OP_SCMP:
9072 if (r1 >= nregs)
9073 err += efunc(pc, "invalid register %u\n", r1);
9074 if (r2 >= nregs)
9075 err += efunc(pc, "invalid register %u\n", r2);
9076 if (rd != 0)
9077 err += efunc(pc, "non-zero reserved bits\n");
9078 break;
9079 case DIF_OP_TST:
9080 if (r1 >= nregs)
9081 err += efunc(pc, "invalid register %u\n", r1);
9082 if (r2 != 0 || rd != 0)
9083 err += efunc(pc, "non-zero reserved bits\n");
9084 break;
9085 case DIF_OP_BA:
9086 case DIF_OP_BE:
9087 case DIF_OP_BNE:
9088 case DIF_OP_BG:
9089 case DIF_OP_BGU:
9090 case DIF_OP_BGE:
9091 case DIF_OP_BGEU:
9092 case DIF_OP_BL:
9093 case DIF_OP_BLU:
9094 case DIF_OP_BLE:
9095 case DIF_OP_BLEU:
9096 if (label >= dp->dtdo_len) {
9097 err += efunc(pc, "invalid branch target %u\n",
9098 label);
9099 }
9100 if (label <= pc) {
9101 err += efunc(pc, "backward branch to %u\n",
9102 label);
9103 }
9104 break;
9105 case DIF_OP_RET:
9106 if (r1 != 0 || r2 != 0)
9107 err += efunc(pc, "non-zero reserved bits\n");
9108 if (rd >= nregs)
9109 err += efunc(pc, "invalid register %u\n", rd);
9110 break;
9111 case DIF_OP_NOP:
9112 case DIF_OP_POPTS:
9113 case DIF_OP_FLUSHTS:
9114 if (r1 != 0 || r2 != 0 || rd != 0)
9115 err += efunc(pc, "non-zero reserved bits\n");
9116 break;
9117 case DIF_OP_SETX:
9118 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9119 err += efunc(pc, "invalid integer ref %u\n",
9120 DIF_INSTR_INTEGER(instr));
9121 }
9122 if (rd >= nregs)
9123 err += efunc(pc, "invalid register %u\n", rd);
9124 if (rd == 0)
9125 err += efunc(pc, "cannot write to %r0\n");
9126 break;
9127 case DIF_OP_SETS:
9128 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9129 err += efunc(pc, "invalid string ref %u\n",
9130 DIF_INSTR_STRING(instr));
9131 }
9132 if (rd >= nregs)
9133 err += efunc(pc, "invalid register %u\n", rd);
9134 if (rd == 0)
9135 err += efunc(pc, "cannot write to %r0\n");
9136 break;
9137 case DIF_OP_LDGA:
9138 case DIF_OP_LDTA:
9139 if (r1 > DIF_VAR_ARRAY_MAX)
9140 err += efunc(pc, "invalid array %u\n", r1);
9141 if (r2 >= nregs)
9142 err += efunc(pc, "invalid register %u\n", r2);
9143 if (rd >= nregs)
9144 err += efunc(pc, "invalid register %u\n", rd);
9145 if (rd == 0)
9146 err += efunc(pc, "cannot write to %r0\n");
9147 break;
9148 case DIF_OP_LDGS:
9149 case DIF_OP_LDTS:
9150 case DIF_OP_LDLS:
9151 case DIF_OP_LDGAA:
9152 case DIF_OP_LDTAA:
9153 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9154 err += efunc(pc, "invalid variable %u\n", v);
9155 if (rd >= nregs)
9156 err += efunc(pc, "invalid register %u\n", rd);
9157 if (rd == 0)
9158 err += efunc(pc, "cannot write to %r0\n");
9159 break;
9160 case DIF_OP_STGS:
9161 case DIF_OP_STTS:
9162 case DIF_OP_STLS:
9163 case DIF_OP_STGAA:
9164 case DIF_OP_STTAA:
9165 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9166 err += efunc(pc, "invalid variable %u\n", v);
9167 if (rs >= nregs)
9168 err += efunc(pc, "invalid register %u\n", rd);
9169 break;
9170 case DIF_OP_CALL:
9171 if (subr > DIF_SUBR_MAX &&
9172 !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9173 err += efunc(pc, "invalid subr %u\n", subr);
9174 if (rd >= nregs)
9175 err += efunc(pc, "invalid register %u\n", rd);
9176 if (rd == 0)
9177 err += efunc(pc, "cannot write to %r0\n");
9178
9179 if (subr == DIF_SUBR_COPYOUT ||
9180 subr == DIF_SUBR_COPYOUTSTR ||
9181 subr == DIF_SUBR_KDEBUG_TRACE ||
9182 subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9183 dp->dtdo_destructive = 1;
9184 }
9185 break;
9186 case DIF_OP_PUSHTR:
9187 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9188 err += efunc(pc, "invalid ref type %u\n", type);
9189 if (r2 >= nregs)
9190 err += efunc(pc, "invalid register %u\n", r2);
9191 if (rs >= nregs)
9192 err += efunc(pc, "invalid register %u\n", rs);
9193 break;
9194 case DIF_OP_PUSHTV:
9195 if (type != DIF_TYPE_CTF)
9196 err += efunc(pc, "invalid val type %u\n", type);
9197 if (r2 >= nregs)
9198 err += efunc(pc, "invalid register %u\n", r2);
9199 if (rs >= nregs)
9200 err += efunc(pc, "invalid register %u\n", rs);
9201 break;
9202 default:
9203 err += efunc(pc, "invalid opcode %u\n",
9204 DIF_INSTR_OP(instr));
9205 }
9206 }
9207
9208 if (dp->dtdo_len != 0 &&
9209 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9210 err += efunc(dp->dtdo_len - 1,
9211 "expected 'ret' as last DIF instruction\n");
9212 }
9213
9214 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9215 /*
9216 * If we're not returning by reference, the size must be either
9217 * 0 or the size of one of the base types.
9218 */
9219 switch (dp->dtdo_rtype.dtdt_size) {
9220 case 0:
9221 case sizeof (uint8_t):
9222 case sizeof (uint16_t):
9223 case sizeof (uint32_t):
9224 case sizeof (uint64_t):
9225 break;
9226
9227 default:
9228 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9229 }
9230 }
9231
9232 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9233 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9234 dtrace_diftype_t *vt, *et;
9235 uint_t id;
9236 int ndx;
9237
9238 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9239 v->dtdv_scope != DIFV_SCOPE_THREAD &&
9240 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9241 err += efunc(i, "unrecognized variable scope %d\n",
9242 v->dtdv_scope);
9243 break;
9244 }
9245
9246 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9247 v->dtdv_kind != DIFV_KIND_SCALAR) {
9248 err += efunc(i, "unrecognized variable type %d\n",
9249 v->dtdv_kind);
9250 break;
9251 }
9252
9253 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9254 err += efunc(i, "%d exceeds variable id limit\n", id);
9255 break;
9256 }
9257
9258 if (id < DIF_VAR_OTHER_UBASE)
9259 continue;
9260
9261 /*
9262 * For user-defined variables, we need to check that this
9263 * definition is identical to any previous definition that we
9264 * encountered.
9265 */
9266 ndx = id - DIF_VAR_OTHER_UBASE;
9267
9268 switch (v->dtdv_scope) {
9269 case DIFV_SCOPE_GLOBAL:
9270 if (maxglobal == -1 || ndx > maxglobal)
9271 maxglobal = ndx;
9272
9273 if (ndx < vstate->dtvs_nglobals) {
9274 dtrace_statvar_t *svar;
9275
9276 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9277 existing = &svar->dtsv_var;
9278 }
9279
9280 break;
9281
9282 case DIFV_SCOPE_THREAD:
9283 if (maxtlocal == -1 || ndx > maxtlocal)
9284 maxtlocal = ndx;
9285
9286 if (ndx < vstate->dtvs_ntlocals)
9287 existing = &vstate->dtvs_tlocals[ndx];
9288 break;
9289
9290 case DIFV_SCOPE_LOCAL:
9291 if (maxlocal == -1 || ndx > maxlocal)
9292 maxlocal = ndx;
9293 if (ndx < vstate->dtvs_nlocals) {
9294 dtrace_statvar_t *svar;
9295
9296 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9297 existing = &svar->dtsv_var;
9298 }
9299
9300 break;
9301 }
9302
9303 vt = &v->dtdv_type;
9304
9305 if (vt->dtdt_flags & DIF_TF_BYREF) {
9306 if (vt->dtdt_size == 0) {
9307 err += efunc(i, "zero-sized variable\n");
9308 break;
9309 }
9310
9311 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
9312 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9313 vt->dtdt_size > dtrace_statvar_maxsize) {
9314 err += efunc(i, "oversized by-ref static\n");
9315 break;
9316 }
9317 }
9318
9319 if (existing == NULL || existing->dtdv_id == 0)
9320 continue;
9321
9322 ASSERT(existing->dtdv_id == v->dtdv_id);
9323 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9324
9325 if (existing->dtdv_kind != v->dtdv_kind)
9326 err += efunc(i, "%d changed variable kind\n", id);
9327
9328 et = &existing->dtdv_type;
9329
9330 if (vt->dtdt_flags != et->dtdt_flags) {
9331 err += efunc(i, "%d changed variable type flags\n", id);
9332 break;
9333 }
9334
9335 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9336 err += efunc(i, "%d changed variable type size\n", id);
9337 break;
9338 }
9339 }
9340
9341 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9342 dif_instr_t instr = dp->dtdo_buf[pc];
9343
9344 uint_t v = DIF_INSTR_VAR(instr);
9345 uint_t op = DIF_INSTR_OP(instr);
9346
9347 switch (op) {
9348 case DIF_OP_LDGS:
9349 case DIF_OP_LDGAA:
9350 case DIF_OP_STGS:
9351 case DIF_OP_STGAA:
9352 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9353 err += efunc(pc, "invalid variable %u\n", v);
9354 break;
9355 case DIF_OP_LDTS:
9356 case DIF_OP_LDTAA:
9357 case DIF_OP_STTS:
9358 case DIF_OP_STTAA:
9359 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9360 err += efunc(pc, "invalid variable %u\n", v);
9361 break;
9362 case DIF_OP_LDLS:
9363 case DIF_OP_STLS:
9364 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9365 err += efunc(pc, "invalid variable %u\n", v);
9366 break;
9367 default:
9368 break;
9369 }
9370 }
9371
9372 return (err);
9373}
9374
9375/*
9376 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
9377 * are much more constrained than normal DIFOs. Specifically, they may
9378 * not:
9379 *
9380 * 1. Make calls to subroutines other than copyin(), copyinstr() or
9381 * miscellaneous string routines
9382 * 2. Access DTrace variables other than the args[] array, and the
9383 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9384 * 3. Have thread-local variables.
9385 * 4. Have dynamic variables.
9386 */
9387static int
9388dtrace_difo_validate_helper(dtrace_difo_t *dp)
9389{
9390 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9391 int err = 0;
9392 uint_t pc;
9393
9394 for (pc = 0; pc < dp->dtdo_len; pc++) {
9395 dif_instr_t instr = dp->dtdo_buf[pc];
9396
9397 uint_t v = DIF_INSTR_VAR(instr);
9398 uint_t subr = DIF_INSTR_SUBR(instr);
9399 uint_t op = DIF_INSTR_OP(instr);
9400
9401 switch (op) {
9402 case DIF_OP_OR:
9403 case DIF_OP_XOR:
9404 case DIF_OP_AND:
9405 case DIF_OP_SLL:
9406 case DIF_OP_SRL:
9407 case DIF_OP_SRA:
9408 case DIF_OP_SUB:
9409 case DIF_OP_ADD:
9410 case DIF_OP_MUL:
9411 case DIF_OP_SDIV:
9412 case DIF_OP_UDIV:
9413 case DIF_OP_SREM:
9414 case DIF_OP_UREM:
9415 case DIF_OP_COPYS:
9416 case DIF_OP_NOT:
9417 case DIF_OP_MOV:
9418 case DIF_OP_RLDSB:
9419 case DIF_OP_RLDSH:
9420 case DIF_OP_RLDSW:
9421 case DIF_OP_RLDUB:
9422 case DIF_OP_RLDUH:
9423 case DIF_OP_RLDUW:
9424 case DIF_OP_RLDX:
9425 case DIF_OP_ULDSB:
9426 case DIF_OP_ULDSH:
9427 case DIF_OP_ULDSW:
9428 case DIF_OP_ULDUB:
9429 case DIF_OP_ULDUH:
9430 case DIF_OP_ULDUW:
9431 case DIF_OP_ULDX:
9432 case DIF_OP_STB:
9433 case DIF_OP_STH:
9434 case DIF_OP_STW:
9435 case DIF_OP_STX:
9436 case DIF_OP_ALLOCS:
9437 case DIF_OP_CMP:
9438 case DIF_OP_SCMP:
9439 case DIF_OP_TST:
9440 case DIF_OP_BA:
9441 case DIF_OP_BE:
9442 case DIF_OP_BNE:
9443 case DIF_OP_BG:
9444 case DIF_OP_BGU:
9445 case DIF_OP_BGE:
9446 case DIF_OP_BGEU:
9447 case DIF_OP_BL:
9448 case DIF_OP_BLU:
9449 case DIF_OP_BLE:
9450 case DIF_OP_BLEU:
9451 case DIF_OP_RET:
9452 case DIF_OP_NOP:
9453 case DIF_OP_POPTS:
9454 case DIF_OP_FLUSHTS:
9455 case DIF_OP_SETX:
9456 case DIF_OP_SETS:
9457 case DIF_OP_LDGA:
9458 case DIF_OP_LDLS:
9459 case DIF_OP_STGS:
9460 case DIF_OP_STLS:
9461 case DIF_OP_PUSHTR:
9462 case DIF_OP_PUSHTV:
9463 break;
9464
9465 case DIF_OP_LDGS:
9466 if (v >= DIF_VAR_OTHER_UBASE)
9467 break;
9468
9469 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9470 break;
9471
9472 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9473 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9474 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9475 v == DIF_VAR_UID || v == DIF_VAR_GID)
9476 break;
9477
9478 err += efunc(pc, "illegal variable %u\n", v);
9479 break;
9480
9481 case DIF_OP_LDTA:
9482 case DIF_OP_LDTS:
9483 case DIF_OP_LDGAA:
9484 case DIF_OP_LDTAA:
9485 err += efunc(pc, "illegal dynamic variable load\n");
9486 break;
9487
9488 case DIF_OP_STTS:
9489 case DIF_OP_STGAA:
9490 case DIF_OP_STTAA:
9491 err += efunc(pc, "illegal dynamic variable store\n");
9492 break;
9493
9494 case DIF_OP_CALL:
9495 if (subr == DIF_SUBR_ALLOCA ||
9496 subr == DIF_SUBR_BCOPY ||
9497 subr == DIF_SUBR_COPYIN ||
9498 subr == DIF_SUBR_COPYINTO ||
9499 subr == DIF_SUBR_COPYINSTR ||
9500 subr == DIF_SUBR_INDEX ||
9501 subr == DIF_SUBR_INET_NTOA ||
9502 subr == DIF_SUBR_INET_NTOA6 ||
9503 subr == DIF_SUBR_INET_NTOP ||
9504 subr == DIF_SUBR_LLTOSTR ||
9505 subr == DIF_SUBR_RINDEX ||
9506 subr == DIF_SUBR_STRCHR ||
9507 subr == DIF_SUBR_STRJOIN ||
9508 subr == DIF_SUBR_STRRCHR ||
9509 subr == DIF_SUBR_STRSTR ||
9510 subr == DIF_SUBR_KDEBUG_TRACE ||
9511 subr == DIF_SUBR_KDEBUG_TRACE_STRING ||
9512 subr == DIF_SUBR_HTONS ||
9513 subr == DIF_SUBR_HTONL ||
9514 subr == DIF_SUBR_HTONLL ||
9515 subr == DIF_SUBR_NTOHS ||
9516 subr == DIF_SUBR_NTOHL ||
9517 subr == DIF_SUBR_NTOHLL)
9518 break;
9519
9520 err += efunc(pc, "invalid subr %u\n", subr);
9521 break;
9522
9523 default:
9524 err += efunc(pc, "invalid opcode %u\n",
9525 DIF_INSTR_OP(instr));
9526 }
9527 }
9528
9529 return (err);
9530}
9531
9532/*
9533 * Returns 1 if the expression in the DIF object can be cached on a per-thread
9534 * basis; 0 if not.
9535 */
9536static int
9537dtrace_difo_cacheable(dtrace_difo_t *dp)
9538{
9539 uint_t i;
9540
9541 if (dp == NULL)
9542 return (0);
9543
9544 for (i = 0; i < dp->dtdo_varlen; i++) {
9545 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9546
9547 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9548 continue;
9549
9550 switch (v->dtdv_id) {
9551 case DIF_VAR_CURTHREAD:
9552 case DIF_VAR_PID:
9553 case DIF_VAR_TID:
9554 case DIF_VAR_EXECNAME:
9555 case DIF_VAR_ZONENAME:
9556 break;
9557
9558 default:
9559 return (0);
9560 }
9561 }
9562
9563 /*
9564 * This DIF object may be cacheable. Now we need to look for any
9565 * array loading instructions, any memory loading instructions, or
9566 * any stores to thread-local variables.
9567 */
9568 for (i = 0; i < dp->dtdo_len; i++) {
9569 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9570
9571 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9572 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9573 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9574 op == DIF_OP_LDGA || op == DIF_OP_STTS)
9575 return (0);
9576 }
9577
9578 return (1);
9579}
9580
9581static void
9582dtrace_difo_hold(dtrace_difo_t *dp)
9583{
9584 uint_t i;
9585
9586 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9587
9588 dp->dtdo_refcnt++;
9589 ASSERT(dp->dtdo_refcnt != 0);
9590
9591 /*
9592 * We need to check this DIF object for references to the variable
9593 * DIF_VAR_VTIMESTAMP.
9594 */
9595 for (i = 0; i < dp->dtdo_varlen; i++) {
9596 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9597
9598 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9599 continue;
9600
9601 if (dtrace_vtime_references++ == 0)
9602 dtrace_vtime_enable();
9603 }
9604}
9605
9606/*
9607 * This routine calculates the dynamic variable chunksize for a given DIF
9608 * object. The calculation is not fool-proof, and can probably be tricked by
9609 * malicious DIF -- but it works for all compiler-generated DIF. Because this
9610 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9611 * if a dynamic variable size exceeds the chunksize.
9612 */
9613static void
9614dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9615{
9616 uint64_t sval = 0;
9617 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9618 const dif_instr_t *text = dp->dtdo_buf;
9619 uint_t pc, srd = 0;
9620 uint_t ttop = 0;
9621 size_t size, ksize;
9622 uint_t id, i;
9623
9624 for (pc = 0; pc < dp->dtdo_len; pc++) {
9625 dif_instr_t instr = text[pc];
9626 uint_t op = DIF_INSTR_OP(instr);
9627 uint_t rd = DIF_INSTR_RD(instr);
9628 uint_t r1 = DIF_INSTR_R1(instr);
9629 uint_t nkeys = 0;
9630 uchar_t scope;
9631
9632 dtrace_key_t *key = tupregs;
9633
9634 switch (op) {
9635 case DIF_OP_SETX:
9636 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9637 srd = rd;
9638 continue;
9639
9640 case DIF_OP_STTS:
9641 key = &tupregs[DIF_DTR_NREGS];
9642 key[0].dttk_size = 0;
9643 key[1].dttk_size = 0;
9644 nkeys = 2;
9645 scope = DIFV_SCOPE_THREAD;
9646 break;
9647
9648 case DIF_OP_STGAA:
9649 case DIF_OP_STTAA:
9650 nkeys = ttop;
9651
9652 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9653 key[nkeys++].dttk_size = 0;
9654
9655 key[nkeys++].dttk_size = 0;
9656
9657 if (op == DIF_OP_STTAA) {
9658 scope = DIFV_SCOPE_THREAD;
9659 } else {
9660 scope = DIFV_SCOPE_GLOBAL;
9661 }
9662
9663 break;
9664
9665 case DIF_OP_PUSHTR:
9666 if (ttop == DIF_DTR_NREGS)
9667 return;
9668
9669 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9670 /*
9671 * If the register for the size of the "pushtr"
9672 * is %r0 (or the value is 0) and the type is
9673 * a string, we'll use the system-wide default
9674 * string size.
9675 */
9676 tupregs[ttop++].dttk_size =
9677 dtrace_strsize_default;
9678 } else {
9679 if (srd == 0)
9680 return;
9681
9682 if (sval > LONG_MAX)
9683 return;
9684
9685 tupregs[ttop++].dttk_size = sval;
9686 }
9687
9688 break;
9689
9690 case DIF_OP_PUSHTV:
9691 if (ttop == DIF_DTR_NREGS)
9692 return;
9693
9694 tupregs[ttop++].dttk_size = 0;
9695 break;
9696
9697 case DIF_OP_FLUSHTS:
9698 ttop = 0;
9699 break;
9700
9701 case DIF_OP_POPTS:
9702 if (ttop != 0)
9703 ttop--;
9704 break;
9705 }
9706
9707 sval = 0;
9708 srd = 0;
9709
9710 if (nkeys == 0)
9711 continue;
9712
9713 /*
9714 * We have a dynamic variable allocation; calculate its size.
9715 */
9716 for (ksize = 0, i = 0; i < nkeys; i++)
9717 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9718
9719 size = sizeof (dtrace_dynvar_t);
9720 size += sizeof (dtrace_key_t) * (nkeys - 1);
9721 size += ksize;
9722
9723 /*
9724 * Now we need to determine the size of the stored data.
9725 */
9726 id = DIF_INSTR_VAR(instr);
9727
9728 for (i = 0; i < dp->dtdo_varlen; i++) {
9729 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9730
9731 if (v->dtdv_id == id && v->dtdv_scope == scope) {
9732 size += v->dtdv_type.dtdt_size;
9733 break;
9734 }
9735 }
9736
9737 if (i == dp->dtdo_varlen)
9738 return;
9739
9740 /*
9741 * We have the size. If this is larger than the chunk size
9742 * for our dynamic variable state, reset the chunk size.
9743 */
9744 size = P2ROUNDUP(size, sizeof (uint64_t));
9745
9746 /*
9747 * Before setting the chunk size, check that we're not going
9748 * to set it to a negative value...
9749 */
9750 if (size > LONG_MAX)
9751 return;
9752
9753 /*
9754 * ...and make certain that we didn't badly overflow.
9755 */
9756 if (size < ksize || size < sizeof (dtrace_dynvar_t))
9757 return;
9758
9759 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9760 vstate->dtvs_dynvars.dtds_chunksize = size;
9761 }
9762}
9763
9764static void
9765dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9766{
9767 int oldsvars, osz, nsz, otlocals, ntlocals;
9768 uint_t i, id;
9769
9770 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9771 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9772
9773 for (i = 0; i < dp->dtdo_varlen; i++) {
9774 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9775 dtrace_statvar_t *svar;
9776 dtrace_statvar_t ***svarp = NULL;
9777 size_t dsize = 0;
9778 uint8_t scope = v->dtdv_scope;
9779 int *np = (int *)NULL;
9780
9781 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9782 continue;
9783
9784 id -= DIF_VAR_OTHER_UBASE;
9785
9786 switch (scope) {
9787 case DIFV_SCOPE_THREAD:
9788 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9789 dtrace_difv_t *tlocals;
9790
9791 if ((ntlocals = (otlocals << 1)) == 0)
9792 ntlocals = 1;
9793
9794 osz = otlocals * sizeof (dtrace_difv_t);
9795 nsz = ntlocals * sizeof (dtrace_difv_t);
9796
9797 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9798
9799 if (osz != 0) {
9800 bcopy(vstate->dtvs_tlocals,
9801 tlocals, osz);
9802 kmem_free(vstate->dtvs_tlocals, osz);
9803 }
9804
9805 vstate->dtvs_tlocals = tlocals;
9806 vstate->dtvs_ntlocals = ntlocals;
9807 }
9808
9809 vstate->dtvs_tlocals[id] = *v;
9810 continue;
9811
9812 case DIFV_SCOPE_LOCAL:
9813 np = &vstate->dtvs_nlocals;
9814 svarp = &vstate->dtvs_locals;
9815
9816 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9817 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9818 sizeof (uint64_t));
9819 else
9820 dsize = (int)NCPU * sizeof (uint64_t);
9821
9822 break;
9823
9824 case DIFV_SCOPE_GLOBAL:
9825 np = &vstate->dtvs_nglobals;
9826 svarp = &vstate->dtvs_globals;
9827
9828 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9829 dsize = v->dtdv_type.dtdt_size +
9830 sizeof (uint64_t);
9831
9832 break;
9833
9834 default:
9835 ASSERT(0);
9836 }
9837
9838 while (id >= (uint_t)(oldsvars = *np)) {
9839 dtrace_statvar_t **statics;
9840 int newsvars, oldsize, newsize;
9841
9842 if ((newsvars = (oldsvars << 1)) == 0)
9843 newsvars = 1;
9844
9845 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9846 newsize = newsvars * sizeof (dtrace_statvar_t *);
9847
9848 statics = kmem_zalloc(newsize, KM_SLEEP);
9849
9850 if (oldsize != 0) {
9851 bcopy(*svarp, statics, oldsize);
9852 kmem_free(*svarp, oldsize);
9853 }
9854
9855 *svarp = statics;
9856 *np = newsvars;
9857 }
9858
9859 if ((svar = (*svarp)[id]) == NULL) {
9860 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9861 svar->dtsv_var = *v;
9862
9863 if ((svar->dtsv_size = dsize) != 0) {
9864 svar->dtsv_data = (uint64_t)(uintptr_t)
9865 kmem_zalloc(dsize, KM_SLEEP);
9866 }
9867
9868 (*svarp)[id] = svar;
9869 }
9870
9871 svar->dtsv_refcnt++;
9872 }
9873
9874 dtrace_difo_chunksize(dp, vstate);
9875 dtrace_difo_hold(dp);
9876}
9877
9878static dtrace_difo_t *
9879dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9880{
9881 dtrace_difo_t *new;
9882 size_t sz;
9883
9884 ASSERT(dp->dtdo_buf != NULL);
9885 ASSERT(dp->dtdo_refcnt != 0);
9886
9887 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9888
9889 ASSERT(dp->dtdo_buf != NULL);
9890 sz = dp->dtdo_len * sizeof (dif_instr_t);
9891 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9892 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9893 new->dtdo_len = dp->dtdo_len;
9894
9895 if (dp->dtdo_strtab != NULL) {
9896 ASSERT(dp->dtdo_strlen != 0);
9897 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9898 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9899 new->dtdo_strlen = dp->dtdo_strlen;
9900 }
9901
9902 if (dp->dtdo_inttab != NULL) {
9903 ASSERT(dp->dtdo_intlen != 0);
9904 sz = dp->dtdo_intlen * sizeof (uint64_t);
9905 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9906 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9907 new->dtdo_intlen = dp->dtdo_intlen;
9908 }
9909
9910 if (dp->dtdo_vartab != NULL) {
9911 ASSERT(dp->dtdo_varlen != 0);
9912 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9913 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9914 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9915 new->dtdo_varlen = dp->dtdo_varlen;
9916 }
9917
9918 dtrace_difo_init(new, vstate);
9919 return (new);
9920}
9921
9922static void
9923dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9924{
9925 uint_t i;
9926
9927 ASSERT(dp->dtdo_refcnt == 0);
9928
9929 for (i = 0; i < dp->dtdo_varlen; i++) {
9930 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9931 dtrace_statvar_t *svar;
9932 dtrace_statvar_t **svarp = NULL;
9933 uint_t id;
9934 uint8_t scope = v->dtdv_scope;
9935 int *np = NULL;
9936
9937 switch (scope) {
9938 case DIFV_SCOPE_THREAD:
9939 continue;
9940
9941 case DIFV_SCOPE_LOCAL:
9942 np = &vstate->dtvs_nlocals;
9943 svarp = vstate->dtvs_locals;
9944 break;
9945
9946 case DIFV_SCOPE_GLOBAL:
9947 np = &vstate->dtvs_nglobals;
9948 svarp = vstate->dtvs_globals;
9949 break;
9950
9951 default:
9952 ASSERT(0);
9953 }
9954
9955 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9956 continue;
9957
9958 id -= DIF_VAR_OTHER_UBASE;
9959
9960 ASSERT(id < (uint_t)*np);
9961
9962 svar = svarp[id];
9963 ASSERT(svar != NULL);
9964 ASSERT(svar->dtsv_refcnt > 0);
9965
9966 if (--svar->dtsv_refcnt > 0)
9967 continue;
9968
9969 if (svar->dtsv_size != 0) {
9970 ASSERT(svar->dtsv_data != 0);
9971 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9972 svar->dtsv_size);
9973 }
9974
9975 kmem_free(svar, sizeof (dtrace_statvar_t));
9976 svarp[id] = NULL;
9977 }
9978
9979 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9980 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9981 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9982 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9983
9984 kmem_free(dp, sizeof (dtrace_difo_t));
9985}
9986
9987static void
9988dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9989{
9990 uint_t i;
9991
9992 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9993 ASSERT(dp->dtdo_refcnt != 0);
9994
9995 for (i = 0; i < dp->dtdo_varlen; i++) {
9996 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9997
9998 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9999 continue;
10000
10001 ASSERT(dtrace_vtime_references > 0);
10002 if (--dtrace_vtime_references == 0)
10003 dtrace_vtime_disable();
10004 }
10005
10006 if (--dp->dtdo_refcnt == 0)
10007 dtrace_difo_destroy(dp, vstate);
10008}
10009
10010/*
10011 * DTrace Format Functions
10012 */
10013static uint16_t
10014dtrace_format_add(dtrace_state_t *state, char *str)
10015{
10016 char *fmt, **new;
10017 uint16_t ndx, len = strlen(str) + 1;
10018
10019 fmt = kmem_zalloc(len, KM_SLEEP);
10020 bcopy(str, fmt, len);
10021
10022 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10023 if (state->dts_formats[ndx] == NULL) {
10024 state->dts_formats[ndx] = fmt;
10025 return (ndx + 1);
10026 }
10027 }
10028
10029 if (state->dts_nformats == USHRT_MAX) {
10030 /*
10031 * This is only likely if a denial-of-service attack is being
10032 * attempted. As such, it's okay to fail silently here.
10033 */
10034 kmem_free(fmt, len);
10035 return (0);
10036 }
10037
10038 /*
10039 * For simplicity, we always resize the formats array to be exactly the
10040 * number of formats.
10041 */
10042 ndx = state->dts_nformats++;
10043 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10044
10045 if (state->dts_formats != NULL) {
10046 ASSERT(ndx != 0);
10047 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10048 kmem_free(state->dts_formats, ndx * sizeof (char *));
10049 }
10050
10051 state->dts_formats = new;
10052 state->dts_formats[ndx] = fmt;
10053
10054 return (ndx + 1);
10055}
10056
10057static void
10058dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10059{
10060 char *fmt;
10061
10062 ASSERT(state->dts_formats != NULL);
10063 ASSERT(format <= state->dts_nformats);
10064 ASSERT(state->dts_formats[format - 1] != NULL);
10065
10066 fmt = state->dts_formats[format - 1];
10067 kmem_free(fmt, strlen(fmt) + 1);
10068 state->dts_formats[format - 1] = NULL;
10069}
10070
10071static void
10072dtrace_format_destroy(dtrace_state_t *state)
10073{
10074 int i;
10075
10076 if (state->dts_nformats == 0) {
10077 ASSERT(state->dts_formats == NULL);
10078 return;
10079 }
10080
10081 ASSERT(state->dts_formats != NULL);
10082
10083 for (i = 0; i < state->dts_nformats; i++) {
10084 char *fmt = state->dts_formats[i];
10085
10086 if (fmt == NULL)
10087 continue;
10088
10089 kmem_free(fmt, strlen(fmt) + 1);
10090 }
10091
10092 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10093 state->dts_nformats = 0;
10094 state->dts_formats = NULL;
10095}
10096
10097/*
10098 * DTrace Predicate Functions
10099 */
10100static dtrace_predicate_t *
10101dtrace_predicate_create(dtrace_difo_t *dp)
10102{
10103 dtrace_predicate_t *pred;
10104
10105 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10106 ASSERT(dp->dtdo_refcnt != 0);
10107
10108 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10109 pred->dtp_difo = dp;
10110 pred->dtp_refcnt = 1;
10111
10112 if (!dtrace_difo_cacheable(dp))
10113 return (pred);
10114
10115 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10116 /*
10117 * This is only theoretically possible -- we have had 2^32
10118 * cacheable predicates on this machine. We cannot allow any
10119 * more predicates to become cacheable: as unlikely as it is,
10120 * there may be a thread caching a (now stale) predicate cache
10121 * ID. (N.B.: the temptation is being successfully resisted to
10122 * have this cmn_err() "Holy shit -- we executed this code!")
10123 */
10124 return (pred);
10125 }
10126
10127 pred->dtp_cacheid = dtrace_predcache_id++;
10128
10129 return (pred);
10130}
10131
10132static void
10133dtrace_predicate_hold(dtrace_predicate_t *pred)
10134{
10135 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10136 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10137 ASSERT(pred->dtp_refcnt > 0);
10138
10139 pred->dtp_refcnt++;
10140}
10141
10142static void
10143dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10144{
10145 dtrace_difo_t *dp = pred->dtp_difo;
10146#pragma unused(dp) /* __APPLE__ */
10147
10148 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10149 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10150 ASSERT(pred->dtp_refcnt > 0);
10151
10152 if (--pred->dtp_refcnt == 0) {
10153 dtrace_difo_release(pred->dtp_difo, vstate);
10154 kmem_free(pred, sizeof (dtrace_predicate_t));
10155 }
10156}
10157
10158/*
10159 * DTrace Action Description Functions
10160 */
10161static dtrace_actdesc_t *
10162dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10163 uint64_t uarg, uint64_t arg)
10164{
10165 dtrace_actdesc_t *act;
10166
10167 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10168 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10169
10170 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10171 act->dtad_kind = kind;
10172 act->dtad_ntuple = ntuple;
10173 act->dtad_uarg = uarg;
10174 act->dtad_arg = arg;
10175 act->dtad_refcnt = 1;
10176
10177 return (act);
10178}
10179
10180static void
10181dtrace_actdesc_hold(dtrace_actdesc_t *act)
10182{
10183 ASSERT(act->dtad_refcnt >= 1);
10184 act->dtad_refcnt++;
10185}
10186
10187static void
10188dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10189{
10190 dtrace_actkind_t kind = act->dtad_kind;
10191 dtrace_difo_t *dp;
10192
10193 ASSERT(act->dtad_refcnt >= 1);
10194
10195 if (--act->dtad_refcnt != 0)
10196 return;
10197
10198 if ((dp = act->dtad_difo) != NULL)
10199 dtrace_difo_release(dp, vstate);
10200
10201 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10202 char *str = (char *)(uintptr_t)act->dtad_arg;
10203
10204 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10205 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10206
10207 if (str != NULL)
10208 kmem_free(str, strlen(str) + 1);
10209 }
10210
10211 kmem_free(act, sizeof (dtrace_actdesc_t));
10212}
10213
10214/*
10215 * DTrace ECB Functions
10216 */
10217static dtrace_ecb_t *
10218dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10219{
10220 dtrace_ecb_t *ecb;
10221 dtrace_epid_t epid;
10222
10223 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10224
10225 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10226 ecb->dte_predicate = NULL;
10227 ecb->dte_probe = probe;
10228
10229 /*
10230 * The default size is the size of the default action: recording
10231 * the header.
10232 */
10233 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10234 ecb->dte_alignment = sizeof (dtrace_epid_t);
10235
10236 epid = state->dts_epid++;
10237
10238 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10239 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10240 int necbs = state->dts_necbs << 1;
10241
10242 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10243
10244 if (necbs == 0) {
10245 ASSERT(oecbs == NULL);
10246 necbs = 1;
10247 }
10248
10249 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10250
10251 if (oecbs != NULL)
10252 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10253
10254 dtrace_membar_producer();
10255 state->dts_ecbs = ecbs;
10256
10257 if (oecbs != NULL) {
10258 /*
10259 * If this state is active, we must dtrace_sync()
10260 * before we can free the old dts_ecbs array: we're
10261 * coming in hot, and there may be active ring
10262 * buffer processing (which indexes into the dts_ecbs
10263 * array) on another CPU.
10264 */
10265 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10266 dtrace_sync();
10267
10268 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10269 }
10270
10271 dtrace_membar_producer();
10272 state->dts_necbs = necbs;
10273 }
10274
10275 ecb->dte_state = state;
10276
10277 ASSERT(state->dts_ecbs[epid - 1] == NULL);
10278 dtrace_membar_producer();
10279 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10280
10281 return (ecb);
10282}
10283
10284static int
10285dtrace_ecb_enable(dtrace_ecb_t *ecb)
10286{
10287 dtrace_probe_t *probe = ecb->dte_probe;
10288
10289 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10290 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10291 ASSERT(ecb->dte_next == NULL);
10292
10293 if (probe == NULL) {
10294 /*
10295 * This is the NULL probe -- there's nothing to do.
10296 */
10297 return(0);
10298 }
10299
10300 probe->dtpr_provider->dtpv_ecb_count++;
10301 if (probe->dtpr_ecb == NULL) {
10302 dtrace_provider_t *prov = probe->dtpr_provider;
10303
10304 /*
10305 * We're the first ECB on this probe.
10306 */
10307 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10308
10309 if (ecb->dte_predicate != NULL)
10310 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10311
10312 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10313 probe->dtpr_id, probe->dtpr_arg));
10314 } else {
10315 /*
10316 * This probe is already active. Swing the last pointer to
10317 * point to the new ECB, and issue a dtrace_sync() to assure
10318 * that all CPUs have seen the change.
10319 */
10320 ASSERT(probe->dtpr_ecb_last != NULL);
10321 probe->dtpr_ecb_last->dte_next = ecb;
10322 probe->dtpr_ecb_last = ecb;
10323 probe->dtpr_predcache = 0;
10324
10325 dtrace_sync();
10326 return(0);
10327 }
10328}
10329
10330static int
10331dtrace_ecb_resize(dtrace_ecb_t *ecb)
10332{
10333 dtrace_action_t *act;
10334 uint32_t curneeded = UINT32_MAX;
10335 uint32_t aggbase = UINT32_MAX;
10336
10337 /*
10338 * If we record anything, we always record the dtrace_rechdr_t. (And
10339 * we always record it first.)
10340 */
10341 ecb->dte_size = sizeof (dtrace_rechdr_t);
10342 ecb->dte_alignment = sizeof (dtrace_epid_t);
10343
10344 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10345 dtrace_recdesc_t *rec = &act->dta_rec;
10346 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10347
10348 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10349
10350 if (DTRACEACT_ISAGG(act->dta_kind)) {
10351 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10352
10353 ASSERT(rec->dtrd_size != 0);
10354 ASSERT(agg->dtag_first != NULL);
10355 ASSERT(act->dta_prev->dta_intuple);
10356 ASSERT(aggbase != UINT32_MAX);
10357 ASSERT(curneeded != UINT32_MAX);
10358
10359 agg->dtag_base = aggbase;
10360 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10361 rec->dtrd_offset = curneeded;
10362 if (curneeded + rec->dtrd_size < curneeded)
10363 return (EINVAL);
10364 curneeded += rec->dtrd_size;
10365 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10366
10367 aggbase = UINT32_MAX;
10368 curneeded = UINT32_MAX;
10369 } else if (act->dta_intuple) {
10370 if (curneeded == UINT32_MAX) {
10371 /*
10372 * This is the first record in a tuple. Align
10373 * curneeded to be at offset 4 in an 8-byte
10374 * aligned block.
10375 */
10376 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
10377 ASSERT(aggbase == UINT32_MAX);
10378
10379 curneeded = P2PHASEUP(ecb->dte_size,
10380 sizeof (uint64_t), sizeof (dtrace_aggid_t));
10381
10382 aggbase = curneeded - sizeof (dtrace_aggid_t);
10383 ASSERT(IS_P2ALIGNED(aggbase,
10384 sizeof (uint64_t)));
10385 }
10386
10387 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10388 rec->dtrd_offset = curneeded;
10389 curneeded += rec->dtrd_size;
10390 if (curneeded + rec->dtrd_size < curneeded)
10391 return (EINVAL);
10392 } else {
10393 /* tuples must be followed by an aggregation */
10394 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
10395 ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
10396 rec->dtrd_offset = ecb->dte_size;
10397 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
10398 return (EINVAL);
10399 ecb->dte_size += rec->dtrd_size;
10400 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10401 }
10402 }
10403
10404 if ((act = ecb->dte_action) != NULL &&
10405 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10406 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10407 /*
10408 * If the size is still sizeof (dtrace_rechdr_t), then all
10409 * actions store no data; set the size to 0.
10410 */
10411 ecb->dte_size = 0;
10412 }
10413
10414 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10415 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10416 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
10417 return (0);
10418}
10419
10420static dtrace_action_t *
10421dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10422{
10423 dtrace_aggregation_t *agg;
10424 size_t size = sizeof (uint64_t);
10425 int ntuple = desc->dtad_ntuple;
10426 dtrace_action_t *act;
10427 dtrace_recdesc_t *frec;
10428 dtrace_aggid_t aggid;
10429 dtrace_state_t *state = ecb->dte_state;
10430
10431 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10432 agg->dtag_ecb = ecb;
10433
10434 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10435
10436 switch (desc->dtad_kind) {
10437 case DTRACEAGG_MIN:
10438 agg->dtag_initial = INT64_MAX;
10439 agg->dtag_aggregate = dtrace_aggregate_min;
10440 break;
10441
10442 case DTRACEAGG_MAX:
10443 agg->dtag_initial = INT64_MIN;
10444 agg->dtag_aggregate = dtrace_aggregate_max;
10445 break;
10446
10447 case DTRACEAGG_COUNT:
10448 agg->dtag_aggregate = dtrace_aggregate_count;
10449 break;
10450
10451 case DTRACEAGG_QUANTIZE:
10452 agg->dtag_aggregate = dtrace_aggregate_quantize;
10453 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10454 sizeof (uint64_t);
10455 break;
10456
10457 case DTRACEAGG_LQUANTIZE: {
10458 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10459 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10460
10461 agg->dtag_initial = desc->dtad_arg;
10462 agg->dtag_aggregate = dtrace_aggregate_lquantize;
10463
10464 if (step == 0 || levels == 0)
10465 goto err;
10466
10467 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10468 break;
10469 }
10470
10471 case DTRACEAGG_LLQUANTIZE: {
10472 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10473 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10474 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10475 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10476 int64_t v;
10477
10478 agg->dtag_initial = desc->dtad_arg;
10479 agg->dtag_aggregate = dtrace_aggregate_llquantize;
10480
10481 if (factor < 2 || low >= high || nsteps < factor)
10482 goto err;
10483
10484 /*
10485 * Now check that the number of steps evenly divides a power
10486 * of the factor. (This assures both integer bucket size and
10487 * linearity within each magnitude.)
10488 */
10489 for (v = factor; v < nsteps; v *= factor)
10490 continue;
10491
10492 if ((v % nsteps) || (nsteps % factor))
10493 goto err;
10494
10495 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10496 break;
10497 }
10498
10499 case DTRACEAGG_AVG:
10500 agg->dtag_aggregate = dtrace_aggregate_avg;
10501 size = sizeof (uint64_t) * 2;
10502 break;
10503
10504 case DTRACEAGG_STDDEV:
10505 agg->dtag_aggregate = dtrace_aggregate_stddev;
10506 size = sizeof (uint64_t) * 4;
10507 break;
10508
10509 case DTRACEAGG_SUM:
10510 agg->dtag_aggregate = dtrace_aggregate_sum;
10511 break;
10512
10513 default:
10514 goto err;
10515 }
10516
10517 agg->dtag_action.dta_rec.dtrd_size = size;
10518
10519 if (ntuple == 0)
10520 goto err;
10521
10522 /*
10523 * We must make sure that we have enough actions for the n-tuple.
10524 */
10525 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10526 if (DTRACEACT_ISAGG(act->dta_kind))
10527 break;
10528
10529 if (--ntuple == 0) {
10530 /*
10531 * This is the action with which our n-tuple begins.
10532 */
10533 agg->dtag_first = act;
10534 goto success;
10535 }
10536 }
10537
10538 /*
10539 * This n-tuple is short by ntuple elements. Return failure.
10540 */
10541 ASSERT(ntuple != 0);
10542err:
10543 kmem_free(agg, sizeof (dtrace_aggregation_t));
10544 return (NULL);
10545
10546success:
10547 /*
10548 * If the last action in the tuple has a size of zero, it's actually
10549 * an expression argument for the aggregating action.
10550 */
10551 ASSERT(ecb->dte_action_last != NULL);
10552 act = ecb->dte_action_last;
10553
10554 if (act->dta_kind == DTRACEACT_DIFEXPR) {
10555 ASSERT(act->dta_difo != NULL);
10556
10557 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10558 agg->dtag_hasarg = 1;
10559 }
10560
10561 /*
10562 * We need to allocate an id for this aggregation.
10563 */
10564 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10565 VM_BESTFIT | VM_SLEEP);
10566
10567 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
10568 dtrace_aggregation_t **oaggs = state->dts_aggregations;
10569 dtrace_aggregation_t **aggs;
10570 int naggs = state->dts_naggregations << 1;
10571 int onaggs = state->dts_naggregations;
10572
10573 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
10574
10575 if (naggs == 0) {
10576 ASSERT(oaggs == NULL);
10577 naggs = 1;
10578 }
10579
10580 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10581
10582 if (oaggs != NULL) {
10583 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10584 kmem_free(oaggs, onaggs * sizeof (*aggs));
10585 }
10586
10587 state->dts_aggregations = aggs;
10588 state->dts_naggregations = naggs;
10589 }
10590
10591 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10592 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10593
10594 frec = &agg->dtag_first->dta_rec;
10595 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10596 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10597
10598 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10599 ASSERT(!act->dta_intuple);
10600 act->dta_intuple = 1;
10601 }
10602
10603 return (&agg->dtag_action);
10604}
10605
10606static void
10607dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10608{
10609 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10610 dtrace_state_t *state = ecb->dte_state;
10611 dtrace_aggid_t aggid = agg->dtag_id;
10612
10613 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10614 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10615
10616 ASSERT(state->dts_aggregations[aggid - 1] == agg);
10617 state->dts_aggregations[aggid - 1] = NULL;
10618
10619 kmem_free(agg, sizeof (dtrace_aggregation_t));
10620}
10621
10622static int
10623dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10624{
10625 dtrace_action_t *action, *last;
10626 dtrace_difo_t *dp = desc->dtad_difo;
10627 uint32_t size = 0, align = sizeof (uint8_t), mask;
10628 uint16_t format = 0;
10629 dtrace_recdesc_t *rec;
10630 dtrace_state_t *state = ecb->dte_state;
10631 dtrace_optval_t *opt = state->dts_options;
10632 dtrace_optval_t nframes=0, strsize;
10633 uint64_t arg = desc->dtad_arg;
10634
10635 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10636 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10637
10638 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10639 /*
10640 * If this is an aggregating action, there must be neither
10641 * a speculate nor a commit on the action chain.
10642 */
10643 dtrace_action_t *act;
10644
10645 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10646 if (act->dta_kind == DTRACEACT_COMMIT)
10647 return (EINVAL);
10648
10649 if (act->dta_kind == DTRACEACT_SPECULATE)
10650 return (EINVAL);
10651 }
10652
10653 action = dtrace_ecb_aggregation_create(ecb, desc);
10654
10655 if (action == NULL)
10656 return (EINVAL);
10657 } else {
10658 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10659 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10660 dp != NULL && dp->dtdo_destructive)) {
10661 state->dts_destructive = 1;
10662 }
10663
10664 switch (desc->dtad_kind) {
10665 case DTRACEACT_PRINTF:
10666 case DTRACEACT_PRINTA:
10667 case DTRACEACT_SYSTEM:
10668 case DTRACEACT_FREOPEN:
10669 case DTRACEACT_DIFEXPR:
10670 /*
10671 * We know that our arg is a string -- turn it into a
10672 * format.
10673 */
10674 if (arg == 0) {
10675 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
10676 desc->dtad_kind == DTRACEACT_DIFEXPR);
10677 format = 0;
10678 } else {
10679 ASSERT(arg != 0);
10680 ASSERT(arg > KERNELBASE);
10681 format = dtrace_format_add(state,
10682 (char *)(uintptr_t)arg);
10683 }
10684
10685 /*FALLTHROUGH*/
10686 case DTRACEACT_LIBACT:
10687 case DTRACEACT_TRACEMEM:
10688 case DTRACEACT_TRACEMEM_DYNSIZE:
10689 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
10690 if (dp == NULL)
10691 return (EINVAL);
10692
10693 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10694 break;
10695
10696 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10697 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10698 return (EINVAL);
10699
10700 size = opt[DTRACEOPT_STRSIZE];
10701 }
10702
10703 break;
10704
10705 case DTRACEACT_STACK:
10706 if ((nframes = arg) == 0) {
10707 nframes = opt[DTRACEOPT_STACKFRAMES];
10708 ASSERT(nframes > 0);
10709 arg = nframes;
10710 }
10711
10712 size = nframes * sizeof (pc_t);
10713 break;
10714
10715 case DTRACEACT_JSTACK:
10716 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10717 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10718
10719 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10720 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10721
10722 arg = DTRACE_USTACK_ARG(nframes, strsize);
10723
10724 /*FALLTHROUGH*/
10725 case DTRACEACT_USTACK:
10726 if (desc->dtad_kind != DTRACEACT_JSTACK &&
10727 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10728 strsize = DTRACE_USTACK_STRSIZE(arg);
10729 nframes = opt[DTRACEOPT_USTACKFRAMES];
10730 ASSERT(nframes > 0);
10731 arg = DTRACE_USTACK_ARG(nframes, strsize);
10732 }
10733
10734 /*
10735 * Save a slot for the pid.
10736 */
10737 size = (nframes + 1) * sizeof (uint64_t);
10738 size += DTRACE_USTACK_STRSIZE(arg);
10739 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10740
10741 break;
10742
10743 case DTRACEACT_SYM:
10744 case DTRACEACT_MOD:
10745 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10746 sizeof (uint64_t)) ||
10747 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10748 return (EINVAL);
10749 break;
10750
10751 case DTRACEACT_USYM:
10752 case DTRACEACT_UMOD:
10753 case DTRACEACT_UADDR:
10754 if (dp == NULL ||
10755 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10756 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10757 return (EINVAL);
10758
10759 /*
10760 * We have a slot for the pid, plus a slot for the
10761 * argument. To keep things simple (aligned with
10762 * bitness-neutral sizing), we store each as a 64-bit
10763 * quantity.
10764 */
10765 size = 2 * sizeof (uint64_t);
10766 break;
10767
10768 case DTRACEACT_STOP:
10769 case DTRACEACT_BREAKPOINT:
10770 case DTRACEACT_PANIC:
10771 break;
10772
10773 case DTRACEACT_CHILL:
10774 case DTRACEACT_DISCARD:
10775 case DTRACEACT_RAISE:
10776 case DTRACEACT_PIDRESUME: /* __APPLE__ */
10777 if (dp == NULL)
10778 return (EINVAL);
10779 break;
10780
10781 case DTRACEACT_EXIT:
10782 if (dp == NULL ||
10783 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10784 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10785 return (EINVAL);
10786 break;
10787
10788 case DTRACEACT_SPECULATE:
10789 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
10790 return (EINVAL);
10791
10792 if (dp == NULL)
10793 return (EINVAL);
10794
10795 state->dts_speculates = 1;
10796 break;
10797
10798 case DTRACEACT_COMMIT: {
10799 dtrace_action_t *act = ecb->dte_action;
10800
10801 for (; act != NULL; act = act->dta_next) {
10802 if (act->dta_kind == DTRACEACT_COMMIT)
10803 return (EINVAL);
10804 }
10805
10806 if (dp == NULL)
10807 return (EINVAL);
10808 break;
10809 }
10810
10811 default:
10812 return (EINVAL);
10813 }
10814
10815 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10816 /*
10817 * If this is a data-storing action or a speculate,
10818 * we must be sure that there isn't a commit on the
10819 * action chain.
10820 */
10821 dtrace_action_t *act = ecb->dte_action;
10822
10823 for (; act != NULL; act = act->dta_next) {
10824 if (act->dta_kind == DTRACEACT_COMMIT)
10825 return (EINVAL);
10826 }
10827 }
10828
10829 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10830 action->dta_rec.dtrd_size = size;
10831 }
10832
10833 action->dta_refcnt = 1;
10834 rec = &action->dta_rec;
10835 size = rec->dtrd_size;
10836
10837 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10838 if (!(size & mask)) {
10839 align = mask + 1;
10840 break;
10841 }
10842 }
10843
10844 action->dta_kind = desc->dtad_kind;
10845
10846 if ((action->dta_difo = dp) != NULL)
10847 dtrace_difo_hold(dp);
10848
10849 rec->dtrd_action = action->dta_kind;
10850 rec->dtrd_arg = arg;
10851 rec->dtrd_uarg = desc->dtad_uarg;
10852 rec->dtrd_alignment = (uint16_t)align;
10853 rec->dtrd_format = format;
10854
10855 if ((last = ecb->dte_action_last) != NULL) {
10856 ASSERT(ecb->dte_action != NULL);
10857 action->dta_prev = last;
10858 last->dta_next = action;
10859 } else {
10860 ASSERT(ecb->dte_action == NULL);
10861 ecb->dte_action = action;
10862 }
10863
10864 ecb->dte_action_last = action;
10865
10866 return (0);
10867}
10868
10869static void
10870dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10871{
10872 dtrace_action_t *act = ecb->dte_action, *next;
10873 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10874 dtrace_difo_t *dp;
10875 uint16_t format;
10876
10877 if (act != NULL && act->dta_refcnt > 1) {
10878 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10879 act->dta_refcnt--;
10880 } else {
10881 for (; act != NULL; act = next) {
10882 next = act->dta_next;
10883 ASSERT(next != NULL || act == ecb->dte_action_last);
10884 ASSERT(act->dta_refcnt == 1);
10885
10886 if ((format = act->dta_rec.dtrd_format) != 0)
10887 dtrace_format_remove(ecb->dte_state, format);
10888
10889 if ((dp = act->dta_difo) != NULL)
10890 dtrace_difo_release(dp, vstate);
10891
10892 if (DTRACEACT_ISAGG(act->dta_kind)) {
10893 dtrace_ecb_aggregation_destroy(ecb, act);
10894 } else {
10895 kmem_free(act, sizeof (dtrace_action_t));
10896 }
10897 }
10898 }
10899
10900 ecb->dte_action = NULL;
10901 ecb->dte_action_last = NULL;
10902 ecb->dte_size = 0;
10903}
10904
10905static void
10906dtrace_ecb_disable(dtrace_ecb_t *ecb)
10907{
10908 /*
10909 * We disable the ECB by removing it from its probe.
10910 */
10911 dtrace_ecb_t *pecb, *prev = NULL;
10912 dtrace_probe_t *probe = ecb->dte_probe;
10913
10914 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10915
10916 if (probe == NULL) {
10917 /*
10918 * This is the NULL probe; there is nothing to disable.
10919 */
10920 return;
10921 }
10922
10923 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10924 if (pecb == ecb)
10925 break;
10926 prev = pecb;
10927 }
10928
10929 ASSERT(pecb != NULL);
10930
10931 if (prev == NULL) {
10932 probe->dtpr_ecb = ecb->dte_next;
10933 } else {
10934 prev->dte_next = ecb->dte_next;
10935 }
10936
10937 if (ecb == probe->dtpr_ecb_last) {
10938 ASSERT(ecb->dte_next == NULL);
10939 probe->dtpr_ecb_last = prev;
10940 }
10941
10942 probe->dtpr_provider->dtpv_ecb_count--;
10943 /*
10944 * The ECB has been disconnected from the probe; now sync to assure
10945 * that all CPUs have seen the change before returning.
10946 */
10947 dtrace_sync();
10948
10949 if (probe->dtpr_ecb == NULL) {
10950 /*
10951 * That was the last ECB on the probe; clear the predicate
10952 * cache ID for the probe, disable it and sync one more time
10953 * to assure that we'll never hit it again.
10954 */
10955 dtrace_provider_t *prov = probe->dtpr_provider;
10956
10957 ASSERT(ecb->dte_next == NULL);
10958 ASSERT(probe->dtpr_ecb_last == NULL);
10959 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10960 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10961 probe->dtpr_id, probe->dtpr_arg);
10962 dtrace_sync();
10963 } else {
10964 /*
10965 * There is at least one ECB remaining on the probe. If there
10966 * is _exactly_ one, set the probe's predicate cache ID to be
10967 * the predicate cache ID of the remaining ECB.
10968 */
10969 ASSERT(probe->dtpr_ecb_last != NULL);
10970 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10971
10972 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10973 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10974
10975 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10976
10977 if (p != NULL)
10978 probe->dtpr_predcache = p->dtp_cacheid;
10979 }
10980
10981 ecb->dte_next = NULL;
10982 }
10983}
10984
10985static void
10986dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10987{
10988 dtrace_state_t *state = ecb->dte_state;
10989 dtrace_vstate_t *vstate = &state->dts_vstate;
10990 dtrace_predicate_t *pred;
10991 dtrace_epid_t epid = ecb->dte_epid;
10992
10993 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10994 ASSERT(ecb->dte_next == NULL);
10995 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10996
10997 if ((pred = ecb->dte_predicate) != NULL)
10998 dtrace_predicate_release(pred, vstate);
10999
11000 dtrace_ecb_action_remove(ecb);
11001
11002 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11003 state->dts_ecbs[epid - 1] = NULL;
11004
11005 kmem_free(ecb, sizeof (dtrace_ecb_t));
11006}
11007
11008static dtrace_ecb_t *
11009dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11010 dtrace_enabling_t *enab)
11011{
11012 dtrace_ecb_t *ecb;
11013 dtrace_predicate_t *pred;
11014 dtrace_actdesc_t *act;
11015 dtrace_provider_t *prov;
11016 dtrace_ecbdesc_t *desc = enab->dten_current;
11017
11018 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11019 ASSERT(state != NULL);
11020
11021 ecb = dtrace_ecb_add(state, probe);
11022 ecb->dte_uarg = desc->dted_uarg;
11023
11024 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11025 dtrace_predicate_hold(pred);
11026 ecb->dte_predicate = pred;
11027 }
11028
11029 if (probe != NULL) {
11030 /*
11031 * If the provider shows more leg than the consumer is old
11032 * enough to see, we need to enable the appropriate implicit
11033 * predicate bits to prevent the ecb from activating at
11034 * revealing times.
11035 *
11036 * Providers specifying DTRACE_PRIV_USER at register time
11037 * are stating that they need the /proc-style privilege
11038 * model to be enforced, and this is what DTRACE_COND_OWNER
11039 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11040 */
11041 prov = probe->dtpr_provider;
11042 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11043 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11044 ecb->dte_cond |= DTRACE_COND_OWNER;
11045
11046 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11047 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11048 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11049
11050 /*
11051 * If the provider shows us kernel innards and the user
11052 * is lacking sufficient privilege, enable the
11053 * DTRACE_COND_USERMODE implicit predicate.
11054 */
11055 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11056 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11057 ecb->dte_cond |= DTRACE_COND_USERMODE;
11058 }
11059
11060 if (dtrace_ecb_create_cache != NULL) {
11061 /*
11062 * If we have a cached ecb, we'll use its action list instead
11063 * of creating our own (saving both time and space).
11064 */
11065 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11066 dtrace_action_t *act_if = cached->dte_action;
11067
11068 if (act_if != NULL) {
11069 ASSERT(act_if->dta_refcnt > 0);
11070 act_if->dta_refcnt++;
11071 ecb->dte_action = act_if;
11072 ecb->dte_action_last = cached->dte_action_last;
11073 ecb->dte_needed = cached->dte_needed;
11074 ecb->dte_size = cached->dte_size;
11075 ecb->dte_alignment = cached->dte_alignment;
11076 }
11077
11078 return (ecb);
11079 }
11080
11081 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11082 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11083 dtrace_ecb_destroy(ecb);
11084 return (NULL);
11085 }
11086 }
11087
11088 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11089 dtrace_ecb_destroy(ecb);
11090 return (NULL);
11091 }
11092
11093 return (dtrace_ecb_create_cache = ecb);
11094}
11095
11096static int
11097dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11098{
11099 dtrace_ecb_t *ecb;
11100 dtrace_enabling_t *enab = arg1;
11101 dtrace_ecbdesc_t *ep = arg2;
11102 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11103
11104 ASSERT(state != NULL);
11105
11106 if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11107 /*
11108 * This probe was created in a generation for which this
11109 * enabling has previously created ECBs; we don't want to
11110 * enable it again, so just kick out.
11111 */
11112 return (DTRACE_MATCH_NEXT);
11113 }
11114
11115 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11116 return (DTRACE_MATCH_DONE);
11117
11118 if (dtrace_ecb_enable(ecb) < 0)
11119 return (DTRACE_MATCH_FAIL);
11120
11121 return (DTRACE_MATCH_NEXT);
11122}
11123
11124static dtrace_ecb_t *
11125dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11126{
11127 dtrace_ecb_t *ecb;
11128#pragma unused(ecb) /* __APPLE__ */
11129
11130 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11131
11132 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11133 return (NULL);
11134
11135 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11136 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11137
11138 return (state->dts_ecbs[id - 1]);
11139}
11140
11141static dtrace_aggregation_t *
11142dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11143{
11144 dtrace_aggregation_t *agg;
11145#pragma unused(agg) /* __APPLE__ */
11146
11147 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11148
11149 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11150 return (NULL);
11151
11152 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11153 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11154 agg->dtag_id == id);
11155
11156 return (state->dts_aggregations[id - 1]);
11157}
11158
11159/*
11160 * DTrace Buffer Functions
11161 *
11162 * The following functions manipulate DTrace buffers. Most of these functions
11163 * are called in the context of establishing or processing consumer state;
11164 * exceptions are explicitly noted.
11165 */
11166
11167/*
11168 * Note: called from cross call context. This function switches the two
11169 * buffers on a given CPU. The atomicity of this operation is assured by
11170 * disabling interrupts while the actual switch takes place; the disabling of
11171 * interrupts serializes the execution with any execution of dtrace_probe() on
11172 * the same CPU.
11173 */
11174static void
11175dtrace_buffer_switch(dtrace_buffer_t *buf)
11176{
11177 caddr_t tomax = buf->dtb_tomax;
11178 caddr_t xamot = buf->dtb_xamot;
11179 dtrace_icookie_t cookie;
11180 hrtime_t now;
11181
11182 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11183 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11184
11185 cookie = dtrace_interrupt_disable();
11186 now = dtrace_gethrtime();
11187 buf->dtb_tomax = xamot;
11188 buf->dtb_xamot = tomax;
11189 buf->dtb_xamot_drops = buf->dtb_drops;
11190 buf->dtb_xamot_offset = buf->dtb_offset;
11191 buf->dtb_xamot_errors = buf->dtb_errors;
11192 buf->dtb_xamot_flags = buf->dtb_flags;
11193 buf->dtb_offset = 0;
11194 buf->dtb_drops = 0;
11195 buf->dtb_errors = 0;
11196 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11197 buf->dtb_interval = now - buf->dtb_switched;
11198 buf->dtb_switched = now;
11199 buf->dtb_cur_limit = buf->dtb_limit;
11200
11201 dtrace_interrupt_enable(cookie);
11202}
11203
11204/*
11205 * Note: called from cross call context. This function activates a buffer
11206 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11207 * is guaranteed by the disabling of interrupts.
11208 */
11209static void
11210dtrace_buffer_activate(dtrace_state_t *state)
11211{
11212 dtrace_buffer_t *buf;
11213 dtrace_icookie_t cookie = dtrace_interrupt_disable();
11214
11215 buf = &state->dts_buffer[CPU->cpu_id];
11216
11217 if (buf->dtb_tomax != NULL) {
11218 /*
11219 * We might like to assert that the buffer is marked inactive,
11220 * but this isn't necessarily true: the buffer for the CPU
11221 * that processes the BEGIN probe has its buffer activated
11222 * manually. In this case, we take the (harmless) action
11223 * re-clearing the bit INACTIVE bit.
11224 */
11225 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11226 }
11227
11228 dtrace_interrupt_enable(cookie);
11229}
11230
11231static int
11232dtrace_buffer_canalloc(size_t size)
11233{
11234 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11235 return (B_FALSE);
11236 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11237 return (B_FALSE);
11238
11239 return (B_TRUE);
11240}
11241
11242static int
11243dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11244 processorid_t cpu)
11245{
11246 dtrace_cpu_t *cp;
11247 dtrace_buffer_t *buf;
11248 size_t size_before_alloc = dtrace_buffer_memory_inuse;
11249
11250 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11251 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11252
11253 if (size > (size_t)dtrace_nonroot_maxsize &&
11254 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11255 return (EFBIG);
11256
11257 cp = cpu_list;
11258
11259 do {
11260 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11261 continue;
11262
11263 buf = &bufs[cp->cpu_id];
11264
11265 /*
11266 * If there is already a buffer allocated for this CPU, it
11267 * is only possible that this is a DR event. In this case,
11268 * the buffer size must match our specified size.
11269 */
11270 if (buf->dtb_tomax != NULL) {
11271 ASSERT(buf->dtb_size == size);
11272 continue;
11273 }
11274
11275 ASSERT(buf->dtb_xamot == NULL);
11276
11277 /* DTrace, please do not eat all the memory. */
11278 if (dtrace_buffer_canalloc(size) == B_FALSE)
11279 goto err;
11280 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11281 goto err;
11282 dtrace_buffer_memory_inuse += size;
11283
11284 /* Unsure that limit is always lower than size */
11285 limit = limit == size ? limit - 1 : limit;
11286 buf->dtb_cur_limit = limit;
11287 buf->dtb_limit = limit;
11288 buf->dtb_size = size;
11289 buf->dtb_flags = flags;
11290 buf->dtb_offset = 0;
11291 buf->dtb_drops = 0;
11292
11293 if (flags & DTRACEBUF_NOSWITCH)
11294 continue;
11295
11296 /* DTrace, please do not eat all the memory. */
11297 if (dtrace_buffer_canalloc(size) == B_FALSE)
11298 goto err;
11299 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11300 goto err;
11301 dtrace_buffer_memory_inuse += size;
11302 } while ((cp = cp->cpu_next) != cpu_list);
11303
11304 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11305
11306 return (0);
11307
11308err:
11309 cp = cpu_list;
11310
11311 do {
11312 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11313 continue;
11314
11315 buf = &bufs[cp->cpu_id];
11316
11317 if (buf->dtb_xamot != NULL) {
11318 ASSERT(buf->dtb_tomax != NULL);
11319 ASSERT(buf->dtb_size == size);
11320 kmem_free(buf->dtb_xamot, size);
11321 }
11322
11323 if (buf->dtb_tomax != NULL) {
11324 ASSERT(buf->dtb_size == size);
11325 kmem_free(buf->dtb_tomax, size);
11326 }
11327
11328 buf->dtb_tomax = NULL;
11329 buf->dtb_xamot = NULL;
11330 buf->dtb_size = 0;
11331 } while ((cp = cp->cpu_next) != cpu_list);
11332
11333 /* Restore the size saved before allocating memory */
11334 dtrace_buffer_memory_inuse = size_before_alloc;
11335
11336 return (ENOMEM);
11337}
11338
11339/*
11340 * Note: called from probe context. This function just increments the drop
11341 * count on a buffer. It has been made a function to allow for the
11342 * possibility of understanding the source of mysterious drop counts. (A
11343 * problem for which one may be particularly disappointed that DTrace cannot
11344 * be used to understand DTrace.)
11345 */
11346static void
11347dtrace_buffer_drop(dtrace_buffer_t *buf)
11348{
11349 buf->dtb_drops++;
11350}
11351
11352/*
11353 * Note: called from probe context. This function is called to reserve space
11354 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
11355 * mstate. Returns the new offset in the buffer, or a negative value if an
11356 * error has occurred.
11357 */
11358static intptr_t
11359dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11360 dtrace_state_t *state, dtrace_mstate_t *mstate)
11361{
11362 intptr_t offs = buf->dtb_offset, soffs;
11363 intptr_t woffs;
11364 caddr_t tomax;
11365 size_t total_off;
11366
11367 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11368 return (-1);
11369
11370 if ((tomax = buf->dtb_tomax) == NULL) {
11371 dtrace_buffer_drop(buf);
11372 return (-1);
11373 }
11374
11375 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11376 while (offs & (align - 1)) {
11377 /*
11378 * Assert that our alignment is off by a number which
11379 * is itself sizeof (uint32_t) aligned.
11380 */
11381 ASSERT(!((align - (offs & (align - 1))) &
11382 (sizeof (uint32_t) - 1)));
11383 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11384 offs += sizeof (uint32_t);
11385 }
11386
11387 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
11388 if (buf->dtb_cur_limit == buf->dtb_limit) {
11389 buf->dtb_cur_limit = buf->dtb_size;
11390
11391 atomic_add_32(&state->dts_buf_over_limit, 1);
11392 /**
11393 * Set an AST on the current processor
11394 * so that we can wake up the process
11395 * outside of probe context, when we know
11396 * it is safe to do so
11397 */
11398 minor_t minor = getminor(state->dts_dev);
11399 ASSERT(minor < 32);
11400
11401 atomic_or_32(&dtrace_wake_clients, 1 << minor);
11402 ast_dtrace_on();
11403 }
11404 if ((uint64_t)soffs > buf->dtb_size) {
11405 dtrace_buffer_drop(buf);
11406 return (-1);
11407 }
11408 }
11409
11410 if (mstate == NULL)
11411 return (offs);
11412
11413 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11414 mstate->dtms_scratch_size = buf->dtb_size - soffs;
11415 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11416
11417 return (offs);
11418 }
11419
11420 if (buf->dtb_flags & DTRACEBUF_FILL) {
11421 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11422 (buf->dtb_flags & DTRACEBUF_FULL))
11423 return (-1);
11424 goto out;
11425 }
11426
11427 total_off = needed + (offs & (align - 1));
11428
11429 /*
11430 * For a ring buffer, life is quite a bit more complicated. Before
11431 * we can store any padding, we need to adjust our wrapping offset.
11432 * (If we've never before wrapped or we're not about to, no adjustment
11433 * is required.)
11434 */
11435 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11436 offs + total_off > buf->dtb_size) {
11437 woffs = buf->dtb_xamot_offset;
11438
11439 if (offs + total_off > buf->dtb_size) {
11440 /*
11441 * We can't fit in the end of the buffer. First, a
11442 * sanity check that we can fit in the buffer at all.
11443 */
11444 if (total_off > buf->dtb_size) {
11445 dtrace_buffer_drop(buf);
11446 return (-1);
11447 }
11448
11449 /*
11450 * We're going to be storing at the top of the buffer,
11451 * so now we need to deal with the wrapped offset. We
11452 * only reset our wrapped offset to 0 if it is
11453 * currently greater than the current offset. If it
11454 * is less than the current offset, it is because a
11455 * previous allocation induced a wrap -- but the
11456 * allocation didn't subsequently take the space due
11457 * to an error or false predicate evaluation. In this
11458 * case, we'll just leave the wrapped offset alone: if
11459 * the wrapped offset hasn't been advanced far enough
11460 * for this allocation, it will be adjusted in the
11461 * lower loop.
11462 */
11463 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11464 if (woffs >= offs)
11465 woffs = 0;
11466 } else {
11467 woffs = 0;
11468 }
11469
11470 /*
11471 * Now we know that we're going to be storing to the
11472 * top of the buffer and that there is room for us
11473 * there. We need to clear the buffer from the current
11474 * offset to the end (there may be old gunk there).
11475 */
11476 while ((uint64_t)offs < buf->dtb_size)
11477 tomax[offs++] = 0;
11478
11479 /*
11480 * We need to set our offset to zero. And because we
11481 * are wrapping, we need to set the bit indicating as
11482 * much. We can also adjust our needed space back
11483 * down to the space required by the ECB -- we know
11484 * that the top of the buffer is aligned.
11485 */
11486 offs = 0;
11487 total_off = needed;
11488 buf->dtb_flags |= DTRACEBUF_WRAPPED;
11489 } else {
11490 /*
11491 * There is room for us in the buffer, so we simply
11492 * need to check the wrapped offset.
11493 */
11494 if (woffs < offs) {
11495 /*
11496 * The wrapped offset is less than the offset.
11497 * This can happen if we allocated buffer space
11498 * that induced a wrap, but then we didn't
11499 * subsequently take the space due to an error
11500 * or false predicate evaluation. This is
11501 * okay; we know that _this_ allocation isn't
11502 * going to induce a wrap. We still can't
11503 * reset the wrapped offset to be zero,
11504 * however: the space may have been trashed in
11505 * the previous failed probe attempt. But at
11506 * least the wrapped offset doesn't need to
11507 * be adjusted at all...
11508 */
11509 goto out;
11510 }
11511 }
11512
11513 while (offs + total_off > (size_t)woffs) {
11514 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11515 size_t size;
11516
11517 if (epid == DTRACE_EPIDNONE) {
11518 size = sizeof (uint32_t);
11519 } else {
11520 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
11521 ASSERT(state->dts_ecbs[epid - 1] != NULL);
11522
11523 size = state->dts_ecbs[epid - 1]->dte_size;
11524 }
11525
11526 ASSERT(woffs + size <= buf->dtb_size);
11527 ASSERT(size != 0);
11528
11529 if (woffs + size == buf->dtb_size) {
11530 /*
11531 * We've reached the end of the buffer; we want
11532 * to set the wrapped offset to 0 and break
11533 * out. However, if the offs is 0, then we're
11534 * in a strange edge-condition: the amount of
11535 * space that we want to reserve plus the size
11536 * of the record that we're overwriting is
11537 * greater than the size of the buffer. This
11538 * is problematic because if we reserve the
11539 * space but subsequently don't consume it (due
11540 * to a failed predicate or error) the wrapped
11541 * offset will be 0 -- yet the EPID at offset 0
11542 * will not be committed. This situation is
11543 * relatively easy to deal with: if we're in
11544 * this case, the buffer is indistinguishable
11545 * from one that hasn't wrapped; we need only
11546 * finish the job by clearing the wrapped bit,
11547 * explicitly setting the offset to be 0, and
11548 * zero'ing out the old data in the buffer.
11549 */
11550 if (offs == 0) {
11551 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11552 buf->dtb_offset = 0;
11553 woffs = total_off;
11554
11555 while ((uint64_t)woffs < buf->dtb_size)
11556 tomax[woffs++] = 0;
11557 }
11558
11559 woffs = 0;
11560 break;
11561 }
11562
11563 woffs += size;
11564 }
11565
11566 /*
11567 * We have a wrapped offset. It may be that the wrapped offset
11568 * has become zero -- that's okay.
11569 */
11570 buf->dtb_xamot_offset = woffs;
11571 }
11572
11573out:
11574 /*
11575 * Now we can plow the buffer with any necessary padding.
11576 */
11577 while (offs & (align - 1)) {
11578 /*
11579 * Assert that our alignment is off by a number which
11580 * is itself sizeof (uint32_t) aligned.
11581 */
11582 ASSERT(!((align - (offs & (align - 1))) &
11583 (sizeof (uint32_t) - 1)));
11584 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11585 offs += sizeof (uint32_t);
11586 }
11587
11588 if (buf->dtb_flags & DTRACEBUF_FILL) {
11589 if (offs + needed > buf->dtb_size - state->dts_reserve) {
11590 buf->dtb_flags |= DTRACEBUF_FULL;
11591 return (-1);
11592 }
11593 }
11594
11595 if (mstate == NULL)
11596 return (offs);
11597
11598 /*
11599 * For ring buffers and fill buffers, the scratch space is always
11600 * the inactive buffer.
11601 */
11602 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11603 mstate->dtms_scratch_size = buf->dtb_size;
11604 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11605
11606 return (offs);
11607}
11608
11609static void
11610dtrace_buffer_polish(dtrace_buffer_t *buf)
11611{
11612 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11613 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11614
11615 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11616 return;
11617
11618 /*
11619 * We need to polish the ring buffer. There are three cases:
11620 *
11621 * - The first (and presumably most common) is that there is no gap
11622 * between the buffer offset and the wrapped offset. In this case,
11623 * there is nothing in the buffer that isn't valid data; we can
11624 * mark the buffer as polished and return.
11625 *
11626 * - The second (less common than the first but still more common
11627 * than the third) is that there is a gap between the buffer offset
11628 * and the wrapped offset, and the wrapped offset is larger than the
11629 * buffer offset. This can happen because of an alignment issue, or
11630 * can happen because of a call to dtrace_buffer_reserve() that
11631 * didn't subsequently consume the buffer space. In this case,
11632 * we need to zero the data from the buffer offset to the wrapped
11633 * offset.
11634 *
11635 * - The third (and least common) is that there is a gap between the
11636 * buffer offset and the wrapped offset, but the wrapped offset is
11637 * _less_ than the buffer offset. This can only happen because a
11638 * call to dtrace_buffer_reserve() induced a wrap, but the space
11639 * was not subsequently consumed. In this case, we need to zero the
11640 * space from the offset to the end of the buffer _and_ from the
11641 * top of the buffer to the wrapped offset.
11642 */
11643 if (buf->dtb_offset < buf->dtb_xamot_offset) {
11644 bzero(buf->dtb_tomax + buf->dtb_offset,
11645 buf->dtb_xamot_offset - buf->dtb_offset);
11646 }
11647
11648 if (buf->dtb_offset > buf->dtb_xamot_offset) {
11649 bzero(buf->dtb_tomax + buf->dtb_offset,
11650 buf->dtb_size - buf->dtb_offset);
11651 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11652 }
11653}
11654
11655static void
11656dtrace_buffer_free(dtrace_buffer_t *bufs)
11657{
11658 int i;
11659
11660 for (i = 0; i < (int)NCPU; i++) {
11661 dtrace_buffer_t *buf = &bufs[i];
11662
11663 if (buf->dtb_tomax == NULL) {
11664 ASSERT(buf->dtb_xamot == NULL);
11665 ASSERT(buf->dtb_size == 0);
11666 continue;
11667 }
11668
11669 if (buf->dtb_xamot != NULL) {
11670 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11671 kmem_free(buf->dtb_xamot, buf->dtb_size);
11672
11673 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11674 dtrace_buffer_memory_inuse -= buf->dtb_size;
11675 }
11676
11677 kmem_free(buf->dtb_tomax, buf->dtb_size);
11678 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11679 dtrace_buffer_memory_inuse -= buf->dtb_size;
11680
11681 buf->dtb_size = 0;
11682 buf->dtb_tomax = NULL;
11683 buf->dtb_xamot = NULL;
11684 }
11685}
11686
11687/*
11688 * DTrace Enabling Functions
11689 */
11690static dtrace_enabling_t *
11691dtrace_enabling_create(dtrace_vstate_t *vstate)
11692{
11693 dtrace_enabling_t *enab;
11694
11695 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11696 enab->dten_vstate = vstate;
11697
11698 return (enab);
11699}
11700
11701static void
11702dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11703{
11704 dtrace_ecbdesc_t **ndesc;
11705 size_t osize, nsize;
11706
11707 /*
11708 * We can't add to enablings after we've enabled them, or after we've
11709 * retained them.
11710 */
11711 ASSERT(enab->dten_probegen == 0);
11712 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11713
11714 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
11715 if (ecb == NULL) return;
11716
11717 if (enab->dten_ndesc < enab->dten_maxdesc) {
11718 enab->dten_desc[enab->dten_ndesc++] = ecb;
11719 return;
11720 }
11721
11722 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11723
11724 if (enab->dten_maxdesc == 0) {
11725 enab->dten_maxdesc = 1;
11726 } else {
11727 enab->dten_maxdesc <<= 1;
11728 }
11729
11730 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11731
11732 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11733 ndesc = kmem_zalloc(nsize, KM_SLEEP);
11734 bcopy(enab->dten_desc, ndesc, osize);
11735 kmem_free(enab->dten_desc, osize);
11736
11737 enab->dten_desc = ndesc;
11738 enab->dten_desc[enab->dten_ndesc++] = ecb;
11739}
11740
11741static void
11742dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11743 dtrace_probedesc_t *pd)
11744{
11745 dtrace_ecbdesc_t *new;
11746 dtrace_predicate_t *pred;
11747 dtrace_actdesc_t *act;
11748
11749 /*
11750 * We're going to create a new ECB description that matches the
11751 * specified ECB in every way, but has the specified probe description.
11752 */
11753 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11754
11755 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11756 dtrace_predicate_hold(pred);
11757
11758 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11759 dtrace_actdesc_hold(act);
11760
11761 new->dted_action = ecb->dted_action;
11762 new->dted_pred = ecb->dted_pred;
11763 new->dted_probe = *pd;
11764 new->dted_uarg = ecb->dted_uarg;
11765
11766 dtrace_enabling_add(enab, new);
11767}
11768
11769static void
11770dtrace_enabling_dump(dtrace_enabling_t *enab)
11771{
11772 int i;
11773
11774 for (i = 0; i < enab->dten_ndesc; i++) {
11775 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11776
11777 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11778 desc->dtpd_provider, desc->dtpd_mod,
11779 desc->dtpd_func, desc->dtpd_name);
11780 }
11781}
11782
11783static void
11784dtrace_enabling_destroy(dtrace_enabling_t *enab)
11785{
11786 int i;
11787 dtrace_ecbdesc_t *ep;
11788 dtrace_vstate_t *vstate = enab->dten_vstate;
11789
11790 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11791
11792 for (i = 0; i < enab->dten_ndesc; i++) {
11793 dtrace_actdesc_t *act, *next;
11794 dtrace_predicate_t *pred;
11795
11796 ep = enab->dten_desc[i];
11797
11798 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11799 dtrace_predicate_release(pred, vstate);
11800
11801 for (act = ep->dted_action; act != NULL; act = next) {
11802 next = act->dtad_next;
11803 dtrace_actdesc_release(act, vstate);
11804 }
11805
11806 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11807 }
11808
11809 kmem_free(enab->dten_desc,
11810 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11811
11812 /*
11813 * If this was a retained enabling, decrement the dts_nretained count
11814 * and take it off of the dtrace_retained list.
11815 */
11816 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11817 dtrace_retained == enab) {
11818 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11819 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11820 enab->dten_vstate->dtvs_state->dts_nretained--;
11821 dtrace_retained_gen++;
11822 }
11823
11824 if (enab->dten_prev == NULL) {
11825 if (dtrace_retained == enab) {
11826 dtrace_retained = enab->dten_next;
11827
11828 if (dtrace_retained != NULL)
11829 dtrace_retained->dten_prev = NULL;
11830 }
11831 } else {
11832 ASSERT(enab != dtrace_retained);
11833 ASSERT(dtrace_retained != NULL);
11834 enab->dten_prev->dten_next = enab->dten_next;
11835 }
11836
11837 if (enab->dten_next != NULL) {
11838 ASSERT(dtrace_retained != NULL);
11839 enab->dten_next->dten_prev = enab->dten_prev;
11840 }
11841
11842 kmem_free(enab, sizeof (dtrace_enabling_t));
11843}
11844
11845static int
11846dtrace_enabling_retain(dtrace_enabling_t *enab)
11847{
11848 dtrace_state_t *state;
11849
11850 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11851 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11852 ASSERT(enab->dten_vstate != NULL);
11853
11854 state = enab->dten_vstate->dtvs_state;
11855 ASSERT(state != NULL);
11856
11857 /*
11858 * We only allow each state to retain dtrace_retain_max enablings.
11859 */
11860 if (state->dts_nretained >= dtrace_retain_max)
11861 return (ENOSPC);
11862
11863 state->dts_nretained++;
11864 dtrace_retained_gen++;
11865
11866 if (dtrace_retained == NULL) {
11867 dtrace_retained = enab;
11868 return (0);
11869 }
11870
11871 enab->dten_next = dtrace_retained;
11872 dtrace_retained->dten_prev = enab;
11873 dtrace_retained = enab;
11874
11875 return (0);
11876}
11877
11878static int
11879dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11880 dtrace_probedesc_t *create)
11881{
11882 dtrace_enabling_t *new, *enab;
11883 int found = 0, err = ENOENT;
11884
11885 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11886 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11887 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11888 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11889 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11890
11891 new = dtrace_enabling_create(&state->dts_vstate);
11892
11893 /*
11894 * Iterate over all retained enablings, looking for enablings that
11895 * match the specified state.
11896 */
11897 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11898 int i;
11899
11900 /*
11901 * dtvs_state can only be NULL for helper enablings -- and
11902 * helper enablings can't be retained.
11903 */
11904 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11905
11906 if (enab->dten_vstate->dtvs_state != state)
11907 continue;
11908
11909 /*
11910 * Now iterate over each probe description; we're looking for
11911 * an exact match to the specified probe description.
11912 */
11913 for (i = 0; i < enab->dten_ndesc; i++) {
11914 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11915 dtrace_probedesc_t *pd = &ep->dted_probe;
11916
11917 /* APPLE NOTE: Darwin employs size bounded string operation. */
11918 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
11919 continue;
11920
11921 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
11922 continue;
11923
11924 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
11925 continue;
11926
11927 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
11928 continue;
11929
11930 /*
11931 * We have a winning probe! Add it to our growing
11932 * enabling.
11933 */
11934 found = 1;
11935 dtrace_enabling_addlike(new, ep, create);
11936 }
11937 }
11938
11939 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11940 dtrace_enabling_destroy(new);
11941 return (err);
11942 }
11943
11944 return (0);
11945}
11946
11947static void
11948dtrace_enabling_retract(dtrace_state_t *state)
11949{
11950 dtrace_enabling_t *enab, *next;
11951
11952 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11953
11954 /*
11955 * Iterate over all retained enablings, destroy the enablings retained
11956 * for the specified state.
11957 */
11958 for (enab = dtrace_retained; enab != NULL; enab = next) {
11959 next = enab->dten_next;
11960
11961 /*
11962 * dtvs_state can only be NULL for helper enablings -- and
11963 * helper enablings can't be retained.
11964 */
11965 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11966
11967 if (enab->dten_vstate->dtvs_state == state) {
11968 ASSERT(state->dts_nretained > 0);
11969 dtrace_enabling_destroy(enab);
11970 }
11971 }
11972
11973 ASSERT(state->dts_nretained == 0);
11974}
11975
11976static int
11977dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
11978{
11979 int i = 0;
11980 int total_matched = 0, matched = 0;
11981
11982 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11983 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11984
11985 for (i = 0; i < enab->dten_ndesc; i++) {
11986 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11987
11988 enab->dten_current = ep;
11989 enab->dten_error = 0;
11990
11991 /**
11992 * Before doing a dtrace_probe_enable, which is really
11993 * expensive, check that this enabling matches the matching precondition
11994 * if we have one
11995 */
11996 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
11997 continue;
11998 }
11999 /*
12000 * If a provider failed to enable a probe then get out and
12001 * let the consumer know we failed.
12002 */
12003 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12004 return (EBUSY);
12005
12006 total_matched += matched;
12007
12008 if (enab->dten_error != 0) {
12009 /*
12010 * If we get an error half-way through enabling the
12011 * probes, we kick out -- perhaps with some number of
12012 * them enabled. Leaving enabled probes enabled may
12013 * be slightly confusing for user-level, but we expect
12014 * that no one will attempt to actually drive on in
12015 * the face of such errors. If this is an anonymous
12016 * enabling (indicated with a NULL nmatched pointer),
12017 * we cmn_err() a message. We aren't expecting to
12018 * get such an error -- such as it can exist at all,
12019 * it would be a result of corrupted DOF in the driver
12020 * properties.
12021 */
12022 if (nmatched == NULL) {
12023 cmn_err(CE_WARN, "dtrace_enabling_match() "
12024 "error on %p: %d", (void *)ep,
12025 enab->dten_error);
12026 }
12027
12028 return (enab->dten_error);
12029 }
12030
12031 ep->dted_probegen = dtrace_probegen;
12032 }
12033
12034 if (nmatched != NULL)
12035 *nmatched = total_matched;
12036
12037 return (0);
12038}
12039
12040static void
12041dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12042{
12043 dtrace_enabling_t *enab;
12044
12045 lck_mtx_lock(&cpu_lock);
12046 lck_mtx_lock(&dtrace_lock);
12047
12048 /*
12049 * Iterate over all retained enablings to see if any probes match
12050 * against them. We only perform this operation on enablings for which
12051 * we have sufficient permissions by virtue of being in the global zone
12052 * or in the same zone as the DTrace client. Because we can be called
12053 * after dtrace_detach() has been called, we cannot assert that there
12054 * are retained enablings. We can safely load from dtrace_retained,
12055 * however: the taskq_destroy() at the end of dtrace_detach() will
12056 * block pending our completion.
12057 */
12058
12059 /*
12060 * Darwin doesn't do zones.
12061 * Behave as if always in "global" zone."
12062 */
12063 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12064 (void) dtrace_enabling_match(enab, NULL, cond);
12065 }
12066
12067 lck_mtx_unlock(&dtrace_lock);
12068 lck_mtx_unlock(&cpu_lock);
12069
12070}
12071
12072static void
12073dtrace_enabling_matchall(void)
12074{
12075 dtrace_enabling_matchall_with_cond(NULL);
12076}
12077
12078
12079
12080/*
12081 * If an enabling is to be enabled without having matched probes (that is, if
12082 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12083 * enabling must be _primed_ by creating an ECB for every ECB description.
12084 * This must be done to assure that we know the number of speculations, the
12085 * number of aggregations, the minimum buffer size needed, etc. before we
12086 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12087 * enabling any probes, we create ECBs for every ECB decription, but with a
12088 * NULL probe -- which is exactly what this function does.
12089 */
12090static void
12091dtrace_enabling_prime(dtrace_state_t *state)
12092{
12093 dtrace_enabling_t *enab;
12094 int i;
12095
12096 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12097 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12098
12099 if (enab->dten_vstate->dtvs_state != state)
12100 continue;
12101
12102 /*
12103 * We don't want to prime an enabling more than once, lest
12104 * we allow a malicious user to induce resource exhaustion.
12105 * (The ECBs that result from priming an enabling aren't
12106 * leaked -- but they also aren't deallocated until the
12107 * consumer state is destroyed.)
12108 */
12109 if (enab->dten_primed)
12110 continue;
12111
12112 for (i = 0; i < enab->dten_ndesc; i++) {
12113 enab->dten_current = enab->dten_desc[i];
12114 (void) dtrace_probe_enable(NULL, enab, NULL);
12115 }
12116
12117 enab->dten_primed = 1;
12118 }
12119}
12120
12121/*
12122 * Called to indicate that probes should be provided due to retained
12123 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12124 * must take an initial lap through the enabling calling the dtps_provide()
12125 * entry point explicitly to allow for autocreated probes.
12126 */
12127static void
12128dtrace_enabling_provide(dtrace_provider_t *prv)
12129{
12130 int i, all = 0;
12131 dtrace_probedesc_t desc;
12132 dtrace_genid_t gen;
12133
12134 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12135 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12136
12137 if (prv == NULL) {
12138 all = 1;
12139 prv = dtrace_provider;
12140 }
12141
12142 do {
12143 dtrace_enabling_t *enab;
12144 void *parg = prv->dtpv_arg;
12145
12146retry:
12147 gen = dtrace_retained_gen;
12148 for (enab = dtrace_retained; enab != NULL;
12149 enab = enab->dten_next) {
12150 for (i = 0; i < enab->dten_ndesc; i++) {
12151 desc = enab->dten_desc[i]->dted_probe;
12152 lck_mtx_unlock(&dtrace_lock);
12153 prv->dtpv_pops.dtps_provide(parg, &desc);
12154 lck_mtx_lock(&dtrace_lock);
12155 /*
12156 * Process the retained enablings again if
12157 * they have changed while we weren't holding
12158 * dtrace_lock.
12159 */
12160 if (gen != dtrace_retained_gen)
12161 goto retry;
12162 }
12163 }
12164 } while (all && (prv = prv->dtpv_next) != NULL);
12165
12166 lck_mtx_unlock(&dtrace_lock);
12167 dtrace_probe_provide(NULL, all ? NULL : prv);
12168 lck_mtx_lock(&dtrace_lock);
12169}
12170
12171/*
12172 * DTrace DOF Functions
12173 */
12174/*ARGSUSED*/
12175static void
12176dtrace_dof_error(dof_hdr_t *dof, const char *str)
12177{
12178#pragma unused(dof) /* __APPLE__ */
12179 if (dtrace_err_verbose)
12180 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12181
12182#ifdef DTRACE_ERRDEBUG
12183 dtrace_errdebug(str);
12184#endif
12185}
12186
12187/*
12188 * Create DOF out of a currently enabled state. Right now, we only create
12189 * DOF containing the run-time options -- but this could be expanded to create
12190 * complete DOF representing the enabled state.
12191 */
12192static dof_hdr_t *
12193dtrace_dof_create(dtrace_state_t *state)
12194{
12195 dof_hdr_t *dof;
12196 dof_sec_t *sec;
12197 dof_optdesc_t *opt;
12198 int i, len = sizeof (dof_hdr_t) +
12199 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12200 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12201
12202 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12203
12204 dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12205 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12206 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12207 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12208 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12209
12210 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12211 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12212 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12213 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12214 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12215 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12216
12217 dof->dofh_flags = 0;
12218 dof->dofh_hdrsize = sizeof (dof_hdr_t);
12219 dof->dofh_secsize = sizeof (dof_sec_t);
12220 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
12221 dof->dofh_secoff = sizeof (dof_hdr_t);
12222 dof->dofh_loadsz = len;
12223 dof->dofh_filesz = len;
12224 dof->dofh_pad = 0;
12225
12226 /*
12227 * Fill in the option section header...
12228 */
12229 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12230 sec->dofs_type = DOF_SECT_OPTDESC;
12231 sec->dofs_align = sizeof (uint64_t);
12232 sec->dofs_flags = DOF_SECF_LOAD;
12233 sec->dofs_entsize = sizeof (dof_optdesc_t);
12234
12235 opt = (dof_optdesc_t *)((uintptr_t)sec +
12236 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12237
12238 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12239 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12240
12241 for (i = 0; i < DTRACEOPT_MAX; i++) {
12242 opt[i].dofo_option = i;
12243 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12244 opt[i].dofo_value = state->dts_options[i];
12245 }
12246
12247 return (dof);
12248}
12249
12250static dof_hdr_t *
12251dtrace_dof_copyin(user_addr_t uarg, int *errp)
12252{
12253 dof_hdr_t hdr, *dof;
12254
12255 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12256
12257 /*
12258 * First, we're going to copyin() the sizeof (dof_hdr_t).
12259 */
12260 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12261 dtrace_dof_error(NULL, "failed to copyin DOF header");
12262 *errp = EFAULT;
12263 return (NULL);
12264 }
12265
12266 /*
12267 * Now we'll allocate the entire DOF and copy it in -- provided
12268 * that the length isn't outrageous.
12269 */
12270 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12271 dtrace_dof_error(&hdr, "load size exceeds maximum");
12272 *errp = E2BIG;
12273 return (NULL);
12274 }
12275
12276 if (hdr.dofh_loadsz < sizeof (hdr)) {
12277 dtrace_dof_error(&hdr, "invalid load size");
12278 *errp = EINVAL;
12279 return (NULL);
12280 }
12281
12282 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12283
12284 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
12285 dof->dofh_loadsz != hdr.dofh_loadsz) {
12286 kmem_free_aligned(dof, hdr.dofh_loadsz);
12287 *errp = EFAULT;
12288 return (NULL);
12289 }
12290
12291 return (dof);
12292}
12293
12294static dof_hdr_t *
12295dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12296{
12297 dof_hdr_t hdr, *dof;
12298
12299 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12300
12301 /*
12302 * First, we're going to copyin() the sizeof (dof_hdr_t).
12303 */
12304 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12305 dtrace_dof_error(NULL, "failed to copyin DOF header");
12306 *errp = EFAULT;
12307 return (NULL);
12308 }
12309
12310 /*
12311 * Now we'll allocate the entire DOF and copy it in -- provided
12312 * that the length isn't outrageous.
12313 */
12314 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12315 dtrace_dof_error(&hdr, "load size exceeds maximum");
12316 *errp = E2BIG;
12317 return (NULL);
12318 }
12319
12320 if (hdr.dofh_loadsz < sizeof (hdr)) {
12321 dtrace_dof_error(&hdr, "invalid load size");
12322 *errp = EINVAL;
12323 return (NULL);
12324 }
12325
12326 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12327
12328 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12329 kmem_free_aligned(dof, hdr.dofh_loadsz);
12330 *errp = EFAULT;
12331 return (NULL);
12332 }
12333
12334 return (dof);
12335}
12336
12337static void
12338dtrace_dof_destroy(dof_hdr_t *dof)
12339{
12340 kmem_free_aligned(dof, dof->dofh_loadsz);
12341}
12342
12343static dof_hdr_t *
12344dtrace_dof_property(const char *name)
12345{
12346 unsigned int len = 0;
12347 dof_hdr_t *dof;
12348
12349 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12350 return NULL;
12351 }
12352
12353 if (!PEReadNVRAMProperty(name, NULL, &len)) {
12354 return NULL;
12355 }
12356
12357 dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
12358
12359 if (!PEReadNVRAMProperty(name, dof, &len)) {
12360 dtrace_dof_destroy(dof);
12361 dtrace_dof_error(NULL, "unreadable DOF");
12362 return NULL;
12363 }
12364
12365 if (len < sizeof (dof_hdr_t)) {
12366 dtrace_dof_destroy(dof);
12367 dtrace_dof_error(NULL, "truncated header");
12368 return (NULL);
12369 }
12370
12371 if (len < dof->dofh_loadsz) {
12372 dtrace_dof_destroy(dof);
12373 dtrace_dof_error(NULL, "truncated DOF");
12374 return (NULL);
12375 }
12376
12377 if (len != dof->dofh_loadsz) {
12378 dtrace_dof_destroy(dof);
12379 dtrace_dof_error(NULL, "invalid DOF size");
12380 return (NULL);
12381 }
12382
12383 if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12384 dtrace_dof_destroy(dof);
12385 dtrace_dof_error(NULL, "oversized DOF");
12386 return (NULL);
12387 }
12388
12389 return (dof);
12390}
12391
12392/*
12393 * Return the dof_sec_t pointer corresponding to a given section index. If the
12394 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
12395 * a type other than DOF_SECT_NONE is specified, the header is checked against
12396 * this type and NULL is returned if the types do not match.
12397 */
12398static dof_sec_t *
12399dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12400{
12401 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12402 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12403
12404 if (i >= dof->dofh_secnum) {
12405 dtrace_dof_error(dof, "referenced section index is invalid");
12406 return (NULL);
12407 }
12408
12409 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12410 dtrace_dof_error(dof, "referenced section is not loadable");
12411 return (NULL);
12412 }
12413
12414 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12415 dtrace_dof_error(dof, "referenced section is the wrong type");
12416 return (NULL);
12417 }
12418
12419 return (sec);
12420}
12421
12422static dtrace_probedesc_t *
12423dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12424{
12425 dof_probedesc_t *probe;
12426 dof_sec_t *strtab;
12427 uintptr_t daddr = (uintptr_t)dof;
12428 uintptr_t str;
12429 size_t size;
12430
12431 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12432 dtrace_dof_error(dof, "invalid probe section");
12433 return (NULL);
12434 }
12435
12436 if (sec->dofs_align != sizeof (dof_secidx_t)) {
12437 dtrace_dof_error(dof, "bad alignment in probe description");
12438 return (NULL);
12439 }
12440
12441 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12442 dtrace_dof_error(dof, "truncated probe description");
12443 return (NULL);
12444 }
12445
12446 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12447 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12448
12449 if (strtab == NULL)
12450 return (NULL);
12451
12452 str = daddr + strtab->dofs_offset;
12453 size = strtab->dofs_size;
12454
12455 if (probe->dofp_provider >= strtab->dofs_size) {
12456 dtrace_dof_error(dof, "corrupt probe provider");
12457 return (NULL);
12458 }
12459
12460 (void) strncpy(desc->dtpd_provider,
12461 (char *)(str + probe->dofp_provider),
12462 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12463
12464 /* APPLE NOTE: Darwin employs size bounded string operation. */
12465 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
12466
12467 if (probe->dofp_mod >= strtab->dofs_size) {
12468 dtrace_dof_error(dof, "corrupt probe module");
12469 return (NULL);
12470 }
12471
12472 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12473 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12474
12475 /* APPLE NOTE: Darwin employs size bounded string operation. */
12476 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
12477
12478 if (probe->dofp_func >= strtab->dofs_size) {
12479 dtrace_dof_error(dof, "corrupt probe function");
12480 return (NULL);
12481 }
12482
12483 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12484 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12485
12486 /* APPLE NOTE: Darwin employs size bounded string operation. */
12487 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
12488
12489 if (probe->dofp_name >= strtab->dofs_size) {
12490 dtrace_dof_error(dof, "corrupt probe name");
12491 return (NULL);
12492 }
12493
12494 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12495 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12496
12497 /* APPLE NOTE: Darwin employs size bounded string operation. */
12498 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
12499
12500 return (desc);
12501}
12502
12503static dtrace_difo_t *
12504dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12505 cred_t *cr)
12506{
12507 dtrace_difo_t *dp;
12508 size_t ttl = 0;
12509 dof_difohdr_t *dofd;
12510 uintptr_t daddr = (uintptr_t)dof;
12511 size_t max_size = dtrace_difo_maxsize;
12512 uint_t i;
12513 int l, n;
12514
12515
12516 static const struct {
12517 int section;
12518 int bufoffs;
12519 int lenoffs;
12520 int entsize;
12521 int align;
12522 const char *msg;
12523 } difo[] = {
12524 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12525 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12526 sizeof (dif_instr_t), "multiple DIF sections" },
12527
12528 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12529 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12530 sizeof (uint64_t), "multiple integer tables" },
12531
12532 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12533 offsetof(dtrace_difo_t, dtdo_strlen), 0,
12534 sizeof (char), "multiple string tables" },
12535
12536 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12537 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12538 sizeof (uint_t), "multiple variable tables" },
12539
12540 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12541 };
12542
12543 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12544 dtrace_dof_error(dof, "invalid DIFO header section");
12545 return (NULL);
12546 }
12547
12548 if (sec->dofs_align != sizeof (dof_secidx_t)) {
12549 dtrace_dof_error(dof, "bad alignment in DIFO header");
12550 return (NULL);
12551 }
12552
12553 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12554 sec->dofs_size % sizeof (dof_secidx_t)) {
12555 dtrace_dof_error(dof, "bad size in DIFO header");
12556 return (NULL);
12557 }
12558
12559 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12560 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12561
12562 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12563 dp->dtdo_rtype = dofd->dofd_rtype;
12564
12565 for (l = 0; l < n; l++) {
12566 dof_sec_t *subsec;
12567 void **bufp;
12568 uint32_t *lenp;
12569
12570 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12571 dofd->dofd_links[l])) == NULL)
12572 goto err; /* invalid section link */
12573
12574 if (ttl + subsec->dofs_size > max_size) {
12575 dtrace_dof_error(dof, "exceeds maximum size");
12576 goto err;
12577 }
12578
12579 ttl += subsec->dofs_size;
12580
12581 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12582
12583 if (subsec->dofs_type != (uint32_t)difo[i].section)
12584 continue;
12585
12586 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12587 dtrace_dof_error(dof, "section not loaded");
12588 goto err;
12589 }
12590
12591 if (subsec->dofs_align != (uint32_t)difo[i].align) {
12592 dtrace_dof_error(dof, "bad alignment");
12593 goto err;
12594 }
12595
12596 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12597 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12598
12599 if (*bufp != NULL) {
12600 dtrace_dof_error(dof, difo[i].msg);
12601 goto err;
12602 }
12603
12604 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
12605 dtrace_dof_error(dof, "entry size mismatch");
12606 goto err;
12607 }
12608
12609 if (subsec->dofs_entsize != 0 &&
12610 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12611 dtrace_dof_error(dof, "corrupt entry size");
12612 goto err;
12613 }
12614
12615 *lenp = subsec->dofs_size;
12616 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12617 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12618 *bufp, subsec->dofs_size);
12619
12620 if (subsec->dofs_entsize != 0)
12621 *lenp /= subsec->dofs_entsize;
12622
12623 break;
12624 }
12625
12626 /*
12627 * If we encounter a loadable DIFO sub-section that is not
12628 * known to us, assume this is a broken program and fail.
12629 */
12630 if (difo[i].section == DOF_SECT_NONE &&
12631 (subsec->dofs_flags & DOF_SECF_LOAD)) {
12632 dtrace_dof_error(dof, "unrecognized DIFO subsection");
12633 goto err;
12634 }
12635 }
12636
12637 if (dp->dtdo_buf == NULL) {
12638 /*
12639 * We can't have a DIF object without DIF text.
12640 */
12641 dtrace_dof_error(dof, "missing DIF text");
12642 goto err;
12643 }
12644
12645 /*
12646 * Before we validate the DIF object, run through the variable table
12647 * looking for the strings -- if any of their size are under, we'll set
12648 * their size to be the system-wide default string size. Note that
12649 * this should _not_ happen if the "strsize" option has been set --
12650 * in this case, the compiler should have set the size to reflect the
12651 * setting of the option.
12652 */
12653 for (i = 0; i < dp->dtdo_varlen; i++) {
12654 dtrace_difv_t *v = &dp->dtdo_vartab[i];
12655 dtrace_diftype_t *t = &v->dtdv_type;
12656
12657 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12658 continue;
12659
12660 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12661 t->dtdt_size = dtrace_strsize_default;
12662 }
12663
12664 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12665 goto err;
12666
12667 dtrace_difo_init(dp, vstate);
12668 return (dp);
12669
12670err:
12671 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12672 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12673 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12674 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12675
12676 kmem_free(dp, sizeof (dtrace_difo_t));
12677 return (NULL);
12678}
12679
12680static dtrace_predicate_t *
12681dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12682 cred_t *cr)
12683{
12684 dtrace_difo_t *dp;
12685
12686 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12687 return (NULL);
12688
12689 return (dtrace_predicate_create(dp));
12690}
12691
12692static dtrace_actdesc_t *
12693dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12694 cred_t *cr)
12695{
12696 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12697 dof_actdesc_t *desc;
12698 dof_sec_t *difosec;
12699 size_t offs;
12700 uintptr_t daddr = (uintptr_t)dof;
12701 uint64_t arg;
12702 dtrace_actkind_t kind;
12703
12704 if (sec->dofs_type != DOF_SECT_ACTDESC) {
12705 dtrace_dof_error(dof, "invalid action section");
12706 return (NULL);
12707 }
12708
12709 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12710 dtrace_dof_error(dof, "truncated action description");
12711 return (NULL);
12712 }
12713
12714 if (sec->dofs_align != sizeof (uint64_t)) {
12715 dtrace_dof_error(dof, "bad alignment in action description");
12716 return (NULL);
12717 }
12718
12719 if (sec->dofs_size < sec->dofs_entsize) {
12720 dtrace_dof_error(dof, "section entry size exceeds total size");
12721 return (NULL);
12722 }
12723
12724 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12725 dtrace_dof_error(dof, "bad entry size in action description");
12726 return (NULL);
12727 }
12728
12729 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12730 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12731 return (NULL);
12732 }
12733
12734 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12735 desc = (dof_actdesc_t *)(daddr +
12736 (uintptr_t)sec->dofs_offset + offs);
12737 kind = (dtrace_actkind_t)desc->dofa_kind;
12738
12739 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12740 (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
12741 (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
12742 {
12743 dof_sec_t *strtab;
12744 char *str, *fmt;
12745 uint64_t i;
12746
12747 /*
12748 * The argument to these actions is an index into the
12749 * DOF string table. For printf()-like actions, this
12750 * is the format string. For print(), this is the
12751 * CTF type of the expression result.
12752 */
12753 if ((strtab = dtrace_dof_sect(dof,
12754 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12755 goto err;
12756
12757 str = (char *)((uintptr_t)dof +
12758 (uintptr_t)strtab->dofs_offset);
12759
12760 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12761 if (str[i] == '\0')
12762 break;
12763 }
12764
12765 if (i >= strtab->dofs_size) {
12766 dtrace_dof_error(dof, "bogus format string");
12767 goto err;
12768 }
12769
12770 if (i == desc->dofa_arg) {
12771 dtrace_dof_error(dof, "empty format string");
12772 goto err;
12773 }
12774
12775 i -= desc->dofa_arg;
12776 fmt = kmem_alloc(i + 1, KM_SLEEP);
12777 bcopy(&str[desc->dofa_arg], fmt, i + 1);
12778 arg = (uint64_t)(uintptr_t)fmt;
12779 } else {
12780 if (kind == DTRACEACT_PRINTA) {
12781 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12782 arg = 0;
12783 } else {
12784 arg = desc->dofa_arg;
12785 }
12786 }
12787
12788 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12789 desc->dofa_uarg, arg);
12790
12791 if (last != NULL) {
12792 last->dtad_next = act;
12793 } else {
12794 first = act;
12795 }
12796
12797 last = act;
12798
12799 if (desc->dofa_difo == DOF_SECIDX_NONE)
12800 continue;
12801
12802 if ((difosec = dtrace_dof_sect(dof,
12803 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12804 goto err;
12805
12806 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12807
12808 if (act->dtad_difo == NULL)
12809 goto err;
12810 }
12811
12812 ASSERT(first != NULL);
12813 return (first);
12814
12815err:
12816 for (act = first; act != NULL; act = next) {
12817 next = act->dtad_next;
12818 dtrace_actdesc_release(act, vstate);
12819 }
12820
12821 return (NULL);
12822}
12823
12824static dtrace_ecbdesc_t *
12825dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12826 cred_t *cr)
12827{
12828 dtrace_ecbdesc_t *ep;
12829 dof_ecbdesc_t *ecb;
12830 dtrace_probedesc_t *desc;
12831 dtrace_predicate_t *pred = NULL;
12832
12833 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12834 dtrace_dof_error(dof, "truncated ECB description");
12835 return (NULL);
12836 }
12837
12838 if (sec->dofs_align != sizeof (uint64_t)) {
12839 dtrace_dof_error(dof, "bad alignment in ECB description");
12840 return (NULL);
12841 }
12842
12843 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12844 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12845
12846 if (sec == NULL)
12847 return (NULL);
12848
12849 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12850 ep->dted_uarg = ecb->dofe_uarg;
12851 desc = &ep->dted_probe;
12852
12853 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12854 goto err;
12855
12856 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12857 if ((sec = dtrace_dof_sect(dof,
12858 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12859 goto err;
12860
12861 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12862 goto err;
12863
12864 ep->dted_pred.dtpdd_predicate = pred;
12865 }
12866
12867 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12868 if ((sec = dtrace_dof_sect(dof,
12869 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12870 goto err;
12871
12872 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12873
12874 if (ep->dted_action == NULL)
12875 goto err;
12876 }
12877
12878 return (ep);
12879
12880err:
12881 if (pred != NULL)
12882 dtrace_predicate_release(pred, vstate);
12883 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12884 return (NULL);
12885}
12886
12887/*
12888 * APPLE NOTE: dyld handles dof relocation.
12889 * Darwin does not need dtrace_dof_relocate()
12890 */
12891
12892/*
12893 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12894 * header: it should be at the front of a memory region that is at least
12895 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12896 * size. It need not be validated in any other way.
12897 */
12898static int
12899dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12900 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12901{
12902#pragma unused(ubase) /* __APPLE__ */
12903 uint64_t len = dof->dofh_loadsz, seclen;
12904 uintptr_t daddr = (uintptr_t)dof;
12905 dtrace_ecbdesc_t *ep;
12906 dtrace_enabling_t *enab;
12907 uint_t i;
12908
12909 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12910 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12911
12912 /*
12913 * Check the DOF header identification bytes. In addition to checking
12914 * valid settings, we also verify that unused bits/bytes are zeroed so
12915 * we can use them later without fear of regressing existing binaries.
12916 */
12917 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12918 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12919 dtrace_dof_error(dof, "DOF magic string mismatch");
12920 return (-1);
12921 }
12922
12923 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12924 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12925 dtrace_dof_error(dof, "DOF has invalid data model");
12926 return (-1);
12927 }
12928
12929 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12930 dtrace_dof_error(dof, "DOF encoding mismatch");
12931 return (-1);
12932 }
12933
12934 /*
12935 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
12936 */
12937 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
12938 dtrace_dof_error(dof, "DOF version mismatch");
12939 return (-1);
12940 }
12941
12942 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12943 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12944 return (-1);
12945 }
12946
12947 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12948 dtrace_dof_error(dof, "DOF uses too many integer registers");
12949 return (-1);
12950 }
12951
12952 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12953 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12954 return (-1);
12955 }
12956
12957 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12958 if (dof->dofh_ident[i] != 0) {
12959 dtrace_dof_error(dof, "DOF has invalid ident byte set");
12960 return (-1);
12961 }
12962 }
12963
12964 if (dof->dofh_flags & ~DOF_FL_VALID) {
12965 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12966 return (-1);
12967 }
12968
12969 if (dof->dofh_secsize < sizeof(dof_sec_t)) {
12970 dtrace_dof_error(dof, "invalid section header size");
12971 return (-1);
12972 }
12973
12974 /*
12975 * Check that the section headers don't exceed the amount of DOF
12976 * data. Note that we cast the section size and number of sections
12977 * to uint64_t's to prevent possible overflow in the multiplication.
12978 */
12979 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12980
12981 if (dof->dofh_secoff > len || seclen > len ||
12982 dof->dofh_secoff + seclen > len) {
12983 dtrace_dof_error(dof, "truncated section headers");
12984 return (-1);
12985 }
12986
12987 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12988 dtrace_dof_error(dof, "misaligned section headers");
12989 return (-1);
12990 }
12991
12992 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12993 dtrace_dof_error(dof, "misaligned section size");
12994 return (-1);
12995 }
12996
12997 /*
12998 * Take an initial pass through the section headers to be sure that
12999 * the headers don't have stray offsets. If the 'noprobes' flag is
13000 * set, do not permit sections relating to providers, probes, or args.
13001 */
13002 for (i = 0; i < dof->dofh_secnum; i++) {
13003 dof_sec_t *sec = (dof_sec_t *)(daddr +
13004 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13005
13006 if (noprobes) {
13007 switch (sec->dofs_type) {
13008 case DOF_SECT_PROVIDER:
13009 case DOF_SECT_PROBES:
13010 case DOF_SECT_PRARGS:
13011 case DOF_SECT_PROFFS:
13012 dtrace_dof_error(dof, "illegal sections "
13013 "for enabling");
13014 return (-1);
13015 }
13016 }
13017
13018 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13019 continue; /* just ignore non-loadable sections */
13020
13021 if (sec->dofs_align & (sec->dofs_align - 1)) {
13022 dtrace_dof_error(dof, "bad section alignment");
13023 return (-1);
13024 }
13025
13026 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13027 dtrace_dof_error(dof, "misaligned section");
13028 return (-1);
13029 }
13030
13031 if (sec->dofs_offset > len || sec->dofs_size > len ||
13032 sec->dofs_offset + sec->dofs_size > len) {
13033 dtrace_dof_error(dof, "corrupt section header");
13034 return (-1);
13035 }
13036
13037 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13038 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13039 dtrace_dof_error(dof, "non-terminating string table");
13040 return (-1);
13041 }
13042 }
13043
13044 /*
13045 * APPLE NOTE: We have no further relocation to perform.
13046 * All dof values are relative offsets.
13047 */
13048
13049 if ((enab = *enabp) == NULL)
13050 enab = *enabp = dtrace_enabling_create(vstate);
13051
13052 for (i = 0; i < dof->dofh_secnum; i++) {
13053 dof_sec_t *sec = (dof_sec_t *)(daddr +
13054 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13055
13056 if (sec->dofs_type != DOF_SECT_ECBDESC)
13057 continue;
13058
13059 /*
13060 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13061 * not all paths out of inlined dtrace_dof_ecbdesc
13062 * are checked for the NULL return value.
13063 * Check for NULL explicitly here.
13064 */
13065 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13066 if (ep == NULL) {
13067 dtrace_enabling_destroy(enab);
13068 *enabp = NULL;
13069 return (-1);
13070 }
13071
13072 dtrace_enabling_add(enab, ep);
13073 }
13074
13075 return (0);
13076}
13077
13078/*
13079 * Process DOF for any options. This routine assumes that the DOF has been
13080 * at least processed by dtrace_dof_slurp().
13081 */
13082static int
13083dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13084{
13085 uint_t i;
13086 int rval;
13087 uint32_t entsize;
13088 size_t offs;
13089 dof_optdesc_t *desc;
13090
13091 for (i = 0; i < dof->dofh_secnum; i++) {
13092 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13093 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13094
13095 if (sec->dofs_type != DOF_SECT_OPTDESC)
13096 continue;
13097
13098 if (sec->dofs_align != sizeof (uint64_t)) {
13099 dtrace_dof_error(dof, "bad alignment in "
13100 "option description");
13101 return (EINVAL);
13102 }
13103
13104 if ((entsize = sec->dofs_entsize) == 0) {
13105 dtrace_dof_error(dof, "zeroed option entry size");
13106 return (EINVAL);
13107 }
13108
13109 if (entsize < sizeof (dof_optdesc_t)) {
13110 dtrace_dof_error(dof, "bad option entry size");
13111 return (EINVAL);
13112 }
13113
13114 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13115 desc = (dof_optdesc_t *)((uintptr_t)dof +
13116 (uintptr_t)sec->dofs_offset + offs);
13117
13118 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13119 dtrace_dof_error(dof, "non-zero option string");
13120 return (EINVAL);
13121 }
13122
13123 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13124 dtrace_dof_error(dof, "unset option");
13125 return (EINVAL);
13126 }
13127
13128 if ((rval = dtrace_state_option(state,
13129 desc->dofo_option, desc->dofo_value)) != 0) {
13130 dtrace_dof_error(dof, "rejected option");
13131 return (rval);
13132 }
13133 }
13134 }
13135
13136 return (0);
13137}
13138
13139/*
13140 * DTrace Consumer State Functions
13141 */
13142static int
13143dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13144{
13145 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13146 void *base;
13147 uintptr_t limit;
13148 dtrace_dynvar_t *dvar, *next, *start;
13149 size_t i;
13150
13151 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13152 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13153
13154 bzero(dstate, sizeof (dtrace_dstate_t));
13155
13156 if ((dstate->dtds_chunksize = chunksize) == 0)
13157 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13158
13159 VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13160
13161 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13162 size = min_size;
13163
13164 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13165 return (ENOMEM);
13166
13167 dstate->dtds_size = size;
13168 dstate->dtds_base = base;
13169 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13170 bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13171
13172 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13173
13174 if (hashsize != 1 && (hashsize & 1))
13175 hashsize--;
13176
13177 dstate->dtds_hashsize = hashsize;
13178 dstate->dtds_hash = dstate->dtds_base;
13179
13180 /*
13181 * Set all of our hash buckets to point to the single sink, and (if
13182 * it hasn't already been set), set the sink's hash value to be the
13183 * sink sentinel value. The sink is needed for dynamic variable
13184 * lookups to know that they have iterated over an entire, valid hash
13185 * chain.
13186 */
13187 for (i = 0; i < hashsize; i++)
13188 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13189
13190 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13191 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13192
13193 /*
13194 * Determine number of active CPUs. Divide free list evenly among
13195 * active CPUs.
13196 */
13197 start = (dtrace_dynvar_t *)
13198 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13199 limit = (uintptr_t)base + size;
13200
13201 VERIFY((uintptr_t)start < limit);
13202 VERIFY((uintptr_t)start >= (uintptr_t)base);
13203
13204 maxper = (limit - (uintptr_t)start) / (int)NCPU;
13205 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13206
13207 for (i = 0; i < NCPU; i++) {
13208 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13209
13210 /*
13211 * If we don't even have enough chunks to make it once through
13212 * NCPUs, we're just going to allocate everything to the first
13213 * CPU. And if we're on the last CPU, we're going to allocate
13214 * whatever is left over. In either case, we set the limit to
13215 * be the limit of the dynamic variable space.
13216 */
13217 if (maxper == 0 || i == NCPU - 1) {
13218 limit = (uintptr_t)base + size;
13219 start = NULL;
13220 } else {
13221 limit = (uintptr_t)start + maxper;
13222 start = (dtrace_dynvar_t *)limit;
13223 }
13224
13225 VERIFY(limit <= (uintptr_t)base + size);
13226
13227 for (;;) {
13228 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13229 dstate->dtds_chunksize);
13230
13231 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13232 break;
13233
13234 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13235 (uintptr_t)dvar <= (uintptr_t)base + size);
13236 dvar->dtdv_next = next;
13237 dvar = next;
13238 }
13239
13240 if (maxper == 0)
13241 break;
13242 }
13243
13244 return (0);
13245}
13246
13247static void
13248dtrace_dstate_fini(dtrace_dstate_t *dstate)
13249{
13250 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13251
13252 if (dstate->dtds_base == NULL)
13253 return;
13254
13255 kmem_free(dstate->dtds_base, dstate->dtds_size);
13256 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13257}
13258
13259static void
13260dtrace_vstate_fini(dtrace_vstate_t *vstate)
13261{
13262 /*
13263 * Logical XOR, where are you?
13264 */
13265 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13266
13267 if (vstate->dtvs_nglobals > 0) {
13268 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13269 sizeof (dtrace_statvar_t *));
13270 }
13271
13272 if (vstate->dtvs_ntlocals > 0) {
13273 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13274 sizeof (dtrace_difv_t));
13275 }
13276
13277 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13278
13279 if (vstate->dtvs_nlocals > 0) {
13280 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13281 sizeof (dtrace_statvar_t *));
13282 }
13283}
13284
13285static void
13286dtrace_state_clean(dtrace_state_t *state)
13287{
13288 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13289 return;
13290
13291 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13292 dtrace_speculation_clean(state);
13293}
13294
13295static void
13296dtrace_state_deadman(dtrace_state_t *state)
13297{
13298 hrtime_t now;
13299
13300 dtrace_sync();
13301
13302 now = dtrace_gethrtime();
13303
13304 if (state != dtrace_anon.dta_state &&
13305 now - state->dts_laststatus >= dtrace_deadman_user)
13306 return;
13307
13308 /*
13309 * We must be sure that dts_alive never appears to be less than the
13310 * value upon entry to dtrace_state_deadman(), and because we lack a
13311 * dtrace_cas64(), we cannot store to it atomically. We thus instead
13312 * store INT64_MAX to it, followed by a memory barrier, followed by
13313 * the new value. This assures that dts_alive never appears to be
13314 * less than its true value, regardless of the order in which the
13315 * stores to the underlying storage are issued.
13316 */
13317 state->dts_alive = INT64_MAX;
13318 dtrace_membar_producer();
13319 state->dts_alive = now;
13320}
13321
13322static int
13323dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13324{
13325 minor_t minor;
13326 major_t major;
13327 char c[30];
13328 dtrace_state_t *state;
13329 dtrace_optval_t *opt;
13330 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13331
13332 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13333 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13334
13335 /* Cause restart */
13336 *new_state = NULL;
13337
13338 if (devp != NULL) {
13339 minor = getminor(*devp);
13340 }
13341 else {
13342 minor = DTRACE_NCLIENTS - 1;
13343 }
13344
13345 state = dtrace_state_allocate(minor);
13346 if (NULL == state) {
13347 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13348 return (ERESTART); /* can't reacquire */
13349 }
13350
13351 state->dts_epid = DTRACE_EPIDNONE + 1;
13352
13353 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13354 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13355 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13356
13357 if (devp != NULL) {
13358 major = getemajor(*devp);
13359 } else {
13360 major = ddi_driver_major(dtrace_devi);
13361 }
13362
13363 state->dts_dev = makedev(major, minor);
13364
13365 if (devp != NULL)
13366 *devp = state->dts_dev;
13367
13368 /*
13369 * We allocate NCPU buffers. On the one hand, this can be quite
13370 * a bit of memory per instance (nearly 36K on a Starcat). On the
13371 * other hand, it saves an additional memory reference in the probe
13372 * path.
13373 */
13374 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13375 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13376 state->dts_buf_over_limit = 0;
13377 state->dts_cleaner = CYCLIC_NONE;
13378 state->dts_deadman = CYCLIC_NONE;
13379 state->dts_vstate.dtvs_state = state;
13380
13381 for (i = 0; i < DTRACEOPT_MAX; i++)
13382 state->dts_options[i] = DTRACEOPT_UNSET;
13383
13384 /*
13385 * Set the default options.
13386 */
13387 opt = state->dts_options;
13388 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13389 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13390 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13391 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13392 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13393 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13394 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13395 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13396 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13397 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13398 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13399 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13400 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13401 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13402 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
13403
13404 /*
13405 * Depending on the user credentials, we set flag bits which alter probe
13406 * visibility or the amount of destructiveness allowed. In the case of
13407 * actual anonymous tracing, or the possession of all privileges, all of
13408 * the normal checks are bypassed.
13409 */
13410#if defined(__APPLE__)
13411 if (cr != NULL) {
13412 kauth_cred_ref(cr);
13413 state->dts_cred.dcr_cred = cr;
13414 }
13415 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13416 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13417 /*
13418 * Allow only proc credentials when DTrace is
13419 * restricted by the current security policy
13420 */
13421 state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
13422 state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13423 }
13424 else {
13425 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13426 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13427 }
13428 }
13429
13430#else
13431 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13432 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13433 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13434 }
13435 else {
13436 /*
13437 * Set up the credentials for this instantiation. We take a
13438 * hold on the credential to prevent it from disappearing on
13439 * us; this in turn prevents the zone_t referenced by this
13440 * credential from disappearing. This means that we can
13441 * examine the credential and the zone from probe context.
13442 */
13443 crhold(cr);
13444 state->dts_cred.dcr_cred = cr;
13445
13446 /*
13447 * CRA_PROC means "we have *some* privilege for dtrace" and
13448 * unlocks the use of variables like pid, zonename, etc.
13449 */
13450 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13451 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13452 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13453 }
13454
13455 /*
13456 * dtrace_user allows use of syscall and profile providers.
13457 * If the user also has proc_owner and/or proc_zone, we
13458 * extend the scope to include additional visibility and
13459 * destructive power.
13460 */
13461 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13462 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13463 state->dts_cred.dcr_visible |=
13464 DTRACE_CRV_ALLPROC;
13465
13466 state->dts_cred.dcr_action |=
13467 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13468 }
13469
13470 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13471 state->dts_cred.dcr_visible |=
13472 DTRACE_CRV_ALLZONE;
13473
13474 state->dts_cred.dcr_action |=
13475 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13476 }
13477
13478 /*
13479 * If we have all privs in whatever zone this is,
13480 * we can do destructive things to processes which
13481 * have altered credentials.
13482 *
13483 * APPLE NOTE: Darwin doesn't do zones.
13484 * Behave as if zone always has destructive privs.
13485 */
13486
13487 state->dts_cred.dcr_action |=
13488 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13489 }
13490
13491 /*
13492 * Holding the dtrace_kernel privilege also implies that
13493 * the user has the dtrace_user privilege from a visibility
13494 * perspective. But without further privileges, some
13495 * destructive actions are not available.
13496 */
13497 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13498 /*
13499 * Make all probes in all zones visible. However,
13500 * this doesn't mean that all actions become available
13501 * to all zones.
13502 */
13503 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13504 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13505
13506 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13507 DTRACE_CRA_PROC;
13508 /*
13509 * Holding proc_owner means that destructive actions
13510 * for *this* zone are allowed.
13511 */
13512 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13513 state->dts_cred.dcr_action |=
13514 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13515
13516 /*
13517 * Holding proc_zone means that destructive actions
13518 * for this user/group ID in all zones is allowed.
13519 */
13520 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13521 state->dts_cred.dcr_action |=
13522 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13523
13524 /*
13525 * If we have all privs in whatever zone this is,
13526 * we can do destructive things to processes which
13527 * have altered credentials.
13528 *
13529 * APPLE NOTE: Darwin doesn't do zones.
13530 * Behave as if zone always has destructive privs.
13531 */
13532 state->dts_cred.dcr_action |=
13533 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13534 }
13535
13536 /*
13537 * Holding the dtrace_proc privilege gives control over fasttrap
13538 * and pid providers. We need to grant wider destructive
13539 * privileges in the event that the user has proc_owner and/or
13540 * proc_zone.
13541 */
13542 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13543 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13544 state->dts_cred.dcr_action |=
13545 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13546
13547 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13548 state->dts_cred.dcr_action |=
13549 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13550 }
13551 }
13552#endif
13553
13554 *new_state = state;
13555 return(0); /* Success */
13556}
13557
13558static int
13559dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13560{
13561 dtrace_optval_t *opt = state->dts_options, size;
13562 processorid_t cpu = 0;
13563 size_t limit = buf->dtb_size;
13564 int flags = 0, rval;
13565
13566 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13567 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13568 ASSERT(which < DTRACEOPT_MAX);
13569 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13570 (state == dtrace_anon.dta_state &&
13571 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13572
13573 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13574 return (0);
13575
13576 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13577 cpu = opt[DTRACEOPT_CPU];
13578
13579 if (which == DTRACEOPT_SPECSIZE)
13580 flags |= DTRACEBUF_NOSWITCH;
13581
13582 if (which == DTRACEOPT_BUFSIZE) {
13583 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13584 flags |= DTRACEBUF_RING;
13585
13586 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13587 flags |= DTRACEBUF_FILL;
13588
13589 if (state != dtrace_anon.dta_state ||
13590 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13591 flags |= DTRACEBUF_INACTIVE;
13592 }
13593
13594 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
13595 /*
13596 * The size must be 8-byte aligned. If the size is not 8-byte
13597 * aligned, drop it down by the difference.
13598 */
13599 if (size & (sizeof (uint64_t) - 1))
13600 size -= size & (sizeof (uint64_t) - 1);
13601
13602 if (size < state->dts_reserve) {
13603 /*
13604 * Buffers always must be large enough to accommodate
13605 * their prereserved space. We return E2BIG instead
13606 * of ENOMEM in this case to allow for user-level
13607 * software to differentiate the cases.
13608 */
13609 return (E2BIG);
13610 }
13611 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
13612 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
13613
13614 if (rval != ENOMEM) {
13615 opt[which] = size;
13616 return (rval);
13617 }
13618
13619 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13620 return (rval);
13621 }
13622
13623 return (ENOMEM);
13624}
13625
13626static int
13627dtrace_state_buffers(dtrace_state_t *state)
13628{
13629 dtrace_speculation_t *spec = state->dts_speculations;
13630 int rval, i;
13631
13632 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13633 DTRACEOPT_BUFSIZE)) != 0)
13634 return (rval);
13635
13636 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13637 DTRACEOPT_AGGSIZE)) != 0)
13638 return (rval);
13639
13640 for (i = 0; i < state->dts_nspeculations; i++) {
13641 if ((rval = dtrace_state_buffer(state,
13642 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
13643 return (rval);
13644 }
13645
13646 return (0);
13647}
13648
13649static void
13650dtrace_state_prereserve(dtrace_state_t *state)
13651{
13652 dtrace_ecb_t *ecb;
13653 dtrace_probe_t *probe;
13654
13655 state->dts_reserve = 0;
13656
13657 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13658 return;
13659
13660 /*
13661 * If our buffer policy is a "fill" buffer policy, we need to set the
13662 * prereserved space to be the space required by the END probes.
13663 */
13664 probe = dtrace_probes[dtrace_probeid_end - 1];
13665 ASSERT(probe != NULL);
13666
13667 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
13668 if (ecb->dte_state != state)
13669 continue;
13670
13671 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
13672 }
13673}
13674
13675static int
13676dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
13677{
13678 dtrace_optval_t *opt = state->dts_options, sz, nspec;
13679 dtrace_speculation_t *spec;
13680 dtrace_buffer_t *buf;
13681 cyc_handler_t hdlr;
13682 cyc_time_t when;
13683 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
13684 dtrace_icookie_t cookie;
13685
13686 lck_mtx_lock(&cpu_lock);
13687 lck_mtx_lock(&dtrace_lock);
13688
13689 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
13690 rval = EBUSY;
13691 goto out;
13692 }
13693
13694 /*
13695 * Before we can perform any checks, we must prime all of the
13696 * retained enablings that correspond to this state.
13697 */
13698 dtrace_enabling_prime(state);
13699
13700 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13701 rval = EACCES;
13702 goto out;
13703 }
13704
13705 dtrace_state_prereserve(state);
13706
13707 /*
13708 * Now we want to do is try to allocate our speculations.
13709 * We do not automatically resize the number of speculations; if
13710 * this fails, we will fail the operation.
13711 */
13712 nspec = opt[DTRACEOPT_NSPEC];
13713 ASSERT(nspec != DTRACEOPT_UNSET);
13714
13715 if (nspec > INT_MAX) {
13716 rval = ENOMEM;
13717 goto out;
13718 }
13719
13720 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13721
13722 if (spec == NULL) {
13723 rval = ENOMEM;
13724 goto out;
13725 }
13726
13727 state->dts_speculations = spec;
13728 state->dts_nspeculations = (int)nspec;
13729
13730 for (i = 0; i < nspec; i++) {
13731 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13732 rval = ENOMEM;
13733 goto err;
13734 }
13735
13736 spec[i].dtsp_buffer = buf;
13737 }
13738
13739 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13740 if (dtrace_anon.dta_state == NULL) {
13741 rval = ENOENT;
13742 goto out;
13743 }
13744
13745 if (state->dts_necbs != 0) {
13746 rval = EALREADY;
13747 goto out;
13748 }
13749
13750 state->dts_anon = dtrace_anon_grab();
13751 ASSERT(state->dts_anon != NULL);
13752 state = state->dts_anon;
13753
13754 /*
13755 * We want "grabanon" to be set in the grabbed state, so we'll
13756 * copy that option value from the grabbing state into the
13757 * grabbed state.
13758 */
13759 state->dts_options[DTRACEOPT_GRABANON] =
13760 opt[DTRACEOPT_GRABANON];
13761
13762 *cpu = dtrace_anon.dta_beganon;
13763
13764 /*
13765 * If the anonymous state is active (as it almost certainly
13766 * is if the anonymous enabling ultimately matched anything),
13767 * we don't allow any further option processing -- but we
13768 * don't return failure.
13769 */
13770 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13771 goto out;
13772 }
13773
13774 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13775 opt[DTRACEOPT_AGGSIZE] != 0) {
13776 if (state->dts_aggregations == NULL) {
13777 /*
13778 * We're not going to create an aggregation buffer
13779 * because we don't have any ECBs that contain
13780 * aggregations -- set this option to 0.
13781 */
13782 opt[DTRACEOPT_AGGSIZE] = 0;
13783 } else {
13784 /*
13785 * If we have an aggregation buffer, we must also have
13786 * a buffer to use as scratch.
13787 */
13788 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13789 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13790 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13791 }
13792 }
13793 }
13794
13795 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13796 opt[DTRACEOPT_SPECSIZE] != 0) {
13797 if (!state->dts_speculates) {
13798 /*
13799 * We're not going to create speculation buffers
13800 * because we don't have any ECBs that actually
13801 * speculate -- set the speculation size to 0.
13802 */
13803 opt[DTRACEOPT_SPECSIZE] = 0;
13804 }
13805 }
13806
13807 /*
13808 * The bare minimum size for any buffer that we're actually going to
13809 * do anything to is sizeof (uint64_t).
13810 */
13811 sz = sizeof (uint64_t);
13812
13813 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13814 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13815 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13816 /*
13817 * A buffer size has been explicitly set to 0 (or to a size
13818 * that will be adjusted to 0) and we need the space -- we
13819 * need to return failure. We return ENOSPC to differentiate
13820 * it from failing to allocate a buffer due to failure to meet
13821 * the reserve (for which we return E2BIG).
13822 */
13823 rval = ENOSPC;
13824 goto out;
13825 }
13826
13827 if ((rval = dtrace_state_buffers(state)) != 0)
13828 goto err;
13829
13830 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13831 sz = dtrace_dstate_defsize;
13832
13833 do {
13834 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13835
13836 if (rval == 0)
13837 break;
13838
13839 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13840 goto err;
13841 } while (sz >>= 1);
13842
13843 opt[DTRACEOPT_DYNVARSIZE] = sz;
13844
13845 if (rval != 0)
13846 goto err;
13847
13848 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13849 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13850
13851 if (opt[DTRACEOPT_CLEANRATE] == 0)
13852 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13853
13854 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13855 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13856
13857 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13858 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13859
13860 if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
13861 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
13862
13863 if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
13864 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
13865
13866 if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
13867 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
13868
13869 if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
13870 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
13871
13872 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13873 hdlr.cyh_arg = state;
13874 hdlr.cyh_level = CY_LOW_LEVEL;
13875
13876 when.cyt_when = 0;
13877 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13878
13879 state->dts_cleaner = cyclic_add(&hdlr, &when);
13880
13881 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13882 hdlr.cyh_arg = state;
13883 hdlr.cyh_level = CY_LOW_LEVEL;
13884
13885 when.cyt_when = 0;
13886 when.cyt_interval = dtrace_deadman_interval;
13887
13888 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13889 state->dts_deadman = cyclic_add(&hdlr, &when);
13890
13891 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13892
13893 /*
13894 * Now it's time to actually fire the BEGIN probe. We need to disable
13895 * interrupts here both to record the CPU on which we fired the BEGIN
13896 * probe (the data from this CPU will be processed first at user
13897 * level) and to manually activate the buffer for this CPU.
13898 */
13899 cookie = dtrace_interrupt_disable();
13900 *cpu = CPU->cpu_id;
13901 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13902 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13903
13904 dtrace_probe(dtrace_probeid_begin,
13905 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13906 dtrace_interrupt_enable(cookie);
13907 /*
13908 * We may have had an exit action from a BEGIN probe; only change our
13909 * state to ACTIVE if we're still in WARMUP.
13910 */
13911 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13912 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13913
13914 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13915 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13916
13917 /*
13918 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13919 * want each CPU to transition its principal buffer out of the
13920 * INACTIVE state. Doing this assures that no CPU will suddenly begin
13921 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13922 * atomically transition from processing none of a state's ECBs to
13923 * processing all of them.
13924 */
13925 dtrace_xcall(DTRACE_CPUALL,
13926 (dtrace_xcall_t)dtrace_buffer_activate, state);
13927 goto out;
13928
13929err:
13930 dtrace_buffer_free(state->dts_buffer);
13931 dtrace_buffer_free(state->dts_aggbuffer);
13932
13933 if ((nspec = state->dts_nspeculations) == 0) {
13934 ASSERT(state->dts_speculations == NULL);
13935 goto out;
13936 }
13937
13938 spec = state->dts_speculations;
13939 ASSERT(spec != NULL);
13940
13941 for (i = 0; i < state->dts_nspeculations; i++) {
13942 if ((buf = spec[i].dtsp_buffer) == NULL)
13943 break;
13944
13945 dtrace_buffer_free(buf);
13946 kmem_free(buf, bufsize);
13947 }
13948
13949 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13950 state->dts_nspeculations = 0;
13951 state->dts_speculations = NULL;
13952
13953out:
13954 lck_mtx_unlock(&dtrace_lock);
13955 lck_mtx_unlock(&cpu_lock);
13956
13957 return (rval);
13958}
13959
13960static int
13961dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13962{
13963 dtrace_icookie_t cookie;
13964
13965 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13966
13967 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13968 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13969 return (EINVAL);
13970
13971 /*
13972 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13973 * to be sure that every CPU has seen it. See below for the details
13974 * on why this is done.
13975 */
13976 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13977 dtrace_sync();
13978
13979 /*
13980 * By this point, it is impossible for any CPU to be still processing
13981 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13982 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13983 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13984 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13985 * iff we're in the END probe.
13986 */
13987 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13988 dtrace_sync();
13989 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13990
13991 /*
13992 * Finally, we can release the reserve and call the END probe. We
13993 * disable interrupts across calling the END probe to allow us to
13994 * return the CPU on which we actually called the END probe. This
13995 * allows user-land to be sure that this CPU's principal buffer is
13996 * processed last.
13997 */
13998 state->dts_reserve = 0;
13999
14000 cookie = dtrace_interrupt_disable();
14001 *cpu = CPU->cpu_id;
14002 dtrace_probe(dtrace_probeid_end,
14003 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14004 dtrace_interrupt_enable(cookie);
14005
14006 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14007 dtrace_sync();
14008
14009 return (0);
14010}
14011
14012static int
14013dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14014 dtrace_optval_t val)
14015{
14016 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14017
14018 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14019 return (EBUSY);
14020
14021 if (option >= DTRACEOPT_MAX)
14022 return (EINVAL);
14023
14024 if (option != DTRACEOPT_CPU && val < 0)
14025 return (EINVAL);
14026
14027 switch (option) {
14028 case DTRACEOPT_DESTRUCTIVE:
14029 /*
14030 * Prevent consumers from enabling destructive actions if DTrace
14031 * is running in a restricted environment, or if actions are
14032 * disallowed.
14033 */
14034 if (dtrace_is_restricted() || dtrace_destructive_disallow)
14035 return (EACCES);
14036
14037 state->dts_cred.dcr_destructive = 1;
14038 break;
14039
14040 case DTRACEOPT_BUFSIZE:
14041 case DTRACEOPT_DYNVARSIZE:
14042 case DTRACEOPT_AGGSIZE:
14043 case DTRACEOPT_SPECSIZE:
14044 case DTRACEOPT_STRSIZE:
14045 if (val < 0)
14046 return (EINVAL);
14047
14048 if (val >= LONG_MAX) {
14049 /*
14050 * If this is an otherwise negative value, set it to
14051 * the highest multiple of 128m less than LONG_MAX.
14052 * Technically, we're adjusting the size without
14053 * regard to the buffer resizing policy, but in fact,
14054 * this has no effect -- if we set the buffer size to
14055 * ~LONG_MAX and the buffer policy is ultimately set to
14056 * be "manual", the buffer allocation is guaranteed to
14057 * fail, if only because the allocation requires two
14058 * buffers. (We set the the size to the highest
14059 * multiple of 128m because it ensures that the size
14060 * will remain a multiple of a megabyte when
14061 * repeatedly halved -- all the way down to 15m.)
14062 */
14063 val = LONG_MAX - (1 << 27) + 1;
14064 }
14065 }
14066
14067 state->dts_options[option] = val;
14068
14069 return (0);
14070}
14071
14072static void
14073dtrace_state_destroy(dtrace_state_t *state)
14074{
14075 dtrace_ecb_t *ecb;
14076 dtrace_vstate_t *vstate = &state->dts_vstate;
14077 minor_t minor = getminor(state->dts_dev);
14078 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14079 dtrace_speculation_t *spec = state->dts_speculations;
14080 int nspec = state->dts_nspeculations;
14081 uint32_t match;
14082
14083 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14084 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14085
14086 /*
14087 * First, retract any retained enablings for this state.
14088 */
14089 dtrace_enabling_retract(state);
14090 ASSERT(state->dts_nretained == 0);
14091
14092 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14093 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14094 /*
14095 * We have managed to come into dtrace_state_destroy() on a
14096 * hot enabling -- almost certainly because of a disorderly
14097 * shutdown of a consumer. (That is, a consumer that is
14098 * exiting without having called dtrace_stop().) In this case,
14099 * we're going to set our activity to be KILLED, and then
14100 * issue a sync to be sure that everyone is out of probe
14101 * context before we start blowing away ECBs.
14102 */
14103 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14104 dtrace_sync();
14105 }
14106
14107 /*
14108 * Release the credential hold we took in dtrace_state_create().
14109 */
14110 if (state->dts_cred.dcr_cred != NULL)
14111 kauth_cred_unref(&state->dts_cred.dcr_cred);
14112
14113 /*
14114 * Now we can safely disable and destroy any enabled probes. Because
14115 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14116 * (especially if they're all enabled), we take two passes through the
14117 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14118 * in the second we disable whatever is left over.
14119 */
14120 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14121 for (i = 0; i < state->dts_necbs; i++) {
14122 if ((ecb = state->dts_ecbs[i]) == NULL)
14123 continue;
14124
14125 if (match && ecb->dte_probe != NULL) {
14126 dtrace_probe_t *probe = ecb->dte_probe;
14127 dtrace_provider_t *prov = probe->dtpr_provider;
14128
14129 if (!(prov->dtpv_priv.dtpp_flags & match))
14130 continue;
14131 }
14132
14133 dtrace_ecb_disable(ecb);
14134 dtrace_ecb_destroy(ecb);
14135 }
14136
14137 if (!match)
14138 break;
14139 }
14140
14141 /*
14142 * Before we free the buffers, perform one more sync to assure that
14143 * every CPU is out of probe context.
14144 */
14145 dtrace_sync();
14146
14147 dtrace_buffer_free(state->dts_buffer);
14148 dtrace_buffer_free(state->dts_aggbuffer);
14149
14150 for (i = 0; i < nspec; i++)
14151 dtrace_buffer_free(spec[i].dtsp_buffer);
14152
14153 if (state->dts_cleaner != CYCLIC_NONE)
14154 cyclic_remove(state->dts_cleaner);
14155
14156 if (state->dts_deadman != CYCLIC_NONE)
14157 cyclic_remove(state->dts_deadman);
14158
14159 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14160 dtrace_vstate_fini(vstate);
14161 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14162
14163 if (state->dts_aggregations != NULL) {
14164#if DEBUG
14165 for (i = 0; i < state->dts_naggregations; i++)
14166 ASSERT(state->dts_aggregations[i] == NULL);
14167#endif
14168 ASSERT(state->dts_naggregations > 0);
14169 kmem_free(state->dts_aggregations,
14170 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14171 }
14172
14173 kmem_free(state->dts_buffer, bufsize);
14174 kmem_free(state->dts_aggbuffer, bufsize);
14175
14176 for (i = 0; i < nspec; i++)
14177 kmem_free(spec[i].dtsp_buffer, bufsize);
14178
14179 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14180
14181 dtrace_format_destroy(state);
14182
14183 vmem_destroy(state->dts_aggid_arena);
14184 dtrace_state_free(minor);
14185}
14186
14187/*
14188 * DTrace Anonymous Enabling Functions
14189 */
14190
14191int
14192dtrace_keep_kernel_symbols(void)
14193{
14194 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14195 return 0;
14196 }
14197
14198 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14199 return 1;
14200
14201 return 0;
14202}
14203
14204static dtrace_state_t *
14205dtrace_anon_grab(void)
14206{
14207 dtrace_state_t *state;
14208
14209 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14210
14211 if ((state = dtrace_anon.dta_state) == NULL) {
14212 ASSERT(dtrace_anon.dta_enabling == NULL);
14213 return (NULL);
14214 }
14215
14216 ASSERT(dtrace_anon.dta_enabling != NULL);
14217 ASSERT(dtrace_retained != NULL);
14218
14219 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14220 dtrace_anon.dta_enabling = NULL;
14221 dtrace_anon.dta_state = NULL;
14222
14223 return (state);
14224}
14225
14226static void
14227dtrace_anon_property(void)
14228{
14229 int i, rv;
14230 dtrace_state_t *state;
14231 dof_hdr_t *dof;
14232 char c[32]; /* enough for "dof-data-" + digits */
14233
14234 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14235 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14236
14237 for (i = 0; ; i++) {
14238 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14239
14240 dtrace_err_verbose = 1;
14241
14242 if ((dof = dtrace_dof_property(c)) == NULL) {
14243 dtrace_err_verbose = 0;
14244 break;
14245 }
14246
14247#ifdef illumos
14248 /*
14249 * We want to create anonymous state, so we need to transition
14250 * the kernel debugger to indicate that DTrace is active. If
14251 * this fails (e.g. because the debugger has modified text in
14252 * some way), we won't continue with the processing.
14253 */
14254 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14255 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14256 "enabling ignored.");
14257 dtrace_dof_destroy(dof);
14258 break;
14259 }
14260#endif
14261
14262 /*
14263 * If we haven't allocated an anonymous state, we'll do so now.
14264 */
14265 if ((state = dtrace_anon.dta_state) == NULL) {
14266 rv = dtrace_state_create(NULL, NULL, &state);
14267 dtrace_anon.dta_state = state;
14268 if (rv != 0 || state == NULL) {
14269 /*
14270 * This basically shouldn't happen: the only
14271 * failure mode from dtrace_state_create() is a
14272 * failure of ddi_soft_state_zalloc() that
14273 * itself should never happen. Still, the
14274 * interface allows for a failure mode, and
14275 * we want to fail as gracefully as possible:
14276 * we'll emit an error message and cease
14277 * processing anonymous state in this case.
14278 */
14279 cmn_err(CE_WARN, "failed to create "
14280 "anonymous state");
14281 dtrace_dof_destroy(dof);
14282 break;
14283 }
14284 }
14285
14286 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14287 &dtrace_anon.dta_enabling, 0, B_TRUE);
14288
14289 if (rv == 0)
14290 rv = dtrace_dof_options(dof, state);
14291
14292 dtrace_err_verbose = 0;
14293 dtrace_dof_destroy(dof);
14294
14295 if (rv != 0) {
14296 /*
14297 * This is malformed DOF; chuck any anonymous state
14298 * that we created.
14299 */
14300 ASSERT(dtrace_anon.dta_enabling == NULL);
14301 dtrace_state_destroy(state);
14302 dtrace_anon.dta_state = NULL;
14303 break;
14304 }
14305
14306 ASSERT(dtrace_anon.dta_enabling != NULL);
14307 }
14308
14309 if (dtrace_anon.dta_enabling != NULL) {
14310 int rval;
14311
14312 /*
14313 * dtrace_enabling_retain() can only fail because we are
14314 * trying to retain more enablings than are allowed -- but
14315 * we only have one anonymous enabling, and we are guaranteed
14316 * to be allowed at least one retained enabling; we assert
14317 * that dtrace_enabling_retain() returns success.
14318 */
14319 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14320 ASSERT(rval == 0);
14321
14322 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14323 }
14324}
14325
14326/*
14327 * DTrace Helper Functions
14328 */
14329static void
14330dtrace_helper_trace(dtrace_helper_action_t *helper,
14331 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14332{
14333 uint32_t size, next, nnext;
14334 int i;
14335 dtrace_helptrace_t *ent;
14336 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14337
14338 if (!dtrace_helptrace_enabled)
14339 return;
14340
14341 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14342
14343 /*
14344 * What would a tracing framework be without its own tracing
14345 * framework? (Well, a hell of a lot simpler, for starters...)
14346 */
14347 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14348 sizeof (uint64_t) - sizeof (uint64_t);
14349
14350 /*
14351 * Iterate until we can allocate a slot in the trace buffer.
14352 */
14353 do {
14354 next = dtrace_helptrace_next;
14355
14356 if (next + size < dtrace_helptrace_bufsize) {
14357 nnext = next + size;
14358 } else {
14359 nnext = size;
14360 }
14361 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14362
14363 /*
14364 * We have our slot; fill it in.
14365 */
14366 if (nnext == size)
14367 next = 0;
14368
14369 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14370 ent->dtht_helper = helper;
14371 ent->dtht_where = where;
14372 ent->dtht_nlocals = vstate->dtvs_nlocals;
14373
14374 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14375 mstate->dtms_fltoffs : -1;
14376 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14377 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
14378
14379 for (i = 0; i < vstate->dtvs_nlocals; i++) {
14380 dtrace_statvar_t *svar;
14381
14382 if ((svar = vstate->dtvs_locals[i]) == NULL)
14383 continue;
14384
14385 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
14386 ent->dtht_locals[i] =
14387 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
14388 }
14389}
14390
14391static uint64_t
14392dtrace_helper(int which, dtrace_mstate_t *mstate,
14393 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14394{
14395 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14396 uint64_t sarg0 = mstate->dtms_arg[0];
14397 uint64_t sarg1 = mstate->dtms_arg[1];
14398 uint64_t rval = 0;
14399 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14400 dtrace_helper_action_t *helper;
14401 dtrace_vstate_t *vstate;
14402 dtrace_difo_t *pred;
14403 int i, trace = dtrace_helptrace_enabled;
14404
14405 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14406
14407 if (helpers == NULL)
14408 return (0);
14409
14410 if ((helper = helpers->dthps_actions[which]) == NULL)
14411 return (0);
14412
14413 vstate = &helpers->dthps_vstate;
14414 mstate->dtms_arg[0] = arg0;
14415 mstate->dtms_arg[1] = arg1;
14416
14417 /*
14418 * Now iterate over each helper. If its predicate evaluates to 'true',
14419 * we'll call the corresponding actions. Note that the below calls
14420 * to dtrace_dif_emulate() may set faults in machine state. This is
14421 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
14422 * the stored DIF offset with its own (which is the desired behavior).
14423 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14424 * from machine state; this is okay, too.
14425 */
14426 for (; helper != NULL; helper = helper->dtha_next) {
14427 if ((pred = helper->dtha_predicate) != NULL) {
14428 if (trace)
14429 dtrace_helper_trace(helper, mstate, vstate, 0);
14430
14431 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14432 goto next;
14433
14434 if (*flags & CPU_DTRACE_FAULT)
14435 goto err;
14436 }
14437
14438 for (i = 0; i < helper->dtha_nactions; i++) {
14439 if (trace)
14440 dtrace_helper_trace(helper,
14441 mstate, vstate, i + 1);
14442
14443 rval = dtrace_dif_emulate(helper->dtha_actions[i],
14444 mstate, vstate, state);
14445
14446 if (*flags & CPU_DTRACE_FAULT)
14447 goto err;
14448 }
14449
14450next:
14451 if (trace)
14452 dtrace_helper_trace(helper, mstate, vstate,
14453 DTRACE_HELPTRACE_NEXT);
14454 }
14455
14456 if (trace)
14457 dtrace_helper_trace(helper, mstate, vstate,
14458 DTRACE_HELPTRACE_DONE);
14459
14460 /*
14461 * Restore the arg0 that we saved upon entry.
14462 */
14463 mstate->dtms_arg[0] = sarg0;
14464 mstate->dtms_arg[1] = sarg1;
14465
14466 return (rval);
14467
14468err:
14469 if (trace)
14470 dtrace_helper_trace(helper, mstate, vstate,
14471 DTRACE_HELPTRACE_ERR);
14472
14473 /*
14474 * Restore the arg0 that we saved upon entry.
14475 */
14476 mstate->dtms_arg[0] = sarg0;
14477 mstate->dtms_arg[1] = sarg1;
14478
14479 return (0);
14480}
14481
14482static void
14483dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14484 dtrace_vstate_t *vstate)
14485{
14486 int i;
14487
14488 if (helper->dtha_predicate != NULL)
14489 dtrace_difo_release(helper->dtha_predicate, vstate);
14490
14491 for (i = 0; i < helper->dtha_nactions; i++) {
14492 ASSERT(helper->dtha_actions[i] != NULL);
14493 dtrace_difo_release(helper->dtha_actions[i], vstate);
14494 }
14495
14496 kmem_free(helper->dtha_actions,
14497 helper->dtha_nactions * sizeof (dtrace_difo_t *));
14498 kmem_free(helper, sizeof (dtrace_helper_action_t));
14499}
14500
14501static int
14502dtrace_helper_destroygen(proc_t* p, int gen)
14503{
14504 dtrace_helpers_t *help = p->p_dtrace_helpers;
14505 dtrace_vstate_t *vstate;
14506 uint_t i;
14507
14508 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14509 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14510
14511 if (help == NULL || gen > help->dthps_generation)
14512 return (EINVAL);
14513
14514 vstate = &help->dthps_vstate;
14515
14516 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14517 dtrace_helper_action_t *last = NULL, *h, *next;
14518
14519 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14520 next = h->dtha_next;
14521
14522 if (h->dtha_generation == gen) {
14523 if (last != NULL) {
14524 last->dtha_next = next;
14525 } else {
14526 help->dthps_actions[i] = next;
14527 }
14528
14529 dtrace_helper_action_destroy(h, vstate);
14530 } else {
14531 last = h;
14532 }
14533 }
14534 }
14535
14536 /*
14537 * Interate until we've cleared out all helper providers with the
14538 * given generation number.
14539 */
14540 for (;;) {
14541 dtrace_helper_provider_t *prov = NULL;
14542
14543 /*
14544 * Look for a helper provider with the right generation. We
14545 * have to start back at the beginning of the list each time
14546 * because we drop dtrace_lock. It's unlikely that we'll make
14547 * more than two passes.
14548 */
14549 for (i = 0; i < help->dthps_nprovs; i++) {
14550 prov = help->dthps_provs[i];
14551
14552 if (prov->dthp_generation == gen)
14553 break;
14554 }
14555
14556 /*
14557 * If there were no matches, we're done.
14558 */
14559 if (i == help->dthps_nprovs)
14560 break;
14561
14562 /*
14563 * Move the last helper provider into this slot.
14564 */
14565 help->dthps_nprovs--;
14566 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14567 help->dthps_provs[help->dthps_nprovs] = NULL;
14568
14569 lck_mtx_unlock(&dtrace_lock);
14570
14571 /*
14572 * If we have a meta provider, remove this helper provider.
14573 */
14574 if (dtrace_meta_pid != NULL) {
14575 ASSERT(dtrace_deferred_pid == NULL);
14576 dtrace_helper_provider_remove(&prov->dthp_prov,
14577 p);
14578 }
14579
14580 dtrace_helper_provider_destroy(prov);
14581
14582 lck_mtx_lock(&dtrace_lock);
14583 }
14584
14585 return (0);
14586}
14587
14588static int
14589dtrace_helper_validate(dtrace_helper_action_t *helper)
14590{
14591 int err = 0, i;
14592 dtrace_difo_t *dp;
14593
14594 if ((dp = helper->dtha_predicate) != NULL)
14595 err += dtrace_difo_validate_helper(dp);
14596
14597 for (i = 0; i < helper->dtha_nactions; i++)
14598 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14599
14600 return (err == 0);
14601}
14602
14603static int
14604dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
14605{
14606 dtrace_helpers_t *help;
14607 dtrace_helper_action_t *helper, *last;
14608 dtrace_actdesc_t *act;
14609 dtrace_vstate_t *vstate;
14610 dtrace_predicate_t *pred;
14611 int count = 0, nactions = 0, i;
14612
14613 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14614 return (EINVAL);
14615
14616 help = p->p_dtrace_helpers;
14617 last = help->dthps_actions[which];
14618 vstate = &help->dthps_vstate;
14619
14620 for (count = 0; last != NULL; last = last->dtha_next) {
14621 count++;
14622 if (last->dtha_next == NULL)
14623 break;
14624 }
14625
14626 /*
14627 * If we already have dtrace_helper_actions_max helper actions for this
14628 * helper action type, we'll refuse to add a new one.
14629 */
14630 if (count >= dtrace_helper_actions_max)
14631 return (ENOSPC);
14632
14633 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14634 helper->dtha_generation = help->dthps_generation;
14635
14636 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14637 ASSERT(pred->dtp_difo != NULL);
14638 dtrace_difo_hold(pred->dtp_difo);
14639 helper->dtha_predicate = pred->dtp_difo;
14640 }
14641
14642 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14643 if (act->dtad_kind != DTRACEACT_DIFEXPR)
14644 goto err;
14645
14646 if (act->dtad_difo == NULL)
14647 goto err;
14648
14649 nactions++;
14650 }
14651
14652 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
14653 (helper->dtha_nactions = nactions), KM_SLEEP);
14654
14655 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
14656 dtrace_difo_hold(act->dtad_difo);
14657 helper->dtha_actions[i++] = act->dtad_difo;
14658 }
14659
14660 if (!dtrace_helper_validate(helper))
14661 goto err;
14662
14663 if (last == NULL) {
14664 help->dthps_actions[which] = helper;
14665 } else {
14666 last->dtha_next = helper;
14667 }
14668
14669 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
14670 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
14671 dtrace_helptrace_next = 0;
14672 }
14673
14674 return (0);
14675err:
14676 dtrace_helper_action_destroy(helper, vstate);
14677 return (EINVAL);
14678}
14679
14680static void
14681dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
14682 dof_helper_t *dofhp)
14683{
14684 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14685 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
14686
14687 lck_mtx_lock(&dtrace_lock);
14688
14689 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
14690 /*
14691 * If the dtrace module is loaded but not attached, or if
14692 * there aren't isn't a meta provider registered to deal with
14693 * these provider descriptions, we need to postpone creating
14694 * the actual providers until later.
14695 */
14696
14697 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14698 dtrace_deferred_pid != help) {
14699 help->dthps_deferred = 1;
14700 help->dthps_pid = p->p_pid;
14701 help->dthps_next = dtrace_deferred_pid;
14702 help->dthps_prev = NULL;
14703 if (dtrace_deferred_pid != NULL)
14704 dtrace_deferred_pid->dthps_prev = help;
14705 dtrace_deferred_pid = help;
14706 }
14707
14708 lck_mtx_unlock(&dtrace_lock);
14709
14710 } else if (dofhp != NULL) {
14711 /*
14712 * If the dtrace module is loaded and we have a particular
14713 * helper provider description, pass that off to the
14714 * meta provider.
14715 */
14716
14717 lck_mtx_unlock(&dtrace_lock);
14718
14719 dtrace_helper_provide(dofhp, p);
14720
14721 } else {
14722 /*
14723 * Otherwise, just pass all the helper provider descriptions
14724 * off to the meta provider.
14725 */
14726
14727 uint_t i;
14728 lck_mtx_unlock(&dtrace_lock);
14729
14730 for (i = 0; i < help->dthps_nprovs; i++) {
14731 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14732 p);
14733 }
14734 }
14735}
14736
14737static int
14738dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
14739{
14740 dtrace_helpers_t *help;
14741 dtrace_helper_provider_t *hprov, **tmp_provs;
14742 uint_t tmp_maxprovs, i;
14743
14744 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14745 help = p->p_dtrace_helpers;
14746 ASSERT(help != NULL);
14747
14748 /*
14749 * If we already have dtrace_helper_providers_max helper providers,
14750 * we're refuse to add a new one.
14751 */
14752 if (help->dthps_nprovs >= dtrace_helper_providers_max)
14753 return (ENOSPC);
14754
14755 /*
14756 * Check to make sure this isn't a duplicate.
14757 */
14758 for (i = 0; i < help->dthps_nprovs; i++) {
14759 if (dofhp->dofhp_addr ==
14760 help->dthps_provs[i]->dthp_prov.dofhp_addr)
14761 return (EALREADY);
14762 }
14763
14764 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14765 hprov->dthp_prov = *dofhp;
14766 hprov->dthp_ref = 1;
14767 hprov->dthp_generation = gen;
14768
14769 /*
14770 * Allocate a bigger table for helper providers if it's already full.
14771 */
14772 if (help->dthps_maxprovs == help->dthps_nprovs) {
14773 tmp_maxprovs = help->dthps_maxprovs;
14774 tmp_provs = help->dthps_provs;
14775
14776 if (help->dthps_maxprovs == 0)
14777 help->dthps_maxprovs = 2;
14778 else
14779 help->dthps_maxprovs *= 2;
14780 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14781 help->dthps_maxprovs = dtrace_helper_providers_max;
14782
14783 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14784
14785 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14786 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14787
14788 if (tmp_provs != NULL) {
14789 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14790 sizeof (dtrace_helper_provider_t *));
14791 kmem_free(tmp_provs, tmp_maxprovs *
14792 sizeof (dtrace_helper_provider_t *));
14793 }
14794 }
14795
14796 help->dthps_provs[help->dthps_nprovs] = hprov;
14797 help->dthps_nprovs++;
14798
14799 return (0);
14800}
14801
14802static void
14803dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14804{
14805 lck_mtx_lock(&dtrace_lock);
14806
14807 if (--hprov->dthp_ref == 0) {
14808 dof_hdr_t *dof;
14809 lck_mtx_unlock(&dtrace_lock);
14810 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14811 dtrace_dof_destroy(dof);
14812 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14813 } else {
14814 lck_mtx_unlock(&dtrace_lock);
14815 }
14816}
14817
14818static int
14819dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14820{
14821 uintptr_t daddr = (uintptr_t)dof;
14822 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14823 dof_provider_t *provider;
14824 dof_probe_t *probe;
14825 uint8_t *arg;
14826 char *strtab, *typestr;
14827 dof_stridx_t typeidx;
14828 size_t typesz;
14829 uint_t nprobes, j, k;
14830
14831 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14832
14833 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14834 dtrace_dof_error(dof, "misaligned section offset");
14835 return (-1);
14836 }
14837
14838 /*
14839 * The section needs to be large enough to contain the DOF provider
14840 * structure appropriate for the given version.
14841 */
14842 if (sec->dofs_size <
14843 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14844 offsetof(dof_provider_t, dofpv_prenoffs) :
14845 sizeof (dof_provider_t))) {
14846 dtrace_dof_error(dof, "provider section too small");
14847 return (-1);
14848 }
14849
14850 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14851 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14852 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14853 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14854 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14855
14856 if (str_sec == NULL || prb_sec == NULL ||
14857 arg_sec == NULL || off_sec == NULL)
14858 return (-1);
14859
14860 enoff_sec = NULL;
14861
14862 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14863 provider->dofpv_prenoffs != DOF_SECT_NONE &&
14864 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14865 provider->dofpv_prenoffs)) == NULL)
14866 return (-1);
14867
14868 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14869
14870 if (provider->dofpv_name >= str_sec->dofs_size ||
14871 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14872 dtrace_dof_error(dof, "invalid provider name");
14873 return (-1);
14874 }
14875
14876 if (prb_sec->dofs_entsize == 0 ||
14877 prb_sec->dofs_entsize > prb_sec->dofs_size) {
14878 dtrace_dof_error(dof, "invalid entry size");
14879 return (-1);
14880 }
14881
14882 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14883 dtrace_dof_error(dof, "misaligned entry size");
14884 return (-1);
14885 }
14886
14887 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14888 dtrace_dof_error(dof, "invalid entry size");
14889 return (-1);
14890 }
14891
14892 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14893 dtrace_dof_error(dof, "misaligned section offset");
14894 return (-1);
14895 }
14896
14897 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14898 dtrace_dof_error(dof, "invalid entry size");
14899 return (-1);
14900 }
14901
14902 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14903
14904 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14905
14906 /*
14907 * Take a pass through the probes to check for errors.
14908 */
14909 for (j = 0; j < nprobes; j++) {
14910 probe = (dof_probe_t *)(uintptr_t)(daddr +
14911 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14912
14913 if (probe->dofpr_func >= str_sec->dofs_size) {
14914 dtrace_dof_error(dof, "invalid function name");
14915 return (-1);
14916 }
14917
14918 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14919 dtrace_dof_error(dof, "function name too long");
14920 return (-1);
14921 }
14922
14923 if (probe->dofpr_name >= str_sec->dofs_size ||
14924 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14925 dtrace_dof_error(dof, "invalid probe name");
14926 return (-1);
14927 }
14928
14929 /*
14930 * The offset count must not wrap the index, and the offsets
14931 * must also not overflow the section's data.
14932 */
14933 if (probe->dofpr_offidx + probe->dofpr_noffs <
14934 probe->dofpr_offidx ||
14935 (probe->dofpr_offidx + probe->dofpr_noffs) *
14936 off_sec->dofs_entsize > off_sec->dofs_size) {
14937 dtrace_dof_error(dof, "invalid probe offset");
14938 return (-1);
14939 }
14940
14941 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14942 /*
14943 * If there's no is-enabled offset section, make sure
14944 * there aren't any is-enabled offsets. Otherwise
14945 * perform the same checks as for probe offsets
14946 * (immediately above).
14947 */
14948 if (enoff_sec == NULL) {
14949 if (probe->dofpr_enoffidx != 0 ||
14950 probe->dofpr_nenoffs != 0) {
14951 dtrace_dof_error(dof, "is-enabled "
14952 "offsets with null section");
14953 return (-1);
14954 }
14955 } else if (probe->dofpr_enoffidx +
14956 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14957 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14958 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14959 dtrace_dof_error(dof, "invalid is-enabled "
14960 "offset");
14961 return (-1);
14962 }
14963
14964 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14965 dtrace_dof_error(dof, "zero probe and "
14966 "is-enabled offsets");
14967 return (-1);
14968 }
14969 } else if (probe->dofpr_noffs == 0) {
14970 dtrace_dof_error(dof, "zero probe offsets");
14971 return (-1);
14972 }
14973
14974 if (probe->dofpr_argidx + probe->dofpr_xargc <
14975 probe->dofpr_argidx ||
14976 (probe->dofpr_argidx + probe->dofpr_xargc) *
14977 arg_sec->dofs_entsize > arg_sec->dofs_size) {
14978 dtrace_dof_error(dof, "invalid args");
14979 return (-1);
14980 }
14981
14982 typeidx = probe->dofpr_nargv;
14983 typestr = strtab + probe->dofpr_nargv;
14984 for (k = 0; k < probe->dofpr_nargc; k++) {
14985 if (typeidx >= str_sec->dofs_size) {
14986 dtrace_dof_error(dof, "bad "
14987 "native argument type");
14988 return (-1);
14989 }
14990
14991 typesz = strlen(typestr) + 1;
14992 if (typesz > DTRACE_ARGTYPELEN) {
14993 dtrace_dof_error(dof, "native "
14994 "argument type too long");
14995 return (-1);
14996 }
14997 typeidx += typesz;
14998 typestr += typesz;
14999 }
15000
15001 typeidx = probe->dofpr_xargv;
15002 typestr = strtab + probe->dofpr_xargv;
15003 for (k = 0; k < probe->dofpr_xargc; k++) {
15004 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15005 dtrace_dof_error(dof, "bad "
15006 "native argument index");
15007 return (-1);
15008 }
15009
15010 if (typeidx >= str_sec->dofs_size) {
15011 dtrace_dof_error(dof, "bad "
15012 "translated argument type");
15013 return (-1);
15014 }
15015
15016 typesz = strlen(typestr) + 1;
15017 if (typesz > DTRACE_ARGTYPELEN) {
15018 dtrace_dof_error(dof, "translated argument "
15019 "type too long");
15020 return (-1);
15021 }
15022
15023 typeidx += typesz;
15024 typestr += typesz;
15025 }
15026 }
15027
15028 return (0);
15029}
15030
15031static int
15032dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15033{
15034 dtrace_helpers_t *help;
15035 dtrace_vstate_t *vstate;
15036 dtrace_enabling_t *enab = NULL;
15037 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15038 uintptr_t daddr = (uintptr_t)dof;
15039
15040 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15041 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15042
15043 if ((help = p->p_dtrace_helpers) == NULL)
15044 help = dtrace_helpers_create(p);
15045
15046 vstate = &help->dthps_vstate;
15047
15048 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15049 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15050 dtrace_dof_destroy(dof);
15051 return (rv);
15052 }
15053
15054 /*
15055 * Look for helper providers and validate their descriptions.
15056 */
15057 if (dhp != NULL) {
15058 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15059 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15060 dof->dofh_secoff + i * dof->dofh_secsize);
15061
15062 if (sec->dofs_type != DOF_SECT_PROVIDER)
15063 continue;
15064
15065 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15066 dtrace_enabling_destroy(enab);
15067 dtrace_dof_destroy(dof);
15068 return (-1);
15069 }
15070
15071 nprovs++;
15072 }
15073 }
15074
15075 /*
15076 * Now we need to walk through the ECB descriptions in the enabling.
15077 */
15078 for (i = 0; i < enab->dten_ndesc; i++) {
15079 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15080 dtrace_probedesc_t *desc = &ep->dted_probe;
15081
15082 /* APPLE NOTE: Darwin employs size bounded string operation. */
15083 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15084 continue;
15085
15086 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15087 continue;
15088
15089 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15090 continue;
15091
15092 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15093 ep)) != 0) {
15094 /*
15095 * Adding this helper action failed -- we are now going
15096 * to rip out the entire generation and return failure.
15097 */
15098 (void) dtrace_helper_destroygen(p, help->dthps_generation);
15099 dtrace_enabling_destroy(enab);
15100 dtrace_dof_destroy(dof);
15101 return (-1);
15102 }
15103
15104 nhelpers++;
15105 }
15106
15107 if (nhelpers < enab->dten_ndesc)
15108 dtrace_dof_error(dof, "unmatched helpers");
15109
15110 gen = help->dthps_generation++;
15111 dtrace_enabling_destroy(enab);
15112
15113 if (dhp != NULL && nprovs > 0) {
15114 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15115 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15116 lck_mtx_unlock(&dtrace_lock);
15117 dtrace_helper_provider_register(p, help, dhp);
15118 lck_mtx_lock(&dtrace_lock);
15119
15120 destroy = 0;
15121 }
15122 }
15123
15124 if (destroy)
15125 dtrace_dof_destroy(dof);
15126
15127 return (gen);
15128}
15129
15130/*
15131 * APPLE NOTE: DTrace lazy dof implementation
15132 *
15133 * DTrace user static probes (USDT probes) and helper actions are loaded
15134 * in a process by proccessing dof sections. The dof sections are passed
15135 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15136 * expensive to process dof for a process that will never use it. There
15137 * is a memory cost (allocating the providers/probes), and a cpu cost
15138 * (creating the providers/probes).
15139 *
15140 * To reduce this cost, we use "lazy dof". The normal proceedure for
15141 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15142 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15143 * used, each process retains the dof_ioctl_data_t block, instead of
15144 * copying in the data it points to.
15145 *
15146 * The dof_ioctl_data_t blocks are managed as if they were the actual
15147 * processed dof; on fork the block is copied to the child, on exec and
15148 * exit the block is freed.
15149 *
15150 * If the process loads library(s) containing additional dof, the
15151 * new dof_ioctl_data_t is merged with the existing block.
15152 *
15153 * There are a few catches that make this slightly more difficult.
15154 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15155 * identifier value for each dof in the block. In non-lazy dof terms,
15156 * this is the generation that dof was loaded in. If we hand back
15157 * a UID for a lazy dof, that same UID must be able to unload the
15158 * dof once it has become non-lazy. To meet this requirement, the
15159 * code that loads lazy dof requires that the UID's for dof(s) in
15160 * the lazy dof be sorted, and in ascending order. It is okay to skip
15161 * UID's, I.E., 1 -> 5 -> 6 is legal.
15162 *
15163 * Once a process has become non-lazy, it will stay non-lazy. All
15164 * future dof operations for that process will be non-lazy, even
15165 * if the dof mode transitions back to lazy.
15166 *
15167 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15168 * That way if the lazy check fails due to transitioning to non-lazy, the
15169 * right thing is done with the newly faulted in dof.
15170 */
15171
15172/*
15173 * This method is a bit squicky. It must handle:
15174 *
15175 * dof should not be lazy.
15176 * dof should have been handled lazily, but there was an error
15177 * dof was handled lazily, and needs to be freed.
15178 * dof was handled lazily, and must not be freed.
15179 *
15180 *
15181 * Returns EACCESS if dof should be handled non-lazily.
15182 *
15183 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15184 *
15185 * If the dofs data is claimed by this method, dofs_claimed will be set.
15186 * Callers should not free claimed dofs.
15187 */
15188static int
15189dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15190{
15191 ASSERT(p);
15192 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15193
15194 int rval = 0;
15195 *dofs_claimed = 0;
15196
15197 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15198
15199 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15200 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15201
15202 /*
15203 * Any existing helpers force non-lazy behavior.
15204 */
15205 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15206 dtrace_sprlock(p);
15207
15208 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15209 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15210 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15211
15212 /*
15213 * Range check...
15214 */
15215 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15216 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15217 rval = EINVAL;
15218 goto unlock;
15219 }
15220
15221 /*
15222 * Each dof being added must be assigned a unique generation.
15223 */
15224 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15225 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15226 /*
15227 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15228 */
15229 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15230 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15231 }
15232
15233
15234 if (existing_dofs) {
15235 /*
15236 * Merge the existing and incoming dofs
15237 */
15238 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15239 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15240
15241 bcopy(&existing_dofs->dofiod_helpers[0],
15242 &merged_dofs->dofiod_helpers[0],
15243 sizeof(dof_helper_t) * existing_dofs_count);
15244 bcopy(&incoming_dofs->dofiod_helpers[0],
15245 &merged_dofs->dofiod_helpers[existing_dofs_count],
15246 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15247
15248 merged_dofs->dofiod_count = merged_dofs_count;
15249
15250 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15251
15252 p->p_dtrace_lazy_dofs = merged_dofs;
15253 } else {
15254 /*
15255 * Claim the incoming dofs
15256 */
15257 *dofs_claimed = 1;
15258 p->p_dtrace_lazy_dofs = incoming_dofs;
15259 }
15260
15261#if DEBUG
15262 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15263 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15264 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15265 }
15266#endif /* DEBUG */
15267
15268unlock:
15269 dtrace_sprunlock(p);
15270 } else {
15271 rval = EACCES;
15272 }
15273
15274 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15275
15276 return rval;
15277}
15278
15279/*
15280 * Returns:
15281 *
15282 * EINVAL: lazy dof is enabled, but the requested generation was not found.
15283 * EACCES: This removal needs to be handled non-lazily.
15284 */
15285static int
15286dtrace_lazy_dofs_remove(proc_t *p, int generation)
15287{
15288 int rval = EINVAL;
15289
15290 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15291
15292 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15293 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15294
15295 /*
15296 * Any existing helpers force non-lazy behavior.
15297 */
15298 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15299 dtrace_sprlock(p);
15300
15301 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15302
15303 if (existing_dofs) {
15304 int index, existing_dofs_count = existing_dofs->dofiod_count;
15305 for (index=0; index<existing_dofs_count; index++) {
15306 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15307 dof_ioctl_data_t* removed_dofs = NULL;
15308
15309 /*
15310 * If there is only 1 dof, we'll delete it and swap in NULL.
15311 */
15312 if (existing_dofs_count > 1) {
15313 int removed_dofs_count = existing_dofs_count - 1;
15314 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15315
15316 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15317 removed_dofs->dofiod_count = removed_dofs_count;
15318
15319 /*
15320 * copy the remaining data.
15321 */
15322 if (index > 0) {
15323 bcopy(&existing_dofs->dofiod_helpers[0],
15324 &removed_dofs->dofiod_helpers[0],
15325 index * sizeof(dof_helper_t));
15326 }
15327
15328 if (index < existing_dofs_count-1) {
15329 bcopy(&existing_dofs->dofiod_helpers[index+1],
15330 &removed_dofs->dofiod_helpers[index],
15331 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
15332 }
15333 }
15334
15335 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15336
15337 p->p_dtrace_lazy_dofs = removed_dofs;
15338
15339 rval = KERN_SUCCESS;
15340
15341 break;
15342 }
15343 }
15344
15345#if DEBUG
15346 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15347 if (all_dofs) {
15348 unsigned int i;
15349 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15350 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15351 }
15352 }
15353#endif
15354
15355 }
15356 dtrace_sprunlock(p);
15357 } else {
15358 rval = EACCES;
15359 }
15360
15361 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15362
15363 return rval;
15364}
15365
15366void
15367dtrace_lazy_dofs_destroy(proc_t *p)
15368{
15369 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15370 dtrace_sprlock(p);
15371
15372 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15373
15374 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15375 p->p_dtrace_lazy_dofs = NULL;
15376
15377 dtrace_sprunlock(p);
15378 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15379
15380 if (lazy_dofs) {
15381 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15382 }
15383}
15384
15385static int
15386dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
15387{
15388#pragma unused(ignored)
15389 /*
15390 * Okay to NULL test without taking the sprlock.
15391 */
15392 return p->p_dtrace_lazy_dofs != NULL;
15393}
15394
15395static void
15396dtrace_lazy_dofs_process(proc_t *p) {
15397 /*
15398 * It is possible this process may exit during our attempt to
15399 * fault in the dof. We could fix this by holding locks longer,
15400 * but the errors are benign.
15401 */
15402 dtrace_sprlock(p);
15403
15404
15405 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15406 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
15407
15408 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15409 p->p_dtrace_lazy_dofs = NULL;
15410
15411 dtrace_sprunlock(p);
15412 lck_mtx_lock(&dtrace_meta_lock);
15413 /*
15414 * Process each dof_helper_t
15415 */
15416 if (lazy_dofs != NULL) {
15417 unsigned int i;
15418 int rval;
15419
15420 for (i=0; i<lazy_dofs->dofiod_count; i++) {
15421 /*
15422 * When loading lazy dof, we depend on the generations being sorted in ascending order.
15423 */
15424 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
15425
15426 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
15427
15428 /*
15429 * We stored the generation in dofhp_dof. Save it, and restore the original value.
15430 */
15431 int generation = dhp->dofhp_dof;
15432 dhp->dofhp_dof = dhp->dofhp_addr;
15433
15434 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
15435
15436 if (dof != NULL) {
15437 dtrace_helpers_t *help;
15438
15439 lck_mtx_lock(&dtrace_lock);
15440
15441 /*
15442 * This must be done with the dtrace_lock held
15443 */
15444 if ((help = p->p_dtrace_helpers) == NULL)
15445 help = dtrace_helpers_create(p);
15446
15447 /*
15448 * If the generation value has been bumped, someone snuck in
15449 * when we released the dtrace lock. We have to dump this generation,
15450 * there is no safe way to load it.
15451 */
15452 if (help->dthps_generation <= generation) {
15453 help->dthps_generation = generation;
15454
15455 /*
15456 * dtrace_helper_slurp() takes responsibility for the dof --
15457 * it may free it now or it may save it and free it later.
15458 */
15459 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
15460 dtrace_dof_error(NULL, "returned value did not match expected generation");
15461 }
15462 }
15463
15464 lck_mtx_unlock(&dtrace_lock);
15465 }
15466 }
15467 lck_mtx_unlock(&dtrace_meta_lock);
15468 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15469 } else {
15470 lck_mtx_unlock(&dtrace_meta_lock);
15471 }
15472}
15473
15474static int
15475dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
15476{
15477#pragma unused(ignored)
15478
15479 dtrace_lazy_dofs_process(p);
15480
15481 return PROC_RETURNED;
15482}
15483
15484#define DTRACE_LAZY_DOFS_DUPLICATED 1
15485
15486static int
15487dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
15488{
15489 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15490 LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15491 LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15492
15493 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15494 dtrace_sprlock(parent);
15495
15496 /*
15497 * We need to make sure that the transition to lazy dofs -> helpers
15498 * was atomic for our parent
15499 */
15500 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
15501 /*
15502 * In theory we should hold the child sprlock, but this is safe...
15503 */
15504 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
15505
15506 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
15507 dof_ioctl_data_t* child_dofs = NULL;
15508 if (parent_dofs) {
15509 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
15510 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
15511 bcopy(parent_dofs, child_dofs, parent_dofs_size);
15512 }
15513
15514 dtrace_sprunlock(parent);
15515
15516 if (child_dofs) {
15517 dtrace_sprlock(child);
15518 child->p_dtrace_lazy_dofs = child_dofs;
15519 dtrace_sprunlock(child);
15520 /**
15521 * We process the DOF at this point if the mode is set to
15522 * LAZY_OFF. This can happen if DTrace is still processing the
15523 * DOF of other process (which can happen because the
15524 * protected pager can have a huge latency)
15525 * but has not processed our parent yet
15526 */
15527 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
15528 dtrace_lazy_dofs_process(child);
15529 }
15530 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15531
15532 return DTRACE_LAZY_DOFS_DUPLICATED;
15533 }
15534 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15535
15536 return 0;
15537}
15538
15539static dtrace_helpers_t *
15540dtrace_helpers_create(proc_t *p)
15541{
15542 dtrace_helpers_t *help;
15543
15544 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15545 ASSERT(p->p_dtrace_helpers == NULL);
15546
15547 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15548 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15549 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15550
15551 p->p_dtrace_helpers = help;
15552 dtrace_helpers++;
15553
15554 return (help);
15555}
15556
15557static void
15558dtrace_helpers_destroy(proc_t* p)
15559{
15560 dtrace_helpers_t *help;
15561 dtrace_vstate_t *vstate;
15562 uint_t i;
15563
15564 lck_mtx_lock(&dtrace_meta_lock);
15565 lck_mtx_lock(&dtrace_lock);
15566
15567 ASSERT(p->p_dtrace_helpers != NULL);
15568 ASSERT(dtrace_helpers > 0);
15569
15570 help = p->p_dtrace_helpers;
15571 vstate = &help->dthps_vstate;
15572
15573 /*
15574 * We're now going to lose the help from this process.
15575 */
15576 p->p_dtrace_helpers = NULL;
15577 dtrace_sync();
15578
15579 /*
15580 * Destory the helper actions.
15581 */
15582 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15583 dtrace_helper_action_t *h, *next;
15584
15585 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15586 next = h->dtha_next;
15587 dtrace_helper_action_destroy(h, vstate);
15588 h = next;
15589 }
15590 }
15591
15592 lck_mtx_unlock(&dtrace_lock);
15593
15594 /*
15595 * Destroy the helper providers.
15596 */
15597 if (help->dthps_maxprovs > 0) {
15598 if (dtrace_meta_pid != NULL) {
15599 ASSERT(dtrace_deferred_pid == NULL);
15600
15601 for (i = 0; i < help->dthps_nprovs; i++) {
15602 dtrace_helper_provider_remove(
15603 &help->dthps_provs[i]->dthp_prov, p);
15604 }
15605 } else {
15606 lck_mtx_lock(&dtrace_lock);
15607 ASSERT(help->dthps_deferred == 0 ||
15608 help->dthps_next != NULL ||
15609 help->dthps_prev != NULL ||
15610 help == dtrace_deferred_pid);
15611
15612 /*
15613 * Remove the helper from the deferred list.
15614 */
15615 if (help->dthps_next != NULL)
15616 help->dthps_next->dthps_prev = help->dthps_prev;
15617 if (help->dthps_prev != NULL)
15618 help->dthps_prev->dthps_next = help->dthps_next;
15619 if (dtrace_deferred_pid == help) {
15620 dtrace_deferred_pid = help->dthps_next;
15621 ASSERT(help->dthps_prev == NULL);
15622 }
15623
15624 lck_mtx_unlock(&dtrace_lock);
15625 }
15626
15627
15628 for (i = 0; i < help->dthps_nprovs; i++) {
15629 dtrace_helper_provider_destroy(help->dthps_provs[i]);
15630 }
15631
15632 kmem_free(help->dthps_provs, help->dthps_maxprovs *
15633 sizeof (dtrace_helper_provider_t *));
15634 }
15635
15636 lck_mtx_lock(&dtrace_lock);
15637
15638 dtrace_vstate_fini(&help->dthps_vstate);
15639 kmem_free(help->dthps_actions,
15640 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
15641 kmem_free(help, sizeof (dtrace_helpers_t));
15642
15643 --dtrace_helpers;
15644 lck_mtx_unlock(&dtrace_lock);
15645 lck_mtx_unlock(&dtrace_meta_lock);
15646}
15647
15648static void
15649dtrace_helpers_duplicate(proc_t *from, proc_t *to)
15650{
15651 dtrace_helpers_t *help, *newhelp;
15652 dtrace_helper_action_t *helper, *new, *last;
15653 dtrace_difo_t *dp;
15654 dtrace_vstate_t *vstate;
15655 uint_t i;
15656 int j, sz, hasprovs = 0;
15657
15658 lck_mtx_lock(&dtrace_meta_lock);
15659 lck_mtx_lock(&dtrace_lock);
15660 ASSERT(from->p_dtrace_helpers != NULL);
15661 ASSERT(dtrace_helpers > 0);
15662
15663 help = from->p_dtrace_helpers;
15664 newhelp = dtrace_helpers_create(to);
15665 ASSERT(to->p_dtrace_helpers != NULL);
15666
15667 newhelp->dthps_generation = help->dthps_generation;
15668 vstate = &newhelp->dthps_vstate;
15669
15670 /*
15671 * Duplicate the helper actions.
15672 */
15673 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15674 if ((helper = help->dthps_actions[i]) == NULL)
15675 continue;
15676
15677 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
15678 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
15679 KM_SLEEP);
15680 new->dtha_generation = helper->dtha_generation;
15681
15682 if ((dp = helper->dtha_predicate) != NULL) {
15683 dp = dtrace_difo_duplicate(dp, vstate);
15684 new->dtha_predicate = dp;
15685 }
15686
15687 new->dtha_nactions = helper->dtha_nactions;
15688 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
15689 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
15690
15691 for (j = 0; j < new->dtha_nactions; j++) {
15692 dtrace_difo_t *dpj = helper->dtha_actions[j];
15693
15694 ASSERT(dpj != NULL);
15695 dpj = dtrace_difo_duplicate(dpj, vstate);
15696 new->dtha_actions[j] = dpj;
15697 }
15698
15699 if (last != NULL) {
15700 last->dtha_next = new;
15701 } else {
15702 newhelp->dthps_actions[i] = new;
15703 }
15704
15705 last = new;
15706 }
15707 }
15708
15709 /*
15710 * Duplicate the helper providers and register them with the
15711 * DTrace framework.
15712 */
15713 if (help->dthps_nprovs > 0) {
15714 newhelp->dthps_nprovs = help->dthps_nprovs;
15715 newhelp->dthps_maxprovs = help->dthps_nprovs;
15716 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
15717 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15718 for (i = 0; i < newhelp->dthps_nprovs; i++) {
15719 newhelp->dthps_provs[i] = help->dthps_provs[i];
15720 newhelp->dthps_provs[i]->dthp_ref++;
15721 }
15722
15723 hasprovs = 1;
15724 }
15725
15726 lck_mtx_unlock(&dtrace_lock);
15727
15728 if (hasprovs)
15729 dtrace_helper_provider_register(to, newhelp, NULL);
15730
15731 lck_mtx_unlock(&dtrace_meta_lock);
15732}
15733
15734/**
15735 * DTrace Process functions
15736 */
15737
15738void
15739dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
15740{
15741 /*
15742 * This code applies to new processes who are copying the task
15743 * and thread state and address spaces of their parent process.
15744 */
15745 if (!spawn) {
15746 /*
15747 * APPLE NOTE: Solaris does a sprlock() and drops the
15748 * proc_lock here. We're cheating a bit and only taking
15749 * the p_dtrace_sprlock lock. A full sprlock would
15750 * task_suspend the parent.
15751 */
15752 dtrace_sprlock(parent_proc);
15753
15754 /*
15755 * Remove all DTrace tracepoints from the child process. We
15756 * need to do this _before_ duplicating USDT providers since
15757 * any associated probes may be immediately enabled.
15758 */
15759 if (parent_proc->p_dtrace_count > 0) {
15760 dtrace_fasttrap_fork(parent_proc, child_proc);
15761 }
15762
15763 dtrace_sprunlock(parent_proc);
15764
15765 /*
15766 * Duplicate any lazy dof(s). This must be done while NOT
15767 * holding the parent sprlock! Lock ordering is
15768 * dtrace_dof_mode_lock, then sprlock. It is imperative we
15769 * always call dtrace_lazy_dofs_duplicate, rather than null
15770 * check and call if !NULL. If we NULL test, during lazy dof
15771 * faulting we can race with the faulting code and proceed
15772 * from here to beyond the helpers copy. The lazy dof
15773 * faulting will then fail to copy the helpers to the child
15774 * process. We return if we duplicated lazy dofs as a process
15775 * can only have one at the same time to avoid a race between
15776 * a dtrace client and dtrace_proc_fork where a process would
15777 * end up with both lazy dofs and helpers.
15778 */
15779 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
15780 return;
15781 }
15782
15783 /*
15784 * Duplicate any helper actions and providers if they haven't
15785 * already.
15786 */
15787#if !defined(__APPLE__)
15788 /*
15789 * The SFORKING
15790 * we set above informs the code to enable USDT probes that
15791 * sprlock() may fail because the child is being forked.
15792 */
15793#endif
15794 /*
15795 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
15796 * never fails to find the child. We do not set SFORKING.
15797 */
15798 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
15799 (*dtrace_helpers_fork)(parent_proc, child_proc);
15800 }
15801 }
15802}
15803
15804void
15805dtrace_proc_exec(proc_t *p)
15806{
15807 /*
15808 * Invalidate any predicate evaluation already cached for this thread by DTrace.
15809 * That's because we've just stored to p_comm and DTrace refers to that when it
15810 * evaluates the "execname" special variable. uid and gid may have changed as well.
15811 */
15812 dtrace_set_thread_predcache(current_thread(), 0);
15813
15814 /*
15815 * Free any outstanding lazy dof entries. It is imperative we
15816 * always call dtrace_lazy_dofs_destroy, rather than null check
15817 * and call if !NULL. If we NULL test, during lazy dof faulting
15818 * we can race with the faulting code and proceed from here to
15819 * beyond the helpers cleanup. The lazy dof faulting will then
15820 * install new helpers which no longer belong to this process!
15821 */
15822 dtrace_lazy_dofs_destroy(p);
15823
15824
15825 /*
15826 * Clean up any DTrace helpers for the process.
15827 */
15828 if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
15829 (*dtrace_helpers_cleanup)(p);
15830 }
15831
15832 /*
15833 * Cleanup the DTrace provider associated with this process.
15834 */
15835 proc_lock(p);
15836 if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
15837 (*dtrace_fasttrap_exec_ptr)(p);
15838 }
15839 proc_unlock(p);
15840}
15841
15842void
15843dtrace_proc_exit(proc_t *p)
15844{
15845 /*
15846 * Free any outstanding lazy dof entries. It is imperative we
15847 * always call dtrace_lazy_dofs_destroy, rather than null check
15848 * and call if !NULL. If we NULL test, during lazy dof faulting
15849 * we can race with the faulting code and proceed from here to
15850 * beyond the helpers cleanup. The lazy dof faulting will then
15851 * install new helpers which will never be cleaned up, and leak.
15852 */
15853 dtrace_lazy_dofs_destroy(p);
15854
15855 /*
15856 * Clean up any DTrace helper actions or probes for the process.
15857 */
15858 if (p->p_dtrace_helpers != NULL) {
15859 (*dtrace_helpers_cleanup)(p);
15860 }
15861
15862 /*
15863 * Clean up any DTrace probes associated with this process.
15864 */
15865 /*
15866 * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
15867 * call this after dtrace_helpers_cleanup()
15868 */
15869 proc_lock(p);
15870 if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
15871 (*dtrace_fasttrap_exit_ptr)(p);
15872 }
15873 proc_unlock(p);
15874}
15875
15876/*
15877 * DTrace Hook Functions
15878 */
15879
15880/*
15881 * APPLE NOTE: dtrace_modctl_* routines for kext support.
15882 * Used to manipulate the modctl list within dtrace xnu.
15883 */
15884
15885modctl_t *dtrace_modctl_list;
15886
15887static void
15888dtrace_modctl_add(struct modctl * newctl)
15889{
15890 struct modctl *nextp, *prevp;
15891
15892 ASSERT(newctl != NULL);
15893 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15894
15895 // Insert new module at the front of the list,
15896
15897 newctl->mod_next = dtrace_modctl_list;
15898 dtrace_modctl_list = newctl;
15899
15900 /*
15901 * If a module exists with the same name, then that module
15902 * must have been unloaded with enabled probes. We will move
15903 * the unloaded module to the new module's stale chain and
15904 * then stop traversing the list.
15905 */
15906
15907 prevp = newctl;
15908 nextp = newctl->mod_next;
15909
15910 while (nextp != NULL) {
15911 if (nextp->mod_loaded) {
15912 /* This is a loaded module. Keep traversing. */
15913 prevp = nextp;
15914 nextp = nextp->mod_next;
15915 continue;
15916 }
15917 else {
15918 /* Found an unloaded module */
15919 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
15920 /* Names don't match. Keep traversing. */
15921 prevp = nextp;
15922 nextp = nextp->mod_next;
15923 continue;
15924 }
15925 else {
15926 /* We found a stale entry, move it. We're done. */
15927 prevp->mod_next = nextp->mod_next;
15928 newctl->mod_stale = nextp;
15929 nextp->mod_next = NULL;
15930 break;
15931 }
15932 }
15933 }
15934}
15935
15936static modctl_t *
15937dtrace_modctl_lookup(struct kmod_info * kmod)
15938{
15939 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15940
15941 struct modctl * ctl;
15942
15943 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
15944 if (ctl->mod_id == kmod->id)
15945 return(ctl);
15946 }
15947 return (NULL);
15948}
15949
15950/*
15951 * This routine is called from dtrace_module_unloaded().
15952 * It removes a modctl structure and its stale chain
15953 * from the kext shadow list.
15954 */
15955static void
15956dtrace_modctl_remove(struct modctl * ctl)
15957{
15958 ASSERT(ctl != NULL);
15959 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15960 modctl_t *prevp, *nextp, *curp;
15961
15962 // Remove stale chain first
15963 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
15964 nextp = curp->mod_stale;
15965 /* There should NEVER be user symbols allocated at this point */
15966 ASSERT(curp->mod_user_symbols == NULL);
15967 kmem_free(curp, sizeof(modctl_t));
15968 }
15969
15970 prevp = NULL;
15971 curp = dtrace_modctl_list;
15972
15973 while (curp != ctl) {
15974 prevp = curp;
15975 curp = curp->mod_next;
15976 }
15977
15978 if (prevp != NULL) {
15979 prevp->mod_next = ctl->mod_next;
15980 }
15981 else {
15982 dtrace_modctl_list = ctl->mod_next;
15983 }
15984
15985 /* There should NEVER be user symbols allocated at this point */
15986 ASSERT(ctl->mod_user_symbols == NULL);
15987
15988 kmem_free (ctl, sizeof(modctl_t));
15989}
15990
15991/*
15992 * APPLE NOTE: The kext loader will call dtrace_module_loaded
15993 * when the kext is loaded in memory, but before calling the
15994 * kext's start routine.
15995 *
15996 * Return 0 on success
15997 * Return -1 on failure
15998 */
15999
16000static int
16001dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16002{
16003 dtrace_provider_t *prv;
16004
16005 /*
16006 * If kernel symbols have been disabled, return immediately
16007 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16008 */
16009 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16010 return 0;
16011
16012 struct modctl *ctl = NULL;
16013 if (!kmod || kmod->address == 0 || kmod->size == 0)
16014 return(-1);
16015
16016 lck_mtx_lock(&dtrace_provider_lock);
16017 lck_mtx_lock(&mod_lock);
16018
16019 /*
16020 * Have we seen this kext before?
16021 */
16022
16023 ctl = dtrace_modctl_lookup(kmod);
16024
16025 if (ctl != NULL) {
16026 /* bail... we already have this kext in the modctl list */
16027 lck_mtx_unlock(&mod_lock);
16028 lck_mtx_unlock(&dtrace_provider_lock);
16029 if (dtrace_err_verbose)
16030 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16031 return(-1);
16032 }
16033 else {
16034 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16035 if (ctl == NULL) {
16036 if (dtrace_err_verbose)
16037 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16038 lck_mtx_unlock(&mod_lock);
16039 lck_mtx_unlock(&dtrace_provider_lock);
16040 return (-1);
16041 }
16042 ctl->mod_next = NULL;
16043 ctl->mod_stale = NULL;
16044 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16045 ctl->mod_loadcnt = kmod->id;
16046 ctl->mod_nenabled = 0;
16047 ctl->mod_address = kmod->address;
16048 ctl->mod_size = kmod->size;
16049 ctl->mod_id = kmod->id;
16050 ctl->mod_loaded = 1;
16051 ctl->mod_flags = 0;
16052 ctl->mod_user_symbols = NULL;
16053
16054 /*
16055 * Find the UUID for this module, if it has one
16056 */
16057 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16058 struct load_command* load_cmd = (struct load_command *)&header[1];
16059 uint32_t i;
16060 for (i = 0; i < header->ncmds; i++) {
16061 if (load_cmd->cmd == LC_UUID) {
16062 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16063 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16064 ctl->mod_flags |= MODCTL_HAS_UUID;
16065 break;
16066 }
16067 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16068 }
16069
16070 if (ctl->mod_address == g_kernel_kmod_info.address) {
16071 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16072 memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16073 }
16074 /*
16075 * Static kexts have a UUID that is not used for symbolication, as all their
16076 * symbols are in kernel
16077 */
16078 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16079 memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16080 ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16081 }
16082 }
16083 dtrace_modctl_add(ctl);
16084
16085 /*
16086 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16087 */
16088 lck_mtx_lock(&dtrace_lock);
16089
16090 /*
16091 * DTrace must decide if it will instrument modules lazily via
16092 * userspace symbols (default mode), or instrument immediately via
16093 * kernel symbols (non-default mode)
16094 *
16095 * When in default/lazy mode, DTrace will only support modules
16096 * built with a valid UUID.
16097 *
16098 * Overriding the default can be done explicitly in one of
16099 * the following two ways.
16100 *
16101 * A module can force symbols from kernel space using the plist key,
16102 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16103 * we fall through and instrument this module now.
16104 *
16105 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16106 * from kernel space (see dtrace_impl.h). If this system state is set
16107 * to a non-userspace mode, we fall through and instrument the module now.
16108 */
16109
16110 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16111 (!(flag & KMOD_DTRACE_FORCE_INIT)))
16112 {
16113 /* We will instrument the module lazily -- this is the default */
16114 lck_mtx_unlock(&dtrace_lock);
16115 lck_mtx_unlock(&mod_lock);
16116 lck_mtx_unlock(&dtrace_provider_lock);
16117 return 0;
16118 }
16119
16120 /* We will instrument the module immediately using kernel symbols */
16121 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16122
16123 lck_mtx_unlock(&dtrace_lock);
16124
16125 /*
16126 * We're going to call each providers per-module provide operation
16127 * specifying only this module.
16128 */
16129 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16130 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16131
16132 /*
16133 * APPLE NOTE: The contract with the kext loader is that once this function
16134 * has completed, it may delete kernel symbols at will.
16135 * We must set this while still holding the mod_lock.
16136 */
16137 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16138
16139 lck_mtx_unlock(&mod_lock);
16140 lck_mtx_unlock(&dtrace_provider_lock);
16141
16142 /*
16143 * If we have any retained enablings, we need to match against them.
16144 * Enabling probes requires that cpu_lock be held, and we cannot hold
16145 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16146 * module. (In particular, this happens when loading scheduling
16147 * classes.) So if we have any retained enablings, we need to dispatch
16148 * our task queue to do the match for us.
16149 */
16150 lck_mtx_lock(&dtrace_lock);
16151
16152 if (dtrace_retained == NULL) {
16153 lck_mtx_unlock(&dtrace_lock);
16154 return 0;
16155 }
16156
16157 /* APPLE NOTE!
16158 *
16159 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16160 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16161 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16162 * the delay call as well.
16163 */
16164 lck_mtx_unlock(&dtrace_lock);
16165
16166 dtrace_enabling_matchall();
16167
16168 return 0;
16169}
16170
16171/*
16172 * Return 0 on success
16173 * Return -1 on failure
16174 */
16175static int
16176dtrace_module_unloaded(struct kmod_info *kmod)
16177{
16178 dtrace_probe_t template, *probe, *first, *next;
16179 dtrace_provider_t *prov;
16180 struct modctl *ctl = NULL;
16181 struct modctl *syncctl = NULL;
16182 struct modctl *nextsyncctl = NULL;
16183 int syncmode = 0;
16184
16185 lck_mtx_lock(&dtrace_provider_lock);
16186 lck_mtx_lock(&mod_lock);
16187 lck_mtx_lock(&dtrace_lock);
16188
16189 if (kmod == NULL) {
16190 syncmode = 1;
16191 }
16192 else {
16193 ctl = dtrace_modctl_lookup(kmod);
16194 if (ctl == NULL)
16195 {
16196 lck_mtx_unlock(&dtrace_lock);
16197 lck_mtx_unlock(&mod_lock);
16198 lck_mtx_unlock(&dtrace_provider_lock);
16199 return (-1);
16200 }
16201 ctl->mod_loaded = 0;
16202 ctl->mod_address = 0;
16203 ctl->mod_size = 0;
16204 }
16205
16206 if (dtrace_bymod == NULL) {
16207 /*
16208 * The DTrace module is loaded (obviously) but not attached;
16209 * we don't have any work to do.
16210 */
16211 if (ctl != NULL)
16212 (void)dtrace_modctl_remove(ctl);
16213 lck_mtx_unlock(&dtrace_lock);
16214 lck_mtx_unlock(&mod_lock);
16215 lck_mtx_unlock(&dtrace_provider_lock);
16216 return(0);
16217 }
16218
16219 /* Syncmode set means we target and traverse entire modctl list. */
16220 if (syncmode)
16221 nextsyncctl = dtrace_modctl_list;
16222
16223syncloop:
16224 if (syncmode)
16225 {
16226 /* find a stale modctl struct */
16227 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16228 if (syncctl->mod_address == 0)
16229 break;
16230 }
16231 if (syncctl==NULL)
16232 {
16233 /* We have no more work to do */
16234 lck_mtx_unlock(&dtrace_lock);
16235 lck_mtx_unlock(&mod_lock);
16236 lck_mtx_unlock(&dtrace_provider_lock);
16237 return(0);
16238 }
16239 else {
16240 /* keep track of next syncctl in case this one is removed */
16241 nextsyncctl = syncctl->mod_next;
16242 ctl = syncctl;
16243 }
16244 }
16245
16246 template.dtpr_mod = ctl->mod_modname;
16247
16248 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16249 probe != NULL; probe = probe->dtpr_nextmod) {
16250 if (probe->dtpr_ecb != NULL) {
16251 /*
16252 * This shouldn't _actually_ be possible -- we're
16253 * unloading a module that has an enabled probe in it.
16254 * (It's normally up to the provider to make sure that
16255 * this can't happen.) However, because dtps_enable()
16256 * doesn't have a failure mode, there can be an
16257 * enable/unload race. Upshot: we don't want to
16258 * assert, but we're not going to disable the
16259 * probe, either.
16260 */
16261
16262
16263 if (syncmode) {
16264 /* We're syncing, let's look at next in list */
16265 goto syncloop;
16266 }
16267
16268 lck_mtx_unlock(&dtrace_lock);
16269 lck_mtx_unlock(&mod_lock);
16270 lck_mtx_unlock(&dtrace_provider_lock);
16271
16272 if (dtrace_err_verbose) {
16273 cmn_err(CE_WARN, "unloaded module '%s' had "
16274 "enabled probes", ctl->mod_modname);
16275 }
16276 return(-1);
16277 }
16278 }
16279
16280 probe = first;
16281
16282 for (first = NULL; probe != NULL; probe = next) {
16283 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16284
16285 dtrace_probes[probe->dtpr_id - 1] = NULL;
16286 probe->dtpr_provider->dtpv_probe_count--;
16287
16288 next = probe->dtpr_nextmod;
16289 dtrace_hash_remove(dtrace_byprov, probe);
16290 dtrace_hash_remove(dtrace_bymod, probe);
16291 dtrace_hash_remove(dtrace_byfunc, probe);
16292 dtrace_hash_remove(dtrace_byname, probe);
16293
16294 if (first == NULL) {
16295 first = probe;
16296 probe->dtpr_nextmod = NULL;
16297 } else {
16298 probe->dtpr_nextmod = first;
16299 first = probe;
16300 }
16301 }
16302
16303 /*
16304 * We've removed all of the module's probes from the hash chains and
16305 * from the probe array. Now issue a dtrace_sync() to be sure that
16306 * everyone has cleared out from any probe array processing.
16307 */
16308 dtrace_sync();
16309
16310 for (probe = first; probe != NULL; probe = first) {
16311 first = probe->dtpr_nextmod;
16312 prov = probe->dtpr_provider;
16313 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16314 probe->dtpr_arg);
16315 dtrace_strunref(probe->dtpr_mod);
16316 dtrace_strunref(probe->dtpr_func);
16317 dtrace_strunref(probe->dtpr_name);
16318 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16319
16320 zfree(dtrace_probe_t_zone, probe);
16321 }
16322
16323 dtrace_modctl_remove(ctl);
16324
16325 if (syncmode)
16326 goto syncloop;
16327
16328 lck_mtx_unlock(&dtrace_lock);
16329 lck_mtx_unlock(&mod_lock);
16330 lck_mtx_unlock(&dtrace_provider_lock);
16331
16332 return(0);
16333}
16334
16335void
16336dtrace_suspend(void)
16337{
16338 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16339}
16340
16341void
16342dtrace_resume(void)
16343{
16344 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16345}
16346
16347static int
16348dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16349{
16350 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16351 lck_mtx_lock(&dtrace_lock);
16352
16353 switch (what) {
16354 case CPU_CONFIG: {
16355 dtrace_state_t *state;
16356 dtrace_optval_t *opt, rs, c;
16357
16358 /*
16359 * For now, we only allocate a new buffer for anonymous state.
16360 */
16361 if ((state = dtrace_anon.dta_state) == NULL)
16362 break;
16363
16364 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16365 break;
16366
16367 opt = state->dts_options;
16368 c = opt[DTRACEOPT_CPU];
16369
16370 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16371 break;
16372
16373 /*
16374 * Regardless of what the actual policy is, we're going to
16375 * temporarily set our resize policy to be manual. We're
16376 * also going to temporarily set our CPU option to denote
16377 * the newly configured CPU.
16378 */
16379 rs = opt[DTRACEOPT_BUFRESIZE];
16380 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16381 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16382
16383 (void) dtrace_state_buffers(state);
16384
16385 opt[DTRACEOPT_BUFRESIZE] = rs;
16386 opt[DTRACEOPT_CPU] = c;
16387
16388 break;
16389 }
16390
16391 case CPU_UNCONFIG:
16392 /*
16393 * We don't free the buffer in the CPU_UNCONFIG case. (The
16394 * buffer will be freed when the consumer exits.)
16395 */
16396 break;
16397
16398 default:
16399 break;
16400 }
16401
16402 lck_mtx_unlock(&dtrace_lock);
16403 return (0);
16404}
16405
16406static void
16407dtrace_cpu_setup_initial(processorid_t cpu)
16408{
16409 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16410}
16411
16412static void
16413dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16414{
16415 if (dtrace_toxranges >= dtrace_toxranges_max) {
16416 int osize, nsize;
16417 dtrace_toxrange_t *range;
16418
16419 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16420
16421 if (osize == 0) {
16422 ASSERT(dtrace_toxrange == NULL);
16423 ASSERT(dtrace_toxranges_max == 0);
16424 dtrace_toxranges_max = 1;
16425 } else {
16426 dtrace_toxranges_max <<= 1;
16427 }
16428
16429 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16430 range = kmem_zalloc(nsize, KM_SLEEP);
16431
16432 if (dtrace_toxrange != NULL) {
16433 ASSERT(osize != 0);
16434 bcopy(dtrace_toxrange, range, osize);
16435 kmem_free(dtrace_toxrange, osize);
16436 }
16437
16438 dtrace_toxrange = range;
16439 }
16440
16441 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16442 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16443
16444 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16445 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16446 dtrace_toxranges++;
16447}
16448
16449/*
16450 * DTrace Driver Cookbook Functions
16451 */
16452/*ARGSUSED*/
16453static int
16454dtrace_attach(dev_info_t *devi)
16455{
16456 dtrace_provider_id_t id;
16457 dtrace_state_t *state = NULL;
16458 dtrace_enabling_t *enab;
16459
16460 lck_mtx_lock(&cpu_lock);
16461 lck_mtx_lock(&dtrace_provider_lock);
16462 lck_mtx_lock(&dtrace_lock);
16463
16464 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
16465 dtrace_devi = devi;
16466
16467 dtrace_modload = dtrace_module_loaded;
16468 dtrace_modunload = dtrace_module_unloaded;
16469 dtrace_cpu_init = dtrace_cpu_setup_initial;
16470 dtrace_helpers_cleanup = dtrace_helpers_destroy;
16471 dtrace_helpers_fork = dtrace_helpers_duplicate;
16472 dtrace_cpustart_init = dtrace_suspend;
16473 dtrace_cpustart_fini = dtrace_resume;
16474 dtrace_debugger_init = dtrace_suspend;
16475 dtrace_debugger_fini = dtrace_resume;
16476
16477 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16478
16479 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16480
16481 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16482 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16483
16484 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16485 sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
16486 NULL, NULL, NULL, NULL, NULL, 0);
16487
16488 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16489
16490 dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
16491 0, /* unused */
16492 offsetof(dtrace_probe_t, dtpr_nextprov),
16493 offsetof(dtrace_probe_t, dtpr_prevprov));
16494
16495 dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
16496 offsetof(dtrace_probe_t, dtpr_mod),
16497 offsetof(dtrace_probe_t, dtpr_nextmod),
16498 offsetof(dtrace_probe_t, dtpr_prevmod));
16499
16500 dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
16501 offsetof(dtrace_probe_t, dtpr_func),
16502 offsetof(dtrace_probe_t, dtpr_nextfunc),
16503 offsetof(dtrace_probe_t, dtpr_prevfunc));
16504
16505 dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
16506 offsetof(dtrace_probe_t, dtpr_name),
16507 offsetof(dtrace_probe_t, dtpr_nextname),
16508 offsetof(dtrace_probe_t, dtpr_prevname));
16509
16510 if (dtrace_retain_max < 1) {
16511 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16512 "setting to 1", dtrace_retain_max);
16513 dtrace_retain_max = 1;
16514 }
16515
16516 /*
16517 * Now discover our toxic ranges.
16518 */
16519 dtrace_toxic_ranges(dtrace_toxrange_add);
16520
16521 /*
16522 * Before we register ourselves as a provider to our own framework,
16523 * we would like to assert that dtrace_provider is NULL -- but that's
16524 * not true if we were loaded as a dependency of a DTrace provider.
16525 * Once we've registered, we can assert that dtrace_provider is our
16526 * pseudo provider.
16527 */
16528 (void) dtrace_register("dtrace", &dtrace_provider_attr,
16529 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16530
16531 ASSERT(dtrace_provider != NULL);
16532 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16533
16534#if defined (__x86_64__)
16535 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16536 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
16537 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16538 dtrace_provider, NULL, NULL, "END", 0, NULL);
16539 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16540 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
16541#elif (defined(__arm__) || defined(__arm64__))
16542 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16543 dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
16544 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16545 dtrace_provider, NULL, NULL, "END", 1, NULL);
16546 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16547 dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
16548#else
16549#error Unknown Architecture
16550#endif
16551
16552 dtrace_anon_property();
16553 lck_mtx_unlock(&cpu_lock);
16554
16555 /*
16556 * If DTrace helper tracing is enabled, we need to allocate the
16557 * trace buffer and initialize the values.
16558 */
16559 if (dtrace_helptrace_enabled) {
16560 ASSERT(dtrace_helptrace_buffer == NULL);
16561 dtrace_helptrace_buffer =
16562 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16563 dtrace_helptrace_next = 0;
16564 }
16565
16566 /*
16567 * If there are already providers, we must ask them to provide their
16568 * probes, and then match any anonymous enabling against them. Note
16569 * that there should be no other retained enablings at this time:
16570 * the only retained enablings at this time should be the anonymous
16571 * enabling.
16572 */
16573 if (dtrace_anon.dta_enabling != NULL) {
16574 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16575
16576 /*
16577 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
16578 */
16579 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16580 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16581 }
16582
16583 dtrace_enabling_provide(NULL);
16584 state = dtrace_anon.dta_state;
16585
16586 /*
16587 * We couldn't hold cpu_lock across the above call to
16588 * dtrace_enabling_provide(), but we must hold it to actually
16589 * enable the probes. We have to drop all of our locks, pick
16590 * up cpu_lock, and regain our locks before matching the
16591 * retained anonymous enabling.
16592 */
16593 lck_mtx_unlock(&dtrace_lock);
16594 lck_mtx_unlock(&dtrace_provider_lock);
16595
16596 lck_mtx_lock(&cpu_lock);
16597 lck_mtx_lock(&dtrace_provider_lock);
16598 lck_mtx_lock(&dtrace_lock);
16599
16600 if ((enab = dtrace_anon.dta_enabling) != NULL)
16601 (void) dtrace_enabling_match(enab, NULL, NULL);
16602
16603 lck_mtx_unlock(&cpu_lock);
16604 }
16605
16606 lck_mtx_unlock(&dtrace_lock);
16607 lck_mtx_unlock(&dtrace_provider_lock);
16608
16609 if (state != NULL) {
16610 /*
16611 * If we created any anonymous state, set it going now.
16612 */
16613 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16614 }
16615
16616 return (DDI_SUCCESS);
16617}
16618
16619/*ARGSUSED*/
16620static int
16621dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16622{
16623#pragma unused(flag, otyp)
16624 dtrace_state_t *state;
16625 uint32_t priv;
16626 uid_t uid;
16627 zoneid_t zoneid;
16628 int rv;
16629
16630 /* APPLE: Darwin puts Helper on its own major device. */
16631
16632 /*
16633 * If no DTRACE_PRIV_* bits are set in the credential, then the
16634 * caller lacks sufficient permission to do anything with DTrace.
16635 */
16636 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16637 if (priv == DTRACE_PRIV_NONE)
16638 return (EACCES);
16639
16640 /*
16641 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
16642 * It certainly can't be later than now!
16643 */
16644 fasttrap_init();
16645
16646 /*
16647 * Ask all providers to provide all their probes.
16648 */
16649 lck_mtx_lock(&dtrace_provider_lock);
16650 dtrace_probe_provide(NULL, NULL);
16651 lck_mtx_unlock(&dtrace_provider_lock);
16652
16653 lck_mtx_lock(&cpu_lock);
16654 lck_mtx_lock(&dtrace_lock);
16655 dtrace_opens++;
16656 dtrace_membar_producer();
16657
16658#ifdef illumos
16659 /*
16660 * If the kernel debugger is active (that is, if the kernel debugger
16661 * modified text in some way), we won't allow the open.
16662 */
16663 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16664 dtrace_opens--;
16665 lck_mtx_unlock(&dtrace_lock);
16666 lck_mtx_unlock(&cpu_lock);
16667 return (EBUSY);
16668 }
16669#endif
16670
16671 rv = dtrace_state_create(devp, cred_p, &state);
16672 lck_mtx_unlock(&cpu_lock);
16673
16674 if (rv != 0 || state == NULL) {
16675 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
16676#ifdef illumos
16677 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16678#endif
16679 }
16680 lck_mtx_unlock(&dtrace_lock);
16681 /* propagate EAGAIN or ERESTART */
16682 return (rv);
16683 }
16684
16685 lck_mtx_unlock(&dtrace_lock);
16686
16687 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16688
16689 /*
16690 * If we are currently lazy, transition states.
16691 *
16692 * Unlike dtrace_close, we do not need to check the
16693 * value of dtrace_opens, as any positive value (and
16694 * we count as 1) means we transition states.
16695 */
16696 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
16697 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
16698 /*
16699 * We do not need to hold the exclusive lock while processing
16700 * DOF on processes. We do need to make sure the mode does not get
16701 * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
16702 * (which should not happen anyway since it only happens in
16703 * dtrace_close). There is no way imcomplete USDT probes can be
16704 * activate by any DTrace clients here since they all have to
16705 * call dtrace_open and be blocked on dtrace_dof_mode_lock
16706 */
16707 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
16708 /*
16709 * Iterate all existing processes and load lazy dofs.
16710 */
16711 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
16712 dtrace_lazy_dofs_proc_iterate_doit,
16713 NULL,
16714 dtrace_lazy_dofs_proc_iterate_filter,
16715 NULL);
16716
16717 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16718 }
16719 else {
16720 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16721 }
16722
16723
16724 /*
16725 * Update kernel symbol state.
16726 *
16727 * We must own the provider and dtrace locks.
16728 *
16729 * NOTE! It may appear there is a race by setting this value so late
16730 * after dtrace_probe_provide. However, any kext loaded after the
16731 * call to probe provide and before we set LAZY_OFF will be marked as
16732 * eligible for symbols from userspace. The same dtrace that is currently
16733 * calling dtrace_open() (this call!) will get a list of kexts needing
16734 * symbols and fill them in, thus closing the race window.
16735 *
16736 * We want to set this value only after it certain it will succeed, as
16737 * this significantly reduces the complexity of error exits.
16738 */
16739 lck_mtx_lock(&dtrace_lock);
16740 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16741 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16742 }
16743 lck_mtx_unlock(&dtrace_lock);
16744
16745 return (0);
16746}
16747
16748/*ARGSUSED*/
16749static int
16750dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16751{
16752#pragma unused(flag, otyp, cred_p) /* __APPLE__ */
16753 minor_t minor = getminor(dev);
16754 dtrace_state_t *state;
16755
16756 /* APPLE NOTE: Darwin puts Helper on its own major device. */
16757 state = dtrace_state_get(minor);
16758
16759 lck_mtx_lock(&cpu_lock);
16760 lck_mtx_lock(&dtrace_lock);
16761
16762 if (state->dts_anon) {
16763 /*
16764 * There is anonymous state. Destroy that first.
16765 */
16766 ASSERT(dtrace_anon.dta_state == NULL);
16767 dtrace_state_destroy(state->dts_anon);
16768 }
16769
16770 dtrace_state_destroy(state);
16771 ASSERT(dtrace_opens > 0);
16772
16773 /*
16774 * Only relinquish control of the kernel debugger interface when there
16775 * are no consumers and no anonymous enablings.
16776 */
16777 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
16778#ifdef illumos
16779 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16780#endif
16781 }
16782
16783 lck_mtx_unlock(&dtrace_lock);
16784 lck_mtx_unlock(&cpu_lock);
16785
16786 /*
16787 * Lock ordering requires the dof mode lock be taken before
16788 * the dtrace_lock.
16789 */
16790 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16791 lck_mtx_lock(&dtrace_lock);
16792
16793 if (dtrace_opens == 0) {
16794 /*
16795 * If we are currently lazy-off, and this is the last close, transition to
16796 * lazy state.
16797 */
16798 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16799 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
16800 }
16801
16802 /*
16803 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
16804 */
16805 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
16806 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
16807 }
16808 }
16809
16810 lck_mtx_unlock(&dtrace_lock);
16811 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16812
16813 /*
16814 * Kext probes may be retained past the end of the kext's lifespan. The
16815 * probes are kept until the last reference to them has been removed.
16816 * Since closing an active dtrace context is likely to drop that last reference,
16817 * lets take a shot at cleaning out the orphaned probes now.
16818 */
16819 dtrace_module_unloaded(NULL);
16820
16821 return (0);
16822}
16823
16824/*ARGSUSED*/
16825static int
16826dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
16827{
16828#pragma unused(rv)
16829 /*
16830 * Safe to check this outside the dof mode lock
16831 */
16832 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
16833 return KERN_SUCCESS;
16834
16835 switch (cmd) {
16836#if defined (__arm64__)
16837 case DTRACEHIOC_ADDDOF_U32:
16838 case DTRACEHIOC_ADDDOF_U64:
16839#else
16840 case DTRACEHIOC_ADDDOF:
16841#endif /* __arm64__*/
16842 {
16843 dof_helper_t *dhp = NULL;
16844 size_t dof_ioctl_data_size;
16845 dof_ioctl_data_t* multi_dof;
16846 unsigned int i;
16847 int rval = 0;
16848 user_addr_t user_address = *(user_addr_t*)arg;
16849 uint64_t dof_count;
16850 int multi_dof_claimed = 0;
16851 proc_t* p = current_proc();
16852
16853 /*
16854 * If this is a restricted process and dtrace is restricted,
16855 * do not allow DOFs to be registered
16856 */
16857 if (dtrace_is_restricted() &&
16858 !dtrace_are_restrictions_relaxed() &&
16859 !dtrace_can_attach_to_proc(current_proc())) {
16860 return (EACCES);
16861 }
16862
16863 /*
16864 * Read the number of DOF sections being passed in.
16865 */
16866 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
16867 &dof_count,
16868 sizeof(dof_count))) {
16869 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
16870 return (EFAULT);
16871 }
16872
16873 /*
16874 * Range check the count.
16875 */
16876 if (dof_count == 0 || dof_count > 1024) {
16877 dtrace_dof_error(NULL, "dofiod_count is not valid");
16878 return (EINVAL);
16879 }
16880
16881 /*
16882 * Allocate a correctly sized structure and copyin the data.
16883 */
16884 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
16885 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
16886 return (ENOMEM);
16887
16888 /* NOTE! We can no longer exit this method via return */
16889 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
16890 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
16891 rval = EFAULT;
16892 goto cleanup;
16893 }
16894
16895 /*
16896 * Check that the count didn't change between the first copyin and the second.
16897 */
16898 if (multi_dof->dofiod_count != dof_count) {
16899 rval = EINVAL;
16900 goto cleanup;
16901 }
16902
16903 /*
16904 * Try to process lazily first.
16905 */
16906 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
16907
16908 /*
16909 * If rval is EACCES, we must be non-lazy.
16910 */
16911 if (rval == EACCES) {
16912 rval = 0;
16913 /*
16914 * Process each dof_helper_t
16915 */
16916 i = 0;
16917 do {
16918 dhp = &multi_dof->dofiod_helpers[i];
16919
16920 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
16921
16922 if (dof != NULL) {
16923 lck_mtx_lock(&dtrace_meta_lock);
16924 lck_mtx_lock(&dtrace_lock);
16925
16926 /*
16927 * dtrace_helper_slurp() takes responsibility for the dof --
16928 * it may free it now or it may save it and free it later.
16929 */
16930 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
16931 rval = EINVAL;
16932 }
16933
16934 lck_mtx_unlock(&dtrace_lock);
16935 lck_mtx_unlock(&dtrace_meta_lock);
16936 }
16937 } while (++i < multi_dof->dofiod_count && rval == 0);
16938 }
16939
16940 /*
16941 * We need to copyout the multi_dof struct, because it contains
16942 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
16943 *
16944 * This could certainly be better optimized.
16945 */
16946 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
16947 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
16948 /* Don't overwrite pre-existing error code */
16949 if (rval == 0) rval = EFAULT;
16950 }
16951
16952 cleanup:
16953 /*
16954 * If we had to allocate struct memory, free it.
16955 */
16956 if (multi_dof != NULL && !multi_dof_claimed) {
16957 kmem_free(multi_dof, dof_ioctl_data_size);
16958 }
16959
16960 return rval;
16961 }
16962
16963 case DTRACEHIOC_REMOVE: {
16964 int generation = *(int*)arg;
16965 proc_t* p = current_proc();
16966
16967 /*
16968 * Try lazy first.
16969 */
16970 int rval = dtrace_lazy_dofs_remove(p, generation);
16971
16972 /*
16973 * EACCES means non-lazy
16974 */
16975 if (rval == EACCES) {
16976 lck_mtx_lock(&dtrace_meta_lock);
16977 lck_mtx_lock(&dtrace_lock);
16978 rval = dtrace_helper_destroygen(p, generation);
16979 lck_mtx_unlock(&dtrace_lock);
16980 lck_mtx_unlock(&dtrace_meta_lock);
16981 }
16982
16983 return (rval);
16984 }
16985
16986 default:
16987 break;
16988 }
16989
16990 return ENOTTY;
16991}
16992
16993/*ARGSUSED*/
16994static int
16995dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
16996{
16997#pragma unused(md)
16998 minor_t minor = getminor(dev);
16999 dtrace_state_t *state;
17000 int rval;
17001
17002 /* Darwin puts Helper on its own major device. */
17003
17004 state = dtrace_state_get(minor);
17005
17006 if (state->dts_anon) {
17007 ASSERT(dtrace_anon.dta_state == NULL);
17008 state = state->dts_anon;
17009 }
17010
17011 switch (cmd) {
17012 case DTRACEIOC_PROVIDER: {
17013 dtrace_providerdesc_t pvd;
17014 dtrace_provider_t *pvp;
17015
17016 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17017 return (EFAULT);
17018
17019 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17020 lck_mtx_lock(&dtrace_provider_lock);
17021
17022 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17023 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17024 break;
17025 }
17026
17027 lck_mtx_unlock(&dtrace_provider_lock);
17028
17029 if (pvp == NULL)
17030 return (ESRCH);
17031
17032 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17033 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17034 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17035 return (EFAULT);
17036
17037 return (0);
17038 }
17039
17040 case DTRACEIOC_EPROBE: {
17041 dtrace_eprobedesc_t epdesc;
17042 dtrace_ecb_t *ecb;
17043 dtrace_action_t *act;
17044 void *buf;
17045 size_t size;
17046 uintptr_t dest;
17047 int nrecs;
17048
17049 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17050 return (EFAULT);
17051
17052 lck_mtx_lock(&dtrace_lock);
17053
17054 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17055 lck_mtx_unlock(&dtrace_lock);
17056 return (EINVAL);
17057 }
17058
17059 if (ecb->dte_probe == NULL) {
17060 lck_mtx_unlock(&dtrace_lock);
17061 return (EINVAL);
17062 }
17063
17064 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17065 epdesc.dtepd_uarg = ecb->dte_uarg;
17066 epdesc.dtepd_size = ecb->dte_size;
17067
17068 nrecs = epdesc.dtepd_nrecs;
17069 epdesc.dtepd_nrecs = 0;
17070 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17071 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17072 continue;
17073
17074 epdesc.dtepd_nrecs++;
17075 }
17076
17077 /*
17078 * Now that we have the size, we need to allocate a temporary
17079 * buffer in which to store the complete description. We need
17080 * the temporary buffer to be able to drop dtrace_lock()
17081 * across the copyout(), below.
17082 */
17083 size = sizeof (dtrace_eprobedesc_t) +
17084 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17085
17086 buf = kmem_alloc(size, KM_SLEEP);
17087 dest = (uintptr_t)buf;
17088
17089 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17090 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17091
17092 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17093 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17094 continue;
17095
17096 if (nrecs-- == 0)
17097 break;
17098
17099 bcopy(&act->dta_rec, (void *)dest,
17100 sizeof (dtrace_recdesc_t));
17101 dest += sizeof (dtrace_recdesc_t);
17102 }
17103
17104 lck_mtx_unlock(&dtrace_lock);
17105
17106 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17107 kmem_free(buf, size);
17108 return (EFAULT);
17109 }
17110
17111 kmem_free(buf, size);
17112 return (0);
17113 }
17114
17115 case DTRACEIOC_AGGDESC: {
17116 dtrace_aggdesc_t aggdesc;
17117 dtrace_action_t *act;
17118 dtrace_aggregation_t *agg;
17119 int nrecs;
17120 uint32_t offs;
17121 dtrace_recdesc_t *lrec;
17122 void *buf;
17123 size_t size;
17124 uintptr_t dest;
17125
17126 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17127 return (EFAULT);
17128
17129 lck_mtx_lock(&dtrace_lock);
17130
17131 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17132 lck_mtx_unlock(&dtrace_lock);
17133 return (EINVAL);
17134 }
17135
17136 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17137
17138 nrecs = aggdesc.dtagd_nrecs;
17139 aggdesc.dtagd_nrecs = 0;
17140
17141 offs = agg->dtag_base;
17142 lrec = &agg->dtag_action.dta_rec;
17143 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17144
17145 for (act = agg->dtag_first; ; act = act->dta_next) {
17146 ASSERT(act->dta_intuple ||
17147 DTRACEACT_ISAGG(act->dta_kind));
17148
17149 /*
17150 * If this action has a record size of zero, it
17151 * denotes an argument to the aggregating action.
17152 * Because the presence of this record doesn't (or
17153 * shouldn't) affect the way the data is interpreted,
17154 * we don't copy it out to save user-level the
17155 * confusion of dealing with a zero-length record.
17156 */
17157 if (act->dta_rec.dtrd_size == 0) {
17158 ASSERT(agg->dtag_hasarg);
17159 continue;
17160 }
17161
17162 aggdesc.dtagd_nrecs++;
17163
17164 if (act == &agg->dtag_action)
17165 break;
17166 }
17167
17168 /*
17169 * Now that we have the size, we need to allocate a temporary
17170 * buffer in which to store the complete description. We need
17171 * the temporary buffer to be able to drop dtrace_lock()
17172 * across the copyout(), below.
17173 */
17174 size = sizeof (dtrace_aggdesc_t) +
17175 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17176
17177 buf = kmem_alloc(size, KM_SLEEP);
17178 dest = (uintptr_t)buf;
17179
17180 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17181 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17182
17183 for (act = agg->dtag_first; ; act = act->dta_next) {
17184 dtrace_recdesc_t rec = act->dta_rec;
17185
17186 /*
17187 * See the comment in the above loop for why we pass
17188 * over zero-length records.
17189 */
17190 if (rec.dtrd_size == 0) {
17191 ASSERT(agg->dtag_hasarg);
17192 continue;
17193 }
17194
17195 if (nrecs-- == 0)
17196 break;
17197
17198 rec.dtrd_offset -= offs;
17199 bcopy(&rec, (void *)dest, sizeof (rec));
17200 dest += sizeof (dtrace_recdesc_t);
17201
17202 if (act == &agg->dtag_action)
17203 break;
17204 }
17205
17206 lck_mtx_unlock(&dtrace_lock);
17207
17208 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17209 kmem_free(buf, size);
17210 return (EFAULT);
17211 }
17212
17213 kmem_free(buf, size);
17214 return (0);
17215 }
17216
17217 case DTRACEIOC_ENABLE: {
17218 dof_hdr_t *dof;
17219 dtrace_enabling_t *enab = NULL;
17220 dtrace_vstate_t *vstate;
17221 int err = 0;
17222
17223 *rv = 0;
17224
17225 /*
17226 * If a NULL argument has been passed, we take this as our
17227 * cue to reevaluate our enablings.
17228 */
17229 if (arg == 0) {
17230 dtrace_enabling_matchall();
17231
17232 return (0);
17233 }
17234
17235 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17236 return (rval);
17237
17238 lck_mtx_lock(&cpu_lock);
17239 lck_mtx_lock(&dtrace_lock);
17240 vstate = &state->dts_vstate;
17241
17242 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17243 lck_mtx_unlock(&dtrace_lock);
17244 lck_mtx_unlock(&cpu_lock);
17245 dtrace_dof_destroy(dof);
17246 return (EBUSY);
17247 }
17248
17249 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17250 lck_mtx_unlock(&dtrace_lock);
17251 lck_mtx_unlock(&cpu_lock);
17252 dtrace_dof_destroy(dof);
17253 return (EINVAL);
17254 }
17255
17256 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17257 dtrace_enabling_destroy(enab);
17258 lck_mtx_unlock(&dtrace_lock);
17259 lck_mtx_unlock(&cpu_lock);
17260 dtrace_dof_destroy(dof);
17261 return (rval);
17262 }
17263
17264 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
17265 err = dtrace_enabling_retain(enab);
17266 } else {
17267 dtrace_enabling_destroy(enab);
17268 }
17269
17270 lck_mtx_unlock(&dtrace_lock);
17271 lck_mtx_unlock(&cpu_lock);
17272 dtrace_dof_destroy(dof);
17273
17274 return (err);
17275 }
17276
17277 case DTRACEIOC_REPLICATE: {
17278 dtrace_repldesc_t desc;
17279 dtrace_probedesc_t *match = &desc.dtrpd_match;
17280 dtrace_probedesc_t *create = &desc.dtrpd_create;
17281 int err;
17282
17283 if (copyin(arg, &desc, sizeof (desc)) != 0)
17284 return (EFAULT);
17285
17286 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17287 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17288 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17289 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17290
17291 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17292 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17293 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17294 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17295
17296 lck_mtx_lock(&dtrace_lock);
17297 err = dtrace_enabling_replicate(state, match, create);
17298 lck_mtx_unlock(&dtrace_lock);
17299
17300 return (err);
17301 }
17302
17303 case DTRACEIOC_PROBEMATCH:
17304 case DTRACEIOC_PROBES: {
17305 dtrace_probe_t *probe = NULL;
17306 dtrace_probedesc_t desc;
17307 dtrace_probekey_t pkey;
17308 dtrace_id_t i;
17309 int m = 0;
17310 uint32_t priv;
17311 uid_t uid;
17312 zoneid_t zoneid;
17313
17314 if (copyin(arg, &desc, sizeof (desc)) != 0)
17315 return (EFAULT);
17316
17317 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17318 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17319 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17320 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17321
17322 /*
17323 * Before we attempt to match this probe, we want to give
17324 * all providers the opportunity to provide it.
17325 */
17326 if (desc.dtpd_id == DTRACE_IDNONE) {
17327 lck_mtx_lock(&dtrace_provider_lock);
17328 dtrace_probe_provide(&desc, NULL);
17329 lck_mtx_unlock(&dtrace_provider_lock);
17330 desc.dtpd_id++;
17331 }
17332
17333 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17334
17335 lck_mtx_lock(&dtrace_lock);
17336
17337 if (cmd == DTRACEIOC_PROBEMATCH) {
17338 dtrace_probekey(&desc, &pkey);
17339 pkey.dtpk_id = DTRACE_IDNONE;
17340
17341 /* Quiet compiler warning */
17342 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17343 if ((probe = dtrace_probes[i - 1]) != NULL &&
17344 (m = dtrace_match_probe(probe, &pkey,
17345 priv, uid, zoneid)) != 0)
17346 break;
17347 }
17348
17349 if (m < 0) {
17350 lck_mtx_unlock(&dtrace_lock);
17351 return (EINVAL);
17352 }
17353 dtrace_probekey_release(&pkey);
17354
17355 } else {
17356 /* Quiet compiler warning */
17357 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17358 if ((probe = dtrace_probes[i - 1]) != NULL &&
17359 dtrace_match_priv(probe, priv, uid, zoneid))
17360 break;
17361 }
17362 }
17363
17364 if (probe == NULL) {
17365 lck_mtx_unlock(&dtrace_lock);
17366 return (ESRCH);
17367 }
17368
17369 dtrace_probe_description(probe, &desc);
17370 lck_mtx_unlock(&dtrace_lock);
17371
17372 if (copyout(&desc, arg, sizeof (desc)) != 0)
17373 return (EFAULT);
17374
17375 return (0);
17376 }
17377
17378 case DTRACEIOC_PROBEARG: {
17379 dtrace_argdesc_t desc;
17380 dtrace_probe_t *probe;
17381 dtrace_provider_t *prov;
17382
17383 if (copyin(arg, &desc, sizeof (desc)) != 0)
17384 return (EFAULT);
17385
17386 if (desc.dtargd_id == DTRACE_IDNONE)
17387 return (EINVAL);
17388
17389 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17390 return (EINVAL);
17391
17392 lck_mtx_lock(&dtrace_provider_lock);
17393 lck_mtx_lock(&mod_lock);
17394 lck_mtx_lock(&dtrace_lock);
17395
17396 /* Quiet compiler warning */
17397 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
17398 lck_mtx_unlock(&dtrace_lock);
17399 lck_mtx_unlock(&mod_lock);
17400 lck_mtx_unlock(&dtrace_provider_lock);
17401 return (EINVAL);
17402 }
17403
17404 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17405 lck_mtx_unlock(&dtrace_lock);
17406 lck_mtx_unlock(&mod_lock);
17407 lck_mtx_unlock(&dtrace_provider_lock);
17408 return (EINVAL);
17409 }
17410
17411 lck_mtx_unlock(&dtrace_lock);
17412
17413 prov = probe->dtpr_provider;
17414
17415 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17416 /*
17417 * There isn't any typed information for this probe.
17418 * Set the argument number to DTRACE_ARGNONE.
17419 */
17420 desc.dtargd_ndx = DTRACE_ARGNONE;
17421 } else {
17422 desc.dtargd_native[0] = '\0';
17423 desc.dtargd_xlate[0] = '\0';
17424 desc.dtargd_mapping = desc.dtargd_ndx;
17425
17426 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17427 probe->dtpr_id, probe->dtpr_arg, &desc);
17428 }
17429
17430 lck_mtx_unlock(&mod_lock);
17431 lck_mtx_unlock(&dtrace_provider_lock);
17432
17433 if (copyout(&desc, arg, sizeof (desc)) != 0)
17434 return (EFAULT);
17435
17436 return (0);
17437 }
17438
17439 case DTRACEIOC_GO: {
17440 processorid_t cpuid;
17441 rval = dtrace_state_go(state, &cpuid);
17442
17443 if (rval != 0)
17444 return (rval);
17445
17446 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
17447 return (EFAULT);
17448
17449 return (0);
17450 }
17451
17452 case DTRACEIOC_STOP: {
17453 processorid_t cpuid;
17454
17455 lck_mtx_lock(&dtrace_lock);
17456 rval = dtrace_state_stop(state, &cpuid);
17457 lck_mtx_unlock(&dtrace_lock);
17458
17459 if (rval != 0)
17460 return (rval);
17461
17462 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
17463 return (EFAULT);
17464
17465 return (0);
17466 }
17467
17468 case DTRACEIOC_DOFGET: {
17469 dof_hdr_t hdr, *dof;
17470 uint64_t len;
17471
17472 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
17473 return (EFAULT);
17474
17475 lck_mtx_lock(&dtrace_lock);
17476 dof = dtrace_dof_create(state);
17477 lck_mtx_unlock(&dtrace_lock);
17478
17479 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17480 rval = copyout(dof, arg, len);
17481 dtrace_dof_destroy(dof);
17482
17483 return (rval == 0 ? 0 : EFAULT);
17484 }
17485
17486 case DTRACEIOC_SLEEP: {
17487 int64_t time;
17488 uint64_t abstime;
17489 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
17490
17491 if (copyin(arg, &time, sizeof(time)) != 0)
17492 return (EFAULT);
17493
17494 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
17495 clock_absolutetime_interval_to_deadline(abstime, &abstime);
17496
17497 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
17498 if (state->dts_buf_over_limit > 0) {
17499 clear_wait(current_thread(), THREAD_INTERRUPTED);
17500 rvalue = DTRACE_WAKE_BUF_LIMIT;
17501 } else {
17502 thread_block(THREAD_CONTINUE_NULL);
17503 if (state->dts_buf_over_limit > 0) {
17504 rvalue = DTRACE_WAKE_BUF_LIMIT;
17505 }
17506 }
17507 }
17508
17509 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
17510 return (EFAULT);
17511
17512 return (0);
17513 }
17514
17515 case DTRACEIOC_SIGNAL: {
17516 wakeup(state);
17517 return (0);
17518 }
17519
17520 case DTRACEIOC_AGGSNAP:
17521 case DTRACEIOC_BUFSNAP: {
17522 dtrace_bufdesc_t desc;
17523 caddr_t cached;
17524 boolean_t over_limit;
17525 dtrace_buffer_t *buf;
17526
17527 if (copyin(arg, &desc, sizeof (desc)) != 0)
17528 return (EFAULT);
17529
17530 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17531 return (EINVAL);
17532
17533 lck_mtx_lock(&dtrace_lock);
17534
17535 if (cmd == DTRACEIOC_BUFSNAP) {
17536 buf = &state->dts_buffer[desc.dtbd_cpu];
17537 } else {
17538 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17539 }
17540
17541 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17542 size_t sz = buf->dtb_offset;
17543
17544 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17545 lck_mtx_unlock(&dtrace_lock);
17546 return (EBUSY);
17547 }
17548
17549 /*
17550 * If this buffer has already been consumed, we're
17551 * going to indicate that there's nothing left here
17552 * to consume.
17553 */
17554 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17555 lck_mtx_unlock(&dtrace_lock);
17556
17557 desc.dtbd_size = 0;
17558 desc.dtbd_drops = 0;
17559 desc.dtbd_errors = 0;
17560 desc.dtbd_oldest = 0;
17561 sz = sizeof (desc);
17562
17563 if (copyout(&desc, arg, sz) != 0)
17564 return (EFAULT);
17565
17566 return (0);
17567 }
17568
17569 /*
17570 * If this is a ring buffer that has wrapped, we want
17571 * to copy the whole thing out.
17572 */
17573 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17574 dtrace_buffer_polish(buf);
17575 sz = buf->dtb_size;
17576 }
17577
17578 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
17579 lck_mtx_unlock(&dtrace_lock);
17580 return (EFAULT);
17581 }
17582
17583 desc.dtbd_size = sz;
17584 desc.dtbd_drops = buf->dtb_drops;
17585 desc.dtbd_errors = buf->dtb_errors;
17586 desc.dtbd_oldest = buf->dtb_xamot_offset;
17587 desc.dtbd_timestamp = dtrace_gethrtime();
17588
17589 lck_mtx_unlock(&dtrace_lock);
17590
17591 if (copyout(&desc, arg, sizeof (desc)) != 0)
17592 return (EFAULT);
17593
17594 buf->dtb_flags |= DTRACEBUF_CONSUMED;
17595
17596 return (0);
17597 }
17598
17599 if (buf->dtb_tomax == NULL) {
17600 ASSERT(buf->dtb_xamot == NULL);
17601 lck_mtx_unlock(&dtrace_lock);
17602 return (ENOENT);
17603 }
17604
17605 cached = buf->dtb_tomax;
17606 over_limit = buf->dtb_cur_limit == buf->dtb_size;
17607
17608 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17609
17610 dtrace_xcall(desc.dtbd_cpu,
17611 (dtrace_xcall_t)dtrace_buffer_switch, buf);
17612
17613 state->dts_errors += buf->dtb_xamot_errors;
17614
17615 /*
17616 * If the buffers did not actually switch, then the cross call
17617 * did not take place -- presumably because the given CPU is
17618 * not in the ready set. If this is the case, we'll return
17619 * ENOENT.
17620 */
17621 if (buf->dtb_tomax == cached) {
17622 ASSERT(buf->dtb_xamot != cached);
17623 lck_mtx_unlock(&dtrace_lock);
17624 return (ENOENT);
17625 }
17626
17627 ASSERT(cached == buf->dtb_xamot);
17628 /*
17629 * At this point we know the buffer have switched, so we
17630 * can decrement the over limit count if the buffer was over
17631 * its limit. The new buffer might already be over its limit
17632 * yet, but we don't care since we're guaranteed not to be
17633 * checking the buffer over limit count at this point.
17634 */
17635 if (over_limit) {
17636 uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -1);
17637 #pragma unused(old)
17638
17639 /*
17640 * Verify that we didn't underflow the value
17641 */
17642 ASSERT(old != 0);
17643 }
17644
17645 /*
17646 * We have our snapshot; now copy it out.
17647 */
17648 if (dtrace_buffer_copyout(buf->dtb_xamot,
17649 (user_addr_t)desc.dtbd_data,
17650 buf->dtb_xamot_offset) != 0) {
17651 lck_mtx_unlock(&dtrace_lock);
17652 return (EFAULT);
17653 }
17654
17655 desc.dtbd_size = buf->dtb_xamot_offset;
17656 desc.dtbd_drops = buf->dtb_xamot_drops;
17657 desc.dtbd_errors = buf->dtb_xamot_errors;
17658 desc.dtbd_oldest = 0;
17659 desc.dtbd_timestamp = buf->dtb_switched;
17660
17661 lck_mtx_unlock(&dtrace_lock);
17662
17663 /*
17664 * Finally, copy out the buffer description.
17665 */
17666 if (copyout(&desc, arg, sizeof (desc)) != 0)
17667 return (EFAULT);
17668
17669 return (0);
17670 }
17671
17672 case DTRACEIOC_CONF: {
17673 dtrace_conf_t conf;
17674
17675 bzero(&conf, sizeof (conf));
17676 conf.dtc_difversion = DIF_VERSION;
17677 conf.dtc_difintregs = DIF_DIR_NREGS;
17678 conf.dtc_diftupregs = DIF_DTR_NREGS;
17679 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17680
17681 if (copyout(&conf, arg, sizeof (conf)) != 0)
17682 return (EFAULT);
17683
17684 return (0);
17685 }
17686
17687 case DTRACEIOC_STATUS: {
17688 dtrace_status_t stat;
17689 dtrace_dstate_t *dstate;
17690 int i, j;
17691 uint64_t nerrs;
17692
17693 /*
17694 * See the comment in dtrace_state_deadman() for the reason
17695 * for setting dts_laststatus to INT64_MAX before setting
17696 * it to the correct value.
17697 */
17698 state->dts_laststatus = INT64_MAX;
17699 dtrace_membar_producer();
17700 state->dts_laststatus = dtrace_gethrtime();
17701
17702 bzero(&stat, sizeof (stat));
17703
17704 lck_mtx_lock(&dtrace_lock);
17705
17706 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17707 lck_mtx_unlock(&dtrace_lock);
17708 return (ENOENT);
17709 }
17710
17711 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17712 stat.dtst_exiting = 1;
17713
17714 nerrs = state->dts_errors;
17715 dstate = &state->dts_vstate.dtvs_dynvars;
17716
17717 for (i = 0; i < (int)NCPU; i++) {
17718 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17719
17720 stat.dtst_dyndrops += dcpu->dtdsc_drops;
17721 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17722 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17723
17724 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17725 stat.dtst_filled++;
17726
17727 nerrs += state->dts_buffer[i].dtb_errors;
17728
17729 for (j = 0; j < state->dts_nspeculations; j++) {
17730 dtrace_speculation_t *spec;
17731 dtrace_buffer_t *buf;
17732
17733 spec = &state->dts_speculations[j];
17734 buf = &spec->dtsp_buffer[i];
17735 stat.dtst_specdrops += buf->dtb_xamot_drops;
17736 }
17737 }
17738
17739 stat.dtst_specdrops_busy = state->dts_speculations_busy;
17740 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17741 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17742 stat.dtst_dblerrors = state->dts_dblerrors;
17743 stat.dtst_killed =
17744 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17745 stat.dtst_errors = nerrs;
17746
17747 lck_mtx_unlock(&dtrace_lock);
17748
17749 if (copyout(&stat, arg, sizeof (stat)) != 0)
17750 return (EFAULT);
17751
17752 return (0);
17753 }
17754
17755 case DTRACEIOC_FORMAT: {
17756 dtrace_fmtdesc_t fmt;
17757 char *str;
17758 int len;
17759
17760 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
17761 return (EFAULT);
17762
17763 lck_mtx_lock(&dtrace_lock);
17764
17765 if (fmt.dtfd_format == 0 ||
17766 fmt.dtfd_format > state->dts_nformats) {
17767 lck_mtx_unlock(&dtrace_lock);
17768 return (EINVAL);
17769 }
17770
17771 /*
17772 * Format strings are allocated contiguously and they are
17773 * never freed; if a format index is less than the number
17774 * of formats, we can assert that the format map is non-NULL
17775 * and that the format for the specified index is non-NULL.
17776 */
17777 ASSERT(state->dts_formats != NULL);
17778 str = state->dts_formats[fmt.dtfd_format - 1];
17779 ASSERT(str != NULL);
17780
17781 len = strlen(str) + 1;
17782
17783 if (len > fmt.dtfd_length) {
17784 fmt.dtfd_length = len;
17785
17786 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
17787 lck_mtx_unlock(&dtrace_lock);
17788 return (EINVAL);
17789 }
17790 } else {
17791 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
17792 lck_mtx_unlock(&dtrace_lock);
17793 return (EINVAL);
17794 }
17795 }
17796
17797 lck_mtx_unlock(&dtrace_lock);
17798 return (0);
17799 }
17800
17801 case DTRACEIOC_MODUUIDSLIST: {
17802 size_t module_uuids_list_size;
17803 dtrace_module_uuids_list_t* uuids_list;
17804 uint64_t dtmul_count;
17805
17806 /*
17807 * Security restrictions make this operation illegal, if this is enabled DTrace
17808 * must refuse to provide any fbt probes.
17809 */
17810 if (dtrace_fbt_probes_restricted()) {
17811 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17812 return (EPERM);
17813 }
17814
17815 /*
17816 * Fail if the kernel symbol mode makes this operation illegal.
17817 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17818 * for them without holding the dtrace_lock.
17819 */
17820 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
17821 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17822 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
17823 return (EPERM);
17824 }
17825
17826 /*
17827 * Read the number of symbolsdesc structs being passed in.
17828 */
17829 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
17830 &dtmul_count,
17831 sizeof(dtmul_count))) {
17832 cmn_err(CE_WARN, "failed to copyin dtmul_count");
17833 return (EFAULT);
17834 }
17835
17836 /*
17837 * Range check the count. More than 2k kexts is probably an error.
17838 */
17839 if (dtmul_count > 2048) {
17840 cmn_err(CE_WARN, "dtmul_count is not valid");
17841 return (EINVAL);
17842 }
17843
17844 /*
17845 * For all queries, we return EINVAL when the user specified
17846 * count does not match the actual number of modules we find
17847 * available.
17848 *
17849 * If the user specified count is zero, then this serves as a
17850 * simple query to count the available modules in need of symbols.
17851 */
17852
17853 rval = 0;
17854
17855 if (dtmul_count == 0)
17856 {
17857 lck_mtx_lock(&mod_lock);
17858 struct modctl* ctl = dtrace_modctl_list;
17859 while (ctl) {
17860 /* Update the private probes bit */
17861 if (dtrace_provide_private_probes)
17862 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17863
17864 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17865 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17866 dtmul_count++;
17867 rval = EINVAL;
17868 }
17869 ctl = ctl->mod_next;
17870 }
17871 lck_mtx_unlock(&mod_lock);
17872
17873 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
17874 return (EFAULT);
17875 else
17876 return (rval);
17877 }
17878
17879 /*
17880 * If we reach this point, then we have a request for full list data.
17881 * Allocate a correctly sized structure and copyin the data.
17882 */
17883 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
17884 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
17885 return (ENOMEM);
17886
17887 /* NOTE! We can no longer exit this method via return */
17888 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
17889 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
17890 rval = EFAULT;
17891 goto moduuidslist_cleanup;
17892 }
17893
17894 /*
17895 * Check that the count didn't change between the first copyin and the second.
17896 */
17897 if (uuids_list->dtmul_count != dtmul_count) {
17898 rval = EINVAL;
17899 goto moduuidslist_cleanup;
17900 }
17901
17902 /*
17903 * Build the list of UUID's that need symbols
17904 */
17905 lck_mtx_lock(&mod_lock);
17906
17907 dtmul_count = 0;
17908
17909 struct modctl* ctl = dtrace_modctl_list;
17910 while (ctl) {
17911 /* Update the private probes bit */
17912 if (dtrace_provide_private_probes)
17913 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17914
17915 /*
17916 * We assume that userspace symbols will be "better" than kernel level symbols,
17917 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
17918 * are available, add user syms if the module might use them.
17919 */
17920 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17921 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17922 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
17923 if (dtmul_count++ < uuids_list->dtmul_count) {
17924 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
17925 }
17926 }
17927 ctl = ctl->mod_next;
17928 }
17929
17930 lck_mtx_unlock(&mod_lock);
17931
17932 if (uuids_list->dtmul_count < dtmul_count)
17933 rval = EINVAL;
17934
17935 uuids_list->dtmul_count = dtmul_count;
17936
17937 /*
17938 * Copyout the symbols list (or at least the count!)
17939 */
17940 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
17941 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
17942 rval = EFAULT;
17943 }
17944
17945 moduuidslist_cleanup:
17946 /*
17947 * If we had to allocate struct memory, free it.
17948 */
17949 if (uuids_list != NULL) {
17950 kmem_free(uuids_list, module_uuids_list_size);
17951 }
17952
17953 return rval;
17954 }
17955
17956 case DTRACEIOC_PROVMODSYMS: {
17957 size_t module_symbols_size;
17958 dtrace_module_symbols_t* module_symbols;
17959 uint64_t dtmodsyms_count;
17960
17961 /*
17962 * Security restrictions make this operation illegal, if this is enabled DTrace
17963 * must refuse to provide any fbt probes.
17964 */
17965 if (dtrace_fbt_probes_restricted()) {
17966 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17967 return (EPERM);
17968 }
17969
17970 /*
17971 * Fail if the kernel symbol mode makes this operation illegal.
17972 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17973 * for them without holding the dtrace_lock.
17974 */
17975 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
17976 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17977 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
17978 return (EPERM);
17979 }
17980
17981 /*
17982 * Read the number of module symbols structs being passed in.
17983 */
17984 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
17985 &dtmodsyms_count,
17986 sizeof(dtmodsyms_count))) {
17987 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
17988 return (EFAULT);
17989 }
17990
17991 /*
17992 * Range check the count. How much data can we pass around?
17993 * FIX ME!
17994 */
17995 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
17996 cmn_err(CE_WARN, "dtmodsyms_count is not valid");
17997 return (EINVAL);
17998 }
17999
18000 /*
18001 * Allocate a correctly sized structure and copyin the data.
18002 */
18003 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18004 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18005 return (ENOMEM);
18006
18007 rval = 0;
18008
18009 /* NOTE! We can no longer exit this method via return */
18010 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18011 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18012 rval = EFAULT;
18013 goto module_symbols_cleanup;
18014 }
18015
18016 /*
18017 * Check that the count didn't change between the first copyin and the second.
18018 */
18019 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18020 rval = EINVAL;
18021 goto module_symbols_cleanup;
18022 }
18023
18024 /*
18025 * Find the modctl to add symbols to.
18026 */
18027 lck_mtx_lock(&dtrace_provider_lock);
18028 lck_mtx_lock(&mod_lock);
18029
18030 struct modctl* ctl = dtrace_modctl_list;
18031 while (ctl) {
18032 /* Update the private probes bit */
18033 if (dtrace_provide_private_probes)
18034 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
18035
18036 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18037 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18038 dtrace_provider_t *prv;
18039 ctl->mod_user_symbols = module_symbols;
18040
18041 /*
18042 * We're going to call each providers per-module provide operation
18043 * specifying only this module.
18044 */
18045 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18046 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18047 /*
18048 * We gave every provider a chance to provide with the user syms, go ahead and clear them
18049 */
18050 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18051 }
18052 ctl = ctl->mod_next;
18053 }
18054
18055 lck_mtx_unlock(&mod_lock);
18056 lck_mtx_unlock(&dtrace_provider_lock);
18057
18058 module_symbols_cleanup:
18059 /*
18060 * If we had to allocate struct memory, free it.
18061 */
18062 if (module_symbols != NULL) {
18063 kmem_free(module_symbols, module_symbols_size);
18064 }
18065
18066 return rval;
18067 }
18068
18069 case DTRACEIOC_PROCWAITFOR: {
18070 dtrace_procdesc_t pdesc = {
18071 .p_name = {0},
18072 .p_pid = -1
18073 };
18074
18075 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18076 goto proc_waitfor_error;
18077
18078 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18079 goto proc_waitfor_error;
18080
18081 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18082 goto proc_waitfor_error;
18083
18084 return 0;
18085
18086 proc_waitfor_error:
18087 /* The process was suspended, revert this since the client will not do it. */
18088 if (pdesc.p_pid != -1) {
18089 proc_t *proc = proc_find(pdesc.p_pid);
18090 if (proc != PROC_NULL) {
18091 task_pidresume(proc->task);
18092 proc_rele(proc);
18093 }
18094 }
18095
18096 return rval;
18097 }
18098
18099 default:
18100 break;
18101 }
18102
18103 return (ENOTTY);
18104}
18105
18106/*
18107 * APPLE NOTE: dtrace_detach not implemented
18108 */
18109#if !defined(__APPLE__)
18110/*ARGSUSED*/
18111static int
18112dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18113{
18114 dtrace_state_t *state;
18115
18116 switch (cmd) {
18117 case DDI_DETACH:
18118 break;
18119
18120 case DDI_SUSPEND:
18121 return (DDI_SUCCESS);
18122
18123 default:
18124 return (DDI_FAILURE);
18125 }
18126
18127 lck_mtx_lock(&cpu_lock);
18128 lck_mtx_lock(&dtrace_provider_lock);
18129 lck_mtx_lock(&dtrace_lock);
18130
18131 ASSERT(dtrace_opens == 0);
18132
18133 if (dtrace_helpers > 0) {
18134 lck_mtx_unlock(&dtrace_lock);
18135 lck_mtx_unlock(&dtrace_provider_lock);
18136 lck_mtx_unlock(&cpu_lock);
18137 return (DDI_FAILURE);
18138 }
18139
18140 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18141 lck_mtx_unlock(&dtrace_lock);
18142 lck_mtx_unlock(&dtrace_provider_lock);
18143 lck_mtx_unlock(&cpu_lock);
18144 return (DDI_FAILURE);
18145 }
18146
18147 dtrace_provider = NULL;
18148
18149 if ((state = dtrace_anon_grab()) != NULL) {
18150 /*
18151 * If there were ECBs on this state, the provider should
18152 * have not been allowed to detach; assert that there is
18153 * none.
18154 */
18155 ASSERT(state->dts_necbs == 0);
18156 dtrace_state_destroy(state);
18157
18158 /*
18159 * If we're being detached with anonymous state, we need to
18160 * indicate to the kernel debugger that DTrace is now inactive.
18161 */
18162 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18163 }
18164
18165 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18166 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18167 dtrace_cpu_init = NULL;
18168 dtrace_helpers_cleanup = NULL;
18169 dtrace_helpers_fork = NULL;
18170 dtrace_cpustart_init = NULL;
18171 dtrace_cpustart_fini = NULL;
18172 dtrace_debugger_init = NULL;
18173 dtrace_debugger_fini = NULL;
18174 dtrace_kreloc_init = NULL;
18175 dtrace_kreloc_fini = NULL;
18176 dtrace_modload = NULL;
18177 dtrace_modunload = NULL;
18178
18179 lck_mtx_unlock(&cpu_lock);
18180
18181 if (dtrace_helptrace_enabled) {
18182 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18183 dtrace_helptrace_buffer = NULL;
18184 }
18185
18186 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18187 dtrace_probes = NULL;
18188 dtrace_nprobes = 0;
18189
18190 dtrace_hash_destroy(dtrace_strings);
18191 dtrace_hash_destroy(dtrace_byprov);
18192 dtrace_hash_destroy(dtrace_bymod);
18193 dtrace_hash_destroy(dtrace_byfunc);
18194 dtrace_hash_destroy(dtrace_byname);
18195 dtrace_strings = NULL;
18196 dtrace_byprov = NULL;
18197 dtrace_bymod = NULL;
18198 dtrace_byfunc = NULL;
18199 dtrace_byname = NULL;
18200
18201 kmem_cache_destroy(dtrace_state_cache);
18202 vmem_destroy(dtrace_arena);
18203
18204 if (dtrace_toxrange != NULL) {
18205 kmem_free(dtrace_toxrange,
18206 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18207 dtrace_toxrange = NULL;
18208 dtrace_toxranges = 0;
18209 dtrace_toxranges_max = 0;
18210 }
18211
18212 ddi_remove_minor_node(dtrace_devi, NULL);
18213 dtrace_devi = NULL;
18214
18215 ddi_soft_state_fini(&dtrace_softstate);
18216
18217 ASSERT(dtrace_vtime_references == 0);
18218 ASSERT(dtrace_opens == 0);
18219 ASSERT(dtrace_retained == NULL);
18220
18221 lck_mtx_unlock(&dtrace_lock);
18222 lck_mtx_unlock(&dtrace_provider_lock);
18223
18224#ifdef illumos
18225 /*
18226 * We don't destroy the task queue until after we have dropped our
18227 * locks (taskq_destroy() may block on running tasks). To prevent
18228 * attempting to do work after we have effectively detached but before
18229 * the task queue has been destroyed, all tasks dispatched via the
18230 * task queue must check that DTrace is still attached before
18231 * performing any operation.
18232 */
18233 taskq_destroy(dtrace_taskq);
18234 dtrace_taskq = NULL;
18235#endif
18236
18237 return (DDI_SUCCESS);
18238}
18239#endif /* __APPLE__ */
18240
18241d_open_t _dtrace_open, helper_open;
18242d_close_t _dtrace_close, helper_close;
18243d_ioctl_t _dtrace_ioctl, helper_ioctl;
18244
18245int
18246_dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18247{
18248#pragma unused(p)
18249 dev_t locdev = dev;
18250
18251 return dtrace_open( &locdev, flags, devtype, CRED());
18252}
18253
18254int
18255helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18256{
18257#pragma unused(dev,flags,devtype,p)
18258 return 0;
18259}
18260
18261int
18262_dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18263{
18264#pragma unused(p)
18265 return dtrace_close( dev, flags, devtype, CRED());
18266}
18267
18268int
18269helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18270{
18271#pragma unused(dev,flags,devtype,p)
18272 return 0;
18273}
18274
18275int
18276_dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18277{
18278#pragma unused(p)
18279 int err, rv = 0;
18280 user_addr_t uaddrp;
18281
18282 if (proc_is64bit(p))
18283 uaddrp = *(user_addr_t *)data;
18284 else
18285 uaddrp = (user_addr_t) *(uint32_t *)data;
18286
18287 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18288
18289 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18290 if (err != 0) {
18291 ASSERT( (err & 0xfffff000) == 0 );
18292 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18293 } else if (rv != 0) {
18294 ASSERT( (rv & 0xfff00000) == 0 );
18295 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18296 } else
18297 return 0;
18298}
18299
18300int
18301helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18302{
18303#pragma unused(dev,fflag,p)
18304 int err, rv = 0;
18305
18306 err = dtrace_ioctl_helper(cmd, data, &rv);
18307 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18308 if (err != 0) {
18309 ASSERT( (err & 0xfffff000) == 0 );
18310 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18311 } else if (rv != 0) {
18312 ASSERT( (rv & 0xfff00000) == 0 );
18313 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18314 } else
18315 return 0;
18316}
18317
18318#define HELPER_MAJOR -24 /* let the kernel pick the device number */
18319
18320/*
18321 * A struct describing which functions will get invoked for certain
18322 * actions.
18323 */
18324static struct cdevsw helper_cdevsw =
18325{
18326 helper_open, /* open */
18327 helper_close, /* close */
18328 eno_rdwrt, /* read */
18329 eno_rdwrt, /* write */
18330 helper_ioctl, /* ioctl */
18331 (stop_fcn_t *)nulldev, /* stop */
18332 (reset_fcn_t *)nulldev, /* reset */
18333 NULL, /* tty's */
18334 eno_select, /* select */
18335 eno_mmap, /* mmap */
18336 eno_strat, /* strategy */
18337 eno_getc, /* getc */
18338 eno_putc, /* putc */
18339 0 /* type */
18340};
18341
18342static int helper_majdevno = 0;
18343
18344static int gDTraceInited = 0;
18345
18346void
18347helper_init( void )
18348{
18349 /*
18350 * Once the "helper" is initialized, it can take ioctl calls that use locks
18351 * and zones initialized in dtrace_init. Make certain dtrace_init was called
18352 * before us.
18353 */
18354
18355 if (!gDTraceInited) {
18356 panic("helper_init before dtrace_init\n");
18357 }
18358
18359 if (0 >= helper_majdevno)
18360 {
18361 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
18362
18363 if (helper_majdevno < 0) {
18364 printf("helper_init: failed to allocate a major number!\n");
18365 return;
18366 }
18367
18368 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
18369 DTRACEMNR_HELPER, 0 )) {
18370 printf("dtrace_init: failed to devfs_make_node for helper!\n");
18371 return;
18372 }
18373 } else
18374 panic("helper_init: called twice!\n");
18375}
18376
18377#undef HELPER_MAJOR
18378
18379static int
18380dtrace_clone_func(dev_t dev, int action)
18381{
18382#pragma unused(dev)
18383
18384 if (action == DEVFS_CLONE_ALLOC) {
18385 return dtrace_state_reserve();
18386 }
18387 else if (action == DEVFS_CLONE_FREE) {
18388 return 0;
18389 }
18390 else return -1;
18391}
18392
18393void dtrace_ast(void);
18394
18395void
18396dtrace_ast(void)
18397{
18398 int i;
18399 uint32_t clients = atomic_and_32(&dtrace_wake_clients, 0);
18400 if (clients == 0)
18401 return;
18402 /**
18403 * We disable preemption here to be sure that we won't get
18404 * interrupted by a wakeup to a thread that is higher
18405 * priority than us, so that we do issue all wakeups
18406 */
18407 disable_preemption();
18408 for (i = 0; i < DTRACE_NCLIENTS; i++) {
18409 if (clients & (1 << i)) {
18410 dtrace_state_t *state = dtrace_state_get(i);
18411 if (state) {
18412 wakeup(state);
18413 }
18414
18415 }
18416 }
18417 enable_preemption();
18418}
18419
18420
18421#define DTRACE_MAJOR -24 /* let the kernel pick the device number */
18422
18423static struct cdevsw dtrace_cdevsw =
18424{
18425 _dtrace_open, /* open */
18426 _dtrace_close, /* close */
18427 eno_rdwrt, /* read */
18428 eno_rdwrt, /* write */
18429 _dtrace_ioctl, /* ioctl */
18430 (stop_fcn_t *)nulldev, /* stop */
18431 (reset_fcn_t *)nulldev, /* reset */
18432 NULL, /* tty's */
18433 eno_select, /* select */
18434 eno_mmap, /* mmap */
18435 eno_strat, /* strategy */
18436 eno_getc, /* getc */
18437 eno_putc, /* putc */
18438 0 /* type */
18439};
18440
18441lck_attr_t* dtrace_lck_attr;
18442lck_grp_attr_t* dtrace_lck_grp_attr;
18443lck_grp_t* dtrace_lck_grp;
18444
18445static int gMajDevNo;
18446
18447void dtrace_early_init (void)
18448{
18449 dtrace_restriction_policy_load();
18450
18451 /*
18452 * See dtrace_impl.h for a description of kernel symbol modes.
18453 * The default is to wait for symbols from userspace (lazy symbols).
18454 */
18455 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
18456 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
18457 }
18458}
18459
18460void
18461dtrace_init( void )
18462{
18463 if (0 == gDTraceInited) {
18464 int i, ncpu;
18465 size_t size = sizeof(dtrace_buffer_memory_maxsize);
18466
18467 /*
18468 * DTrace allocates buffers based on the maximum number
18469 * of enabled cpus. This call avoids any race when finding
18470 * that count.
18471 */
18472 ASSERT(dtrace_max_cpus == 0);
18473 ncpu = dtrace_max_cpus = ml_get_max_cpus();
18474
18475 /*
18476 * Retrieve the size of the physical memory in order to define
18477 * the state buffer memory maximal size. If we cannot retrieve
18478 * this value, we'll consider that we have 1Gb of memory per CPU, that's
18479 * still better than raising a kernel panic.
18480 */
18481 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
18482 &size, NULL, 0))
18483 {
18484 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
18485 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
18486 dtrace_buffer_memory_maxsize);
18487 }
18488
18489 /*
18490 * Finally, divide by three to prevent DTrace from eating too
18491 * much memory.
18492 */
18493 dtrace_buffer_memory_maxsize /= 3;
18494 ASSERT(dtrace_buffer_memory_maxsize > 0);
18495
18496 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
18497
18498 if (gMajDevNo < 0) {
18499 printf("dtrace_init: failed to allocate a major number!\n");
18500 gDTraceInited = 0;
18501 return;
18502 }
18503
18504 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
18505 dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
18506 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
18507 gDTraceInited = 0;
18508 return;
18509 }
18510
18511 /*
18512 * Allocate the dtrace_probe_t zone
18513 */
18514 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
18515 1024 * sizeof(dtrace_probe_t),
18516 sizeof(dtrace_probe_t),
18517 "dtrace.dtrace_probe_t");
18518
18519 /*
18520 * Create the dtrace lock group and attrs.
18521 */
18522 dtrace_lck_attr = lck_attr_alloc_init();
18523 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
18524 dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr);
18525
18526 /*
18527 * We have to initialize all locks explicitly
18528 */
18529 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
18530 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
18531 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
18532 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
18533#if DEBUG
18534 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
18535#endif
18536 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
18537
18538 /*
18539 * The cpu_core structure consists of per-CPU state available in any context.
18540 * On some architectures, this may mean that the page(s) containing the
18541 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
18542 * is up to the platform to assure that this is performed properly. Note that
18543 * the structure is sized to avoid false sharing.
18544 */
18545 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
18546 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
18547 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
18548
18549 /*
18550 * Initialize the CPU offline/online hooks.
18551 */
18552 dtrace_install_cpu_hooks();
18553
18554 dtrace_modctl_list = NULL;
18555
18556 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
18557 for (i = 0; i < ncpu; ++i) {
18558 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
18559 }
18560
18561 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
18562 for (i = 0; i < ncpu; ++i) {
18563 cpu_list[i].cpu_id = (processorid_t)i;
18564 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
18565 LIST_INIT(&cpu_list[i].cpu_cyc_list);
18566 lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
18567 }
18568
18569 lck_mtx_lock(&cpu_lock);
18570 for (i = 0; i < ncpu; ++i)
18571 /* FIXME: track CPU configuration */
18572 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
18573 lck_mtx_unlock(&cpu_lock);
18574
18575 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
18576
18577 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
18578 offsetof(dtrace_string_t, dtst_str),
18579 offsetof(dtrace_string_t, dtst_next),
18580 offsetof(dtrace_string_t, dtst_prev));
18581
18582 dtrace_isa_init();
18583 /*
18584 * See dtrace_impl.h for a description of dof modes.
18585 * The default is lazy dof.
18586 *
18587 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
18588 * makes no sense...
18589 */
18590 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
18591#if CONFIG_EMBEDDED
18592 /* Disable DOF mode by default for performance reasons */
18593 dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
18594#else
18595 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
18596#endif
18597 }
18598
18599 /*
18600 * Sanity check of dof mode value.
18601 */
18602 switch (dtrace_dof_mode) {
18603 case DTRACE_DOF_MODE_NEVER:
18604 case DTRACE_DOF_MODE_LAZY_ON:
18605 /* valid modes, but nothing else we need to do */
18606 break;
18607
18608 case DTRACE_DOF_MODE_LAZY_OFF:
18609 case DTRACE_DOF_MODE_NON_LAZY:
18610 /* Cannot wait for a dtrace_open to init fasttrap */
18611 fasttrap_init();
18612 break;
18613
18614 default:
18615 /* Invalid, clamp to non lazy */
18616 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
18617 fasttrap_init();
18618 break;
18619 }
18620
18621 gDTraceInited = 1;
18622
18623 } else
18624 panic("dtrace_init: called twice!\n");
18625}
18626
18627void
18628dtrace_postinit(void)
18629{
18630 /*
18631 * Called from bsd_init after all provider's *_init() routines have been
18632 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
18633 * to go.
18634 */
18635 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
18636
18637 /*
18638 * Add the mach_kernel to the module list for lazy processing
18639 */
18640 struct kmod_info fake_kernel_kmod;
18641 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
18642
18643 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
18644 fake_kernel_kmod.id = 1;
18645 fake_kernel_kmod.address = g_kernel_kmod_info.address;
18646 fake_kernel_kmod.size = g_kernel_kmod_info.size;
18647
18648 if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
18649 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
18650 }
18651
18652 if (!PE_parse_boot_argn("dtrace_provide_private_probes", &dtrace_provide_private_probes, sizeof (dtrace_provide_private_probes))) {
18653 dtrace_provide_private_probes = 0;
18654 }
18655
18656 (void)OSKextRegisterKextsWithDTrace();
18657}
18658#undef DTRACE_MAJOR
18659
18660/*
18661 * Routines used to register interest in cpu's being added to or removed
18662 * from the system.
18663 */
18664void
18665register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
18666{
18667#pragma unused(ignore1,ignore2)
18668}
18669
18670void
18671unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
18672{
18673#pragma unused(ignore1,ignore2)
18674}
18675