spec_vnops.c source code [codebrowser/bsd/miscfs/specfs/spec_vnops.c]

1	/*
2	* Copyright (c) 2000-2016 Apple Computer, Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1989, 1993, 1995
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)spec_vnops.c 8.14 (Berkeley) 5/21/95
62	*/
63
64	#include <sys/param.h>
65	#include <sys/proc_internal.h>
66	#include <sys/kauth.h>
67	#include <sys/systm.h>
68	#include <sys/kernel.h>
69	#include <sys/conf.h>
70	#include <sys/buf_internal.h>
71	#include <sys/mount_internal.h>
72	#include <sys/vnode_internal.h>
73	#include <sys/file_internal.h>
74	#include <sys/namei.h>
75	#include <sys/stat.h>
76	#include <sys/errno.h>
77	#include <sys/ioctl.h>
78	#include <sys/file.h>
79	#include <sys/user.h>
80	#include <sys/malloc.h>
81	#include <sys/disk.h>
82	#include <sys/uio_internal.h>
83	#include <sys/resource.h>
84	#include <machine/machine_routines.h>
85	#include <miscfs/specfs/specdev.h>
86	#include <vfs/vfs_support.h>
87	#include <vfs/vfs_disk_conditioner.h>
88
89	#include <kern/assert.h>
90	#include <kern/task.h>
91	#include <kern/sched_prim.h>
92	#include <kern/thread.h>
93	#include <kern/policy_internal.h>
94	#include <kern/timer_call.h>
95	#include <kern/waitq.h>
96
97	#include <pexpert/pexpert.h>
98
99	#include <sys/kdebug.h>
100	#include <libkern/section_keywords.h>
101
102	/ XXX following three prototypes should be in a header file somewhere /
103	extern dev_t chrtoblk(dev_t dev);
104	extern boolean_t iskmemdev(dev_t dev);
105	extern int bpfkqfilter(dev_t dev, struct knote *kn);
106	extern int ptsd_kqfilter(dev_t, struct knote *);
107	extern int ptmx_kqfilter(dev_t, struct knote *);
108
109	struct vnode *speclisth[SPECHSZ];
110
111	/ symbolic sleep message strings for devices /
112	char devopn[] = "devopn";
113	char devio[] = "devio";
114	char devwait[] = "devwait";
115	char devin[] = "devin";
116	char devout[] = "devout";
117	char devioc[] = "devioc";
118	char devcls[] = "devcls";
119
120	#define VOPFUNC int ()(void )
121
122	int (*spec_vnodeop_p)(void* *);
123	struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
124	{ &vnop_default_desc, (VOPFUNC)vn_default_error },
125	{ &vnop_lookup_desc, (VOPFUNC)spec_lookup }, / lookup /
126	{ &vnop_create_desc, (VOPFUNC)err_create }, / create /
127	{ &vnop_mknod_desc, (VOPFUNC)err_mknod }, / mknod /
128	{ &vnop_open_desc, (VOPFUNC)spec_open }, / open /
129	{ &vnop_close_desc, (VOPFUNC)spec_close }, / close /
130	{ &vnop_access_desc, (VOPFUNC)spec_access }, / access /
131	{ &vnop_getattr_desc, (VOPFUNC)spec_getattr }, / getattr /
132	{ &vnop_setattr_desc, (VOPFUNC)spec_setattr }, / setattr /
133	{ &vnop_read_desc, (VOPFUNC)spec_read }, / read /
134	{ &vnop_write_desc, (VOPFUNC)spec_write }, / write /
135	{ &vnop_ioctl_desc, (VOPFUNC)spec_ioctl }, / ioctl /
136	{ &vnop_select_desc, (VOPFUNC)spec_select }, / select /
137	{ &vnop_revoke_desc, (VOPFUNC)nop_revoke }, / revoke /
138	{ &vnop_mmap_desc, (VOPFUNC)err_mmap }, / mmap /
139	{ &vnop_fsync_desc, (VOPFUNC)spec_fsync }, / fsync /
140	{ &vnop_remove_desc, (VOPFUNC)err_remove }, / remove /
141	{ &vnop_link_desc, (VOPFUNC)err_link }, / link /
142	{ &vnop_rename_desc, (VOPFUNC)err_rename }, / rename /
143	{ &vnop_mkdir_desc, (VOPFUNC)err_mkdir }, / mkdir /
144	{ &vnop_rmdir_desc, (VOPFUNC)err_rmdir }, / rmdir /
145	{ &vnop_symlink_desc, (VOPFUNC)err_symlink }, / symlink /
146	{ &vnop_readdir_desc, (VOPFUNC)err_readdir }, / readdir /
147	{ &vnop_readlink_desc, (VOPFUNC)err_readlink }, / readlink /
148	{ &vnop_inactive_desc, (VOPFUNC)nop_inactive }, / inactive /
149	{ &vnop_reclaim_desc, (VOPFUNC)nop_reclaim }, / reclaim /
150	{ &vnop_strategy_desc, (VOPFUNC)spec_strategy }, / strategy /
151	{ &vnop_pathconf_desc, (VOPFUNC)spec_pathconf }, / pathconf /
152	{ &vnop_advlock_desc, (VOPFUNC)err_advlock }, / advlock /
153	{ &vnop_bwrite_desc, (VOPFUNC)spec_bwrite }, / bwrite /
154	{ &vnop_pagein_desc, (VOPFUNC)err_pagein }, / Pagein /
155	{ &vnop_pageout_desc, (VOPFUNC)err_pageout }, / Pageout /
156	{ &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, / Copyfile /
157	{ &vnop_blktooff_desc, (VOPFUNC)spec_blktooff }, / blktooff /
158	{ &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk }, / offtoblk /
159	{ &vnop_blockmap_desc, (VOPFUNC)spec_blockmap }, / blockmap /
160	{ (struct vnodeop_desc)NULL, (int()(void *))NULL }
161	};
162	struct vnodeopv_desc spec_vnodeop_opv_desc =
163	{ &spec_vnodeop_p, spec_vnodeop_entries };
164
165
166	static void set_blocksize(vnode_t, dev_t);
167
168	#define LOWPRI_TIER1_WINDOW_MSECS 25
169	#define LOWPRI_TIER2_WINDOW_MSECS 100
170	#define LOWPRI_TIER3_WINDOW_MSECS 500
171
172	#define LOWPRI_TIER1_IO_PERIOD_MSECS 40
173	#define LOWPRI_TIER2_IO_PERIOD_MSECS 85
174	#define LOWPRI_TIER3_IO_PERIOD_MSECS 200
175
176	#define LOWPRI_TIER1_IO_PERIOD_SSD_MSECS 5
177	#define LOWPRI_TIER2_IO_PERIOD_SSD_MSECS 15
178	#define LOWPRI_TIER3_IO_PERIOD_SSD_MSECS 25
179
180
181	int throttle_windows_msecs[THROTTLE_LEVEL_END + `1`] = {
182	`0`,
183	LOWPRI_TIER1_WINDOW_MSECS,
184	LOWPRI_TIER2_WINDOW_MSECS,
185	LOWPRI_TIER3_WINDOW_MSECS,
186	};
187
188	int throttle_io_period_msecs[THROTTLE_LEVEL_END + `1`] = {
189	`0`,
190	LOWPRI_TIER1_IO_PERIOD_MSECS,
191	LOWPRI_TIER2_IO_PERIOD_MSECS,
192	LOWPRI_TIER3_IO_PERIOD_MSECS,
193	};
194
195	int throttle_io_period_ssd_msecs[THROTTLE_LEVEL_END + `1`] = {
196	`0`,
197	LOWPRI_TIER1_IO_PERIOD_SSD_MSECS,
198	LOWPRI_TIER2_IO_PERIOD_SSD_MSECS,
199	LOWPRI_TIER3_IO_PERIOD_SSD_MSECS,
200	};
201
202
203	int throttled_count[THROTTLE_LEVEL_END + `1`];
204
205	struct _throttle_io_info_t {
206	lck_mtx_t throttle_lock;
207
208	struct timeval throttle_last_write_timestamp;
209	struct timeval throttle_min_timer_deadline;
210	struct timeval throttle_window_start_timestamp[THROTTLE_LEVEL_END + `1`]; / window starts at both the beginning and completion of an I/O /
211	struct timeval throttle_last_IO_timestamp[THROTTLE_LEVEL_END + `1`];
212	pid_t throttle_last_IO_pid[THROTTLE_LEVEL_END + `1`];
213	struct timeval throttle_start_IO_period_timestamp[THROTTLE_LEVEL_END + `1`];
214	int32_t throttle_inflight_count[THROTTLE_LEVEL_END + `1`];
215
216	TAILQ_HEAD( , uthread) throttle_uthlist[THROTTLE_LEVEL_END + `1`]; / Lists of throttled uthreads /
217	int throttle_next_wake_level;
218
219	thread_call_t throttle_timer_call;
220	int32_t throttle_timer_ref;
221	int32_t throttle_timer_active;
222
223	int32_t throttle_io_count;
224	int32_t throttle_io_count_begin;
225	int *throttle_io_periods;
226	uint32_t throttle_io_period_num;
227
228	int32_t throttle_refcnt;
229	int32_t throttle_alloc;
230	int32_t throttle_disabled;
231	int32_t throttle_is_fusion_with_priority;
232	};
233
234	struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
235
236
237	int lowpri_throttle_enabled = `1`;
238
239
240	static void throttle_info_end_io_internal(struct _throttle_io_info_t info, int* throttle_level);
241	static int throttle_info_update_internal(struct _throttle_io_info_t info, uthread_t ut, int* flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap);
242	static int throttle_get_thread_throttle_level(uthread_t ut);
243	static int throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier);
244	void throttle_info_mount_reset_period(mount_t mp, int isssd);
245
246	/*
247	* Trivial lookup routine that always fails.
248	*/
249	int
250	spec_lookup(struct vnop_lookup_args *ap)
251	{
252
253	*ap->a_vpp = NULL;
254	return (ENOTDIR);
255	}
256
257	static void
258	set_blocksize(struct vnode *vp, dev_t dev)
259	{
260	int (*size)(dev_t);
261	int rsize;
262
263	if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
264	rsize = (*size)(dev);
265	if (rsize <= `0`) / did size fail? /
266	vp->v_specsize = DEV_BSIZE;
267	else
268	vp->v_specsize = rsize;
269	}
270	else
271	vp->v_specsize = DEV_BSIZE;
272	}
273
274	void
275	set_fsblocksize(struct vnode *vp)
276	{
277
278	if (vp->v_type == VBLK) {
279	dev_t dev = (dev_t)vp->v_rdev;
280	int maj = major(dev);
281
282	if ((u_int)maj >= (u_int)nblkdev)
283	return;
284
285	vnode_lock(vp);
286	set_blocksize(vp, dev);
287	vnode_unlock(vp);
288	}
289
290	}
291
292
293	/*
294	* Open a special file.
295	*/
296	int
297	spec_open(struct vnop_open_args *ap)
298	{
299	struct proc *p = vfs_context_proc(ap->a_context);
300	kauth_cred_t cred = vfs_context_ucred(ap->a_context);
301	struct vnode *vp = ap->a_vp;
302	dev_t bdev, dev = (dev_t)vp->v_rdev;
303	int maj = major(dev);
304	int error;
305
306	/*
307	* Don't allow open if fs is mounted -nodev.
308	*/
309	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
310	return (ENXIO);
311
312	switch (vp->v_type) {
313
314	case VCHR:
315	if ((u_int)maj >= (u_int)nchrdev)
316	return (ENXIO);
317	if (cred != FSCRED && (ap->a_mode & FWRITE)) {
318	/*
319	* When running in very secure mode, do not allow
320	* opens for writing of any disk character devices.
321	*/
322	if (securelevel >= `2` && isdisk(dev, VCHR))
323	return (EPERM);
324
325	/ Never allow writing to /dev/mem or /dev/kmem /
326	if (iskmemdev(dev))
327	return (EPERM);
328	/*
329	* When running in secure mode, do not allow opens for
330	* writing of character devices whose corresponding block
331	* devices are currently mounted.
332	*/
333	if (securelevel >= `1`) {
334	if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
335	return (error);
336	}
337	}
338
339	devsw_lock(dev, S_IFCHR);
340	error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
341
342	if (error == `0`) {
343	vp->v_specinfo->si_opencount++;
344	}
345
346	devsw_unlock(dev, S_IFCHR);
347
348	if (error == `0` && cdevsw[maj].d_type == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
349	int isssd = `0`;
350	uint64_t throttle_mask = `0`;
351	uint32_t devbsdunit = `0`;
352
353	if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, `0`, NULL) == `0`) {
354
355	if (throttle_mask != `0` &&
356	VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, `0`, ap->a_context) == `0`) {
357	/*
358	* as a reasonable approximation, only use the lowest bit of the mask
359	* to generate a disk unit number
360	*/
361	devbsdunit = num_trailing_0(throttle_mask);
362
363	vnode_lock(vp);
364
365	vp->v_un.vu_specinfo->si_isssd = isssd;
366	vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
367	vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
368	vp->v_un.vu_specinfo->si_throttleable = `1`;
369	vp->v_un.vu_specinfo->si_initted = `1`;
370
371	vnode_unlock(vp);
372	}
373	}
374	if (vp->v_un.vu_specinfo->si_initted == `0`) {
375	vnode_lock(vp);
376	vp->v_un.vu_specinfo->si_initted = `1`;
377	vnode_unlock(vp);
378	}
379	}
380	return (error);
381
382	case VBLK:
383	if ((u_int)maj >= (u_int)nblkdev)
384	return (ENXIO);
385	/*
386	* When running in very secure mode, do not allow
387	* opens for writing of any disk block devices.
388	*/
389	if (securelevel >= `2` && cred != FSCRED &&
390	(ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
391	return (EPERM);
392	/*
393	* Do not allow opens of block devices that are
394	* currently mounted.
395	*/
396	if ( (error = vfs_mountedon(vp)) )
397	return (error);
398
399	devsw_lock(dev, S_IFBLK);
400	error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
401	if (!error) {
402	vp->v_specinfo->si_opencount++;
403	}
404	devsw_unlock(dev, S_IFBLK);
405
406	if (!error) {
407	u_int64_t blkcnt;
408	u_int32_t blksize;
409	int setsize = `0`;
410	u_int32_t size512 = `512`;
411
412
413	if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, `0`, ap->a_context)) {
414	/ Switch to 512 byte sectors (temporarily) /
415
416	if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
417	/ Get the number of 512 byte physical blocks. /
418	if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, `0`, ap->a_context)) {
419	setsize = `1`;
420	}
421	}
422	/ If it doesn't set back, we can't recover /
423	if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
424	error = ENXIO;
425	}
426
427
428	vnode_lock(vp);
429	set_blocksize(vp, dev);
430
431	/*
432	* Cache the size in bytes of the block device for later
433	* use by spec_write().
434	*/
435	if (setsize)
436	vp->v_specdevsize = blkcnt * (u_int64_t)size512;
437	else
438	vp->v_specdevsize = (u_int64_t)`0`; / Default: Can't get /
439
440	vnode_unlock(vp);
441
442	}
443	return(error);
444	default:
445	panic("spec_open type");
446	}
447	return (`0`);
448	}
449
450	/*
451	* Vnode op for read
452	*/
453	int
454	spec_read(struct vnop_read_args *ap)
455	{
456	struct vnode *vp = ap->a_vp;
457	struct uio *uio = ap->a_uio;
458	struct buf *bp;
459	daddr64_t bn, nextbn;
460	long bsize, bscale;
461	int devBlockSize=`0`;
462	int n, on;
463	int error = `0`;
464	dev_t dev;
465
466	#if DIAGNOSTIC
467	if (uio->uio_rw != UIO_READ)
468	panic("spec_read mode");
469	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
470	panic("spec_read proc");
471	#endif
472	if (uio_resid(uio) == `0`)
473	return (`0`);
474
475	switch (vp->v_type) {
476
477	case VCHR:
478	{
479	struct _throttle_io_info_t *throttle_info = NULL;
480	int thread_throttle_level;
481	if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
482	throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
483	thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, `0`, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
484	}
485	error = (*cdevsw[major(vp->v_rdev)].d_read)
486	(vp->v_rdev, uio, ap->a_ioflag);
487
488	if (throttle_info) {
489	throttle_info_end_io_internal(throttle_info, thread_throttle_level);
490	}
491
492	return (error);
493	}
494
495	case VBLK:
496	if (uio->uio_offset < `0`)
497	return (EINVAL);
498
499	dev = vp->v_rdev;
500
501	devBlockSize = vp->v_specsize;
502
503	if (devBlockSize > PAGE_SIZE)
504	return (EINVAL);
505
506	bscale = PAGE_SIZE / devBlockSize;
507	bsize = bscale * devBlockSize;
508
509	do {
510	on = uio->uio_offset % bsize;
511
512	bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - `1`));
513
514	if (vp->v_speclastr + bscale == bn) {
515	nextbn = bn + bscale;
516	error = buf_breadn(vp, bn, (int)bsize, &nextbn,
517	(int *)&bsize, `1`, NOCRED, &bp);
518	} else
519	error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
520
521	vnode_lock(vp);
522	vp->v_speclastr = bn;
523	vnode_unlock(vp);
524
525	n = bsize - buf_resid(bp);
526	if ((on > n) \|\| error) {
527	if (!error)
528	error = EINVAL;
529	buf_brelse(bp);
530	return (error);
531	}
532	n = min((unsigned)(n - on), uio_resid(uio));
533
534	error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
535	if (n + on == bsize)
536	buf_markaged(bp);
537	buf_brelse(bp);
538	} while (error == `0` && uio_resid(uio) > `0` && n != `0`);
539	return (error);
540
541	default:
542	panic("spec_read type");
543	}
544	/ NOTREACHED /
545
546	return (`0`);
547	}
548
549	/*
550	* Vnode op for write
551	*/
552	int
553	spec_write(struct vnop_write_args *ap)
554	{
555	struct vnode *vp = ap->a_vp;
556	struct uio *uio = ap->a_uio;
557	struct buf *bp;
558	daddr64_t bn;
559	int bsize, blkmask, bscale;
560	int io_sync;
561	int devBlockSize=`0`;
562	int n, on;
563	int error = `0`;
564	dev_t dev;
565
566	#if DIAGNOSTIC
567	if (uio->uio_rw != UIO_WRITE)
568	panic("spec_write mode");
569	if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
570	panic("spec_write proc");
571	#endif
572
573	switch (vp->v_type) {
574
575	case VCHR:
576	{
577	struct _throttle_io_info_t *throttle_info = NULL;
578	int thread_throttle_level;
579	if (cdevsw[major(vp->v_rdev)].d_type == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
580	throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
581
582	thread_throttle_level = throttle_info_update_internal(throttle_info, NULL, `0`, vp->v_un.vu_specinfo->si_isssd, TRUE, NULL);
583
584	microuptime(&throttle_info->throttle_last_write_timestamp);
585	}
586	error = (*cdevsw[major(vp->v_rdev)].d_write)
587	(vp->v_rdev, uio, ap->a_ioflag);
588
589	if (throttle_info) {
590	throttle_info_end_io_internal(throttle_info, thread_throttle_level);
591	}
592
593	return (error);
594	}
595
596	case VBLK:
597	if (uio_resid(uio) == `0`)
598	return (`0`);
599	if (uio->uio_offset < `0`)
600	return (EINVAL);
601
602	io_sync = (ap->a_ioflag & IO_SYNC);
603
604	dev = (vp->v_rdev);
605
606	devBlockSize = vp->v_specsize;
607	if (devBlockSize > PAGE_SIZE)
608	return(EINVAL);
609
610	bscale = PAGE_SIZE / devBlockSize;
611	blkmask = bscale - `1`;
612	bsize = bscale * devBlockSize;
613
614
615	do {
616	bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
617	on = uio->uio_offset % bsize;
618
619	n = min((unsigned)(bsize - on), uio_resid(uio));
620
621	/*
622	* Use buf_getblk() as an optimization IFF:
623	*
624	* 1) We are reading exactly a block on a block
625	* aligned boundary
626	* 2) We know the size of the device from spec_open
627	* 3) The read doesn't span the end of the device
628	*
629	* Otherwise, we fall back on buf_bread().
630	*/
631	if (n == bsize &&
632	vp->v_specdevsize != (u_int64_t)`0` &&
633	(uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
634	/ reduce the size of the read to what is there /
635	n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
636	}
637
638	if (n == bsize)
639	bp = buf_getblk(vp, bn, bsize, `0`, `0`, BLK_WRITE);
640	else
641	error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
642
643	/ Translate downstream error for upstream, if needed /
644	if (!error)
645	error = (int)buf_error(bp);
646	if (error) {
647	buf_brelse(bp);
648	return (error);
649	}
650	n = min(n, bsize - buf_resid(bp));
651
652	error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
653	if (error) {
654	buf_brelse(bp);
655	return (error);
656	}
657	buf_markaged(bp);
658
659	if (io_sync)
660	error = buf_bwrite(bp);
661	else {
662	if ((n + on) == bsize)
663	error = buf_bawrite(bp);
664	else
665	error = buf_bdwrite(bp);
666	}
667	} while (error == `0` && uio_resid(uio) > `0` && n != `0`);
668	return (error);
669
670	default:
671	panic("spec_write type");
672	}
673	/ NOTREACHED /
674
675	return (`0`);
676	}
677
678	/*
679	* Device ioctl operation.
680	*/
681	int
682	spec_ioctl(struct vnop_ioctl_args *ap)
683	{
684	proc_t p = vfs_context_proc(ap->a_context);
685	dev_t dev = ap->a_vp->v_rdev;
686	int retval = `0`;
687
688	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, `0`) \| DBG_FUNC_START,
689	dev, ap->a_command, ap->a_fflag, ap->a_vp->v_type, `0`);
690
691	switch (ap->a_vp->v_type) {
692
693	case VCHR:
694	retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
695	ap->a_fflag, p);
696	break;
697
698	case VBLK:
699	retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
700	if (!retval && ap->a_command == DKIOCSETBLOCKSIZE)
701	ap->a_vp->v_specsize = (uint32_t )ap->a_data;
702	break;
703
704	default:
705	panic("spec_ioctl");
706	/ NOTREACHED /
707	}
708	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, `0`) \| DBG_FUNC_END,
709	dev, ap->a_command, ap->a_fflag, retval, `0`);
710
711	return (retval);
712	}
713
714	int
715	spec_select(struct vnop_select_args *ap)
716	{
717	proc_t p = vfs_context_proc(ap->a_context);
718	dev_t dev;
719
720	switch (ap->a_vp->v_type) {
721
722	default:
723	return (`1`); / XXX /
724
725	case VCHR:
726	dev = ap->a_vp->v_rdev;
727	return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
728	}
729	}
730
731	static int filt_specattach(struct knote kn, struct* kevent_internal_s *kev);
732
733	int
734	spec_kqfilter(vnode_t vp, struct knote kn, struct* kevent_internal_s *kev)
735	{
736	dev_t dev;
737
738	assert(vnode_ischr(vp));
739
740	dev = vnode_specrdev(vp);
741
742	#if NETWORKING
743	/*
744	* Try a bpf device, as defined in bsd/net/bpf.c
745	* If it doesn't error out the attach, then it
746	* claimed it. Otherwise, fall through and try
747	* other attaches.
748	*/
749	int32_t tmp_flags = kn->kn_flags;
750	int64_t tmp_data = kn->kn_data;
751	int res;
752
753	res = bpfkqfilter(dev, kn);
754	if ((kn->kn_flags & EV_ERROR) == `0`) {
755	return res;
756	}
757	kn->kn_flags = tmp_flags;
758	kn->kn_data = tmp_data;
759	#endif
760
761	if (major(dev) > nchrdev) {
762	knote_set_error(kn, ENXIO);
763	return `0`;
764	}
765
766	kn->kn_vnode_kqok = !!(cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE);
767	kn->kn_vnode_use_ofst = !!(cdevsw_flags[major(dev)] & CDEVSW_USE_OFFSET);
768
769	if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTS) {
770	kn->kn_filtid = EVFILTID_PTSD;
771	return ptsd_kqfilter(dev, kn);
772	} else if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
773	kn->kn_filtid = EVFILTID_PTMX;
774	return ptmx_kqfilter(dev, kn);
775	} else if (cdevsw[major(dev)].d_type == D_TTY && kn->kn_vnode_kqok) {
776	/*
777	* TTYs from drivers that use struct ttys use their own filter
778	* routines. The PTC driver doesn't use the tty for character
779	* counts, so it must go through the select fallback.
780	*/
781	kn->kn_filtid = EVFILTID_TTY;
782	return knote_fops(kn)->f_attach(kn, kev);
783	}
784
785	/ Try to attach to other char special devices /
786	return filt_specattach(kn, kev);
787	}
788
789	/*
790	* Synch buffers associated with a block device
791	*/
792	int
793	spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
794	{
795	if (vp->v_type == VCHR)
796	return (`0`);
797	/*
798	* Flush all dirty buffers associated with a block device.
799	*/
800	buf_flushdirtyblks(vp, (waitfor == MNT_WAIT \|\| waitfor == MNT_DWAIT), `0`, "spec_fsync");
801
802	return (`0`);
803	}
804
805	int
806	spec_fsync(struct vnop_fsync_args *ap)
807	{
808	return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
809	}
810
811
812	/*
813	* Just call the device strategy routine
814	*/
815	void throttle_init(void);
816
817
818	#if 0
819	#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) \
820	do { \
821	if ((debug_info)->alloc) \
822	printf("%s: "format, __FUNCTION__, ## args); \
823	} while(0)
824
825	#else
826	#define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
827	#endif
828
829
830	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER1], `0`, "");
831	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER2], `0`, "");
832	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_window_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_windows_msecs[THROTTLE_LEVEL_TIER3], `0`, "");
833
834	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER1], `0`, "");
835	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER2], `0`, "");
836	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_msecs[THROTTLE_LEVEL_TIER3], `0`, "");
837
838	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier1_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER1], `0`, "");
839	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier2_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER2], `0`, "");
840	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_tier3_io_period_ssd_msecs, CTLFLAG_RW \| CTLFLAG_LOCKED, &throttle_io_period_ssd_msecs[THROTTLE_LEVEL_TIER3], `0`, "");
841
842	SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_enabled, CTLFLAG_RW \| CTLFLAG_LOCKED, &lowpri_throttle_enabled, `0`, "");
843
844
845	static lck_grp_t *throttle_lock_grp;
846	static lck_attr_t *throttle_lock_attr;
847	static lck_grp_attr_t *throttle_lock_grp_attr;
848
849
850	/*
851	* throttled I/O helper function
852	* convert the index of the lowest set bit to a device index
853	*/
854	int
855	num_trailing_0(uint64_t n)
856	{
857	/*
858	* since in most cases the number of trailing 0s is very small,
859	* we simply counting sequentially from the lowest bit
860	*/
861	if (n == `0`)
862	return sizeof(n) * `8`;
863	int count = `0`;
864	while (!ISSET(n, `1`)) {
865	n >>= `1`;
866	++count;
867	}
868	return count;
869	}
870
871
872	/*
873	* Release the reference and if the item was allocated and this is the last
874	* reference then free it.
875	*
876	* This routine always returns the old value.
877	*/
878	static int
879	throttle_info_rel(struct _throttle_io_info_t *info)
880	{
881	SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
882
883	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
884	info, (int)(oldValue -`1`), info );
885
886	/ The reference count just went negative, very bad /
887	if (oldValue == `0`)
888	panic("throttle info ref cnt went negative!");
889
890	/*
891	* Once reference count is zero, no one else should be able to take a
892	* reference
893	*/
894	if ((info->throttle_refcnt == `0`) && (info->throttle_alloc)) {
895	DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
896
897	lck_mtx_destroy(&info->throttle_lock, throttle_lock_grp);
898	FREE(info, M_TEMP);
899	}
900	return oldValue;
901	}
902
903
904	/*
905	* Just take a reference on the throttle info structure.
906	*
907	* This routine always returns the old value.
908	*/
909	static SInt32
910	throttle_info_ref(struct _throttle_io_info_t *info)
911	{
912	SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
913
914	DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n",
915	info, (int)(oldValue -`1`), info );
916	/ Allocated items should never have a reference of zero /
917	if (info->throttle_alloc && (oldValue == `0`))
918	panic("Taking a reference without calling create throttle info!\n");
919
920	return oldValue;
921	}
922
923	/*
924	* on entry the throttle_lock is held...
925	* this function is responsible for taking
926	* and dropping the reference on the info
927	* structure which will keep it from going
928	* away while the timer is running if it
929	* happens to have been dynamically allocated by
930	* a network fileystem kext which is now trying
931	* to free it
932	*/
933	static uint32_t
934	throttle_timer_start(struct _throttle_io_info_t info, boolean_t update_io_count, int* wakelevel)
935	{
936	struct timeval elapsed;
937	struct timeval now;
938	struct timeval period;
939	uint64_t elapsed_msecs;
940	int throttle_level;
941	int level;
942	int msecs;
943	boolean_t throttled = FALSE;
944	boolean_t need_timer = FALSE;
945
946	microuptime(&now);
947
948	if (update_io_count == TRUE) {
949	info->throttle_io_count_begin = info->throttle_io_count;
950	info->throttle_io_period_num++;
951
952	while (wakelevel >= THROTTLE_LEVEL_THROTTLED)
953	info->throttle_start_IO_period_timestamp[wakelevel--] = now;
954
955	info->throttle_min_timer_deadline = now;
956
957	msecs = info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED];
958	period.tv_sec = msecs / `1000`;
959	period.tv_usec = (msecs % `1000`) * `1000`;
960
961	timevaladd(&info->throttle_min_timer_deadline, &period);
962	}
963	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
964
965	elapsed = now;
966	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
967	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)`1000` + (elapsed.tv_usec / `1000`);
968
969	for (level = throttle_level + `1`; level <= THROTTLE_LEVEL_END; level++) {
970
971	if (!TAILQ_EMPTY(&info->throttle_uthlist[level])) {
972
973	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[level] \|\| info->throttle_inflight_count[throttle_level]) {
974	/*
975	* we had an I/O occur at a higher priority tier within
976	* this tier's throttle window
977	*/
978	throttled = TRUE;
979	}
980	/*
981	* we assume that the windows are the same or longer
982	* as we drop through the throttling tiers... thus
983	* we can stop looking once we run into a tier with
984	* threads to schedule regardless of whether it's
985	* still in its throttling window or not
986	*/
987	break;
988	}
989	}
990	if (throttled == TRUE)
991	break;
992	}
993	if (throttled == TRUE) {
994	uint64_t deadline = `0`;
995	struct timeval target;
996	struct timeval min_target;
997
998	/*
999	* we've got at least one tier still in a throttled window
1000	* so we need a timer running... compute the next deadline
1001	* and schedule it
1002	*/
1003	for (level = throttle_level+`1`; level <= THROTTLE_LEVEL_END; level++) {
1004
1005	if (TAILQ_EMPTY(&info->throttle_uthlist[level]))
1006	continue;
1007
1008	target = info->throttle_start_IO_period_timestamp[level];
1009
1010	msecs = info->throttle_io_periods[level];
1011	period.tv_sec = msecs / `1000`;
1012	period.tv_usec = (msecs % `1000`) * `1000`;
1013
1014	timevaladd(&target, &period);
1015
1016	if (need_timer == FALSE \|\| timevalcmp(&target, &min_target, <)) {
1017	min_target = target;
1018	need_timer = TRUE;
1019	}
1020	}
1021	if (timevalcmp(&info->throttle_min_timer_deadline, &now, >)) {
1022	if (timevalcmp(&info->throttle_min_timer_deadline, &min_target, >))
1023	min_target = info->throttle_min_timer_deadline;
1024	}
1025
1026	if (info->throttle_timer_active) {
1027	if (thread_call_cancel(info->throttle_timer_call) == FALSE) {
1028	/*
1029	* couldn't kill the timer because it's already
1030	* been dispatched, so don't try to start a new
1031	* one... once we drop the lock, the timer will
1032	* proceed and eventually re-run this function
1033	*/
1034	need_timer = FALSE;
1035	} else
1036	info->throttle_timer_active = `0`;
1037	}
1038	if (need_timer == TRUE) {
1039	/*
1040	* This is defined as an int (32-bit) rather than a 64-bit
1041	* value because it would need a really big period in the
1042	* order of ~500 days to overflow this. So, we let this be
1043	* 32-bit which allows us to use the clock_interval_to_deadline()
1044	* routine.
1045	*/
1046	int target_msecs;
1047
1048	if (info->throttle_timer_ref == `0`) {
1049	/*
1050	* take a reference for the timer
1051	*/
1052	throttle_info_ref(info);
1053
1054	info->throttle_timer_ref = `1`;
1055	}
1056	elapsed = min_target;
1057	timevalsub(&elapsed, &now);
1058	target_msecs = elapsed.tv_sec * `1000` + elapsed.tv_usec / `1000`;
1059
1060	if (target_msecs <= `0`) {
1061	/*
1062	* we may have computed a deadline slightly in the past
1063	* due to various factors... if so, just set the timer
1064	* to go off in the near future (we don't need to be precise)
1065	*/
1066	target_msecs = `1`;
1067	}
1068	clock_interval_to_deadline(target_msecs, `1000000`, &deadline);
1069
1070	thread_call_enter_delayed(info->throttle_timer_call, deadline);
1071	info->throttle_timer_active = `1`;
1072	}
1073	}
1074	return (throttle_level);
1075	}
1076
1077
1078	static void
1079	throttle_timer(struct _throttle_io_info_t *info)
1080	{
1081	uthread_t ut, utlist;
1082	struct timeval elapsed;
1083	struct timeval now;
1084	uint64_t elapsed_msecs;
1085	int throttle_level;
1086	int level;
1087	int wake_level;
1088	caddr_t wake_address = NULL;
1089	boolean_t update_io_count = FALSE;
1090	boolean_t need_wakeup = FALSE;
1091	boolean_t need_release = FALSE;
1092
1093	ut = NULL;
1094	lck_mtx_lock(&info->throttle_lock);
1095
1096	info->throttle_timer_active = `0`;
1097	microuptime(&now);
1098
1099	elapsed = now;
1100	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[THROTTLE_LEVEL_THROTTLED]);
1101	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)`1000` + (elapsed.tv_usec / `1000`);
1102
1103	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[THROTTLE_LEVEL_THROTTLED]) {
1104
1105	wake_level = info->throttle_next_wake_level;
1106
1107	for (level = THROTTLE_LEVEL_START; level < THROTTLE_LEVEL_END; level++) {
1108
1109	elapsed = now;
1110	timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp[wake_level]);
1111	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)`1000` + (elapsed.tv_usec / `1000`);
1112
1113	if (elapsed_msecs >= (uint64_t)info->throttle_io_periods[wake_level] && !TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1114	/*
1115	* we're closing out the current IO period...
1116	* if we have a waiting thread, wake it up
1117	* after we have reset the I/O window info
1118	*/
1119	need_wakeup = TRUE;
1120	update_io_count = TRUE;
1121
1122	info->throttle_next_wake_level = wake_level - `1`;
1123
1124	if (info->throttle_next_wake_level == THROTTLE_LEVEL_START)
1125	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1126
1127	break;
1128	}
1129	wake_level--;
1130
1131	if (wake_level == THROTTLE_LEVEL_START)
1132	wake_level = THROTTLE_LEVEL_END;
1133	}
1134	}
1135	if (need_wakeup == TRUE) {
1136	if (!TAILQ_EMPTY(&info->throttle_uthlist[wake_level])) {
1137
1138	ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
1139	TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
1140	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1141	ut->uu_is_throttled = false;
1142
1143	wake_address = (caddr_t)&ut->uu_on_throttlelist;
1144	}
1145	} else
1146	wake_level = THROTTLE_LEVEL_START;
1147
1148	throttle_level = throttle_timer_start(info, update_io_count, wake_level);
1149
1150	if (wake_address != NULL)
1151	wakeup(wake_address);
1152
1153	for (level = THROTTLE_LEVEL_THROTTLED; level <= throttle_level; level++) {
1154
1155	TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist[level], uu_throttlelist, utlist) {
1156
1157	TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
1158	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1159	ut->uu_is_throttled = false;
1160
1161	wakeup(&ut->uu_on_throttlelist);
1162	}
1163	}
1164	if (info->throttle_timer_active == `0` && info->throttle_timer_ref) {
1165	info->throttle_timer_ref = `0`;
1166	need_release = TRUE;
1167	}
1168	lck_mtx_unlock(&info->throttle_lock);
1169
1170	if (need_release == TRUE)
1171	throttle_info_rel(info);
1172	}
1173
1174
1175	static int
1176	throttle_add_to_list(struct _throttle_io_info_t info, uthread_t ut, int* mylevel, boolean_t insert_tail)
1177	{
1178	boolean_t start_timer = FALSE;
1179	int level = THROTTLE_LEVEL_START;
1180
1181	if (TAILQ_EMPTY(&info->throttle_uthlist[mylevel])) {
1182	info->throttle_start_IO_period_timestamp[mylevel] = info->throttle_last_IO_timestamp[mylevel];
1183	start_timer = TRUE;
1184	}
1185
1186	if (insert_tail == TRUE)
1187	TAILQ_INSERT_TAIL(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1188	else
1189	TAILQ_INSERT_HEAD(&info->throttle_uthlist[mylevel], ut, uu_throttlelist);
1190
1191	ut->uu_on_throttlelist = mylevel;
1192
1193	if (start_timer == TRUE) {
1194	/ we may need to start or rearm the timer /
1195	level = throttle_timer_start(info, FALSE, THROTTLE_LEVEL_START);
1196
1197	if (level == THROTTLE_LEVEL_END) {
1198	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1199	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1200
1201	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1202	}
1203	}
1204	}
1205	return (level);
1206	}
1207
1208	static void
1209	throttle_init_throttle_window(void)
1210	{
1211	int throttle_window_size;
1212
1213	/*
1214	* The hierarchy of throttle window values is as follows:
1215	* - Global defaults
1216	* - Device tree properties
1217	* - Boot-args
1218	* All values are specified in msecs.
1219	*/
1220
1221	/ Override global values with device-tree properties /
1222	if (PE_get_default("kern.io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1223	throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1224
1225	if (PE_get_default("kern.io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1226	throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1227
1228	if (PE_get_default("kern.io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1229	throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1230
1231	/ Override with boot-args /
1232	if (PE_parse_boot_argn("io_throttle_window_tier1", &throttle_window_size, sizeof(throttle_window_size)))
1233	throttle_windows_msecs[THROTTLE_LEVEL_TIER1] = throttle_window_size;
1234
1235	if (PE_parse_boot_argn("io_throttle_window_tier2", &throttle_window_size, sizeof(throttle_window_size)))
1236	throttle_windows_msecs[THROTTLE_LEVEL_TIER2] = throttle_window_size;
1237
1238	if (PE_parse_boot_argn("io_throttle_window_tier3", &throttle_window_size, sizeof(throttle_window_size)))
1239	throttle_windows_msecs[THROTTLE_LEVEL_TIER3] = throttle_window_size;
1240	}
1241
1242	static void
1243	throttle_init_throttle_period(struct _throttle_io_info_t *info, boolean_t isssd)
1244	{
1245	int throttle_period_size;
1246
1247	/*
1248	* The hierarchy of throttle period values is as follows:
1249	* - Global defaults
1250	* - Device tree properties
1251	* - Boot-args
1252	* All values are specified in msecs.
1253	*/
1254
1255	/ Assign global defaults /
1256	if ((isssd == TRUE) && (info->throttle_is_fusion_with_priority == `0`))
1257	info->throttle_io_periods = &throttle_io_period_ssd_msecs[`0`];
1258	else
1259	info->throttle_io_periods = &throttle_io_period_msecs[`0`];
1260
1261	/ Override global values with device-tree properties /
1262	if (PE_get_default("kern.io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1263	info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1264
1265	if (PE_get_default("kern.io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1266	info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1267
1268	if (PE_get_default("kern.io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1269	info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1270
1271	/ Override with boot-args /
1272	if (PE_parse_boot_argn("io_throttle_period_tier1", &throttle_period_size, sizeof(throttle_period_size)))
1273	info->throttle_io_periods[THROTTLE_LEVEL_TIER1] = throttle_period_size;
1274
1275	if (PE_parse_boot_argn("io_throttle_period_tier2", &throttle_period_size, sizeof(throttle_period_size)))
1276	info->throttle_io_periods[THROTTLE_LEVEL_TIER2] = throttle_period_size;
1277
1278	if (PE_parse_boot_argn("io_throttle_period_tier3", &throttle_period_size, sizeof(throttle_period_size)))
1279	info->throttle_io_periods[THROTTLE_LEVEL_TIER3] = throttle_period_size;
1280
1281	}
1282
1283	#if CONFIG_IOSCHED
1284	extern void vm_io_reprioritize_init(void);
1285	int iosched_enabled = `1`;
1286	#endif
1287
1288	void
1289	throttle_init(void)
1290	{
1291	struct _throttle_io_info_t *info;
1292	int i;
1293	int level;
1294	#if CONFIG_IOSCHED
1295	int iosched;
1296	#endif
1297	/*
1298	* allocate lock group attribute and group
1299	*/
1300	throttle_lock_grp_attr = lck_grp_attr_alloc_init();
1301	throttle_lock_grp = lck_grp_alloc_init("throttle I/O", throttle_lock_grp_attr);
1302
1303	/ Update throttle parameters based on device tree configuration /
1304	throttle_init_throttle_window();
1305
1306	/*
1307	* allocate the lock attribute
1308	*/
1309	throttle_lock_attr = lck_attr_alloc_init();
1310
1311	for (i = `0`; i < LOWPRI_MAX_NUM_DEV; i++) {
1312	info = &_throttle_io_info[i];
1313
1314	lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1315	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1316
1317	for (level = `0`; level <= THROTTLE_LEVEL_END; level++) {
1318	TAILQ_INIT(&info->throttle_uthlist[level]);
1319	info->throttle_last_IO_pid[level] = `0`;
1320	info->throttle_inflight_count[level] = `0`;
1321	}
1322	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1323	info->throttle_disabled = `0`;
1324	info->throttle_is_fusion_with_priority = `0`;
1325	}
1326	#if CONFIG_IOSCHED
1327	if (PE_parse_boot_argn("iosched", &iosched, sizeof(iosched))) {
1328	iosched_enabled = iosched;
1329	}
1330	if (iosched_enabled) {
1331	/ Initialize I/O Reprioritization mechanism /
1332	vm_io_reprioritize_init();
1333	}
1334	#endif
1335	}
1336
1337	void
1338	sys_override_io_throttle(boolean_t enable_override)
1339	{
1340	if (enable_override)
1341	lowpri_throttle_enabled = `0`;
1342	else
1343	lowpri_throttle_enabled = `1`;
1344	}
1345
1346	int rethrottle_wakeups = `0`;
1347
1348	/*
1349	* the uu_rethrottle_lock is used to synchronize this function
1350	* with "throttle_lowpri_io" which is where a throttled thread
1351	* will block... that function will grab this lock before beginning
1352	* it's decision making process concerning the need to block, and
1353	* hold it through the assert_wait. When that thread is awakened
1354	* for any reason (timer or rethrottle), it will reacquire the
1355	* uu_rethrottle_lock before determining if it really is ok for
1356	* it to now run. This is the point at which the thread could
1357	* enter a different throttling queue and reblock or return from
1358	* the throttle w/o having waited out it's entire throttle if
1359	* the rethrottle has now moved it out of any currently
1360	* active throttle window.
1361	*
1362	*
1363	* NOTES:
1364	* 1 - This may be called with the task lock held.
1365	* 2 - This may be called with preemption and interrupts disabled
1366	* in the kqueue wakeup path so we can't take the throttle_lock which is a mutex
1367	* 3 - This cannot safely dereference uu_throttle_info, as it may
1368	* get deallocated out from under us
1369	*/
1370
1371	void
1372	rethrottle_thread(uthread_t ut)
1373	{
1374	/*
1375	* If uthread doesn't have throttle state, then there's no chance
1376	* of it needing a rethrottle.
1377	*/
1378	if (ut->uu_throttle_info == NULL)
1379	return;
1380
1381	boolean_t s = ml_set_interrupts_enabled(FALSE);
1382	lck_spin_lock(&ut->uu_rethrottle_lock);
1383
1384	if (!ut->uu_is_throttled)
1385	ut->uu_was_rethrottled = true;
1386	else {
1387	int my_new_level = throttle_get_thread_throttle_level(ut);
1388
1389	if (my_new_level != ut->uu_on_throttlelist) {
1390	/*
1391	* ut is currently blocked (as indicated by
1392	* ut->uu_is_throttled == true)
1393	* and we're changing it's throttle level, so
1394	* we need to wake it up.
1395	*/
1396	ut->uu_is_throttled = false;
1397	wakeup(&ut->uu_on_throttlelist);
1398
1399	rethrottle_wakeups++;
1400	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, `102`)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, my_new_level, `0`, `0`);
1401	}
1402	}
1403	lck_spin_unlock(&ut->uu_rethrottle_lock);
1404	ml_set_interrupts_enabled(s);
1405	}
1406
1407
1408	/*
1409	* KPI routine
1410	*
1411	* Create and take a reference on a throttle info structure and return a
1412	* pointer for the file system to use when calling throttle_info_update.
1413	* Calling file system must have a matching release for every create.
1414	*/
1415	void *
1416	throttle_info_create(void)
1417	{
1418	struct _throttle_io_info_t *info;
1419	int level;
1420
1421	MALLOC(info, struct _throttle_io_info_t , sizeof(info), M_TEMP, M_ZERO \| M_WAITOK);
1422	/ Should never happen but just in case /
1423	if (info == NULL)
1424	return NULL;
1425	/ Mark that this one was allocated and needs to be freed /
1426	DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
1427	info->throttle_alloc = TRUE;
1428
1429	lck_mtx_init(&info->throttle_lock, throttle_lock_grp, throttle_lock_attr);
1430	info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
1431
1432	for (level = `0`; level <= THROTTLE_LEVEL_END; level++) {
1433	TAILQ_INIT(&info->throttle_uthlist[level]);
1434	}
1435	info->throttle_next_wake_level = THROTTLE_LEVEL_END;
1436
1437	/ Take a reference /
1438	OSIncrementAtomic(&info->throttle_refcnt);
1439	return info;
1440	}
1441
1442	/*
1443	* KPI routine
1444	*
1445	* Release the throttle info pointer if all the reference are gone. Should be
1446	* called to release reference taken by throttle_info_create
1447	*/
1448	void
1449	throttle_info_release(void *throttle_info)
1450	{
1451	DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
1452	(struct _throttle_io_info_t *)throttle_info,
1453	(struct _throttle_io_info_t *)throttle_info);
1454	if (throttle_info) / Just to be careful /
1455	throttle_info_rel(throttle_info);
1456	}
1457
1458	/*
1459	* KPI routine
1460	*
1461	* File Systems that create an info structure, need to call this routine in
1462	* their mount routine (used by cluster code). File Systems that call this in
1463	* their mount routines must call throttle_info_mount_rel in their unmount
1464	* routines.
1465	*/
1466	void
1467	throttle_info_mount_ref(mount_t mp, void *throttle_info)
1468	{
1469	if ((throttle_info == NULL) \|\| (mp == NULL))
1470	return;
1471	throttle_info_ref(throttle_info);
1472
1473	/*
1474	* We already have a reference release it before adding the new one
1475	*/
1476	if (mp->mnt_throttle_info)
1477	throttle_info_rel(mp->mnt_throttle_info);
1478	mp->mnt_throttle_info = throttle_info;
1479	}
1480
1481	/*
1482	* Private KPI routine
1483	*
1484	* return a handle for accessing throttle_info given a throttle_mask. The
1485	* handle must be released by throttle_info_rel_by_mask
1486	*/
1487	int
1488	throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
1489	{
1490	int dev_index;
1491	struct _throttle_io_info_t *info;
1492
1493	if (throttle_info_handle == NULL)
1494	return EINVAL;
1495
1496	dev_index = num_trailing_0(throttle_mask);
1497	info = &_throttle_io_info[dev_index];
1498	throttle_info_ref(info);
1499	(struct* _throttle_io_info_t**)throttle_info_handle = info;
1500
1501	return `0`;
1502	}
1503
1504	/*
1505	* Private KPI routine
1506	*
1507	* release the handle obtained by throttle_info_ref_by_mask
1508	*/
1509	void
1510	throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
1511	{
1512	/*
1513	* for now the handle is just a pointer to _throttle_io_info_t
1514	*/
1515	throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
1516	}
1517
1518	/*
1519	* KPI routine
1520	*
1521	* File Systems that throttle_info_mount_ref, must call this routine in their
1522	* umount routine.
1523	*/
1524	void
1525	throttle_info_mount_rel(mount_t mp)
1526	{
1527	if (mp->mnt_throttle_info)
1528	throttle_info_rel(mp->mnt_throttle_info);
1529	mp->mnt_throttle_info = NULL;
1530	}
1531
1532	/*
1533	* Reset throttling periods for the given mount point
1534	*
1535	* private interface used by disk conditioner to reset
1536	* throttling periods when 'is_ssd' status changes
1537	*/
1538	void
1539	throttle_info_mount_reset_period(mount_t mp, int isssd)
1540	{
1541	struct _throttle_io_info_t *info;
1542
1543	if (mp == NULL)
1544	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
1545	else if (mp->mnt_throttle_info == NULL)
1546	info = &_throttle_io_info[mp->mnt_devbsdunit];
1547	else
1548	info = mp->mnt_throttle_info;
1549
1550	throttle_init_throttle_period(info, isssd);
1551	}
1552
1553	void
1554	throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
1555	{
1556	struct _throttle_io_info_t *info;
1557
1558	if (mp == NULL)
1559	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
1560	else if (mp->mnt_throttle_info == NULL)
1561	info = &_throttle_io_info[mp->mnt_devbsdunit];
1562	else
1563	info = mp->mnt_throttle_info;
1564
1565	*tv = info->throttle_last_write_timestamp;
1566	}
1567
1568	void
1569	update_last_io_time(mount_t mp)
1570	{
1571	struct _throttle_io_info_t *info;
1572
1573	if (mp == NULL)
1574	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
1575	else if (mp->mnt_throttle_info == NULL)
1576	info = &_throttle_io_info[mp->mnt_devbsdunit];
1577	else
1578	info = mp->mnt_throttle_info;
1579
1580	microuptime(&info->throttle_last_write_timestamp);
1581	if (mp != NULL)
1582	mp->mnt_last_write_completed_timestamp = info->throttle_last_write_timestamp;
1583	}
1584
1585	int
1586	throttle_get_io_policy(uthread_t *ut)
1587	{
1588	if (ut != NULL)
1589	*ut = get_bsdthread_info(current_thread());
1590
1591	return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO));
1592	}
1593
1594	int
1595	throttle_get_passive_io_policy(uthread_t *ut)
1596	{
1597	if (ut != NULL)
1598	*ut = get_bsdthread_info(current_thread());
1599
1600	return (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_PASSIVE_IO));
1601	}
1602
1603
1604	static int
1605	throttle_get_thread_throttle_level(uthread_t ut)
1606	{
1607	uthread_t *ut_p = (ut == NULL) ? &ut : NULL;
1608	int io_tier = throttle_get_io_policy(ut_p);
1609
1610	return throttle_get_thread_throttle_level_internal(ut, io_tier);
1611	}
1612
1613	/*
1614	* Return a throttle level given an existing I/O tier (such as returned by throttle_get_io_policy)
1615	*/
1616	static int
1617	throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier) {
1618	int thread_throttle_level = io_tier;
1619	int user_idle_level;
1620
1621	assert(ut != NULL);
1622
1623	/ Bootcache misses should always be throttled /
1624	if (ut->uu_throttle_bc)
1625	thread_throttle_level = THROTTLE_LEVEL_TIER3;
1626
1627	/*
1628	* Issue tier3 I/O as tier2 when the user is idle
1629	* to allow maintenance tasks to make more progress.
1630	*
1631	* Assume any positive idle level is enough... for now it's
1632	* only ever 0 or 128 but this is not defined anywhere.
1633	*/
1634	if (thread_throttle_level >= THROTTLE_LEVEL_TIER3) {
1635	user_idle_level = timer_get_user_idle_level();
1636	if (user_idle_level > `0`) {
1637	thread_throttle_level--;
1638	}
1639	}
1640
1641	return (thread_throttle_level);
1642	}
1643
1644	/*
1645	* I/O will be throttled if either of the following are true:
1646	* - Higher tiers have in-flight I/O
1647	* - The time delta since the last start/completion of a higher tier is within the throttle window interval
1648	*
1649	* In-flight I/O is bookended by throttle_info_update_internal/throttle_info_end_io_internal
1650	*/
1651	static int
1652	throttle_io_will_be_throttled_internal(void * throttle_info, int * mylevel, int * throttling_level)
1653	{
1654	struct _throttle_io_info_t *info = throttle_info;
1655	struct timeval elapsed;
1656	struct timeval now;
1657	uint64_t elapsed_msecs;
1658	int thread_throttle_level;
1659	int throttle_level;
1660
1661	if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL)) < THROTTLE_LEVEL_THROTTLED)
1662	return (THROTTLE_DISENGAGED);
1663
1664	microuptime(&now);
1665
1666	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
1667	if (info->throttle_inflight_count[throttle_level]) {
1668	break;
1669	}
1670	elapsed = now;
1671	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
1672	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)`1000` + (elapsed.tv_usec / `1000`);
1673
1674	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
1675	break;
1676	}
1677	if (throttle_level >= thread_throttle_level) {
1678	/*
1679	* we're beyond all of the throttle windows
1680	* that affect the throttle level of this thread,
1681	* so go ahead and treat as normal I/O
1682	*/
1683	return (THROTTLE_DISENGAGED);
1684	}
1685	if (mylevel)
1686	*mylevel = thread_throttle_level;
1687	if (throttling_level)
1688	*throttling_level = throttle_level;
1689
1690	if (info->throttle_io_count != info->throttle_io_count_begin) {
1691	/*
1692	* we've already issued at least one throttleable I/O
1693	* in the current I/O window, so avoid issuing another one
1694	*/
1695	return (THROTTLE_NOW);
1696	}
1697	/*
1698	* we're in the throttle window, so
1699	* cut the I/O size back
1700	*/
1701	return (THROTTLE_ENGAGED);
1702	}
1703
1704	/*
1705	* If we have a mount point and it has a throttle info pointer then
1706	* use it to do the check, otherwise use the device unit number to find
1707	* the correct throttle info array element.
1708	*/
1709	int
1710	throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
1711	{
1712	struct _throttle_io_info_t *info;
1713
1714	/*
1715	* Should we just return zero if no mount point
1716	*/
1717	if (mp == NULL)
1718	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
1719	else if (mp->mnt_throttle_info == NULL)
1720	info = &_throttle_io_info[mp->mnt_devbsdunit];
1721	else
1722	info = mp->mnt_throttle_info;
1723
1724	if (info->throttle_is_fusion_with_priority) {
1725	uthread_t ut = get_bsdthread_info(current_thread());
1726	if (ut->uu_lowpri_window == `0`)
1727	return (THROTTLE_DISENGAGED);
1728	}
1729
1730	if (info->throttle_disabled)
1731	return (THROTTLE_DISENGAGED);
1732	else
1733	return throttle_io_will_be_throttled_internal(info, NULL, NULL);
1734	}
1735
1736	/*
1737	* Routine to increment I/O throttling counters maintained in the proc
1738	*/
1739
1740	static void
1741	throttle_update_proc_stats(pid_t throttling_pid, int count)
1742	{
1743	proc_t throttling_proc;
1744	proc_t throttled_proc = current_proc();
1745
1746	/ The throttled_proc is always the current proc; so we are not concerned with refs /
1747	OSAddAtomic64(count, &(throttled_proc->was_throttled));
1748
1749	/ The throttling pid might have exited by now /
1750	throttling_proc = proc_find(throttling_pid);
1751	if (throttling_proc != PROC_NULL) {
1752	OSAddAtomic64(count, &(throttling_proc->did_throttle));
1753	proc_rele(throttling_proc);
1754	}
1755	}
1756
1757	/*
1758	* Block until woken up by the throttle timer or by a rethrottle call.
1759	* As long as we hold the throttle_lock while querying the throttle tier, we're
1760	* safe against seeing an old throttle tier after a rethrottle.
1761	*/
1762	uint32_t
1763	throttle_lowpri_io(int sleep_amount)
1764	{
1765	uthread_t ut;
1766	struct _throttle_io_info_t *info;
1767	int throttle_type = `0`;
1768	int mylevel = `0`;
1769	int throttling_level = THROTTLE_LEVEL_NONE;
1770	int sleep_cnt = `0`;
1771	uint32_t throttle_io_period_num = `0`;
1772	boolean_t insert_tail = TRUE;
1773	boolean_t s;
1774
1775	ut = get_bsdthread_info(current_thread());
1776
1777	if (ut->uu_lowpri_window == `0`)
1778	return (`0`);
1779
1780	info = ut->uu_throttle_info;
1781
1782	if (info == NULL) {
1783	ut->uu_throttle_bc = false;
1784	ut->uu_lowpri_window = `0`;
1785	return (`0`);
1786	}
1787	lck_mtx_lock(&info->throttle_lock);
1788	assert(ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED);
1789
1790	if (sleep_amount == `0`)
1791	goto done;
1792
1793	if (sleep_amount == `1` && !ut->uu_throttle_bc)
1794	sleep_amount = `0`;
1795
1796	throttle_io_period_num = info->throttle_io_period_num;
1797
1798	ut->uu_was_rethrottled = false;
1799
1800	while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) {
1801
1802	if (throttle_type == THROTTLE_ENGAGED) {
1803	if (sleep_amount == `0`)
1804	break;
1805	if (info->throttle_io_period_num < throttle_io_period_num)
1806	break;
1807	if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
1808	break;
1809	}
1810	/*
1811	* keep the same position in the list if "rethrottle_thread" changes our throttle level and
1812	* then puts us back to the original level before we get a chance to run
1813	*/
1814	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED && ut->uu_on_throttlelist != mylevel) {
1815	/*
1816	* must have been awakened via "rethrottle_thread" (the timer pulls us off the list)
1817	* and we've changed our throttling level, so pull ourselves off of the appropriate list
1818	* and make sure we get put on the tail of the new list since we're starting anew w/r to
1819	* the throttling engine
1820	*/
1821	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1822	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1823	insert_tail = TRUE;
1824	}
1825	if (ut->uu_on_throttlelist < THROTTLE_LEVEL_THROTTLED) {
1826	if (throttle_add_to_list(info, ut, mylevel, insert_tail) == THROTTLE_LEVEL_END)
1827	goto done;
1828	}
1829	assert(throttling_level >= THROTTLE_LEVEL_START && throttling_level <= THROTTLE_LEVEL_END);
1830
1831	s = ml_set_interrupts_enabled(FALSE);
1832	lck_spin_lock(&ut->uu_rethrottle_lock);
1833
1834	/*
1835	* this is the critical section w/r to our interaction
1836	* with "rethrottle_thread"
1837	*/
1838	if (ut->uu_was_rethrottled) {
1839
1840	lck_spin_unlock(&ut->uu_rethrottle_lock);
1841	ml_set_interrupts_enabled(s);
1842	lck_mtx_yield(&info->throttle_lock);
1843
1844	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, `103`)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, `0`, `0`, `0`);
1845
1846	ut->uu_was_rethrottled = false;
1847	continue;
1848	}
1849	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) \| DBG_FUNC_NONE,
1850	info->throttle_last_IO_pid[throttling_level], throttling_level, proc_selfpid(), mylevel, `0`);
1851
1852	if (sleep_cnt == `0`) {
1853	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, `97`)) \| DBG_FUNC_START,
1854	throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, `0`, `0`);
1855	throttled_count[mylevel]++;
1856	}
1857	ut->uu_wmesg = "throttle_lowpri_io";
1858
1859	assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
1860
1861	ut->uu_is_throttled = true;
1862	lck_spin_unlock(&ut->uu_rethrottle_lock);
1863	ml_set_interrupts_enabled(s);
1864
1865	lck_mtx_unlock(&info->throttle_lock);
1866
1867	thread_block(THREAD_CONTINUE_NULL);
1868
1869	ut->uu_wmesg = NULL;
1870
1871	ut->uu_is_throttled = false;
1872	ut->uu_was_rethrottled = false;
1873
1874	lck_mtx_lock(&info->throttle_lock);
1875
1876	sleep_cnt++;
1877
1878	if (sleep_amount == `0`)
1879	insert_tail = FALSE;
1880	else if (info->throttle_io_period_num < throttle_io_period_num \|\|
1881	(info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
1882	insert_tail = FALSE;
1883	sleep_amount = `0`;
1884	}
1885	}
1886	done:
1887	if (ut->uu_on_throttlelist >= THROTTLE_LEVEL_THROTTLED) {
1888	TAILQ_REMOVE(&info->throttle_uthlist[ut->uu_on_throttlelist], ut, uu_throttlelist);
1889	ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
1890	}
1891	lck_mtx_unlock(&info->throttle_lock);
1892
1893	if (sleep_cnt) {
1894	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, `97`)) \| DBG_FUNC_END,
1895	throttle_windows_msecs[mylevel], info->throttle_io_periods[mylevel], info->throttle_io_count, `0`, `0`);
1896	/*
1897	* We update the stats for the last pid which opened a throttle window for the throttled thread.
1898	* This might not be completely accurate since the multiple throttles seen by the lower tier pid
1899	* might have been caused by various higher prio pids. However, updating these stats accurately
1900	* means doing a proc_find while holding the throttle lock which leads to deadlock.
1901	*/
1902	throttle_update_proc_stats(info->throttle_last_IO_pid[throttling_level], sleep_cnt);
1903	}
1904
1905	ut->uu_throttle_info = NULL;
1906	ut->uu_throttle_bc = false;
1907	ut->uu_lowpri_window = `0`;
1908
1909	throttle_info_rel(info);
1910
1911	return (sleep_cnt);
1912	}
1913
1914	/*
1915	* KPI routine
1916	*
1917	* set a kernel thread's IO policy. policy can be:
1918	* IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE, IOPOL_UTILITY, IOPOL_STANDARD
1919	*
1920	* explanations about these policies are in the man page of setiopolicy_np
1921	*/
1922	void throttle_set_thread_io_policy(int policy)
1923	{
1924	proc_set_thread_policy(current_thread(), TASK_POLICY_INTERNAL, TASK_POLICY_IOPOL, policy);
1925	}
1926
1927	int throttle_get_thread_effective_io_policy()
1928	{
1929	return proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
1930	}
1931
1932	void throttle_info_reset_window(uthread_t ut)
1933	{
1934	struct _throttle_io_info_t *info;
1935
1936	if (ut == NULL)
1937	ut = get_bsdthread_info(current_thread());
1938
1939	if ( (info = ut->uu_throttle_info) ) {
1940	throttle_info_rel(info);
1941
1942	ut->uu_throttle_info = NULL;
1943	ut->uu_lowpri_window = `0`;
1944	ut->uu_throttle_bc = false;
1945	}
1946	}
1947
1948	static
1949	void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle, boolean_t isssd)
1950	{
1951	if (lowpri_throttle_enabled == `0` \|\| info->throttle_disabled)
1952	return;
1953
1954	if (info->throttle_io_periods == `0`) {
1955	throttle_init_throttle_period(info, isssd);
1956	}
1957	if (ut->uu_throttle_info == NULL) {
1958
1959	ut->uu_throttle_info = info;
1960	throttle_info_ref(info);
1961	DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
1962
1963	ut->uu_lowpri_window = `1`;
1964	ut->uu_throttle_bc = BC_throttle;
1965	}
1966	}
1967
1968	/*
1969	* Update inflight IO count and throttling window
1970	* Should be called when an IO is done
1971	*
1972	* Only affects IO that was sent through spec_strategy
1973	*/
1974	void throttle_info_end_io(buf_t bp) {
1975	mount_t mp;
1976	struct bufattr *bap;
1977	struct _throttle_io_info_t *info;
1978	int io_tier;
1979
1980	bap = &bp->b_attr;
1981	if (!ISSET(bap->ba_flags, BA_STRATEGY_TRACKED_IO)) {
1982	return;
1983	}
1984	CLR(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
1985
1986	mp = buf_vnode(bp)->v_mount;
1987	if (mp != NULL) {
1988	info = &_throttle_io_info[mp->mnt_devbsdunit];
1989	} else {
1990	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
1991	}
1992
1993	io_tier = GET_BUFATTR_IO_TIER(bap);
1994	if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
1995	io_tier--;
1996	}
1997
1998	throttle_info_end_io_internal(info, io_tier);
1999	}
2000
2001	/*
2002	* Decrement inflight count initially incremented by throttle_info_update_internal
2003	*/
2004	static
2005	void throttle_info_end_io_internal(struct _throttle_io_info_t info, int* throttle_level) {
2006	if (throttle_level == THROTTLE_LEVEL_NONE) {
2007	return;
2008	}
2009
2010	microuptime(&info->throttle_window_start_timestamp[throttle_level]);
2011	OSDecrementAtomic(&info->throttle_inflight_count[throttle_level]);
2012	assert(info->throttle_inflight_count[throttle_level] >= `0`);
2013	}
2014
2015	/*
2016	* If inflight is TRUE and bap is NULL then the caller is responsible for calling
2017	* throttle_info_end_io_internal to avoid leaking in-flight I/O.
2018	*/
2019	static
2020	int throttle_info_update_internal(struct _throttle_io_info_t info, uthread_t ut, int* flags, boolean_t isssd, boolean_t inflight, struct bufattr *bap)
2021	{
2022	int thread_throttle_level;
2023
2024	if (lowpri_throttle_enabled == `0` \|\| info->throttle_disabled)
2025	return THROTTLE_LEVEL_NONE;
2026
2027	if (ut == NULL)
2028	ut = get_bsdthread_info(current_thread());
2029
2030	if (bap && inflight && !ut->uu_throttle_bc) {
2031	thread_throttle_level = GET_BUFATTR_IO_TIER(bap);
2032	if (ISSET(bap->ba_flags, BA_IO_TIER_UPGRADE)) {
2033	thread_throttle_level--;
2034	}
2035	} else {
2036	thread_throttle_level = throttle_get_thread_throttle_level(ut);
2037	}
2038
2039	if (thread_throttle_level != THROTTLE_LEVEL_NONE) {
2040	if(!ISSET(flags, B_PASSIVE)) {
2041	info->throttle_last_IO_pid[thread_throttle_level] = proc_selfpid();
2042	if (inflight && !ut->uu_throttle_bc) {
2043	if (NULL != bap) {
2044	SET(bap->ba_flags, BA_STRATEGY_TRACKED_IO);
2045	}
2046	OSIncrementAtomic(&info->throttle_inflight_count[thread_throttle_level]);
2047	} else {
2048	microuptime(&info->throttle_window_start_timestamp[thread_throttle_level]);
2049	}
2050	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, OPEN_THROTTLE_WINDOW)) \| DBG_FUNC_NONE,
2051	current_proc()->p_pid, thread_throttle_level, `0`, `0`, `0`);
2052	}
2053	microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
2054	}
2055
2056
2057	if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
2058	/*
2059	* I'd really like to do the IOSleep here, but
2060	* we may be holding all kinds of filesystem related locks
2061	* and the pages for this I/O marked 'busy'...
2062	* we don't want to cause a normal task to block on
2063	* one of these locks while we're throttling a task marked
2064	* for low priority I/O... we'll mark the uthread and
2065	* do the delay just before we return from the system
2066	* call that triggered this I/O or from vnode_pagein
2067	*/
2068	OSAddAtomic(`1`, &info->throttle_io_count);
2069
2070	throttle_info_set_initial_window(ut, info, FALSE, isssd);
2071	}
2072
2073	return thread_throttle_level;
2074	}
2075
2076	void *throttle_info_update_by_mount(mount_t mp)
2077	{
2078	struct _throttle_io_info_t *info;
2079	uthread_t ut;
2080	boolean_t isssd = FALSE;
2081
2082	ut = get_bsdthread_info(current_thread());
2083
2084	if (mp != NULL) {
2085	if (disk_conditioner_mount_is_ssd(mp))
2086	isssd = TRUE;
2087	info = &_throttle_io_info[mp->mnt_devbsdunit];
2088	} else
2089	info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
2090
2091	if (!ut->uu_lowpri_window)
2092	throttle_info_set_initial_window(ut, info, FALSE, isssd);
2093
2094	return info;
2095	}
2096
2097
2098	/*
2099	* KPI routine
2100	*
2101	* this is usually called before every I/O, used for throttled I/O
2102	* book keeping. This routine has low overhead and does not sleep
2103	*/
2104	void throttle_info_update(void throttle_info, int* flags)
2105	{
2106	if (throttle_info)
2107	throttle_info_update_internal(throttle_info, NULL, flags, FALSE, FALSE, NULL);
2108	}
2109
2110	/*
2111	* KPI routine
2112	*
2113	* this is usually called before every I/O, used for throttled I/O
2114	* book keeping. This routine has low overhead and does not sleep
2115	*/
2116	void throttle_info_update_by_mask(void throttle_info_handle, int* flags)
2117	{
2118	void *throttle_info = throttle_info_handle;
2119
2120	/*
2121	* for now we only use the lowest bit of the throttle mask, so the
2122	* handle is the same as the throttle_info. Later if we store a
2123	* set of throttle infos in the handle, we will want to loop through
2124	* them and call throttle_info_update in a loop
2125	*/
2126	throttle_info_update(throttle_info, flags);
2127	}
2128	/*
2129	* KPI routine
2130	*
2131	* This routine marks the throttle info as disabled. Used for mount points which
2132	* support I/O scheduling.
2133	*/
2134
2135	void throttle_info_disable_throttle(int devno, boolean_t isfusion)
2136	{
2137	struct _throttle_io_info_t *info;
2138
2139	if (devno < `0` \|\| devno >= LOWPRI_MAX_NUM_DEV)
2140	panic("Illegal devno (%d) passed into throttle_info_disable_throttle()", devno);
2141
2142	info = &_throttle_io_info[devno];
2143	// don't disable software throttling on devices that are part of a fusion device
2144	// and override the software throttle periods to use HDD periods
2145	if (isfusion) {
2146	info->throttle_is_fusion_with_priority = isfusion;
2147	throttle_init_throttle_period(info, FALSE);
2148	}
2149	info->throttle_disabled = !info->throttle_is_fusion_with_priority;
2150	return;
2151	}
2152
2153
2154	/*
2155	* KPI routine (private)
2156	* Called to determine if this IO is being throttled to this level so that it can be treated specially
2157	*/
2158	int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
2159	{
2160	struct _throttle_io_info_t *info = throttle_info;
2161	struct timeval elapsed;
2162	uint64_t elapsed_msecs;
2163	int throttle_level;
2164	int thread_throttle_level;
2165
2166	switch (policy) {
2167
2168	case IOPOL_THROTTLE:
2169	thread_throttle_level = THROTTLE_LEVEL_TIER3;
2170	break;
2171	case IOPOL_UTILITY:
2172	thread_throttle_level = THROTTLE_LEVEL_TIER2;
2173	break;
2174	case IOPOL_STANDARD:
2175	thread_throttle_level = THROTTLE_LEVEL_TIER1;
2176	break;
2177	default:
2178	thread_throttle_level = THROTTLE_LEVEL_TIER0;
2179	break;
2180	}
2181	for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
2182	if (info->throttle_inflight_count[throttle_level]) {
2183	break;
2184	}
2185
2186	microuptime(&elapsed);
2187	timevalsub(&elapsed, &info->throttle_window_start_timestamp[throttle_level]);
2188	elapsed_msecs = (uint64_t)elapsed.tv_sec * (uint64_t)`1000` + (elapsed.tv_usec / `1000`);
2189
2190	if (elapsed_msecs < (uint64_t)throttle_windows_msecs[thread_throttle_level])
2191	break;
2192	}
2193	if (throttle_level >= thread_throttle_level) {
2194	/*
2195	* we're beyond all of the throttle windows
2196	* so go ahead and treat as normal I/O
2197	*/
2198	return (THROTTLE_DISENGAGED);
2199	}
2200	/*
2201	* we're in the throttle window
2202	*/
2203	return (THROTTLE_ENGAGED);
2204	}
2205
2206	int throttle_lowpri_window(void)
2207	{
2208	struct uthread *ut = get_bsdthread_info(current_thread());
2209	return ut->uu_lowpri_window;
2210	}
2211
2212
2213	#if CONFIG_IOSCHED
2214	int upl_get_cached_tier(void *);
2215	#endif
2216
2217	int
2218	spec_strategy(struct vnop_strategy_args *ap)
2219	{
2220	buf_t bp;
2221	int bflags;
2222	int io_tier;
2223	int passive;
2224	dev_t bdev;
2225	uthread_t ut;
2226	mount_t mp;
2227	struct bufattr *bap;
2228	int strategy_ret;
2229	struct _throttle_io_info_t *throttle_info;
2230	boolean_t isssd = FALSE;
2231	boolean_t inflight = FALSE;
2232	boolean_t upgrade = FALSE;
2233	int code = `0`;
2234
2235	#if !CONFIG_EMBEDDED
2236	proc_t curproc = current_proc();
2237	#endif /* !CONFIG_EMBEDDED */
2238
2239	bp = ap->a_bp;
2240	bdev = buf_device(bp);
2241	mp = buf_vnode(bp)->v_mount;
2242	bap = &bp->b_attr;
2243
2244	#if CONFIG_IOSCHED
2245	if (bp->b_flags & B_CLUSTER) {
2246
2247	io_tier = upl_get_cached_tier(bp->b_upl);
2248
2249	if (io_tier == -`1`)
2250	io_tier = throttle_get_io_policy(&ut);
2251	#if DEVELOPMENT \|\| DEBUG
2252	else {
2253	int my_io_tier = throttle_get_io_policy(&ut);
2254
2255	if (io_tier != my_io_tier)
2256	KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, IO_TIER_UPL_MISMATCH)) \| DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), my_io_tier, io_tier, `0`, `0`);
2257	}
2258	#endif
2259	} else
2260	io_tier = throttle_get_io_policy(&ut);
2261	#else
2262	io_tier = throttle_get_io_policy(&ut);
2263	#endif
2264	passive = throttle_get_passive_io_policy(&ut);
2265
2266	/*
2267	* Mark if the I/O was upgraded by throttle_get_thread_throttle_level
2268	* while preserving the original issued tier (throttle_get_io_policy
2269	* does not return upgraded tiers)
2270	*/
2271	if (mp && io_tier > throttle_get_thread_throttle_level_internal(ut, io_tier)) {
2272	#if CONFIG_IOSCHED
2273	if (!(mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2274	upgrade = TRUE;
2275	}
2276	#else /* CONFIG_IOSCHED */
2277	upgrade = TRUE;
2278	#endif /* CONFIG_IOSCHED */
2279	}
2280
2281	if (bp->b_flags & B_META)
2282	bap->ba_flags \|= BA_META;
2283
2284	#if CONFIG_IOSCHED
2285	/*
2286	* For I/O Scheduling, we currently do not have a way to track and expedite metadata I/Os.
2287	* To ensure we dont get into priority inversions due to metadata I/Os, we use the following rules:
2288	* For metadata reads, ceil all I/Os to IOSCHED_METADATA_TIER & mark them passive if the I/O tier was upgraded
2289	* For metadata writes, unconditionally mark them as IOSCHED_METADATA_TIER and passive
2290	*/
2291	if (bap->ba_flags & BA_META) {
2292	if (mp && (mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)) {
2293	if (bp->b_flags & B_READ) {
2294	if (io_tier > IOSCHED_METADATA_TIER) {
2295	io_tier = IOSCHED_METADATA_TIER;
2296	passive = `1`;
2297	}
2298	} else {
2299	io_tier = IOSCHED_METADATA_TIER;
2300	passive = `1`;
2301	}
2302	}
2303	}
2304	#endif /* CONFIG_IOSCHED */
2305
2306	SET_BUFATTR_IO_TIER(bap, io_tier);
2307
2308	if (passive) {
2309	bp->b_flags \|= B_PASSIVE;
2310	bap->ba_flags \|= BA_PASSIVE;
2311	}
2312
2313	#if !CONFIG_EMBEDDED
2314	if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
2315	bap->ba_flags \|= BA_DELAYIDLESLEEP;
2316	#endif /* !CONFIG_EMBEDDED */
2317
2318	bflags = bp->b_flags;
2319
2320	if (((bflags & B_READ) == `0`) && ((bflags & B_ASYNC) == `0`))
2321	bufattr_markquickcomplete(bap);
2322
2323	if (bflags & B_READ)
2324	code \|= DKIO_READ;
2325	if (bflags & B_ASYNC)
2326	code \|= DKIO_ASYNC;
2327
2328	if (bap->ba_flags & BA_META)
2329	code \|= DKIO_META;
2330	else if (bflags & B_PAGEIO)
2331	code \|= DKIO_PAGING;
2332
2333	if (io_tier != `0`)
2334	code \|= DKIO_THROTTLE;
2335
2336	code \|= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
2337
2338	if (bflags & B_PASSIVE)
2339	code \|= DKIO_PASSIVE;
2340
2341	if (bap->ba_flags & BA_NOCACHE)
2342	code \|= DKIO_NOCACHE;
2343
2344	if (upgrade) {
2345	code \|= DKIO_TIER_UPGRADE;
2346	SET(bap->ba_flags, BA_IO_TIER_UPGRADE);
2347	}
2348
2349	if (kdebug_enable) {
2350	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) \| DBG_FUNC_NONE,
2351	buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), `0`);
2352	}
2353
2354	thread_update_io_stats(current_thread(), buf_count(bp), code);
2355
2356	if (mp != NULL) {
2357	if (disk_conditioner_mount_is_ssd(mp))
2358	isssd = TRUE;
2359	/*
2360	* Partially initialized mounts don't have a final devbsdunit and should not be tracked.
2361	* Verify that devbsdunit is initialized (non-zero) or that 0 is the correct initialized value
2362	* (mnt_throttle_mask is initialized and num_trailing_0 would be 0)
2363	*/
2364	if (mp->mnt_devbsdunit \|\| (mp->mnt_throttle_mask != LOWPRI_MAX_NUM_DEV - `1` && mp->mnt_throttle_mask & `0x1`)) {
2365	inflight = TRUE;
2366	}
2367	throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
2368
2369	} else
2370	throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - `1`];
2371
2372	throttle_info_update_internal(throttle_info, ut, bflags, isssd, inflight, bap);
2373
2374	if ((bflags & B_READ) == `0`) {
2375	microuptime(&throttle_info->throttle_last_write_timestamp);
2376
2377	if (mp) {
2378	mp->mnt_last_write_issued_timestamp = throttle_info->throttle_last_write_timestamp;
2379	INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
2380	}
2381	} else if (mp) {
2382	INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
2383	}
2384	/*
2385	* The BootCache may give us special information about
2386	* the IO, so it returns special values that we check
2387	* for here.
2388	*
2389	* IO_SATISFIED_BY_CACHE
2390	* The read has been satisfied by the boot cache. Don't
2391	* throttle the thread unnecessarily.
2392	*
2393	* IO_SHOULD_BE_THROTTLED
2394	* The boot cache is playing back a playlist and this IO
2395	* cut through. Throttle it so we're not cutting through
2396	* the boot cache too often.
2397	*
2398	* Note that typical strategy routines are defined with
2399	* a void return so we'll get garbage here. In the
2400	* unlikely case the garbage matches our special return
2401	* value, it's not a big deal since we're only adjusting
2402	* the throttling delay.
2403	*/
2404	#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
2405	#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2406	typedef int strategy_fcn_ret_t(struct buf *bp);
2407
2408	strategy_ret = ((strategy_fcn_ret_t)bdevsw[major(bdev)].d_strategy)(bp);
2409
2410	// disk conditioner needs to track when this I/O actually starts
2411	// which means track it after `strategy` which may include delays
2412	// from inflight I/Os
2413	microuptime(&bp->b_timestamp_tv);
2414
2415	if (IO_SATISFIED_BY_CACHE == strategy_ret) {
2416	/*
2417	* If this was a throttled IO satisfied by the boot cache,
2418	* don't delay the thread.
2419	*/
2420	throttle_info_reset_window(ut);
2421
2422	} else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
2423	/*
2424	* If the boot cache indicates this IO should be throttled,
2425	* delay the thread.
2426	*/
2427	throttle_info_set_initial_window(ut, throttle_info, TRUE, isssd);
2428	}
2429	return (`0`);
2430	}
2431
2432
2433	/*
2434	* This is a noop, simply returning what one has been given.
2435	*/
2436	int
2437	spec_blockmap(__unused struct vnop_blockmap_args *ap)
2438	{
2439	return (ENOTSUP);
2440	}
2441
2442
2443	/*
2444	* Device close routine
2445	*/
2446	int
2447	spec_close(struct vnop_close_args *ap)
2448	{
2449	struct vnode *vp = ap->a_vp;
2450	dev_t dev = vp->v_rdev;
2451	int error = `0`;
2452	int flags = ap->a_fflag;
2453	struct proc *p = vfs_context_proc(ap->a_context);
2454	struct session *sessp;
2455
2456	switch (vp->v_type) {
2457
2458	case VCHR:
2459	/*
2460	* Hack: a tty device that is a controlling terminal
2461	* has a reference from the session structure.
2462	* We cannot easily tell that a character device is
2463	* a controlling terminal, unless it is the closing
2464	* process' controlling terminal. In that case,
2465	* if the reference count is 1 (this is the very
2466	* last close)
2467	*/
2468	sessp = proc_session(p);
2469	devsw_lock(dev, S_IFCHR);
2470	if (sessp != SESSION_NULL) {
2471	if (vp == sessp->s_ttyvp && vcount(vp) == `1`) {
2472	struct tty *tp = TTY_NULL;
2473
2474	devsw_unlock(dev, S_IFCHR);
2475	session_lock(sessp);
2476	if (vp == sessp->s_ttyvp) {
2477	tp = SESSION_TP(sessp);
2478	sessp->s_ttyvp = NULL;
2479	sessp->s_ttyvid = `0`;
2480	sessp->s_ttyp = TTY_NULL;
2481	sessp->s_ttypgrpid = NO_PID;
2482	}
2483	session_unlock(sessp);
2484
2485	if (tp != TTY_NULL) {
2486	/*
2487	* We may have won a race with a proc_exit
2488	* of the session leader, the winner
2489	* clears the flag (even if not set)
2490	*/
2491	tty_lock(tp);
2492	ttyclrpgrphup(tp);
2493	tty_unlock(tp);
2494
2495	ttyfree(tp);
2496	}
2497	devsw_lock(dev, S_IFCHR);
2498	}
2499	session_rele(sessp);
2500	}
2501
2502	if (--vp->v_specinfo->si_opencount < `0`)
2503	panic("negative open count (c, %u, %u)", major(dev), minor(dev));
2504
2505	/*
2506	* close on last reference or on vnode revoke call
2507	*/
2508	if (vcount(vp) == `0` \|\| (flags & IO_REVOKE) != `0`)
2509	error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
2510
2511	devsw_unlock(dev, S_IFCHR);
2512	break;
2513
2514	case VBLK:
2515	/*
2516	* If there is more than one outstanding open, don't
2517	* send the close to the device.
2518	*/
2519	devsw_lock(dev, S_IFBLK);
2520	if (vcount(vp) > `1`) {
2521	vp->v_specinfo->si_opencount--;
2522	devsw_unlock(dev, S_IFBLK);
2523	return (`0`);
2524	}
2525	devsw_unlock(dev, S_IFBLK);
2526
2527	/*
2528	* On last close of a block device (that isn't mounted)
2529	* we must invalidate any in core blocks, so that
2530	* we can, for instance, change floppy disks.
2531	*/
2532	if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
2533	return (error);
2534
2535	error = buf_invalidateblks(vp, BUF_WRITE_DATA, `0`, `0`);
2536	if (error)
2537	return (error);
2538
2539	devsw_lock(dev, S_IFBLK);
2540
2541	if (--vp->v_specinfo->si_opencount < `0`)
2542	panic("negative open count (b, %u, %u)", major(dev), minor(dev));
2543
2544	if (vcount(vp) == `0`)
2545	error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
2546
2547	devsw_unlock(dev, S_IFBLK);
2548	break;
2549
2550	default:
2551	panic("spec_close: not special");
2552	return(EBADF);
2553	}
2554
2555	return error;
2556	}
2557
2558	/*
2559	* Return POSIX pathconf information applicable to special devices.
2560	*/
2561	int
2562	spec_pathconf(struct vnop_pathconf_args *ap)
2563	{
2564
2565	switch (ap->a_name) {
2566	case _PC_LINK_MAX:
2567	*ap->a_retval = LINK_MAX;
2568	return (`0`);
2569	case _PC_MAX_CANON:
2570	*ap->a_retval = MAX_CANON;
2571	return (`0`);
2572	case _PC_MAX_INPUT:
2573	*ap->a_retval = MAX_INPUT;
2574	return (`0`);
2575	case _PC_PIPE_BUF:
2576	*ap->a_retval = PIPE_BUF;
2577	return (`0`);
2578	case _PC_CHOWN_RESTRICTED:
2579	ap->a_retval = `200112`; /* _POSIX_CHOWN_RESTRICTED /
2580	return (`0`);
2581	case _PC_VDISABLE:
2582	*ap->a_retval = _POSIX_VDISABLE;
2583	return (`0`);
2584	default:
2585	return (EINVAL);
2586	}
2587	/ NOTREACHED /
2588	}
2589
2590	/*
2591	* Special device failed operation
2592	*/
2593	int
2594	spec_ebadf(__unused void *dummy)
2595	{
2596
2597	return (EBADF);
2598	}
2599
2600	/ Blktooff derives file offset from logical block number /
2601	int
2602	spec_blktooff(struct vnop_blktooff_args *ap)
2603	{
2604	struct vnode *vp = ap->a_vp;
2605
2606	switch (vp->v_type) {
2607	case VCHR:
2608	ap->a_offset = (off_t)-`1`; /* failure /
2609	return (ENOTSUP);
2610
2611	case VBLK:
2612	printf("spec_blktooff: not implemented for VBLK\n");
2613	ap->a_offset = (off_t)-`1`; /* failure /
2614	return (ENOTSUP);
2615
2616	default:
2617	panic("spec_blktooff type");
2618	}
2619	/ NOTREACHED /
2620
2621	return (`0`);
2622	}
2623
2624	/ Offtoblk derives logical block number from file offset /
2625	int
2626	spec_offtoblk(struct vnop_offtoblk_args *ap)
2627	{
2628	struct vnode *vp = ap->a_vp;
2629
2630	switch (vp->v_type) {
2631	case VCHR:
2632	ap->a_lblkno = (daddr64_t)-`1`; /* failure /
2633	return (ENOTSUP);
2634
2635	case VBLK:
2636	printf("spec_offtoblk: not implemented for VBLK\n");
2637	ap->a_lblkno = (daddr64_t)-`1`; /* failure /
2638	return (ENOTSUP);
2639
2640	default:
2641	panic("spec_offtoblk type");
2642	}
2643	/ NOTREACHED /
2644
2645	return (`0`);
2646	}
2647
2648	static void filt_specdetach(struct knote *kn);
2649	static int filt_specevent(struct knote kn, long* hint);
2650	static int filt_spectouch(struct knote kn, struct* kevent_internal_s *kev);
2651	static int filt_specprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev);
2652	static int filt_specpeek(struct knote *kn);
2653
2654	SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
2655	.f_isfd = `1`,
2656	.f_attach = filt_specattach,
2657	.f_detach = filt_specdetach,
2658	.f_event = filt_specevent,
2659	.f_touch = filt_spectouch,
2660	.f_process = filt_specprocess,
2661	.f_peek = filt_specpeek
2662	};
2663
2664
2665	/*
2666	* Given a waitq that is assumed to be embedded within a selinfo structure,
2667	* return the containing selinfo structure. While 'wq' is not really a queue
2668	* element, this macro simply does the offset_of calculation to get back to a
2669	* containing struct given the struct type and member name.
2670	*/
2671	#define selinfo_from_waitq(wq) \
2672	qe_element((wq), struct selinfo, si_waitq)
2673
2674	static int
2675	spec_knote_select_and_link(struct knote *kn)
2676	{
2677	uthread_t uth;
2678	vfs_context_t ctx;
2679	vnode_t vp;
2680	struct waitq_set *old_wqs;
2681	uint64_t rsvd, rsvd_arg;
2682	uint64_t *rlptr = NULL;
2683	struct selinfo *si = NULL;
2684	int selres = `0`;
2685
2686	uth = get_bsdthread_info(current_thread());
2687
2688	ctx = vfs_context_current();
2689	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2690
2691	int error = vnode_getwithvid(vp, kn->kn_hookid);
2692	if (error != `0`) {
2693	knote_set_error(kn, ENOENT);
2694	return `0`;
2695	}
2696
2697	/*
2698	* This function may be called many times to link or re-link the
2699	* underlying vnode to the kqueue. If we've already linked the two,
2700	* we will have a valid kn_hook_data which ties us to the underlying
2701	* device's waitq via a the waitq's prepost table object. However,
2702	* devices can abort any select action by calling selthreadclear().
2703	* This is OK because the table object will be invalidated by the
2704	* driver (through a call to selthreadclear), so any attempt to access
2705	* the associated waitq will fail because the table object is invalid.
2706	*
2707	* Even if we've already registered, we need to pass a pointer
2708	* to a reserved link structure. Otherwise, selrecord() will
2709	* infer that we're in the second pass of select() and won't
2710	* actually do anything!
2711	*/
2712	rsvd = rsvd_arg = waitq_link_reserve(NULL);
2713	rlptr = (void *)&rsvd_arg;
2714
2715	/*
2716	* Trick selrecord() into hooking kqueue's wait queue set into the device's
2717	* selinfo wait queue.
2718	*/
2719	old_wqs = uth->uu_wqset;
2720	uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
2721
2722	/*
2723	* Be sure that the waitq set is linked
2724	* before calling select to avoid possible
2725	* allocation under spinlocks.
2726	*/
2727	waitq_set_lazy_init_link(uth->uu_wqset);
2728
2729	/*
2730	* Now these are the laws of VNOP_SELECT, as old and as true as the sky,
2731	* And the device that shall keep it may prosper, but the device that shall
2732	* break it must receive ENODEV:
2733	*
2734	* 1. Take a lock to protect against other selects on the same vnode.
2735	* 2. Return 1 if data is ready to be read.
2736	* 3. Return 0 and call `selrecord` on a handy `selinfo` structure if there
2737	* is no data.
2738	* 4. Call `selwakeup` when the vnode has an active `selrecord` and data
2739	* can be read or written (depending on the seltype).
2740	* 5. If there's a `selrecord` and no corresponding `selwakeup`, but the
2741	* vnode is going away, call `selthreadclear`.
2742	*/
2743	selres = VNOP_SELECT(vp, knote_get_seltype(kn), `0`, rlptr, ctx);
2744	uth->uu_wqset = old_wqs;
2745
2746	/*
2747	* Make sure to cleanup the reserved link - this guards against
2748	* drivers that may not actually call selrecord().
2749	*/
2750	waitq_link_release(rsvd);
2751	if (rsvd != rsvd_arg) {
2752	/ The driver / handler called selrecord() /
2753	struct waitq *wq;
2754	memcpy(&wq, rlptr, sizeof(void *));
2755
2756	/*
2757	* The waitq is part of the selinfo structure managed by the
2758	* driver. For certain drivers, we want to hook the knote into
2759	* the selinfo structure's si_note field so selwakeup can call
2760	* KNOTE.
2761	*/
2762	si = selinfo_from_waitq(wq);
2763
2764	/*
2765	* The waitq_get_prepost_id() function will (potentially)
2766	* allocate a prepost table object for the waitq and return
2767	* the table object's ID to us. It will also set the
2768	* waitq_prepost_id field within the waitq structure.
2769	*
2770	* We can just overwrite kn_hook_data because it's simply a
2771	* table ID used to grab a reference when needed.
2772	*
2773	* We have a reference on the vnode, so we know that the
2774	* device won't go away while we get this ID.
2775	*/
2776	kn->kn_hook_data = waitq_get_prepost_id(wq);
2777	} else if (selres == `0`) {
2778	/*
2779	* The device indicated that there's no data to read, but didn't call
2780	* `selrecord`. Nothing will be notified of changes to this vnode, so
2781	* return an error back to user space, to make it clear that the knote
2782	* is not attached.
2783	*/
2784	knote_set_error(kn, ENODEV);
2785	}
2786
2787	vnode_put(vp);
2788
2789	return selres;
2790	}
2791
2792	static void filt_spec_common(struct knote kn, int* selres)
2793	{
2794	if (kn->kn_vnode_use_ofst) {
2795	if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
2796	kn->kn_data = `0`;
2797	} else {
2798	kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
2799	}
2800	} else {
2801	kn->kn_data = selres;
2802	}
2803	}
2804
2805	static int
2806	filt_specattach(struct knote kn, __unused struct* kevent_internal_s *kev)
2807	{
2808	vnode_t vp;
2809	dev_t dev;
2810
2811	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; / Already have iocount, and vnode is alive /
2812
2813	assert(vnode_ischr(vp));
2814
2815	dev = vnode_specrdev(vp);
2816
2817	/*
2818	* For a few special kinds of devices, we can attach knotes with
2819	* no restrictions because their "select" vectors return the amount
2820	* of data available. Others require an explicit NOTE_LOWAT with
2821	* data of 1, indicating that the caller doesn't care about actual
2822	* data counts, just an indication that the device has data.
2823	*/
2824	if (!kn->kn_vnode_kqok &&
2825	((kn->kn_sfflags & NOTE_LOWAT) == `0` \|\| kn->kn_sdata != `1`)) {
2826	knote_set_error(kn, EINVAL);
2827	return `0`;
2828	}
2829
2830	/*
2831	* This forces the select fallback to call through VNOP_SELECT and hook
2832	* up selinfo on every filter routine.
2833	*
2834	* Pseudo-terminal controllers are opted out of native kevent support --
2835	* remove this when they get their own EVFILTID.
2836	*/
2837	if (cdevsw_flags[major(dev)] & CDEVSW_IS_PTC) {
2838	kn->kn_vnode_kqok = `0`;
2839	}
2840
2841	kn->kn_filtid = EVFILTID_SPEC;
2842	kn->kn_hook_data = `0`;
2843	kn->kn_hookid = vnode_vid(vp);
2844
2845	knote_markstayactive(kn);
2846	return spec_knote_select_and_link(kn);
2847	}
2848
2849	static void
2850	filt_specdetach(struct knote *kn)
2851	{
2852	knote_clearstayactive(kn);
2853
2854	/*
2855	* This is potentially tricky: the device's selinfo waitq that was
2856	* tricked into being part of this knote's waitq set may not be a part
2857	* of any other set, and the device itself may have revoked the memory
2858	* in which the waitq was held. We use the knote's kn_hook_data field
2859	* to keep the ID of the waitq's prepost table object. This
2860	* object keeps a pointer back to the waitq, and gives us a safe way
2861	* to decouple the dereferencing of driver allocated memory: if the
2862	* driver goes away (taking the waitq with it) then the prepost table
2863	* object will be invalidated. The waitq details are handled in the
2864	* waitq API invoked here.
2865	*/
2866	if (kn->kn_hook_data) {
2867	waitq_unlink_by_prepost_id(kn->kn_hook_data, &(knote_get_kq(kn)->kq_wqs));
2868	kn->kn_hook_data = `0`;
2869	}
2870	}
2871
2872	static int
2873	filt_specevent(struct knote kn, __unused long* hint)
2874	{
2875	/*
2876	* Nothing should call knote or knote_vanish on this knote.
2877	*/
2878	panic("filt_specevent(%p)", kn);
2879	return `0`;
2880	}
2881
2882	static int
2883	filt_spectouch(struct knote kn, struct* kevent_internal_s *kev)
2884	{
2885	kn->kn_sdata = kev->data;
2886	kn->kn_sfflags = kev->fflags;
2887
2888	if (kev->flags & EV_ENABLE) {
2889	return spec_knote_select_and_link(kn);
2890	}
2891
2892	return `0`;
2893	}
2894
2895	static int
2896	filt_specprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
2897	{
2898	#pragma unused(data)
2899	vnode_t vp;
2900	uthread_t uth;
2901	vfs_context_t ctx;
2902	int res;
2903	int selres;
2904	int error;
2905
2906	uth = get_bsdthread_info(current_thread());
2907	ctx = vfs_context_current();
2908	vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
2909
2910	error = vnode_getwithvid(vp, kn->kn_hookid);
2911	if (error != `0`) {
2912	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
2913	*kev = kn->kn_kevent;
2914	return `1`;
2915	}
2916
2917	selres = spec_knote_select_and_link(kn);
2918	filt_spec_common(kn, selres);
2919
2920	vnode_put(vp);
2921
2922	res = ((kn->kn_sfflags & NOTE_LOWAT) != `0`) ?
2923	(kn->kn_data >= kn->kn_sdata) : kn->kn_data;
2924
2925	if (res) {
2926	*kev = kn->kn_kevent;
2927	if (kn->kn_flags & EV_CLEAR) {
2928	kn->kn_fflags = `0`;
2929	kn->kn_data = `0`;
2930	}
2931	}
2932
2933	return res;
2934	}
2935
2936	static int
2937	filt_specpeek(struct knote *kn)
2938	{
2939	int selres = `0`;
2940
2941	selres = spec_knote_select_and_link(kn);
2942	filt_spec_common(kn, selres);
2943
2944	return kn->kn_data != `0`;
2945	}
2946
2947

Browse the source code of codebrowser/bsd/miscfs/specfs/spec_vnops.c