vfs_subr.c source code [codebrowser/bsd/vfs/vfs_subr.c]

1	/*
2	* Copyright (c) 2000-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1989, 1993
31	* The Regents of the University of California. All rights reserved.
32	* (c) UNIX System Laboratories, Inc.
33	* All or some portions of this file are derived from material licensed
34	* to the University of California by American Telephone and Telegraph
35	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
36	* the permission of UNIX System Laboratories, Inc.
37	*
38	* Redistribution and use in source and binary forms, with or without
39	* modification, are permitted provided that the following conditions
40	* are met:
41	* 1. Redistributions of source code must retain the above copyright
42	* notice, this list of conditions and the following disclaimer.
43	* 2. Redistributions in binary form must reproduce the above copyright
44	* notice, this list of conditions and the following disclaimer in the
45	* documentation and/or other materials provided with the distribution.
46	* 3. All advertising materials mentioning features or use of this software
47	* must display the following acknowledgement:
48	* This product includes software developed by the University of
49	* California, Berkeley and its contributors.
50	* 4. Neither the name of the University nor the names of its contributors
51	* may be used to endorse or promote products derived from this software
52	* without specific prior written permission.
53	*
54	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64	* SUCH DAMAGE.
65	*
66	* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
67	*/
68	/*
69	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
70	* support for mandatory and extensible security protections. This notice
71	* is included in support of clause 2.2 (b) of the Apple Public License,
72	* Version 2.0.
73	*/
74
75	/*
76	* External virtual filesystem routines
77	*/
78
79	#include <sys/param.h>
80	#include <sys/systm.h>
81	#include <sys/proc_internal.h>
82	#include <sys/kauth.h>
83	#include <sys/mount_internal.h>
84	#include <sys/time.h>
85	#include <sys/lock.h>
86	#include <sys/vnode.h>
87	#include <sys/vnode_internal.h>
88	#include <sys/stat.h>
89	#include <sys/namei.h>
90	#include <sys/ucred.h>
91	#include <sys/buf_internal.h>
92	#include <sys/errno.h>
93	#include <sys/malloc.h>
94	#include <sys/uio_internal.h>
95	#include <sys/uio.h>
96	#include <sys/domain.h>
97	#include <sys/mbuf.h>
98	#include <sys/syslog.h>
99	#include <sys/ubc_internal.h>
100	#include <sys/vm.h>
101	#include <sys/sysctl.h>
102	#include <sys/filedesc.h>
103	#include <sys/event.h>
104	#include <sys/kdebug.h>
105	#include <sys/kauth.h>
106	#include <sys/user.h>
107	#include <sys/systm.h>
108	#include <sys/kern_memorystatus.h>
109	#include <sys/lockf.h>
110	#include <miscfs/fifofs/fifo.h>
111
112	#include <string.h>
113	#include <machine/machine_routines.h>
114
115	#include <kern/assert.h>
116	#include <mach/kern_return.h>
117	#include <kern/thread.h>
118	#include <kern/sched_prim.h>
119
120	#include <miscfs/specfs/specdev.h>
121
122	#include <mach/mach_types.h>
123	#include <mach/memory_object_types.h>
124	#include <mach/memory_object_control.h>
125
126	#include <kern/kalloc.h> /* kalloc()/kfree() */
127	#include <kern/clock.h> /* delay_for_interval() */
128	#include <libkern/OSAtomic.h> /* OSAddAtomic() */
129	#if !CONFIG_EMBEDDED
130	#include <console/video_console.h>
131	#endif
132
133	#ifdef JOE_DEBUG
134	#include <libkern/OSDebug.h>
135	#endif
136
137	#include <vm/vm_protos.h> /* vnode_pager_vrele() */
138
139	#if CONFIG_MACF
140	#include <security/mac_framework.h>
141	#endif
142
143	#include <vfs/vfs_disk_conditioner.h>
144	#include <libkern/section_keywords.h>
145
146	extern lck_grp_t *vnode_lck_grp;
147	extern lck_attr_t *vnode_lck_attr;
148
149	#if CONFIG_TRIGGERS
150	extern lck_grp_t *trigger_vnode_lck_grp;
151	extern lck_attr_t *trigger_vnode_lck_attr;
152	#endif
153
154	extern lck_mtx_t * mnt_list_mtx_lock;
155
156	enum vtype iftovt_tab[`16`] = {
157	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
158	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
159	};
160	int vttoif_tab[`9`] = {
161	`0`, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
162	S_IFSOCK, S_IFIFO, S_IFMT,
163	};
164
165
166	/ XXX These should be in a BSD accessible Mach header, but aren't. /
167	extern void memory_object_mark_used(
168	memory_object_control_t control);
169
170	extern void memory_object_mark_unused(
171	memory_object_control_t control,
172	boolean_t rage);
173
174	extern void memory_object_mark_io_tracking(
175	memory_object_control_t control);
176
177	/ XXX next protptype should be from <nfs/nfs.h> /
178	extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int);
179
180	extern int paniclog_append_noflush(const char *format, ...);
181
182	/ XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern /
183	__private_extern__ void qsort(
184	void * array,
185	size_t nmembers,
186	size_t member_size,
187	int ()(const* void , const* void *));
188
189	__private_extern__ void vntblinit(void);
190	__private_extern__ int unlink1(vfs_context_t, vnode_t, user_addr_t,
191	enum uio_seg, int);
192
193	extern int system_inshutdown;
194
195	static void vnode_list_add(vnode_t);
196	static void vnode_async_list_add(vnode_t);
197	static void vnode_list_remove(vnode_t);
198	static void vnode_list_remove_locked(vnode_t);
199
200	static void vnode_abort_advlocks(vnode_t);
201	static errno_t vnode_drain(vnode_t);
202	static void vgone(vnode_t, int flags);
203	static void vclean(vnode_t vp, int flag);
204	static void vnode_reclaim_internal(vnode_t, int, int, int);
205
206	static void vnode_dropiocount (vnode_t);
207
208	static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev);
209	static int vnode_reload(vnode_t);
210	static int vnode_isinuse_locked(vnode_t, int, int);
211
212	static int unmount_callback(mount_t, __unused void *);
213
214	static void insmntque(vnode_t vp, mount_t mp);
215	static int mount_getvfscnt(void);
216	static int mount_fillfsids(fsid_t , int* );
217	static void vnode_iterate_setup(mount_t);
218	int vnode_umount_preflight(mount_t, vnode_t, int);
219	static int vnode_iterate_prepare(mount_t);
220	static int vnode_iterate_reloadq(mount_t);
221	static void vnode_iterate_clear(mount_t);
222	static mount_t vfs_getvfs_locked(fsid_t *);
223	static int vn_create_reg(vnode_t dvp, vnode_t vpp, struct* nameidata *ndp,
224	struct vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx);
225	static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int* noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx);
226
227	errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *);
228
229	#ifdef JOE_DEBUG
230	static void record_vp(vnode_t vp, int count);
231	#endif
232
233	#if CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG)
234	extern int bootarg_no_vnode_jetsam; / from bsd_init.c default value is 0 /
235	#endif /* CONFIG_JETSAM && (DEVELOPMENT \|\| DEBUG) */
236
237	boolean_t root_is_CF_drive = FALSE;
238
239	#if CONFIG_TRIGGERS
240	static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external);
241	static void vnode_resolver_detach(vnode_t);
242	#endif
243
244	TAILQ_HEAD(freelst, vnode) vnode_free_list; / vnode free list /
245	TAILQ_HEAD(deadlst, vnode) vnode_dead_list; / vnode dead list /
246	TAILQ_HEAD(async_work_lst, vnode) vnode_async_work_list;
247
248
249	TAILQ_HEAD(ragelst, vnode) vnode_rage_list; / vnode rapid age list /
250	struct timeval rage_tv;
251	int rage_limit = `0`;
252	int ragevnodes = `0`;
253
254	#define RAGE_LIMIT_MIN 100
255	#define RAGE_TIME_LIMIT 5
256
257	struct mntlist mountlist; / mounted filesystem list /
258	static int nummounts = `0`;
259
260	#if DIAGNOSTIC
261	#define VLISTCHECK(fun, vp, list) \
262	if ((vp)->v_freelist.tqe_prev == (struct vnode **)0xdeadb) \
263	panic("%s: %s vnode not on %slist", (fun), (list), (list));
264	#else
265	#define VLISTCHECK(fun, vp, list)
266	#endif /* DIAGNOSTIC */
267
268	#define VLISTNONE(vp) \
269	do { \
270	(vp)->v_freelist.tqe_next = (struct vnode *)0; \
271	(vp)->v_freelist.tqe_prev = (struct vnode **)0xdeadb; \
272	} while(0)
273
274	#define VONLIST(vp) \
275	((vp)->v_freelist.tqe_prev != (struct vnode **)0xdeadb)
276
277	/ remove a vnode from free vnode list /
278	#define VREMFREE(fun, vp) \
279	do { \
280	VLISTCHECK((fun), (vp), "free"); \
281	TAILQ_REMOVE(&vnode_free_list, (vp), v_freelist); \
282	VLISTNONE((vp)); \
283	freevnodes--; \
284	} while(0)
285
286
287	/ remove a vnode from dead vnode list /
288	#define VREMDEAD(fun, vp) \
289	do { \
290	VLISTCHECK((fun), (vp), "dead"); \
291	TAILQ_REMOVE(&vnode_dead_list, (vp), v_freelist); \
292	VLISTNONE((vp)); \
293	vp->v_listflag &= ~VLIST_DEAD; \
294	deadvnodes--; \
295	} while(0)
296
297
298	/ remove a vnode from async work vnode list /
299	#define VREMASYNC_WORK(fun, vp) \
300	do { \
301	VLISTCHECK((fun), (vp), "async_work"); \
302	TAILQ_REMOVE(&vnode_async_work_list, (vp), v_freelist); \
303	VLISTNONE((vp)); \
304	vp->v_listflag &= ~VLIST_ASYNC_WORK; \
305	async_work_vnodes--; \
306	} while(0)
307
308
309	/ remove a vnode from rage vnode list /
310	#define VREMRAGE(fun, vp) \
311	do { \
312	if ( !(vp->v_listflag & VLIST_RAGE)) \
313	panic("VREMRAGE: vp not on rage list"); \
314	VLISTCHECK((fun), (vp), "rage"); \
315	TAILQ_REMOVE(&vnode_rage_list, (vp), v_freelist); \
316	VLISTNONE((vp)); \
317	vp->v_listflag &= ~VLIST_RAGE; \
318	ragevnodes--; \
319	} while(0)
320
321	static void async_work_continue(void);
322
323	/*
324	* Initialize the vnode management data structures.
325	*/
326	__private_extern__ void
327	vntblinit(void)
328	{
329	thread_t thread = THREAD_NULL;
330
331	TAILQ_INIT(&vnode_free_list);
332	TAILQ_INIT(&vnode_rage_list);
333	TAILQ_INIT(&vnode_dead_list);
334	TAILQ_INIT(&vnode_async_work_list);
335	TAILQ_INIT(&mountlist);
336
337	microuptime(&rage_tv);
338	rage_limit = desiredvnodes / `100`;
339
340	if (rage_limit < RAGE_LIMIT_MIN)
341	rage_limit = RAGE_LIMIT_MIN;
342
343	/*
344	* create worker threads
345	*/
346	kernel_thread_start((thread_continue_t)async_work_continue, NULL, &thread);
347	thread_deallocate(thread);
348	}
349
350	/ the timeout is in 10 msecs /
351	int
352	vnode_waitforwrites(vnode_t vp, int output_target, int slpflag, int slptimeout, const char *msg) {
353	int error = `0`;
354	struct timespec ts;
355
356	KERNEL_DEBUG(`0x3010280` \| DBG_FUNC_START, (int)vp, output_target, vp->v_numoutput, `0`, `0`);
357
358	if (vp->v_numoutput > output_target) {
359
360	slpflag \|= PDROP;
361
362	vnode_lock_spin(vp);
363
364	while ((vp->v_numoutput > output_target) && error == `0`) {
365	if (output_target)
366	vp->v_flag \|= VTHROTTLED;
367	else
368	vp->v_flag \|= VBWAIT;
369
370	ts.tv_sec = (slptimeout/`100`);
371	ts.tv_nsec = (slptimeout % `1000`) * `10` * NSEC_PER_USEC * `1000` ;
372	error = msleep((caddr_t)&vp->v_numoutput, &vp->v_lock, (slpflag \| (PRIBIO + `1`)), msg, &ts);
373
374	vnode_lock_spin(vp);
375	}
376	vnode_unlock(vp);
377	}
378	KERNEL_DEBUG(`0x3010280` \| DBG_FUNC_END, (int)vp, output_target, vp->v_numoutput, error, `0`);
379
380	return error;
381	}
382
383
384	void
385	vnode_startwrite(vnode_t vp) {
386
387	OSAddAtomic(`1`, &vp->v_numoutput);
388	}
389
390
391	void
392	vnode_writedone(vnode_t vp)
393	{
394	if (vp) {
395	int need_wakeup = `0`;
396
397	OSAddAtomic(-`1`, &vp->v_numoutput);
398
399	vnode_lock_spin(vp);
400
401	if (vp->v_numoutput < `0`)
402	panic("vnode_writedone: numoutput < 0");
403
404	if ((vp->v_flag & VTHROTTLED)) {
405	vp->v_flag &= ~VTHROTTLED;
406	need_wakeup = `1`;
407	}
408	if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == `0`)) {
409	vp->v_flag &= ~VBWAIT;
410	need_wakeup = `1`;
411	}
412	vnode_unlock(vp);
413
414	if (need_wakeup)
415	wakeup((caddr_t)&vp->v_numoutput);
416	}
417	}
418
419
420
421	int
422	vnode_hasdirtyblks(vnode_t vp)
423	{
424	struct cl_writebehind *wbp;
425
426	/*
427	* Not taking the buf_mtxp as there is little
428	* point doing it. Even if the lock is taken the
429	* state can change right after that. If their
430	* needs to be a synchronization, it must be driven
431	* by the caller
432	*/
433	if (vp->v_dirtyblkhd.lh_first)
434	return (`1`);
435
436	if (!UBCINFOEXISTS(vp))
437	return (`0`);
438
439	wbp = vp->v_ubcinfo->cl_wbehind;
440
441	if (wbp && (wbp->cl_number \|\| wbp->cl_scmap))
442	return (`1`);
443
444	return (`0`);
445	}
446
447	int
448	vnode_hascleanblks(vnode_t vp)
449	{
450	/*
451	* Not taking the buf_mtxp as there is little
452	* point doing it. Even if the lock is taken the
453	* state can change right after that. If their
454	* needs to be a synchronization, it must be driven
455	* by the caller
456	*/
457	if (vp->v_cleanblkhd.lh_first)
458	return (`1`);
459	return (`0`);
460	}
461
462	void
463	vnode_iterate_setup(mount_t mp)
464	{
465	mp->mnt_lflag \|= MNT_LITER;
466	}
467
468	int
469	vnode_umount_preflight(mount_t mp, vnode_t skipvp, int flags)
470	{
471	vnode_t vp;
472
473	TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
474	if (vp->v_type == VDIR)
475	continue;
476	if (vp == skipvp)
477	continue;
478	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\| (vp->v_flag & VNOFLUSH)))
479	continue;
480	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP))
481	continue;
482	if ((flags & WRITECLOSE) && (vp->v_writecount == `0` \|\| vp->v_type != VREG))
483	continue;
484
485	/ Look for busy vnode /
486	if ((vp->v_usecount != `0`) && ((vp->v_usecount - vp->v_kusecount) != `0`)) {
487	return `1`;
488
489	} else if (vp->v_iocount > `0`) {
490	/ Busy if iocount is > 0 for more than 3 seconds /
491	tsleep(&vp->v_iocount, PVFS, "vnode_drain_network", `3` * hz);
492	if (vp->v_iocount > `0`)
493	return `1`;
494	continue;
495	}
496	}
497
498	return `0`;
499	}
500
501	/*
502	* This routine prepares iteration by moving all the vnodes to worker queue
503	* called with mount lock held
504	*/
505	int
506	vnode_iterate_prepare(mount_t mp)
507	{
508	vnode_t vp;
509
510	if (TAILQ_EMPTY(&mp->mnt_vnodelist)) {
511	/ nothing to do /
512	return (`0`);
513	}
514
515	vp = TAILQ_FIRST(&mp->mnt_vnodelist);
516	vp->v_mntvnodes.tqe_prev = &(mp->mnt_workerqueue.tqh_first);
517	mp->mnt_workerqueue.tqh_first = mp->mnt_vnodelist.tqh_first;
518	mp->mnt_workerqueue.tqh_last = mp->mnt_vnodelist.tqh_last;
519
520	TAILQ_INIT(&mp->mnt_vnodelist);
521	if (mp->mnt_newvnodes.tqh_first != NULL)
522	panic("vnode_iterate_prepare: newvnode when entering vnode");
523	TAILQ_INIT(&mp->mnt_newvnodes);
524
525	return (`1`);
526	}
527
528
529	/ called with mount lock held /
530	int
531	vnode_iterate_reloadq(mount_t mp)
532	{
533	int moved = `0`;
534
535	/ add the remaining entries in workerq to the end of mount vnode list /
536	if (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
537	struct vnode * mvp;
538	mvp = TAILQ_LAST(&mp->mnt_vnodelist, vnodelst);
539
540	/ Joining the workerque entities to mount vnode list /
541	if (mvp)
542	mvp->v_mntvnodes.tqe_next = mp->mnt_workerqueue.tqh_first;
543	else
544	mp->mnt_vnodelist.tqh_first = mp->mnt_workerqueue.tqh_first;
545	mp->mnt_workerqueue.tqh_first->v_mntvnodes.tqe_prev = mp->mnt_vnodelist.tqh_last;
546	mp->mnt_vnodelist.tqh_last = mp->mnt_workerqueue.tqh_last;
547	TAILQ_INIT(&mp->mnt_workerqueue);
548	}
549
550	/ add the newvnodes to the head of mount vnode list /
551	if (!TAILQ_EMPTY(&mp->mnt_newvnodes)) {
552	struct vnode * nlvp;
553	nlvp = TAILQ_LAST(&mp->mnt_newvnodes, vnodelst);
554
555	mp->mnt_newvnodes.tqh_first->v_mntvnodes.tqe_prev = &mp->mnt_vnodelist.tqh_first;
556	nlvp->v_mntvnodes.tqe_next = mp->mnt_vnodelist.tqh_first;
557	if(mp->mnt_vnodelist.tqh_first)
558	mp->mnt_vnodelist.tqh_first->v_mntvnodes.tqe_prev = &nlvp->v_mntvnodes.tqe_next;
559	else
560	mp->mnt_vnodelist.tqh_last = mp->mnt_newvnodes.tqh_last;
561	mp->mnt_vnodelist.tqh_first = mp->mnt_newvnodes.tqh_first;
562	TAILQ_INIT(&mp->mnt_newvnodes);
563	moved = `1`;
564	}
565
566	return(moved);
567	}
568
569
570	void
571	vnode_iterate_clear(mount_t mp)
572	{
573	mp->mnt_lflag &= ~MNT_LITER;
574	}
575
576	#if !CONFIG_EMBEDDED
577
578	#include <i386/panic_hooks.h>
579
580	struct vnode_iterate_panic_hook {
581	panic_hook_t hook;
582	mount_t mp;
583	struct vnode *vp;
584	};
585
586	static void vnode_iterate_panic_hook(panic_hook_t *hook_)
587	{
588	struct vnode_iterate_panic_hook hook = (struct* vnode_iterate_panic_hook *)hook_;
589	panic_phys_range_t range;
590	uint64_t phys;
591
592	if (panic_phys_range_before(hook->mp, &phys, &range)) {
593	paniclog_append_noflush("mp = %p, phys = %p, prev (%p: %p-%p)\n",
594	hook->mp, phys, range.type, range.phys_start,
595	range.phys_start + range.len);
596	} else {
597	paniclog_append_noflush("mp = %p, phys = %p, prev (!)\n", hook->mp, phys);
598	}
599
600	if (panic_phys_range_before(hook->vp, &phys, &range)) {
601	paniclog_append_noflush("vp = %p, phys = %p, prev (%p: %p-%p)\n",
602	hook->vp, phys, range.type, range.phys_start,
603	range.phys_start + range.len);
604	} else {
605	paniclog_append_noflush("vp = %p, phys = %p, prev (!)\n", hook->vp, phys);
606	}
607	panic_dump_mem((void *)(((vm_offset_t)hook->mp -`4096`) & ~`4095`), `12288`);
608	}
609	#endif //CONFIG_EMBEDDED
610
611	int
612	vnode_iterate(mount_t mp, int flags, int (callout)(struct* vnode , void* *),
613	void *arg)
614	{
615	struct vnode *vp;
616	int vid, retval;
617	int ret = `0`;
618
619	/*
620	* The mount iterate mutex is held for the duration of the iteration.
621	* This can be done by a state flag on the mount structure but we can
622	* run into priority inversion issues sometimes.
623	* Using a mutex allows us to benefit from the priority donation
624	* mechanisms in the kernel for locks. This mutex should never be
625	* acquired in spin mode and it should be acquired before attempting to
626	* acquire the mount lock.
627	*/
628	mount_iterate_lock(mp);
629
630	mount_lock(mp);
631
632	vnode_iterate_setup(mp);
633
634	/ If it returns 0 then there is nothing to do /
635	retval = vnode_iterate_prepare(mp);
636
637	if (retval == `0`) {
638	vnode_iterate_clear(mp);
639	mount_unlock(mp);
640	mount_iterate_unlock(mp);
641	return(ret);
642	}
643
644	#if !CONFIG_EMBEDDED
645	struct vnode_iterate_panic_hook hook;
646	hook.mp = mp;
647	hook.vp = NULL;
648	panic_hook(&hook.hook, vnode_iterate_panic_hook);
649	#endif
650	/ iterate over all the vnodes /
651	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
652	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
653	#if !CONFIG_EMBEDDED
654	hook.vp = vp;
655	#endif
656	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
657	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
658	vid = vp->v_id;
659	if ((vp->v_data == NULL) \|\| (vp->v_type == VNON) \|\| (vp->v_mount != mp)) {
660	continue;
661	}
662	mount_unlock(mp);
663
664	if ( vget_internal(vp, vid, (flags \| VNODE_NODEAD\| VNODE_WITHID \| VNODE_NOSUSPEND))) {
665	mount_lock(mp);
666	continue;
667	}
668	if (flags & VNODE_RELOAD) {
669	/*
670	* we're reloading the filesystem
671	* cast out any inactive vnodes...
672	*/
673	if (vnode_reload(vp)) {
674	/ vnode will be recycled on the refcount drop /
675	vnode_put(vp);
676	mount_lock(mp);
677	continue;
678	}
679	}
680
681	retval = callout(vp, arg);
682
683	switch (retval) {
684	case VNODE_RETURNED:
685	case VNODE_RETURNED_DONE:
686	vnode_put(vp);
687	if (retval == VNODE_RETURNED_DONE) {
688	mount_lock(mp);
689	ret = `0`;
690	goto out;
691	}
692	break;
693
694	case VNODE_CLAIMED_DONE:
695	mount_lock(mp);
696	ret = `0`;
697	goto out;
698	case VNODE_CLAIMED:
699	default:
700	break;
701	}
702	mount_lock(mp);
703	}
704
705	out:
706	#if !CONFIG_EMBEDDED
707	panic_unhook(&hook.hook);
708	#endif
709	(void)vnode_iterate_reloadq(mp);
710	vnode_iterate_clear(mp);
711	mount_unlock(mp);
712	mount_iterate_unlock(mp);
713	return (ret);
714	}
715
716	void
717	mount_lock_renames(mount_t mp)
718	{
719	lck_mtx_lock(&mp->mnt_renamelock);
720	}
721
722	void
723	mount_unlock_renames(mount_t mp)
724	{
725	lck_mtx_unlock(&mp->mnt_renamelock);
726	}
727
728	void
729	mount_iterate_lock(mount_t mp)
730	{
731	lck_mtx_lock(&mp->mnt_iter_lock);
732	}
733
734	void
735	mount_iterate_unlock(mount_t mp)
736	{
737	lck_mtx_unlock(&mp->mnt_iter_lock);
738	}
739
740	void
741	mount_lock(mount_t mp)
742	{
743	lck_mtx_lock(&mp->mnt_mlock);
744	}
745
746	void
747	mount_lock_spin(mount_t mp)
748	{
749	lck_mtx_lock_spin(&mp->mnt_mlock);
750	}
751
752	void
753	mount_unlock(mount_t mp)
754	{
755	lck_mtx_unlock(&mp->mnt_mlock);
756	}
757
758
759	void
760	mount_ref(mount_t mp, int locked)
761	{
762	if ( !locked)
763	mount_lock_spin(mp);
764
765	mp->mnt_count++;
766
767	if ( !locked)
768	mount_unlock(mp);
769	}
770
771
772	void
773	mount_drop(mount_t mp, int locked)
774	{
775	if ( !locked)
776	mount_lock_spin(mp);
777
778	mp->mnt_count--;
779
780	if (mp->mnt_count == `0` && (mp->mnt_lflag & MNT_LDRAIN))
781	wakeup(&mp->mnt_lflag);
782
783	if ( !locked)
784	mount_unlock(mp);
785	}
786
787
788	int
789	mount_iterref(mount_t mp, int locked)
790	{
791	int retval = `0`;
792
793	if (!locked)
794	mount_list_lock();
795	if (mp->mnt_iterref < `0`) {
796	retval = `1`;
797	} else {
798	mp->mnt_iterref++;
799	}
800	if (!locked)
801	mount_list_unlock();
802	return(retval);
803	}
804
805	int
806	mount_isdrained(mount_t mp, int locked)
807	{
808	int retval;
809
810	if (!locked)
811	mount_list_lock();
812	if (mp->mnt_iterref < `0`)
813	retval = `1`;
814	else
815	retval = `0`;
816	if (!locked)
817	mount_list_unlock();
818	return(retval);
819	}
820
821	void
822	mount_iterdrop(mount_t mp)
823	{
824	mount_list_lock();
825	mp->mnt_iterref--;
826	wakeup(&mp->mnt_iterref);
827	mount_list_unlock();
828	}
829
830	void
831	mount_iterdrain(mount_t mp)
832	{
833	mount_list_lock();
834	while (mp->mnt_iterref)
835	msleep((caddr_t)&mp->mnt_iterref, mnt_list_mtx_lock, PVFS, "mount_iterdrain", NULL);
836	/ mount iterations drained /
837	mp->mnt_iterref = -`1`;
838	mount_list_unlock();
839	}
840	void
841	mount_iterreset(mount_t mp)
842	{
843	mount_list_lock();
844	if (mp->mnt_iterref == -`1`)
845	mp->mnt_iterref = `0`;
846	mount_list_unlock();
847	}
848
849	/ always called with mount lock held /
850	int
851	mount_refdrain(mount_t mp)
852	{
853	if (mp->mnt_lflag & MNT_LDRAIN)
854	panic("already in drain");
855	mp->mnt_lflag \|= MNT_LDRAIN;
856
857	while (mp->mnt_count)
858	msleep((caddr_t)&mp->mnt_lflag, &mp->mnt_mlock, PVFS, "mount_drain", NULL);
859
860	if (mp->mnt_vnodelist.tqh_first != NULL)
861	panic("mount_refdrain: dangling vnode");
862
863	mp->mnt_lflag &= ~MNT_LDRAIN;
864
865	return(`0`);
866	}
867
868	/ Tags the mount point as not supportine extended readdir for NFS exports /
869	void
870	mount_set_noreaddirext(mount_t mp) {
871	mount_lock (mp);
872	mp->mnt_kern_flag \|= MNTK_DENY_READDIREXT;
873	mount_unlock (mp);
874	}
875
876	/*
877	* Mark a mount point as busy. Used to synchronize access and to delay
878	* unmounting.
879	*/
880	int
881	vfs_busy(mount_t mp, int flags)
882	{
883
884	restart:
885	if (mp->mnt_lflag & MNT_LDEAD)
886	return (ENOENT);
887
888	mount_lock(mp);
889
890	if (mp->mnt_lflag & MNT_LUNMOUNT) {
891	if (flags & LK_NOWAIT \|\| mp->mnt_lflag & MNT_LDEAD) {
892	mount_unlock(mp);
893	return (ENOENT);
894	}
895
896	/*
897	* Since all busy locks are shared except the exclusive
898	* lock granted when unmounting, the only place that a
899	* wakeup needs to be done is at the release of the
900	* exclusive lock at the end of dounmount.
901	*/
902	mp->mnt_lflag \|= MNT_LWAIT;
903	msleep((caddr_t)mp, &mp->mnt_mlock, (PVFS \| PDROP), "vfsbusy", NULL);
904	return (ENOENT);
905	}
906
907	mount_unlock(mp);
908
909	lck_rw_lock_shared(&mp->mnt_rwlock);
910
911	/*
912	* Until we are granted the rwlock, it's possible for the mount point to
913	* change state, so re-evaluate before granting the vfs_busy.
914	*/
915	if (mp->mnt_lflag & (MNT_LDEAD \| MNT_LUNMOUNT)) {
916	lck_rw_done(&mp->mnt_rwlock);
917	goto restart;
918	}
919	return (`0`);
920	}
921
922	/*
923	* Free a busy filesystem.
924	*/
925	void
926	vfs_unbusy(mount_t mp)
927	{
928	lck_rw_done(&mp->mnt_rwlock);
929	}
930
931
932
933	static void
934	vfs_rootmountfailed(mount_t mp) {
935
936	mount_list_lock();
937	mp->mnt_vtable->vfc_refcount--;
938	mount_list_unlock();
939
940	vfs_unbusy(mp);
941
942	mount_lock_destroy(mp);
943
944	#if CONFIG_MACF
945	mac_mount_label_destroy(mp);
946	#endif
947
948	FREE_ZONE(mp, sizeof(struct mount), M_MOUNT);
949	}
950
951	/*
952	* Lookup a filesystem type, and if found allocate and initialize
953	* a mount structure for it.
954	*
955	* Devname is usually updated by mount(8) after booting.
956	*/
957	static mount_t
958	vfs_rootmountalloc_internal(struct vfstable vfsp, const* char *devname)
959	{
960	mount_t mp;
961
962	mp = _MALLOC_ZONE(sizeof(struct mount), M_MOUNT, M_WAITOK);
963	bzero((char )mp, sizeof(struct* mount));
964
965	/ Initialize the default IO constraints /
966	mp->mnt_maxreadcnt = mp->mnt_maxwritecnt = MAXPHYS;
967	mp->mnt_segreadcnt = mp->mnt_segwritecnt = `32`;
968	mp->mnt_maxsegreadsize = mp->mnt_maxreadcnt;
969	mp->mnt_maxsegwritesize = mp->mnt_maxwritecnt;
970	mp->mnt_devblocksize = DEV_BSIZE;
971	mp->mnt_alignmentmask = PAGE_MASK;
972	mp->mnt_ioqueue_depth = MNT_DEFAULT_IOQUEUE_DEPTH;
973	mp->mnt_ioscale = `1`;
974	mp->mnt_ioflags = `0`;
975	mp->mnt_realrootvp = NULLVP;
976	mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL;
977	mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - `1`;
978	mp->mnt_devbsdunit = `0`;
979
980	mount_lock_init(mp);
981	(void)vfs_busy(mp, LK_NOWAIT);
982
983	TAILQ_INIT(&mp->mnt_vnodelist);
984	TAILQ_INIT(&mp->mnt_workerqueue);
985	TAILQ_INIT(&mp->mnt_newvnodes);
986
987	mp->mnt_vtable = vfsp;
988	mp->mnt_op = vfsp->vfc_vfsops;
989	mp->mnt_flag = MNT_RDONLY \| MNT_ROOTFS;
990	mp->mnt_vnodecovered = NULLVP;
991	//mp->mnt_stat.f_type = vfsp->vfc_typenum;
992	mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK;
993
994	mount_list_lock();
995	vfsp->vfc_refcount++;
996	mount_list_unlock();
997
998	strlcpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN);
999	mp->mnt_vfsstat.f_mntonname[`0`] = `'/'`;
1000	/ XXX const poisoning layering violation /
1001	(void) copystr((const void *)devname, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - `1`, NULL);
1002
1003	#if CONFIG_MACF
1004	mac_mount_label_init(mp);
1005	mac_mount_label_associate(vfs_context_kernel(), mp);
1006	#endif
1007	return (mp);
1008	}
1009
1010	errno_t
1011	vfs_rootmountalloc(const char fstypename, const* char devname, mount_t mpp)
1012	{
1013	struct vfstable *vfsp;
1014
1015	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1016	if (!strncmp(vfsp->vfc_name, fstypename,
1017	sizeof(vfsp->vfc_name)))
1018	break;
1019	if (vfsp == NULL)
1020	return (ENODEV);
1021
1022	*mpp = vfs_rootmountalloc_internal(vfsp, devname);
1023
1024	if (*mpp)
1025	return (`0`);
1026
1027	return (ENOMEM);
1028	}
1029
1030	#define DBG_MOUNTROOT (FSDBG_CODE(DBG_MOUNT, 0))
1031
1032	/*
1033	* Find an appropriate filesystem to use for the root. If a filesystem
1034	* has not been preselected, walk through the list of known filesystems
1035	* trying those that have mountroot routines, and try them until one
1036	* works or we have tried them all.
1037	*/
1038	extern int (mountroot)(void*);
1039
1040	int
1041	vfs_mountroot(void)
1042	{
1043	#if CONFIG_MACF
1044	struct vnode *vp;
1045	#endif
1046	struct vfstable *vfsp;
1047	vfs_context_t ctx = vfs_context_kernel();
1048	struct vfs_attr vfsattr;
1049	int error;
1050	mount_t mp;
1051	vnode_t bdevvp_rootvp;
1052
1053	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_START);
1054	if (mountroot != NULL) {
1055	/*
1056	* used for netboot which follows a different set of rules
1057	*/
1058	error = (*mountroot)();
1059
1060	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, `0`);
1061	return (error);
1062	}
1063	if ((error = bdevvp(rootdev, &rootvp))) {
1064	printf("vfs_mountroot: can't setup bdevvp\n");
1065
1066	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error, `1`);
1067	return (error);
1068	}
1069	/*
1070	* 4951998 - code we call in vfc_mountroot may replace rootvp
1071	* so keep a local copy for some house keeping.
1072	*/
1073	bdevvp_rootvp = rootvp;
1074
1075	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1076	if (vfsp->vfc_mountroot == NULL
1077	&& !ISSET(vfsp->vfc_vfsflags, VFC_VFSCANMOUNTROOT)) {
1078	continue;
1079	}
1080
1081	mp = vfs_rootmountalloc_internal(vfsp, "root_device");
1082	mp->mnt_devvp = rootvp;
1083
1084	if (vfsp->vfc_mountroot)
1085	error = (*vfsp->vfc_mountroot)(mp, rootvp, ctx);
1086	else
1087	error = VFS_MOUNT(mp, rootvp, `0`, ctx);
1088
1089	if (!error) {
1090	if ( bdevvp_rootvp != rootvp ) {
1091	/*
1092	* rootvp changed...
1093	* bump the iocount and fix up mnt_devvp for the
1094	* new rootvp (it will already have a usecount taken)...
1095	* drop the iocount and the usecount on the orignal
1096	* since we are no longer going to use it...
1097	*/
1098	vnode_getwithref(rootvp);
1099	mp->mnt_devvp = rootvp;
1100
1101	vnode_rele(bdevvp_rootvp);
1102	vnode_put(bdevvp_rootvp);
1103	}
1104	mp->mnt_devvp->v_specflags \|= SI_MOUNTEDON;
1105
1106	vfs_unbusy(mp);
1107
1108	mount_list_add(mp);
1109
1110	/*
1111	* cache the IO attributes for the underlying physical media...
1112	* an error return indicates the underlying driver doesn't
1113	* support all the queries necessary... however, reasonable
1114	* defaults will have been set, so no reason to bail or care
1115	*/
1116	vfs_init_io_attributes(rootvp, mp);
1117
1118	if (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) {
1119	root_is_CF_drive = TRUE;
1120	}
1121
1122	/*
1123	* Shadow the VFC_VFSNATIVEXATTR flag to MNTK_EXTENDED_ATTRS.
1124	*/
1125	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSNATIVEXATTR) {
1126	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1127	}
1128	if (mp->mnt_vtable->vfc_vfsflags & VFC_VFSPREFLIGHT) {
1129	mp->mnt_kern_flag \|= MNTK_UNMOUNT_PREFLIGHT;
1130	}
1131
1132	#if !CONFIG_EMBEDDED
1133	uint32_t speed;
1134
1135	if (MNTK_VIRTUALDEV & mp->mnt_kern_flag) speed = `128`;
1136	else if (disk_conditioner_mount_is_ssd(mp)) speed = `7`*`256`;
1137	else speed = `256`;
1138	vc_progress_setdiskspeed(speed);
1139	#endif
1140	/*
1141	* Probe root file system for additional features.
1142	*/
1143	(void)VFS_START(mp, `0`, ctx);
1144
1145	VFSATTR_INIT(&vfsattr);
1146	VFSATTR_WANTED(&vfsattr, f_capabilities);
1147	if (vfs_getattr(mp, &vfsattr, ctx) == `0` &&
1148	VFSATTR_IS_SUPPORTED(&vfsattr, f_capabilities)) {
1149	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) &&
1150	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) {
1151	mp->mnt_kern_flag \|= MNTK_EXTENDED_ATTRS;
1152	}
1153	#if NAMEDSTREAMS
1154	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS) &&
1155	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_NAMEDSTREAMS)) {
1156	mp->mnt_kern_flag \|= MNTK_NAMED_STREAMS;
1157	}
1158	#endif
1159	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID) &&
1160	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_PATH_FROM_ID)) {
1161	mp->mnt_kern_flag \|= MNTK_PATH_FROM_ID;
1162	}
1163
1164	if ((vfsattr.f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS) &&
1165	(vfsattr.f_capabilities.valid[VOL_CAPABILITIES_FORMAT] & VOL_CAP_FMT_DIR_HARDLINKS)) {
1166	mp->mnt_kern_flag \|= MNTK_DIR_HARDLINKS;
1167	}
1168	}
1169
1170	/*
1171	* get rid of iocount reference returned
1172	* by bdevvp (or picked up by us on the substitued
1173	* rootvp)... it (or we) will have also taken
1174	* a usecount reference which we want to keep
1175	*/
1176	vnode_put(rootvp);
1177
1178	#if CONFIG_MACF
1179	if ((vfs_flags(mp) & MNT_MULTILABEL) == `0`) {
1180	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, `0`, `2`);
1181	return (`0`);
1182	}
1183
1184	error = VFS_ROOT(mp, &vp, ctx);
1185	if (error) {
1186	printf("%s() VFS_ROOT() returned %d\n",
1187	__func__, error);
1188	dounmount(mp, MNT_FORCE, `0`, ctx);
1189	goto fail;
1190	}
1191	error = vnode_label(mp, NULL, vp, NULL, `0`, ctx);
1192	/*
1193	* get rid of reference provided by VFS_ROOT
1194	*/
1195	vnode_put(vp);
1196
1197	if (error) {
1198	printf("%s() vnode_label() returned %d\n",
1199	__func__, error);
1200	dounmount(mp, MNT_FORCE, `0`, ctx);
1201	goto fail;
1202	}
1203	#endif
1204	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, `0`, `3`);
1205	return (`0`);
1206	}
1207	#if CONFIG_MACF
1208	fail:
1209	#endif
1210	vfs_rootmountfailed(mp);
1211
1212	if (error != EINVAL)
1213	printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
1214	}
1215	KDBG_RELEASE(DBG_MOUNTROOT \| DBG_FUNC_END, error ? error : ENODEV, `4`);
1216	return (ENODEV);
1217	}
1218
1219	/*
1220	* Lookup a mount point by filesystem identifier.
1221	*/
1222
1223	struct mount *
1224	vfs_getvfs(fsid_t *fsid)
1225	{
1226	return (mount_list_lookupby_fsid(fsid, `0`, `0`));
1227	}
1228
1229	static struct mount *
1230	vfs_getvfs_locked(fsid_t *fsid)
1231	{
1232	return(mount_list_lookupby_fsid(fsid, `1`, `0`));
1233	}
1234
1235	struct mount *
1236	vfs_getvfs_by_mntonname(char *path)
1237	{
1238	mount_t retmp = (mount_t)`0`;
1239	mount_t mp;
1240
1241	mount_list_lock();
1242	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
1243	if (!strncmp(mp->mnt_vfsstat.f_mntonname, path,
1244	sizeof(mp->mnt_vfsstat.f_mntonname))) {
1245	retmp = mp;
1246	if (mount_iterref(retmp, `1`))
1247	retmp = NULL;
1248	goto out;
1249	}
1250	}
1251	out:
1252	mount_list_unlock();
1253	return (retmp);
1254	}
1255
1256	/ generation number for creation of new fsids /
1257	u_short mntid_gen = `0`;
1258	/*
1259	* Get a new unique fsid
1260	*/
1261	void
1262	vfs_getnewfsid(struct mount *mp)
1263	{
1264
1265	fsid_t tfsid;
1266	int mtype;
1267
1268	mount_list_lock();
1269
1270	/ generate a new fsid /
1271	mtype = mp->mnt_vtable->vfc_typenum;
1272	if (++mntid_gen == `0`)
1273	mntid_gen++;
1274	tfsid.val[`0`] = makedev(nblkdev + mtype, mntid_gen);
1275	tfsid.val[`1`] = mtype;
1276
1277	while (vfs_getvfs_locked(&tfsid)) {
1278	if (++mntid_gen == `0`)
1279	mntid_gen++;
1280	tfsid.val[`0`] = makedev(nblkdev + mtype, mntid_gen);
1281	}
1282
1283	mp->mnt_vfsstat.f_fsid.val[`0`] = tfsid.val[`0`];
1284	mp->mnt_vfsstat.f_fsid.val[`1`] = tfsid.val[`1`];
1285	mount_list_unlock();
1286	}
1287
1288	/*
1289	* Routines having to do with the management of the vnode table.
1290	*/
1291	extern int (*dead_vnodeop_p)(void* *);
1292	long numvnodes, freevnodes, deadvnodes, async_work_vnodes;
1293
1294
1295	int async_work_timed_out = `0`;
1296	int async_work_handled = `0`;
1297	int dead_vnode_wanted = `0`;
1298	int dead_vnode_waited = `0`;
1299
1300	/*
1301	* Move a vnode from one mount queue to another.
1302	*/
1303	static void
1304	insmntque(vnode_t vp, mount_t mp)
1305	{
1306	mount_t lmp;
1307	/*
1308	* Delete from old mount point vnode list, if on one.
1309	*/
1310	if ( (lmp = vp->v_mount) != NULL && lmp != dead_mountp) {
1311	if ((vp->v_lflag & VNAMED_MOUNT) == `0`)
1312	panic("insmntque: vp not in mount vnode list");
1313	vp->v_lflag &= ~VNAMED_MOUNT;
1314
1315	mount_lock_spin(lmp);
1316
1317	mount_drop(lmp, `1`);
1318
1319	if (vp->v_mntvnodes.tqe_next == NULL) {
1320	if (TAILQ_LAST(&lmp->mnt_vnodelist, vnodelst) == vp)
1321	TAILQ_REMOVE(&lmp->mnt_vnodelist, vp, v_mntvnodes);
1322	else if (TAILQ_LAST(&lmp->mnt_newvnodes, vnodelst) == vp)
1323	TAILQ_REMOVE(&lmp->mnt_newvnodes, vp, v_mntvnodes);
1324	else if (TAILQ_LAST(&lmp->mnt_workerqueue, vnodelst) == vp)
1325	TAILQ_REMOVE(&lmp->mnt_workerqueue, vp, v_mntvnodes);
1326	} else {
1327	vp->v_mntvnodes.tqe_next->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_prev;
1328	*vp->v_mntvnodes.tqe_prev = vp->v_mntvnodes.tqe_next;
1329	}
1330	vp->v_mntvnodes.tqe_next = NULL;
1331	vp->v_mntvnodes.tqe_prev = NULL;
1332	mount_unlock(lmp);
1333	return;
1334	}
1335
1336	/*
1337	* Insert into list of vnodes for the new mount point, if available.
1338	*/
1339	if ((vp->v_mount = mp) != NULL) {
1340	mount_lock_spin(mp);
1341	if ((vp->v_mntvnodes.tqe_next != `0`) && (vp->v_mntvnodes.tqe_prev != `0`))
1342	panic("vp already in mount list");
1343	if (mp->mnt_lflag & MNT_LITER)
1344	TAILQ_INSERT_HEAD(&mp->mnt_newvnodes, vp, v_mntvnodes);
1345	else
1346	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
1347	if (vp->v_lflag & VNAMED_MOUNT)
1348	panic("insmntque: vp already in mount vnode list");
1349	vp->v_lflag \|= VNAMED_MOUNT;
1350	mount_ref(mp, `1`);
1351	mount_unlock(mp);
1352	}
1353	}
1354
1355
1356	/*
1357	* Create a vnode for a block device.
1358	* Used for root filesystem, argdev, and swap areas.
1359	* Also used for memory file system special devices.
1360	*/
1361	int
1362	bdevvp(dev_t dev, vnode_t *vpp)
1363	{
1364	vnode_t nvp;
1365	int error;
1366	struct vnode_fsparam vfsp;
1367	struct vfs_context context;
1368
1369	if (dev == NODEV) {
1370	*vpp = NULLVP;
1371	return (ENODEV);
1372	}
1373
1374	context.vc_thread = current_thread();
1375	context.vc_ucred = FSCRED;
1376
1377	vfsp.vnfs_mp = (struct mount *)`0`;
1378	vfsp.vnfs_vtype = VBLK;
1379	vfsp.vnfs_str = "bdevvp";
1380	vfsp.vnfs_dvp = NULL;
1381	vfsp.vnfs_fsnode = NULL;
1382	vfsp.vnfs_cnp = NULL;
1383	vfsp.vnfs_vops = spec_vnodeop_p;
1384	vfsp.vnfs_rdev = dev;
1385	vfsp.vnfs_filesize = `0`;
1386
1387	vfsp.vnfs_flags = VNFS_NOCACHE \| VNFS_CANTCACHE;
1388
1389	vfsp.vnfs_marksystem = `0`;
1390	vfsp.vnfs_markroot = `0`;
1391
1392	if ( (error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &nvp)) ) {
1393	*vpp = NULLVP;
1394	return (error);
1395	}
1396	vnode_lock_spin(nvp);
1397	nvp->v_flag \|= VBDEVVP;
1398	nvp->v_tag = VT_NON; / set this to VT_NON so during aliasing it can be replaced /
1399	vnode_unlock(nvp);
1400	if ( (error = vnode_ref(nvp)) ) {
1401	panic("bdevvp failed: vnode_ref");
1402	return (error);
1403	}
1404	if ( (error = VNOP_FSYNC(nvp, MNT_WAIT, &context)) ) {
1405	panic("bdevvp failed: fsync");
1406	return (error);
1407	}
1408	if ( (error = buf_invalidateblks(nvp, BUF_WRITE_DATA, `0`, `0`)) ) {
1409	panic("bdevvp failed: invalidateblks");
1410	return (error);
1411	}
1412
1413	#if CONFIG_MACF
1414	/*
1415	* XXXMAC: We can't put a MAC check here, the system will
1416	* panic without this vnode.
1417	*/
1418	#endif /* MAC */
1419
1420	if ( (error = VNOP_OPEN(nvp, FREAD, &context)) ) {
1421	panic("bdevvp failed: open");
1422	return (error);
1423	}
1424	*vpp = nvp;
1425
1426	return (`0`);
1427	}
1428
1429	/*
1430	* Check to see if the new vnode represents a special device
1431	* for which we already have a vnode (either because of
1432	* bdevvp() or because of a different vnode representing
1433	* the same block device). If such an alias exists, deallocate
1434	* the existing contents and return the aliased vnode. The
1435	* caller is responsible for filling it with its new contents.
1436	*/
1437	static vnode_t
1438	checkalias(struct vnode *nvp, dev_t nvp_rdev)
1439	{
1440	struct vnode *vp;
1441	struct vnode **vpp;
1442	struct specinfo *sin = NULL;
1443	int vid = `0`;
1444
1445	vpp = &speclisth[SPECHASH(nvp_rdev)];
1446	loop:
1447	SPECHASH_LOCK();
1448
1449	for (vp = *vpp; vp; vp = vp->v_specnext) {
1450	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1451	vid = vp->v_id;
1452	break;
1453	}
1454	}
1455	SPECHASH_UNLOCK();
1456
1457	if (vp) {
1458	found_alias:
1459	if (vnode_getwithvid(vp,vid)) {
1460	goto loop;
1461	}
1462	/*
1463	* Termination state is checked in vnode_getwithvid
1464	*/
1465	vnode_lock(vp);
1466
1467	/*
1468	* Alias, but not in use, so flush it out.
1469	*/
1470	if ((vp->v_iocount == `1`) && (vp->v_usecount == `0`)) {
1471	vnode_reclaim_internal(vp, `1`, `1`, `0`);
1472	vnode_put_locked(vp);
1473	vnode_unlock(vp);
1474	goto loop;
1475	}
1476
1477	}
1478	if (vp == NULL \|\| vp->v_tag != VT_NON) {
1479	if (sin == NULL) {
1480	MALLOC_ZONE(sin, struct specinfo , sizeof(struct* specinfo),
1481	M_SPECINFO, M_WAITOK);
1482	}
1483
1484	nvp->v_specinfo = sin;
1485	bzero(nvp->v_specinfo, sizeof(struct specinfo));
1486	nvp->v_rdev = nvp_rdev;
1487	nvp->v_specflags = `0`;
1488	nvp->v_speclastr = -`1`;
1489	nvp->v_specinfo->si_opencount = `0`;
1490	nvp->v_specinfo->si_initted = `0`;
1491	nvp->v_specinfo->si_throttleable = `0`;
1492
1493	SPECHASH_LOCK();
1494
1495	/ We dropped the lock, someone could have added /
1496	if (vp == NULLVP) {
1497	for (vp = *vpp; vp; vp = vp->v_specnext) {
1498	if (nvp_rdev == vp->v_rdev && nvp->v_type == vp->v_type) {
1499	vid = vp->v_id;
1500	SPECHASH_UNLOCK();
1501	goto found_alias;
1502	}
1503	}
1504	}
1505
1506	nvp->v_hashchain = vpp;
1507	nvp->v_specnext = *vpp;
1508	*vpp = nvp;
1509
1510	if (vp != NULLVP) {
1511	nvp->v_specflags \|= SI_ALIASED;
1512	vp->v_specflags \|= SI_ALIASED;
1513	SPECHASH_UNLOCK();
1514	vnode_put_locked(vp);
1515	vnode_unlock(vp);
1516	} else {
1517	SPECHASH_UNLOCK();
1518	}
1519
1520	return (NULLVP);
1521	}
1522
1523	if (sin) {
1524	FREE_ZONE(sin, sizeof(struct specinfo), M_SPECINFO);
1525	}
1526
1527	if ((vp->v_flag & (VBDEVVP \| VDEVFLUSH)) != `0`)
1528	return(vp);
1529
1530	panic("checkalias with VT_NON vp that shouldn't: %p", vp);
1531
1532	return (vp);
1533	}
1534
1535
1536	/*
1537	* Get a reference on a particular vnode and lock it if requested.
1538	* If the vnode was on the inactive list, remove it from the list.
1539	* If the vnode was on the free list, remove it from the list and
1540	* move it to inactive list as needed.
1541	* The vnode lock bit is set if the vnode is being eliminated in
1542	* vgone. The process is awakened when the transition is completed,
1543	* and an error returned to indicate that the vnode is no longer
1544	* usable (possibly having been changed to a new file system type).
1545	*/
1546	int
1547	vget_internal(vnode_t vp, int vid, int vflags)
1548	{
1549	int error = `0`;
1550
1551	vnode_lock_spin(vp);
1552
1553	if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == `0`))
1554	/*
1555	* vnode to be returned only if it has writers opened
1556	*/
1557	error = EINVAL;
1558	else
1559	error = vnode_getiocount(vp, vid, vflags);
1560
1561	vnode_unlock(vp);
1562
1563	return (error);
1564	}
1565
1566	/*
1567	* Returns: 0 Success
1568	* ENOENT No such file or directory [terminating]
1569	*/
1570	int
1571	vnode_ref(vnode_t vp)
1572	{
1573
1574	return (vnode_ref_ext(vp, `0`, `0`));
1575	}
1576
1577	/*
1578	* Returns: 0 Success
1579	* ENOENT No such file or directory [terminating]
1580	*/
1581	int
1582	vnode_ref_ext(vnode_t vp, int fmode, int flags)
1583	{
1584	int error = `0`;
1585
1586	vnode_lock_spin(vp);
1587
1588	/*
1589	* once all the current call sites have been fixed to insure they have
1590	* taken an iocount, we can toughen this assert up and insist that the
1591	* iocount is non-zero... a non-zero usecount doesn't insure correctness
1592	*/
1593	if (vp->v_iocount <= `0` && vp->v_usecount <= `0`)
1594	panic("vnode_ref_ext: vp %p has no valid reference %d, %d", vp, vp->v_iocount, vp->v_usecount);
1595
1596	/*
1597	* if you are the owner of drain/termination, can acquire usecount
1598	*/
1599	if ((flags & VNODE_REF_FORCE) == `0`) {
1600	if ((vp->v_lflag & (VL_DRAIN \| VL_TERMINATE \| VL_DEAD))) {
1601	if (vp->v_owner != current_thread()) {
1602	error = ENOENT;
1603	goto out;
1604	}
1605	}
1606	}
1607	vp->v_usecount++;
1608
1609	if (fmode & FWRITE) {
1610	if (++vp->v_writecount <= `0`)
1611	panic("vnode_ref_ext: v_writecount");
1612	}
1613	if (fmode & O_EVTONLY) {
1614	if (++vp->v_kusecount <= `0`)
1615	panic("vnode_ref_ext: v_kusecount");
1616	}
1617	if (vp->v_flag & VRAGE) {
1618	struct uthread *ut;
1619
1620	ut = get_bsdthread_info(current_thread());
1621
1622	if ( !(current_proc()->p_lflag & P_LRAGE_VNODES) &&
1623	!(ut->uu_flag & UT_RAGE_VNODES)) {
1624	/*
1625	* a 'normal' process accessed this vnode
1626	* so make sure its no longer marked
1627	* for rapid aging... also, make sure
1628	* it gets removed from the rage list...
1629	* when v_usecount drops back to 0, it
1630	* will be put back on the real free list
1631	*/
1632	vp->v_flag &= ~VRAGE;
1633	vp->v_references = `0`;
1634	vnode_list_remove(vp);
1635	}
1636	}
1637	if (vp->v_usecount == `1` && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1638
1639	if (vp->v_ubcinfo) {
1640	vnode_lock_convert(vp);
1641	memory_object_mark_used(vp->v_ubcinfo->ui_control);
1642	}
1643	}
1644	out:
1645	vnode_unlock(vp);
1646
1647	return (error);
1648	}
1649
1650
1651	boolean_t
1652	vnode_on_reliable_media(vnode_t vp)
1653	{
1654	if ( !(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) && (vp->v_mount->mnt_flag & MNT_LOCAL) )
1655	return (TRUE);
1656	return (FALSE);
1657	}
1658
1659	static void
1660	vnode_async_list_add(vnode_t vp)
1661	{
1662	vnode_list_lock();
1663
1664	if (VONLIST(vp) \|\| (vp->v_lflag & (VL_TERMINATE\|VL_DEAD)))
1665	panic("vnode_async_list_add: %p is in wrong state", vp);
1666
1667	TAILQ_INSERT_HEAD(&vnode_async_work_list, vp, v_freelist);
1668	vp->v_listflag \|= VLIST_ASYNC_WORK;
1669
1670	async_work_vnodes++;
1671
1672	vnode_list_unlock();
1673
1674	wakeup(&vnode_async_work_list);
1675
1676	}
1677
1678
1679	/*
1680	* put the vnode on appropriate free list.
1681	* called with vnode LOCKED
1682	*/
1683	static void
1684	vnode_list_add(vnode_t vp)
1685	{
1686	boolean_t need_dead_wakeup = FALSE;
1687
1688	#if DIAGNOSTIC
1689	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1690	#endif
1691
1692	again:
1693
1694	/*
1695	* if it is already on a list or non zero references return
1696	*/
1697	if (VONLIST(vp) \|\| (vp->v_usecount != `0`) \|\| (vp->v_iocount != `0`) \|\| (vp->v_lflag & VL_TERMINATE))
1698	return;
1699
1700	/*
1701	* In vclean, we might have deferred ditching locked buffers
1702	* because something was still referencing them (indicated by
1703	* usecount). We can ditch them now.
1704	*/
1705	if (ISSET(vp->v_lflag, VL_DEAD)
1706	&& (!LIST_EMPTY(&vp->v_cleanblkhd) \|\| !LIST_EMPTY(&vp->v_dirtyblkhd))) {
1707	++vp->v_iocount; // Probably not necessary, but harmless
1708	#ifdef JOE_DEBUG
1709	record_vp(vp, `1`);
1710	#endif
1711	vnode_unlock(vp);
1712	buf_invalidateblks(vp, BUF_INVALIDATE_LOCKED, `0`, `0`);
1713	vnode_lock(vp);
1714	vnode_dropiocount(vp);
1715	goto again;
1716	}
1717
1718	vnode_list_lock();
1719
1720	if ((vp->v_flag & VRAGE) && !(vp->v_lflag & VL_DEAD)) {
1721	/*
1722	* add the new guy to the appropriate end of the RAGE list
1723	*/
1724	if ((vp->v_flag & VAGE))
1725	TAILQ_INSERT_HEAD(&vnode_rage_list, vp, v_freelist);
1726	else
1727	TAILQ_INSERT_TAIL(&vnode_rage_list, vp, v_freelist);
1728
1729	vp->v_listflag \|= VLIST_RAGE;
1730	ragevnodes++;
1731
1732	/*
1733	* reset the timestamp for the last inserted vp on the RAGE
1734	* queue to let new_vnode know that its not ok to start stealing
1735	* from this list... as long as we're actively adding to this list
1736	* we'll push out the vnodes we want to donate to the real free list
1737	* once we stop pushing, we'll let some time elapse before we start
1738	* stealing them in the new_vnode routine
1739	*/
1740	microuptime(&rage_tv);
1741	} else {
1742	/*
1743	* if VL_DEAD, insert it at head of the dead list
1744	* else insert at tail of LRU list or at head if VAGE is set
1745	*/
1746	if ( (vp->v_lflag & VL_DEAD)) {
1747	TAILQ_INSERT_HEAD(&vnode_dead_list, vp, v_freelist);
1748	vp->v_listflag \|= VLIST_DEAD;
1749	deadvnodes++;
1750
1751	if (dead_vnode_wanted) {
1752	dead_vnode_wanted--;
1753	need_dead_wakeup = TRUE;
1754	}
1755
1756	} else if ( (vp->v_flag & VAGE) ) {
1757	TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1758	vp->v_flag &= ~VAGE;
1759	freevnodes++;
1760	} else {
1761	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1762	freevnodes++;
1763	}
1764	}
1765	vnode_list_unlock();
1766
1767	if (need_dead_wakeup == TRUE)
1768	wakeup_one((caddr_t)&dead_vnode_wanted);
1769	}
1770
1771
1772	/*
1773	* remove the vnode from appropriate free list.
1774	* called with vnode LOCKED and
1775	* the list lock held
1776	*/
1777	static void
1778	vnode_list_remove_locked(vnode_t vp)
1779	{
1780	if (VONLIST(vp)) {
1781	/*
1782	* the v_listflag field is
1783	* protected by the vnode_list_lock
1784	*/
1785	if (vp->v_listflag & VLIST_RAGE)
1786	VREMRAGE("vnode_list_remove", vp);
1787	else if (vp->v_listflag & VLIST_DEAD)
1788	VREMDEAD("vnode_list_remove", vp);
1789	else if (vp->v_listflag & VLIST_ASYNC_WORK)
1790	VREMASYNC_WORK("vnode_list_remove", vp);
1791	else
1792	VREMFREE("vnode_list_remove", vp);
1793	}
1794	}
1795
1796
1797	/*
1798	* remove the vnode from appropriate free list.
1799	* called with vnode LOCKED
1800	*/
1801	static void
1802	vnode_list_remove(vnode_t vp)
1803	{
1804	#if DIAGNOSTIC
1805	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1806	#endif
1807	/*
1808	* we want to avoid taking the list lock
1809	* in the case where we're not on the free
1810	* list... this will be true for most
1811	* directories and any currently in use files
1812	*
1813	* we're guaranteed that we can't go from
1814	* the not-on-list state to the on-list
1815	* state since we hold the vnode lock...
1816	* all calls to vnode_list_add are done
1817	* under the vnode lock... so we can
1818	* check for that condition (the prevelant one)
1819	* without taking the list lock
1820	*/
1821	if (VONLIST(vp)) {
1822	vnode_list_lock();
1823	/*
1824	* however, we're not guaranteed that
1825	* we won't go from the on-list state
1826	* to the not-on-list state until we
1827	* hold the vnode_list_lock... this
1828	* is due to "new_vnode" removing vnodes
1829	* from the free list uder the list_lock
1830	* w/o the vnode lock... so we need to
1831	* check again whether we're currently
1832	* on the free list
1833	*/
1834	vnode_list_remove_locked(vp);
1835
1836	vnode_list_unlock();
1837	}
1838	}
1839
1840
1841	void
1842	vnode_rele(vnode_t vp)
1843	{
1844	vnode_rele_internal(vp, `0`, `0`, `0`);
1845	}
1846
1847
1848	void
1849	vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter)
1850	{
1851	vnode_rele_internal(vp, fmode, dont_reenter, `0`);
1852	}
1853
1854
1855	void
1856	vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked)
1857	{
1858
1859	if ( !locked)
1860	vnode_lock_spin(vp);
1861	#if DIAGNOSTIC
1862	else
1863	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
1864	#endif
1865	if (--vp->v_usecount < `0`)
1866	panic("vnode_rele_ext: vp %p usecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1867
1868	if (fmode & FWRITE) {
1869	if (--vp->v_writecount < `0`)
1870	panic("vnode_rele_ext: vp %p writecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_writecount, vp->v_tag, vp->v_type, vp->v_flag);
1871	}
1872	if (fmode & O_EVTONLY) {
1873	if (--vp->v_kusecount < `0`)
1874	panic("vnode_rele_ext: vp %p kusecount -ve : %d. v_tag = %d, v_type = %d, v_flag = %x.", vp, vp->v_kusecount, vp->v_tag, vp->v_type, vp->v_flag);
1875	}
1876	if (vp->v_kusecount > vp->v_usecount)
1877	panic("vnode_rele_ext: vp %p kusecount(%d) out of balance with usecount(%d). v_tag = %d, v_type = %d, v_flag = %x.",vp, vp->v_kusecount, vp->v_usecount, vp->v_tag, vp->v_type, vp->v_flag);
1878
1879	if ((vp->v_iocount > `0`) \|\| (vp->v_usecount > `0`)) {
1880	/*
1881	* vnode is still busy... if we're the last
1882	* usecount, mark for a future call to VNOP_INACTIVE
1883	* when the iocount finally drops to 0
1884	*/
1885	if (vp->v_usecount == `0`) {
1886	vp->v_lflag \|= VL_NEEDINACTIVE;
1887	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
1888	}
1889	goto done;
1890	}
1891	vp->v_flag &= ~(VNOCACHE_DATA \| VRAOFF \| VOPENEVT);
1892
1893	if (ISSET(vp->v_lflag, VL_TERMINATE \| VL_DEAD) \|\| dont_reenter) {
1894	/*
1895	* vnode is being cleaned, or
1896	* we've requested that we don't reenter
1897	* the filesystem on this release...in
1898	* the latter case, we'll mark the vnode aged
1899	*/
1900	if (dont_reenter) {
1901	if ( !(vp->v_lflag & (VL_TERMINATE \| VL_DEAD \| VL_MARKTERM)) ) {
1902	vp->v_lflag \|= VL_NEEDINACTIVE;
1903
1904	if (vnode_on_reliable_media(vp) == FALSE \|\| vp->v_flag & VISDIRTY) {
1905	vnode_async_list_add(vp);
1906	goto done;
1907	}
1908	}
1909	vp->v_flag \|= VAGE;
1910	}
1911	vnode_list_add(vp);
1912
1913	goto done;
1914	}
1915	/*
1916	* at this point both the iocount and usecount
1917	* are zero
1918	* pick up an iocount so that we can call
1919	* VNOP_INACTIVE with the vnode lock unheld
1920	*/
1921	vp->v_iocount++;
1922	#ifdef JOE_DEBUG
1923	record_vp(vp, `1`);
1924	#endif
1925	vp->v_lflag &= ~VL_NEEDINACTIVE;
1926	vnode_unlock(vp);
1927
1928	VNOP_INACTIVE(vp, vfs_context_current());
1929
1930	vnode_lock_spin(vp);
1931	/*
1932	* because we dropped the vnode lock to call VNOP_INACTIVE
1933	* the state of the vnode may have changed... we may have
1934	* picked up an iocount, usecount or the MARKTERM may have
1935	* been set... we need to reevaluate the reference counts
1936	* to determine if we can call vnode_reclaim_internal at
1937	* this point... if the reference counts are up, we'll pick
1938	* up the MARKTERM state when they get subsequently dropped
1939	*/
1940	if ( (vp->v_iocount == `1`) && (vp->v_usecount == `0`) &&
1941	((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM)) {
1942	struct uthread *ut;
1943
1944	ut = get_bsdthread_info(current_thread());
1945
1946	if (ut->uu_defer_reclaims) {
1947	vp->v_defer_reclaimlist = ut->uu_vreclaims;
1948	ut->uu_vreclaims = vp;
1949	goto done;
1950	}
1951	vnode_lock_convert(vp);
1952	vnode_reclaim_internal(vp, `1`, `1`, `0`);
1953	}
1954	vnode_dropiocount(vp);
1955	vnode_list_add(vp);
1956	done:
1957	if (vp->v_usecount == `0` && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) {
1958
1959	if (vp->v_ubcinfo) {
1960	vnode_lock_convert(vp);
1961	memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE);
1962	}
1963	}
1964	if ( !locked)
1965	vnode_unlock(vp);
1966	return;
1967	}
1968
1969	/*
1970	* Remove any vnodes in the vnode table belonging to mount point mp.
1971	*
1972	* If MNT_NOFORCE is specified, there should not be any active ones,
1973	* return error if any are found (nb: this is a user error, not a
1974	* system error). If MNT_FORCE is specified, detach any active vnodes
1975	* that are found.
1976	*/
1977	#if DIAGNOSTIC
1978	int busyprt = `0`; / print out busy vnodes /
1979	#endif
1980
1981	int
1982	vflush(struct mount mp, struct* vnode skipvp, int* flags)
1983	{
1984	struct vnode *vp;
1985	int busy = `0`;
1986	int reclaimed = `0`;
1987	int retval;
1988	unsigned int vid;
1989
1990	/*
1991	* See comments in vnode_iterate() for the rationale for this lock
1992	*/
1993	mount_iterate_lock(mp);
1994
1995	mount_lock(mp);
1996	vnode_iterate_setup(mp);
1997	/*
1998	* On regular unmounts(not forced) do a
1999	* quick check for vnodes to be in use. This
2000	* preserves the caching of vnodes. automounter
2001	* tries unmounting every so often to see whether
2002	* it is still busy or not.
2003	*/
2004	if (((flags & FORCECLOSE)==`0`) && ((mp->mnt_kern_flag & MNTK_UNMOUNT_PREFLIGHT) != `0`)) {
2005	if (vnode_umount_preflight(mp, skipvp, flags)) {
2006	vnode_iterate_clear(mp);
2007	mount_unlock(mp);
2008	mount_iterate_unlock(mp);
2009	return(EBUSY);
2010	}
2011	}
2012	loop:
2013	/ If it returns 0 then there is nothing to do /
2014	retval = vnode_iterate_prepare(mp);
2015
2016	if (retval == `0`) {
2017	vnode_iterate_clear(mp);
2018	mount_unlock(mp);
2019	mount_iterate_unlock(mp);
2020	return(retval);
2021	}
2022
2023	/ iterate over all the vnodes /
2024	while (!TAILQ_EMPTY(&mp->mnt_workerqueue)) {
2025
2026	vp = TAILQ_FIRST(&mp->mnt_workerqueue);
2027	TAILQ_REMOVE(&mp->mnt_workerqueue, vp, v_mntvnodes);
2028	TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2029
2030	if ( (vp->v_mount != mp) \|\| (vp == skipvp)) {
2031	continue;
2032	}
2033	vid = vp->v_id;
2034	mount_unlock(mp);
2035
2036	vnode_lock_spin(vp);
2037
2038	// If vnode is already terminating, wait for it...
2039	while (vp->v_id == vid && ISSET(vp->v_lflag, VL_TERMINATE)) {
2040	vp->v_lflag \|= VL_TERMWANT;
2041	msleep(&vp->v_lflag, &vp->v_lock, PVFS, "vflush", NULL);
2042	}
2043
2044	if ((vp->v_id != vid) \|\| ISSET(vp->v_lflag, VL_DEAD)) {
2045	vnode_unlock(vp);
2046	mount_lock(mp);
2047	continue;
2048	}
2049
2050	/*
2051	* If requested, skip over vnodes marked VSYSTEM.
2052	* Skip over all vnodes marked VNOFLUSH.
2053	*/
2054	if ((flags & SKIPSYSTEM) && ((vp->v_flag & VSYSTEM) \|\|
2055	(vp->v_flag & VNOFLUSH))) {
2056	vnode_unlock(vp);
2057	mount_lock(mp);
2058	continue;
2059	}
2060	/*
2061	* If requested, skip over vnodes marked VSWAP.
2062	*/
2063	if ((flags & SKIPSWAP) && (vp->v_flag & VSWAP)) {
2064	vnode_unlock(vp);
2065	mount_lock(mp);
2066	continue;
2067	}
2068	/*
2069	* If requested, skip over vnodes marked VROOT.
2070	*/
2071	if ((flags & SKIPROOT) && (vp->v_flag & VROOT)) {
2072	vnode_unlock(vp);
2073	mount_lock(mp);
2074	continue;
2075	}
2076	/*
2077	* If WRITECLOSE is set, only flush out regular file
2078	* vnodes open for writing.
2079	*/
2080	if ((flags & WRITECLOSE) &&
2081	(vp->v_writecount == `0` \|\| vp->v_type != VREG)) {
2082	vnode_unlock(vp);
2083	mount_lock(mp);
2084	continue;
2085	}
2086	/*
2087	* If the real usecount is 0, all we need to do is clear
2088	* out the vnode data structures and we are done.
2089	*/
2090	if (((vp->v_usecount == `0`) \|\|
2091	((vp->v_usecount - vp->v_kusecount) == `0`))) {
2092
2093	vnode_lock_convert(vp);
2094	vp->v_iocount++; / so that drain waits for * other iocounts /
2095	#ifdef JOE_DEBUG
2096	record_vp(vp, `1`);
2097	#endif
2098	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2099	vnode_dropiocount(vp);
2100	vnode_list_add(vp);
2101	vnode_unlock(vp);
2102
2103	reclaimed++;
2104	mount_lock(mp);
2105	continue;
2106	}
2107	/*
2108	* If FORCECLOSE is set, forcibly close the vnode.
2109	* For block or character devices, revert to an
2110	* anonymous device. For all other files, just kill them.
2111	*/
2112	if (flags & FORCECLOSE) {
2113	vnode_lock_convert(vp);
2114
2115	if (vp->v_type != VBLK && vp->v_type != VCHR) {
2116	vp->v_iocount++; / so that drain waits * for other iocounts /
2117	#ifdef JOE_DEBUG
2118	record_vp(vp, `1`);
2119	#endif
2120	vnode_abort_advlocks(vp);
2121	vnode_reclaim_internal(vp, `1`, `1`, `0`);
2122	vnode_dropiocount(vp);
2123	vnode_list_add(vp);
2124	vnode_unlock(vp);
2125	} else {
2126	vclean(vp, `0`);
2127	vp->v_lflag &= ~VL_DEAD;
2128	vp->v_op = spec_vnodeop_p;
2129	vp->v_flag \|= VDEVFLUSH;
2130	vnode_unlock(vp);
2131	}
2132	mount_lock(mp);
2133	continue;
2134	}
2135	#if DIAGNOSTIC
2136	if (busyprt)
2137	vprint("vflush: busy vnode", vp);
2138	#endif
2139	vnode_unlock(vp);
2140	mount_lock(mp);
2141	busy++;
2142	}
2143
2144	/ At this point the worker queue is completed /
2145	if (busy && ((flags & FORCECLOSE)==`0`) && reclaimed) {
2146	busy = `0`;
2147	reclaimed = `0`;
2148	(void)vnode_iterate_reloadq(mp);
2149	/ returned with mount lock held /
2150	goto loop;
2151	}
2152
2153	/ if new vnodes were created in between retry the reclaim /
2154	if ( vnode_iterate_reloadq(mp) != `0`) {
2155	if (!(busy && ((flags & FORCECLOSE)==`0`)))
2156	goto loop;
2157	}
2158	vnode_iterate_clear(mp);
2159	mount_unlock(mp);
2160	mount_iterate_unlock(mp);
2161
2162	if (busy && ((flags & FORCECLOSE)==`0`))
2163	return (EBUSY);
2164	return (`0`);
2165	}
2166
2167	long num_recycledvnodes = `0`;
2168	/*
2169	* Disassociate the underlying file system from a vnode.
2170	* The vnode lock is held on entry.
2171	*/
2172	static void
2173	vclean(vnode_t vp, int flags)
2174	{
2175	vfs_context_t ctx = vfs_context_current();
2176	int active;
2177	int need_inactive;
2178	int already_terminating;
2179	int clflags = `0`;
2180	#if NAMEDSTREAMS
2181	int is_namedstream;
2182	#endif
2183
2184	/*
2185	* Check to see if the vnode is in use.
2186	* If so we have to reference it before we clean it out
2187	* so that its count cannot fall to zero and generate a
2188	* race against ourselves to recycle it.
2189	*/
2190	active = vp->v_usecount;
2191
2192	/*
2193	* just in case we missed sending a needed
2194	* VNOP_INACTIVE, we'll do it now
2195	*/
2196	need_inactive = (vp->v_lflag & VL_NEEDINACTIVE);
2197
2198	vp->v_lflag &= ~VL_NEEDINACTIVE;
2199
2200	/*
2201	* Prevent the vnode from being recycled or
2202	* brought into use while we clean it out.
2203	*/
2204	already_terminating = (vp->v_lflag & VL_TERMINATE);
2205
2206	vp->v_lflag \|= VL_TERMINATE;
2207
2208	#if NAMEDSTREAMS
2209	is_namedstream = vnode_isnamedstream(vp);
2210	#endif
2211
2212	vnode_unlock(vp);
2213
2214	OSAddAtomicLong(`1`, &num_recycledvnodes);
2215
2216	if (flags & DOCLOSE)
2217	clflags \|= IO_NDELAY;
2218	if (flags & REVOKEALL)
2219	clflags \|= IO_REVOKE;
2220
2221	if (active && (flags & DOCLOSE))
2222	VNOP_CLOSE(vp, clflags, ctx);
2223
2224	/*
2225	* Clean out any buffers associated with the vnode.
2226	*/
2227	if (flags & DOCLOSE) {
2228	#if NFSCLIENT
2229	if (vp->v_tag == VT_NFS)
2230	nfs_vinvalbuf(vp, V_SAVE, ctx, `0`);
2231	else
2232	#endif
2233	{
2234	VNOP_FSYNC(vp, MNT_WAIT, ctx);
2235
2236	/*
2237	* If the vnode is still in use (by the journal for
2238	* example) we don't want to invalidate locked buffers
2239	* here. In that case, either the journal will tidy them
2240	* up, or we will deal with it when the usecount is
2241	* finally released in vnode_rele_internal.
2242	*/
2243	buf_invalidateblks(vp, BUF_WRITE_DATA \| (active ? `0` : BUF_INVALIDATE_LOCKED), `0`, `0`);
2244	}
2245	if (UBCINFOEXISTS(vp))
2246	/*
2247	* Clean the pages in VM.
2248	*/
2249	(void)ubc_msync(vp, (off_t)`0`, ubc_getsize(vp), NULL, UBC_PUSHALL \| UBC_INVALIDATE \| UBC_SYNC);
2250	}
2251	if (active \|\| need_inactive)
2252	VNOP_INACTIVE(vp, ctx);
2253
2254	#if NAMEDSTREAMS
2255	if ((is_namedstream != `0`) && (vp->v_parent != NULLVP)) {
2256	vnode_t pvp = vp->v_parent;
2257
2258	/ Delete the shadow stream file before we reclaim its vnode /
2259	if (vnode_isshadow(vp)) {
2260	vnode_relenamedstream(pvp, vp);
2261	}
2262
2263	/*
2264	* No more streams associated with the parent. We
2265	* have a ref on it, so its identity is stable.
2266	* If the parent is on an opaque volume, then we need to know
2267	* whether it has associated named streams.
2268	*/
2269	if (vfs_authopaque(pvp->v_mount)) {
2270	vnode_lock_spin(pvp);
2271	pvp->v_lflag &= ~VL_HASSTREAMS;
2272	vnode_unlock(pvp);
2273	}
2274	}
2275	#endif
2276
2277	/*
2278	* Destroy ubc named reference
2279	* cluster_release is done on this path
2280	* along with dropping the reference on the ucred
2281	* (and in the case of forced unmount of an mmap-ed file,
2282	* the ubc reference on the vnode is dropped here too).
2283	*/
2284	ubc_destroy_named(vp);
2285
2286	#if CONFIG_TRIGGERS
2287	/*
2288	* cleanup trigger info from vnode (if any)
2289	*/
2290	if (vp->v_resolve)
2291	vnode_resolver_detach(vp);
2292	#endif
2293
2294	/*
2295	* Reclaim the vnode.
2296	*/
2297	if (VNOP_RECLAIM(vp, ctx))
2298	panic("vclean: cannot reclaim");
2299
2300	// make sure the name & parent ptrs get cleaned out!
2301	vnode_update_identity(vp, NULLVP, NULL, `0`, `0`, VNODE_UPDATE_PARENT \| VNODE_UPDATE_NAME \| VNODE_UPDATE_PURGE);
2302
2303	vnode_lock(vp);
2304
2305	/*
2306	* Remove the vnode from any mount list it might be on. It is not
2307	* safe to do this any earlier because unmount needs to wait for
2308	* any vnodes to terminate and it cannot do that if it cannot find
2309	* them.
2310	*/
2311	insmntque(vp, (struct mount *)`0`);
2312
2313	vp->v_mount = dead_mountp;
2314	vp->v_op = dead_vnodeop_p;
2315	vp->v_tag = VT_NON;
2316	vp->v_data = NULL;
2317
2318	vp->v_lflag \|= VL_DEAD;
2319	vp->v_flag &= ~VISDIRTY;
2320
2321	if (already_terminating == `0`) {
2322	vp->v_lflag &= ~VL_TERMINATE;
2323	/*
2324	* Done with purge, notify sleepers of the grim news.
2325	*/
2326	if (vp->v_lflag & VL_TERMWANT) {
2327	vp->v_lflag &= ~VL_TERMWANT;
2328	wakeup(&vp->v_lflag);
2329	}
2330	}
2331	}
2332
2333	/*
2334	* Eliminate all activity associated with the requested vnode
2335	* and with all vnodes aliased to the requested vnode.
2336	*/
2337	int
2338	#if DIAGNOSTIC
2339	vn_revoke(vnode_t vp, int flags, __unused vfs_context_t a_context)
2340	#else
2341	vn_revoke(vnode_t vp, __unused int flags, __unused vfs_context_t a_context)
2342	#endif
2343	{
2344	struct vnode *vq;
2345	int vid;
2346
2347	#if DIAGNOSTIC
2348	if ((flags & REVOKEALL) == `0`)
2349	panic("vnop_revoke");
2350	#endif
2351
2352	if (vnode_isaliased(vp)) {
2353	/*
2354	* If a vgone (or vclean) is already in progress,
2355	* return an immediate error
2356	*/
2357	if (vp->v_lflag & VL_TERMINATE)
2358	return(ENOENT);
2359
2360	/*
2361	* Ensure that vp will not be vgone'd while we
2362	* are eliminating its aliases.
2363	*/
2364	SPECHASH_LOCK();
2365	while ((vp->v_specflags & SI_ALIASED)) {
2366	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2367	if (vq->v_rdev != vp->v_rdev \|\|
2368	vq->v_type != vp->v_type \|\| vp == vq)
2369	continue;
2370	vid = vq->v_id;
2371	SPECHASH_UNLOCK();
2372	if (vnode_getwithvid(vq,vid)){
2373	SPECHASH_LOCK();
2374	break;
2375	}
2376	vnode_lock(vq);
2377	if (!(vq->v_lflag & VL_TERMINATE)) {
2378	vnode_reclaim_internal(vq, `1`, `1`, `0`);
2379	}
2380	vnode_put_locked(vq);
2381	vnode_unlock(vq);
2382	SPECHASH_LOCK();
2383	break;
2384	}
2385	}
2386	SPECHASH_UNLOCK();
2387	}
2388	vnode_lock(vp);
2389	if (vp->v_lflag & VL_TERMINATE) {
2390	vnode_unlock(vp);
2391	return (ENOENT);
2392	}
2393	vnode_reclaim_internal(vp, `1`, `0`, REVOKEALL);
2394	vnode_unlock(vp);
2395
2396	return (`0`);
2397	}
2398
2399	/*
2400	* Recycle an unused vnode to the front of the free list.
2401	* Release the passed interlock if the vnode will be recycled.
2402	*/
2403	int
2404	vnode_recycle(struct vnode *vp)
2405	{
2406	vnode_lock_spin(vp);
2407
2408	if (vp->v_iocount \|\| vp->v_usecount) {
2409	vp->v_lflag \|= VL_MARKTERM;
2410	vnode_unlock(vp);
2411	return(`0`);
2412	}
2413	vnode_lock_convert(vp);
2414	vnode_reclaim_internal(vp, `1`, `0`, `0`);
2415
2416	vnode_unlock(vp);
2417
2418	return (`1`);
2419	}
2420
2421	static int
2422	vnode_reload(vnode_t vp)
2423	{
2424	vnode_lock_spin(vp);
2425
2426	if ((vp->v_iocount > `1`) \|\| vp->v_usecount) {
2427	vnode_unlock(vp);
2428	return(`0`);
2429	}
2430	if (vp->v_iocount <= `0`)
2431	panic("vnode_reload with no iocount %d", vp->v_iocount);
2432
2433	/ mark for release when iocount is dopped /
2434	vp->v_lflag \|= VL_MARKTERM;
2435	vnode_unlock(vp);
2436
2437	return (`1`);
2438	}
2439
2440
2441	static void
2442	vgone(vnode_t vp, int flags)
2443	{
2444	struct vnode *vq;
2445	struct vnode *vx;
2446
2447	/*
2448	* Clean out the filesystem specific data.
2449	* vclean also takes care of removing the
2450	* vnode from any mount list it might be on
2451	*/
2452	vclean(vp, flags \| DOCLOSE);
2453
2454	/*
2455	* If special device, remove it from special device alias list
2456	* if it is on one.
2457	*/
2458	if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_specinfo != `0`) {
2459	SPECHASH_LOCK();
2460	if (*vp->v_hashchain == vp) {
2461	*vp->v_hashchain = vp->v_specnext;
2462	} else {
2463	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2464	if (vq->v_specnext != vp)
2465	continue;
2466	vq->v_specnext = vp->v_specnext;
2467	break;
2468	}
2469	if (vq == NULL)
2470	panic("missing bdev");
2471	}
2472	if (vp->v_specflags & SI_ALIASED) {
2473	vx = NULL;
2474	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2475	if (vq->v_rdev != vp->v_rdev \|\|
2476	vq->v_type != vp->v_type)
2477	continue;
2478	if (vx)
2479	break;
2480	vx = vq;
2481	}
2482	if (vx == NULL)
2483	panic("missing alias");
2484	if (vq == NULL)
2485	vx->v_specflags &= ~SI_ALIASED;
2486	vp->v_specflags &= ~SI_ALIASED;
2487	}
2488	SPECHASH_UNLOCK();
2489	{
2490	struct specinfo *tmp = vp->v_specinfo;
2491	vp->v_specinfo = NULL;
2492	FREE_ZONE((void )tmp, sizeof(struct* specinfo), M_SPECINFO);
2493	}
2494	}
2495	}
2496
2497	/*
2498	* Lookup a vnode by device number.
2499	*/
2500	int
2501	check_mountedon(dev_t dev, enum vtype type, int *errorp)
2502	{
2503	vnode_t vp;
2504	int rc = `0`;
2505	int vid;
2506
2507	loop:
2508	SPECHASH_LOCK();
2509	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
2510	if (dev != vp->v_rdev \|\| type != vp->v_type)
2511	continue;
2512	vid = vp->v_id;
2513	SPECHASH_UNLOCK();
2514	if (vnode_getwithvid(vp,vid))
2515	goto loop;
2516	vnode_lock_spin(vp);
2517	if ((vp->v_usecount > `0`) \|\| (vp->v_iocount > `1`)) {
2518	vnode_unlock(vp);
2519	if ((*errorp = vfs_mountedon(vp)) != `0`)
2520	rc = `1`;
2521	} else
2522	vnode_unlock(vp);
2523	vnode_put(vp);
2524	return(rc);
2525	}
2526	SPECHASH_UNLOCK();
2527	return (`0`);
2528	}
2529
2530	/*
2531	* Calculate the total number of references to a special device.
2532	*/
2533	int
2534	vcount(vnode_t vp)
2535	{
2536	vnode_t vq, vnext;
2537	int count;
2538	int vid;
2539
2540	if (!vnode_isspec(vp)) {
2541	return (vp->v_usecount - vp->v_kusecount);
2542	}
2543
2544	loop:
2545	if (!vnode_isaliased(vp))
2546	return (vp->v_specinfo->si_opencount);
2547	count = `0`;
2548
2549	SPECHASH_LOCK();
2550	/*
2551	* Grab first vnode and its vid.
2552	*/
2553	vq = *vp->v_hashchain;
2554	vid = vq ? vq->v_id : `0`;
2555
2556	SPECHASH_UNLOCK();
2557
2558	while (vq) {
2559	/*
2560	* Attempt to get the vnode outside the SPECHASH lock.
2561	*/
2562	if (vnode_getwithvid(vq, vid)) {
2563	goto loop;
2564	}
2565	vnode_lock(vq);
2566
2567	if (vq->v_rdev == vp->v_rdev && vq->v_type == vp->v_type) {
2568	if ((vq->v_usecount == `0`) && (vq->v_iocount == `1`) && vq != vp) {
2569	/*
2570	* Alias, but not in use, so flush it out.
2571	*/
2572	vnode_reclaim_internal(vq, `1`, `1`, `0`);
2573	vnode_put_locked(vq);
2574	vnode_unlock(vq);
2575	goto loop;
2576	}
2577	count += vq->v_specinfo->si_opencount;
2578	}
2579	vnode_unlock(vq);
2580
2581	SPECHASH_LOCK();
2582	/*
2583	* must do this with the reference still held on 'vq'
2584	* so that it can't be destroyed while we're poking
2585	* through v_specnext
2586	*/
2587	vnext = vq->v_specnext;
2588	vid = vnext ? vnext->v_id : `0`;
2589
2590	SPECHASH_UNLOCK();
2591
2592	vnode_put(vq);
2593
2594	vq = vnext;
2595	}
2596
2597	return (count);
2598	}
2599
2600	int prtactive = `0`; / 1 => print out reclaim of active vnodes /
2601
2602	/*
2603	* Print out a description of a vnode.
2604	*/
2605	static const char *typename[] =
2606	{ "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
2607
2608	void
2609	vprint(const char label, struct* vnode *vp)
2610	{
2611	char sbuf[`64`];
2612
2613	if (label != NULL)
2614	printf("%s: ", label);
2615	printf("type %s, usecount %d, writecount %d",
2616	typename[vp->v_type], vp->v_usecount, vp->v_writecount);
2617	sbuf[`0`] = `'\0'`;
2618	if (vp->v_flag & VROOT)
2619	strlcat(sbuf, "\|VROOT", sizeof(sbuf));
2620	if (vp->v_flag & VTEXT)
2621	strlcat(sbuf, "\|VTEXT", sizeof(sbuf));
2622	if (vp->v_flag & VSYSTEM)
2623	strlcat(sbuf, "\|VSYSTEM", sizeof(sbuf));
2624	if (vp->v_flag & VNOFLUSH)
2625	strlcat(sbuf, "\|VNOFLUSH", sizeof(sbuf));
2626	if (vp->v_flag & VBWAIT)
2627	strlcat(sbuf, "\|VBWAIT", sizeof(sbuf));
2628	if (vnode_isaliased(vp))
2629	strlcat(sbuf, "\|VALIASED", sizeof(sbuf));
2630	if (sbuf[`0`] != `'\0'`)
2631	printf(" flags (%s)", &sbuf[`1`]);
2632	}
2633
2634
2635	int
2636	vn_getpath(struct vnode vp, char* pathbuf, int* *len)
2637	{
2638	return build_path(vp, pathbuf, *len, len, BUILDPATH_NO_FS_ENTER, vfs_context_current());
2639	}
2640
2641	int
2642	vn_getpath_fsenter(struct vnode vp, char* pathbuf, int* *len)
2643	{
2644	return build_path(vp, pathbuf, *len, len, `0`, vfs_context_current());
2645	}
2646
2647	/*
2648	* vn_getpath_fsenter_with_parent will reenter the file system to fine the path of the
2649	* vnode. It requires that there are IO counts on both the vnode and the directory vnode.
2650	*
2651	* vn_getpath_fsenter is called by MAC hooks to authorize operations for every thing, but
2652	* unlink, rmdir and rename. For these operation the MAC hook calls vn_getpath. This presents
2653	* problems where if the path can not be found from the name cache, those operations can
2654	* erroneously fail with EPERM even though the call should succeed. When removing or moving
2655	* file system objects with operations such as unlink or rename, those operations need to
2656	* take IO counts on the target and containing directory. Calling vn_getpath_fsenter from a
2657	* MAC hook from these operations during forced unmount operations can lead to dead
2658	* lock. This happens when the operation starts, IO counts are taken on the containing
2659	* directories and targets. Before the MAC hook is called a forced unmount from another
2660	* thread takes place and blocks on the on going operation's directory vnode in vdrain.
2661	* After which, the MAC hook gets called and calls vn_getpath_fsenter. vn_getpath_fsenter
2662	* is called with the understanding that there is an IO count on the target. If in
2663	* build_path the directory vnode is no longer in the cache, then the parent object id via
2664	* vnode_getattr from the target is obtain and used to call VFS_VGET to get the parent
2665	* vnode. The file system's VFS_VGET then looks up by inode in its hash and tries to get
2666	* an IO count. But VFS_VGET "sees" the directory vnode is in vdrain and can block
2667	* depending on which version and how it calls the vnode_get family of interfaces.
2668	*
2669	* N.B. A reasonable interface to use is vnode_getwithvid. This interface was modified to
2670	* call vnode_getiocount with VNODE_DRAINO, so it will happily get an IO count and not
2671	* cause issues, but there is no guarantee that all or any file systems are doing that.
2672	*
2673	* vn_getpath_fsenter_with_parent can enter the file system safely since there is a known
2674	* IO count on the directory vnode by calling build_path_with_parent.
2675	*/
2676
2677	int
2678	vn_getpath_fsenter_with_parent(struct vnode dvp, struct* vnode vp, char* pathbuf, int* *len)
2679	{
2680	return build_path_with_parent(vp, dvp, pathbuf, *len, len, `0`, vfs_context_current());
2681	}
2682
2683	int
2684	vn_getcdhash(struct vnode vp, off_t offset, unsigned* char *cdhash)
2685	{
2686	return ubc_cs_getcdhash(vp, offset, cdhash);
2687	}
2688
2689
2690	static char *extension_table=NULL;
2691	static int nexts;
2692	static int max_ext_width;
2693
2694	static int
2695	extension_cmp(const void a, const* void *b)
2696	{
2697	return (strlen((const char )a) - strlen((const* char *)b));
2698	}
2699
2700
2701	//
2702	// This is the api LaunchServices uses to inform the kernel
2703	// the list of package extensions to ignore.
2704	//
2705	// Internally we keep the list sorted by the length of the
2706	// the extension (from longest to shortest). We sort the
2707	// list of extensions so that we can speed up our searches
2708	// when comparing file names -- we only compare extensions
2709	// that could possibly fit into the file name, not all of
2710	// them (i.e. a short 8 character name can't have an 8
2711	// character extension).
2712	//
2713	extern lck_mtx_t *pkg_extensions_lck;
2714
2715	__private_extern__ int
2716	set_package_extensions_table(user_addr_t data, int nentries, int maxwidth)
2717	{
2718	char new_exts, old_exts;
2719	int error;
2720
2721	if (nentries <= `0` \|\| nentries > `1024` \|\| maxwidth <= `0` \|\| maxwidth > `255`) {
2722	return EINVAL;
2723	}
2724
2725
2726	// allocate one byte extra so we can guarantee null termination
2727	MALLOC(new_exts, char , (nentries maxwidth) + `1`, M_TEMP, M_WAITOK);
2728	if (new_exts == NULL) {
2729	return ENOMEM;
2730	}
2731
2732	error = copyin(data, new_exts, nentries * maxwidth);
2733	if (error) {
2734	FREE(new_exts, M_TEMP);
2735	return error;
2736	}
2737
2738	new_exts[(nentries * maxwidth)] = `'\0'`; // guarantee null termination of the block
2739
2740	qsort(new_exts, nentries, maxwidth, extension_cmp);
2741
2742	lck_mtx_lock(pkg_extensions_lck);
2743
2744	old_exts = extension_table;
2745	extension_table = new_exts;
2746	nexts = nentries;
2747	max_ext_width = maxwidth;
2748
2749	lck_mtx_unlock(pkg_extensions_lck);
2750
2751	if (old_exts) {
2752	FREE(old_exts, M_TEMP);
2753	}
2754
2755	return `0`;
2756	}
2757
2758
2759	int is_package_name(const char name, int* len)
2760	{
2761	int i, extlen;
2762	const char ptr, name_ext;
2763
2764	if (len <= `3`) {
2765	return `0`;
2766	}
2767
2768	name_ext = NULL;
2769	for(ptr=name; *ptr != `'\0'`; ptr++) {
2770	if (*ptr == `'.'`) {
2771	name_ext = ptr;
2772	}
2773	}
2774
2775	// if there is no "." extension, it can't match
2776	if (name_ext == NULL) {
2777	return `0`;
2778	}
2779
2780	// advance over the "."
2781	name_ext++;
2782
2783	lck_mtx_lock(pkg_extensions_lck);
2784
2785	// now iterate over all the extensions to see if any match
2786	ptr = &extension_table[`0`];
2787	for(i=`0`; i < nexts; i++, ptr+=max_ext_width) {
2788	extlen = strlen(ptr);
2789	if (strncasecmp(name_ext, ptr, extlen) == `0` && name_ext[extlen] == `'\0'`) {
2790	// aha, a match!
2791	lck_mtx_unlock(pkg_extensions_lck);
2792	return `1`;
2793	}
2794	}
2795
2796	lck_mtx_unlock(pkg_extensions_lck);
2797
2798	// if we get here, no extension matched
2799	return `0`;
2800	}
2801
2802	int
2803	vn_path_package_check(__unused vnode_t vp, char path, int* pathlen, int *component)
2804	{
2805	char ptr, end;
2806	int comp=`0`;
2807
2808	*component = -`1`;
2809	if (*path != `'/'`) {
2810	return EINVAL;
2811	}
2812
2813	end = path + `1`;
2814	while(end < path + pathlen && *end != `'\0'`) {
2815	while(end < path + pathlen && end == `'/'` && end != `'\0'`) {
2816	end++;
2817	}
2818
2819	ptr = end;
2820
2821	while(end < path + pathlen && end != `'/'` && end != `'\0'`) {
2822	end++;
2823	}
2824
2825	if (end > path + pathlen) {
2826	// hmm, string wasn't null terminated
2827	return EINVAL;
2828	}
2829
2830	*end = `'\0'`;
2831	if (is_package_name(ptr, end - ptr)) {
2832	*component = comp;
2833	break;
2834	}
2835
2836	end++;
2837	comp++;
2838	}
2839
2840	return `0`;
2841	}
2842
2843	/*
2844	* Determine if a name is inappropriate for a searchfs query.
2845	* This list consists of /System currently.
2846	*/
2847
2848	int vn_searchfs_inappropriate_name(const char name, int* len) {
2849	const char *bad_names[] = { "System" };
2850	int bad_len[] = { `6` };
2851	int i;
2852
2853	for(i=`0`; i < (int) (sizeof(bad_names) / sizeof(bad_names[`0`])); i++) {
2854	if (len == bad_len[i] && strncmp(name, bad_names[i], strlen(bad_names[i]) + `1`) == `0`) {
2855	return `1`;
2856	}
2857	}
2858
2859	// if we get here, no name matched
2860	return `0`;
2861	}
2862
2863	/*
2864	* Top level filesystem related information gathering.
2865	*/
2866	extern unsigned int vfs_nummntops;
2867
2868	/*
2869	* The VFS_NUMMNTOPS shouldn't be at name[1] since
2870	* is a VFS generic variable. Since we no longer support
2871	* VT_UFS, we reserve its value to support this sysctl node.
2872	*
2873	* It should have been:
2874	* name[0]: VFS_GENERIC
2875	* name[1]: VFS_NUMMNTOPS
2876	*/
2877	SYSCTL_INT(_vfs, VFS_NUMMNTOPS, nummntops,
2878	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
2879	&vfs_nummntops, `0`, "");
2880
2881	int
2882	vfs_sysctl(int *name __unused, u_int namelen __unused,
2883	user_addr_t oldp __unused, size_t *oldlenp __unused,
2884	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused);
2885
2886	int
2887	vfs_sysctl(int *name __unused, u_int namelen __unused,
2888	user_addr_t oldp __unused, size_t *oldlenp __unused,
2889	user_addr_t newp __unused, size_t newlen __unused, proc_t p __unused)
2890	{
2891	return (EINVAL);
2892	}
2893
2894
2895	//
2896	// The following code disallows specific sysctl's that came through
2897	// the direct sysctl interface (vfs_sysctl_node) instead of the newer
2898	// sysctl_vfs_ctlbyfsid() interface. We can not allow these selectors
2899	// through vfs_sysctl_node() because it passes the user's oldp pointer
2900	// directly to the file system which (for these selectors) casts it
2901	// back to a struct sysctl_req and then proceed to use SYSCTL_IN()
2902	// which jumps through an arbitrary function pointer. When called
2903	// through the sysctl_vfs_ctlbyfsid() interface this does not happen
2904	// and so it's safe.
2905	//
2906	// Unfortunately we have to pull in definitions from AFP and SMB and
2907	// perform explicit name checks on the file system to determine if
2908	// these selectors are being used.
2909	//
2910
2911	#define AFPFS_VFS_CTL_GETID 0x00020001
2912	#define AFPFS_VFS_CTL_NETCHANGE 0x00020002
2913	#define AFPFS_VFS_CTL_VOLCHANGE 0x00020003
2914
2915	#define SMBFS_SYSCTL_REMOUNT 1
2916	#define SMBFS_SYSCTL_REMOUNT_INFO 2
2917	#define SMBFS_SYSCTL_GET_SERVER_SHARE 3
2918
2919
2920	static int
2921	is_bad_sysctl_name(struct vfstable vfsp, int* selector_name)
2922	{
2923	switch(selector_name) {
2924	case VFS_CTL_QUERY:
2925	case VFS_CTL_TIMEO:
2926	case VFS_CTL_NOLOCKS:
2927	case VFS_CTL_NSTATUS:
2928	case VFS_CTL_SADDR:
2929	case VFS_CTL_DISC:
2930	case VFS_CTL_SERVERINFO:
2931	return `1`;
2932
2933	default:
2934	break;
2935	}
2936
2937	// the more complicated check for some of SMB's special values
2938	if (strcmp(vfsp->vfc_name, "smbfs") == `0`) {
2939	switch(selector_name) {
2940	case SMBFS_SYSCTL_REMOUNT:
2941	case SMBFS_SYSCTL_REMOUNT_INFO:
2942	case SMBFS_SYSCTL_GET_SERVER_SHARE:
2943	return `1`;
2944	}
2945	} else if (strcmp(vfsp->vfc_name, "afpfs") == `0`) {
2946	switch(selector_name) {
2947	case AFPFS_VFS_CTL_GETID:
2948	case AFPFS_VFS_CTL_NETCHANGE:
2949	case AFPFS_VFS_CTL_VOLCHANGE:
2950	return `1`;
2951	}
2952	}
2953
2954	//
2955	// If we get here we passed all the checks so the selector is ok
2956	//
2957	return `0`;
2958	}
2959
2960
2961	int vfs_sysctl_node SYSCTL_HANDLER_ARGS
2962	{
2963	int *name, namelen;
2964	struct vfstable *vfsp;
2965	int error;
2966	int fstypenum;
2967
2968	fstypenum = oidp->oid_number;
2969	name = arg1;
2970	namelen = arg2;
2971
2972	/ all sysctl names at this level should have at least one name slot for the FS /
2973	if (namelen < `1`)
2974	return (EISDIR); / overloaded /
2975
2976	mount_list_lock();
2977	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2978	if (vfsp->vfc_typenum == fstypenum) {
2979	vfsp->vfc_refcount++;
2980	break;
2981	}
2982	mount_list_unlock();
2983
2984	if (vfsp == NULL) {
2985	return (ENOTSUP);
2986	}
2987
2988	if (is_bad_sysctl_name(vfsp, name[`0`])) {
2989	printf("vfs: bad selector 0x%.8x for old-style sysctl(). use the sysctl-by-fsid interface instead\n", name[`0`]);
2990	return EPERM;
2991	}
2992
2993	error = (vfsp->vfc_vfsops->vfs_sysctl)(name, namelen, req->oldptr, &req->oldlen, req->newptr, req->newlen, vfs_context_current());
2994
2995	mount_list_lock();
2996	vfsp->vfc_refcount--;
2997	mount_list_unlock();
2998
2999	return error;
3000	}
3001
3002	/*
3003	* Check to see if a filesystem is mounted on a block device.
3004	*/
3005	int
3006	vfs_mountedon(struct vnode *vp)
3007	{
3008	struct vnode *vq;
3009	int error = `0`;
3010
3011	SPECHASH_LOCK();
3012	if (vp->v_specflags & SI_MOUNTEDON) {
3013	error = EBUSY;
3014	goto out;
3015	}
3016	if (vp->v_specflags & SI_ALIASED) {
3017	for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
3018	if (vq->v_rdev != vp->v_rdev \|\|
3019	vq->v_type != vp->v_type)
3020	continue;
3021	if (vq->v_specflags & SI_MOUNTEDON) {
3022	error = EBUSY;
3023	break;
3024	}
3025	}
3026	}
3027	out:
3028	SPECHASH_UNLOCK();
3029	return (error);
3030	}
3031
3032	struct unmount_info {
3033	int u_errs; // Total failed unmounts
3034	int u_busy; // EBUSY failed unmounts
3035	};
3036
3037	static int
3038	unmount_callback(mount_t mp, void *arg)
3039	{
3040	int error;
3041	char *mntname;
3042	struct unmount_info *uip = arg;
3043
3044	mount_ref(mp, `0`);
3045	mount_iterdrop(mp); // avoid vfs_iterate deadlock in dounmount()
3046
3047	MALLOC_ZONE(mntname, void *, MAXPATHLEN, M_NAMEI, M_WAITOK);
3048	if (mntname)
3049	strlcpy(mntname, mp->mnt_vfsstat.f_mntonname, MAXPATHLEN);
3050
3051	error = dounmount(mp, MNT_FORCE, `1`, vfs_context_current());
3052	if (error) {
3053	uip->u_errs++;
3054	printf("Unmount of %s failed (%d)\n", mntname ? mntname:"?", error);
3055	if (error == EBUSY)
3056	uip->u_busy++;
3057	}
3058	if (mntname)
3059	FREE_ZONE(mntname, MAXPATHLEN, M_NAMEI);
3060
3061	return (VFS_RETURNED);
3062	}
3063
3064	/*
3065	* Unmount all filesystems. The list is traversed in reverse order
3066	* of mounting to avoid dependencies.
3067	* Busy mounts are retried.
3068	*/
3069	__private_extern__ void
3070	vfs_unmountall(void)
3071	{
3072	int mounts, sec = `1`;
3073	struct unmount_info ui;
3074
3075	retry:
3076	ui.u_errs = ui.u_busy = `0`;
3077	vfs_iterate(VFS_ITERATE_CB_DROPREF \| VFS_ITERATE_TAIL_FIRST, unmount_callback, &ui);
3078	mounts = mount_getvfscnt();
3079	if (mounts == `0`)
3080	return;
3081
3082	if (ui.u_busy > `0`) { // Busy mounts - wait & retry
3083	tsleep(&nummounts, PVFS, "busy mount", sec * hz);
3084	sec *= `2`;
3085	if (sec <= `32`)
3086	goto retry;
3087	printf("Unmounting timed out\n");
3088	} else if (ui.u_errs < mounts) {
3089	// If the vfs_iterate missed mounts in progress - wait a bit
3090	tsleep(&nummounts, PVFS, "missed mount", `2` * hz);
3091	}
3092	}
3093
3094	/*
3095	* This routine is called from vnode_pager_deallocate out of the VM
3096	* The path to vnode_pager_deallocate can only be initiated by ubc_destroy_named
3097	* on a vnode that has a UBCINFO
3098	*/
3099	__private_extern__ void
3100	vnode_pager_vrele(vnode_t vp)
3101	{
3102	struct ubc_info *uip;
3103
3104	vnode_lock_spin(vp);
3105
3106	vp->v_lflag &= ~VNAMED_UBC;
3107	if (vp->v_usecount != `0`) {
3108	/*
3109	* At the eleventh hour, just before the ubcinfo is
3110	* destroyed, ensure the ubc-specific v_usecount
3111	* reference has gone. We use v_usecount != 0 as a hint;
3112	* ubc_unmap() does nothing if there's no mapping.
3113	*
3114	* This case is caused by coming here via forced unmount,
3115	* versus the usual vm_object_deallocate() path.
3116	* In the forced unmount case, ubc_destroy_named()
3117	* releases the pager before memory_object_last_unmap()
3118	* can be called.
3119	*/
3120	vnode_unlock(vp);
3121	ubc_unmap(vp);
3122	vnode_lock_spin(vp);
3123	}
3124
3125	uip = vp->v_ubcinfo;
3126	vp->v_ubcinfo = UBC_INFO_NULL;
3127
3128	vnode_unlock(vp);
3129
3130	ubc_info_deallocate(uip);
3131	}
3132
3133
3134	#include <sys/disk.h>
3135
3136	u_int32_t rootunit = (u_int32_t)-`1`;
3137
3138	#if CONFIG_IOSCHED
3139	extern int lowpri_throttle_enabled;
3140	extern int iosched_enabled;
3141	#endif
3142
3143	errno_t
3144	vfs_init_io_attributes(vnode_t devvp, mount_t mp)
3145	{
3146	int error;
3147	off_t readblockcnt = `0`;
3148	off_t writeblockcnt = `0`;
3149	off_t readmaxcnt = `0`;
3150	off_t writemaxcnt = `0`;
3151	off_t readsegcnt = `0`;
3152	off_t writesegcnt = `0`;
3153	off_t readsegsize = `0`;
3154	off_t writesegsize = `0`;
3155	off_t alignment = `0`;
3156	u_int32_t minsaturationbytecount = `0`;
3157	u_int32_t ioqueue_depth = `0`;
3158	u_int32_t blksize;
3159	u_int64_t temp;
3160	u_int32_t features;
3161	vfs_context_t ctx = vfs_context_current();
3162	dk_corestorage_info_t cs_info;
3163	boolean_t cs_present = FALSE;;
3164	int isssd = `0`;
3165	int isvirtual = `0`;
3166
3167
3168	VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, `0`, NULL);
3169	/*
3170	* as a reasonable approximation, only use the lowest bit of the mask
3171	* to generate a disk unit number
3172	*/
3173	mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask);
3174
3175	if (devvp == rootvp)
3176	rootunit = mp->mnt_devbsdunit;
3177
3178	if (mp->mnt_devbsdunit == rootunit) {
3179	/*
3180	* this mount point exists on the same device as the root
3181	* partition, so it comes under the hard throttle control...
3182	* this is true even for the root mount point itself
3183	*/
3184	mp->mnt_kern_flag \|= MNTK_ROOTDEV;
3185	}
3186	/*
3187	* force the spec device to re-cache
3188	* the underlying block size in case
3189	* the filesystem overrode the initial value
3190	*/
3191	set_fsblocksize(devvp);
3192
3193
3194	if ((error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE,
3195	(caddr_t)&blksize, `0`, ctx)))
3196	return (error);
3197
3198	mp->mnt_devblocksize = blksize;
3199
3200	/*
3201	* set the maximum possible I/O size
3202	* this may get clipped to a smaller value
3203	* based on which constraints are being advertised
3204	* and if those advertised constraints result in a smaller
3205	* limit for a given I/O
3206	*/
3207	mp->mnt_maxreadcnt = MAX_UPL_SIZE_BYTES;
3208	mp->mnt_maxwritecnt = MAX_UPL_SIZE_BYTES;
3209
3210	if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, `0`, ctx) == `0`) {
3211	if (isvirtual)
3212	mp->mnt_kern_flag \|= MNTK_VIRTUALDEV;
3213	}
3214	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, `0`, ctx) == `0`) {
3215	if (isssd)
3216	mp->mnt_kern_flag \|= MNTK_SSD;
3217	}
3218	if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES,
3219	(caddr_t)&features, `0`, ctx)))
3220	return (error);
3221
3222	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD,
3223	(caddr_t)&readblockcnt, `0`, ctx)))
3224	return (error);
3225
3226	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE,
3227	(caddr_t)&writeblockcnt, `0`, ctx)))
3228	return (error);
3229
3230	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD,
3231	(caddr_t)&readmaxcnt, `0`, ctx)))
3232	return (error);
3233
3234	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE,
3235	(caddr_t)&writemaxcnt, `0`, ctx)))
3236	return (error);
3237
3238	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD,
3239	(caddr_t)&readsegcnt, `0`, ctx)))
3240	return (error);
3241
3242	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE,
3243	(caddr_t)&writesegcnt, `0`, ctx)))
3244	return (error);
3245
3246	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3247	(caddr_t)&readsegsize, `0`, ctx)))
3248	return (error);
3249
3250	if ((error = VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTBYTECOUNTWRITE,
3251	(caddr_t)&writesegsize, `0`, ctx)))
3252	return (error);
3253
3254	if ((error = VNOP_IOCTL(devvp, DKIOCGETMINSEGMENTALIGNMENTBYTECOUNT,
3255	(caddr_t)&alignment, `0`, ctx)))
3256	return (error);
3257
3258	if ((error = VNOP_IOCTL(devvp, DKIOCGETCOMMANDPOOLSIZE,
3259	(caddr_t)&ioqueue_depth, `0`, ctx)))
3260	return (error);
3261
3262	if (readmaxcnt)
3263	mp->mnt_maxreadcnt = (readmaxcnt > UINT32_MAX) ? UINT32_MAX : readmaxcnt;
3264
3265	if (readblockcnt) {
3266	temp = readblockcnt * blksize;
3267	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3268
3269	if (temp < mp->mnt_maxreadcnt)
3270	mp->mnt_maxreadcnt = (u_int32_t)temp;
3271	}
3272
3273	if (writemaxcnt)
3274	mp->mnt_maxwritecnt = (writemaxcnt > UINT32_MAX) ? UINT32_MAX : writemaxcnt;
3275
3276	if (writeblockcnt) {
3277	temp = writeblockcnt * blksize;
3278	temp = (temp > UINT32_MAX) ? UINT32_MAX : temp;
3279
3280	if (temp < mp->mnt_maxwritecnt)
3281	mp->mnt_maxwritecnt = (u_int32_t)temp;
3282	}
3283
3284	if (readsegcnt) {
3285	temp = (readsegcnt > UINT16_MAX) ? UINT16_MAX : readsegcnt;
3286	} else {
3287	temp = mp->mnt_maxreadcnt / PAGE_SIZE;
3288
3289	if (temp > UINT16_MAX)
3290	temp = UINT16_MAX;
3291	}
3292	mp->mnt_segreadcnt = (u_int16_t)temp;
3293
3294	if (writesegcnt) {
3295	temp = (writesegcnt > UINT16_MAX) ? UINT16_MAX : writesegcnt;
3296	} else {
3297	temp = mp->mnt_maxwritecnt / PAGE_SIZE;
3298
3299	if (temp > UINT16_MAX)
3300	temp = UINT16_MAX;
3301	}
3302	mp->mnt_segwritecnt = (u_int16_t)temp;
3303
3304	if (readsegsize)
3305	temp = (readsegsize > UINT32_MAX) ? UINT32_MAX : readsegsize;
3306	else
3307	temp = mp->mnt_maxreadcnt;
3308	mp->mnt_maxsegreadsize = (u_int32_t)temp;
3309
3310	if (writesegsize)
3311	temp = (writesegsize > UINT32_MAX) ? UINT32_MAX : writesegsize;
3312	else
3313	temp = mp->mnt_maxwritecnt;
3314	mp->mnt_maxsegwritesize = (u_int32_t)temp;
3315
3316	if (alignment)
3317	temp = (alignment > PAGE_SIZE) ? PAGE_MASK : alignment - `1`;
3318	else
3319	temp = `0`;
3320	mp->mnt_alignmentmask = temp;
3321
3322
3323	if (ioqueue_depth > MNT_DEFAULT_IOQUEUE_DEPTH)
3324	temp = ioqueue_depth;
3325	else
3326	temp = MNT_DEFAULT_IOQUEUE_DEPTH;
3327
3328	mp->mnt_ioqueue_depth = temp;
3329	mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
3330
3331	if (mp->mnt_ioscale > `1`)
3332	printf("ioqueue_depth = %d, ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
3333
3334	if (features & DK_FEATURE_FORCE_UNIT_ACCESS)
3335	mp->mnt_ioflags \|= MNT_IOFLAGS_FUA_SUPPORTED;
3336
3337	if (VNOP_IOCTL(devvp, DKIOCGETIOMINSATURATIONBYTECOUNT, (caddr_t)&minsaturationbytecount, `0`, ctx) == `0`) {
3338	mp->mnt_minsaturationbytecount = minsaturationbytecount;
3339	} else {
3340	mp->mnt_minsaturationbytecount = `0`;
3341	}
3342
3343	if (VNOP_IOCTL(devvp, DKIOCCORESTORAGE, (caddr_t)&cs_info, `0`, ctx) == `0`)
3344	cs_present = TRUE;
3345
3346	if (features & DK_FEATURE_UNMAP) {
3347	mp->mnt_ioflags \|= MNT_IOFLAGS_UNMAP_SUPPORTED;
3348
3349	if (cs_present == TRUE)
3350	mp->mnt_ioflags \|= MNT_IOFLAGS_CSUNMAP_SUPPORTED;
3351	}
3352	if (cs_present == TRUE) {
3353	/*
3354	* for now we'll use the following test as a proxy for
3355	* the underlying drive being FUSION in nature
3356	*/
3357	if ((cs_info.flags & DK_CORESTORAGE_PIN_YOUR_METADATA))
3358	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
3359	} else {
3360	/ Check for APFS Fusion /
3361	dk_apfs_flavour_t flavour;
3362	if ((VNOP_IOCTL(devvp, DKIOCGETAPFSFLAVOUR, (caddr_t)&flavour, `0`, ctx) == `0`) &&
3363	(flavour == DK_APFS_FUSION)) {
3364	mp->mnt_ioflags \|= MNT_IOFLAGS_FUSION_DRIVE;
3365	}
3366	}
3367
3368	#if CONFIG_IOSCHED
3369	if (iosched_enabled && (features & DK_FEATURE_PRIORITY)) {
3370	mp->mnt_ioflags \|= MNT_IOFLAGS_IOSCHED_SUPPORTED;
3371	throttle_info_disable_throttle(mp->mnt_devbsdunit, (mp->mnt_ioflags & MNT_IOFLAGS_FUSION_DRIVE) != `0`);
3372	}
3373	#endif /* CONFIG_IOSCHED */
3374	return (error);
3375	}
3376
3377	static struct klist fs_klist;
3378	lck_grp_t *fs_klist_lck_grp;
3379	lck_mtx_t *fs_klist_lock;
3380
3381	void
3382	vfs_event_init(void)
3383	{
3384
3385	klist_init(&fs_klist);
3386	fs_klist_lck_grp = lck_grp_alloc_init("fs_klist", NULL);
3387	fs_klist_lock = lck_mtx_alloc_init(fs_klist_lck_grp, NULL);
3388	}
3389
3390	void
3391	vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data)
3392	{
3393	if (event == VQ_DEAD \|\| event == VQ_NOTRESP) {
3394	struct mount *mp = vfs_getvfs(fsid);
3395	if (mp) {
3396	mount_lock_spin(mp);
3397	if (data)
3398	mp->mnt_kern_flag &= ~MNT_LNOTRESP; // Now responding
3399	else
3400	mp->mnt_kern_flag \|= MNT_LNOTRESP; // Not responding
3401	mount_unlock(mp);
3402	}
3403	}
3404
3405	lck_mtx_lock(fs_klist_lock);
3406	KNOTE(&fs_klist, event);
3407	lck_mtx_unlock(fs_klist_lock);
3408	}
3409
3410	/*
3411	* return the number of mounted filesystems.
3412	*/
3413	static int
3414	sysctl_vfs_getvfscnt(void)
3415	{
3416	return(mount_getvfscnt());
3417	}
3418
3419
3420	static int
3421	mount_getvfscnt(void)
3422	{
3423	int ret;
3424
3425	mount_list_lock();
3426	ret = nummounts;
3427	mount_list_unlock();
3428	return (ret);
3429
3430	}
3431
3432
3433
3434	static int
3435	mount_fillfsids(fsid_t fsidlst, int* count)
3436	{
3437	struct mount *mp;
3438	int actual=`0`;
3439
3440	actual = `0`;
3441	mount_list_lock();
3442	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3443	if (actual <= count) {
3444	fsidlst[actual] = mp->mnt_vfsstat.f_fsid;
3445	actual++;
3446	}
3447	}
3448	mount_list_unlock();
3449	return (actual);
3450
3451	}
3452
3453	/*
3454	* fill in the array of fsid_t's up to a max of 'count', the actual
3455	* number filled in will be set in '*actual'. If there are more fsid_t's
3456	* than room in fsidlst then ENOMEM will be returned and '*actual' will
3457	* have the actual count.
3458	* having *actual filled out even in the error case is depended upon.
3459	*/
3460	static int
3461	sysctl_vfs_getvfslist(fsid_t fsidlst, int* count, int *actual)
3462	{
3463	struct mount *mp;
3464
3465	*actual = `0`;
3466	mount_list_lock();
3467	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3468	(*actual)++;
3469	if (*actual <= count)
3470	fsidlst[(*actual) - `1`] = mp->mnt_vfsstat.f_fsid;
3471	}
3472	mount_list_unlock();
3473	return (*actual <= count ? `0` : ENOMEM);
3474	}
3475
3476	static int
3477	sysctl_vfs_vfslist(__unused struct sysctl_oid oidp, __unused void* *arg1,
3478	__unused int arg2, struct sysctl_req *req)
3479	{
3480	int actual, error;
3481	size_t space;
3482	fsid_t *fsidlst;
3483
3484	/ This is a readonly node. /
3485	if (req->newptr != USER_ADDR_NULL)
3486	return (EPERM);
3487
3488	/ they are querying us so just return the space required. /
3489	if (req->oldptr == USER_ADDR_NULL) {
3490	req->oldidx = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3491	return `0`;
3492	}
3493	again:
3494	/*
3495	* Retrieve an accurate count of the amount of space required to copy
3496	* out all the fsids in the system.
3497	*/
3498	space = req->oldlen;
3499	req->oldlen = sysctl_vfs_getvfscnt() * sizeof(fsid_t);
3500
3501	/ they didn't give us enough space. /
3502	if (space < req->oldlen)
3503	return (ENOMEM);
3504
3505	MALLOC(fsidlst, fsid_t *, req->oldlen, M_TEMP, M_WAITOK \| M_ZERO);
3506	if (fsidlst == NULL) {
3507	return (ENOMEM);
3508	}
3509
3510	error = sysctl_vfs_getvfslist(fsidlst, req->oldlen / sizeof(fsid_t),
3511	&actual);
3512	/*
3513	* If we get back ENOMEM, then another mount has been added while we
3514	* slept in malloc above. If this is the case then try again.
3515	*/
3516	if (error == ENOMEM) {
3517	FREE(fsidlst, M_TEMP);
3518	req->oldlen = space;
3519	goto again;
3520	}
3521	if (error == `0`) {
3522	error = SYSCTL_OUT(req, fsidlst, actual * sizeof(fsid_t));
3523	}
3524	FREE(fsidlst, M_TEMP);
3525	return (error);
3526	}
3527
3528	/*
3529	* Do a sysctl by fsid.
3530	*/
3531	static int
3532	sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid oidp, void* arg1, int* arg2,
3533	struct sysctl_req *req)
3534	{
3535	union union_vfsidctl vc;
3536	struct mount *mp;
3537	struct vfsstatfs *sp;
3538	int *name, flags, namelen;
3539	int error=`0`, gotref=`0`;
3540	vfs_context_t ctx = vfs_context_current();
3541	proc_t p = req->p; / XXX req->p != current_proc()? /
3542	boolean_t is_64_bit;
3543
3544	name = arg1;
3545	namelen = arg2;
3546	is_64_bit = proc_is64bit(p);
3547
3548	error = SYSCTL_IN(req, &vc, is_64_bit? sizeof(vc.vc64):sizeof(vc.vc32));
3549	if (error)
3550	goto out;
3551	if (vc.vc32.vc_vers != VFS_CTL_VERS1) { / works for 32 and 64 /
3552	error = EINVAL;
3553	goto out;
3554	}
3555	mp = mount_list_lookupby_fsid(&vc.vc32.vc_fsid, `0`, `1`); / works for 32 and 64 /
3556	if (mp == NULL) {
3557	error = ENOENT;
3558	goto out;
3559	}
3560	gotref = `1`;
3561	/ reset so that the fs specific code can fetch it. /
3562	req->newidx = `0`;
3563	/*
3564	* Note if this is a VFS_CTL then we pass the actual sysctl req
3565	* in for "oldp" so that the lower layer can DTRT and use the
3566	* SYSCTL_IN/OUT routines.
3567	*/
3568	if (mp->mnt_op->vfs_sysctl != NULL) {
3569	if (is_64_bit) {
3570	if (vfs_64bitready(mp)) {
3571	error = mp->mnt_op->vfs_sysctl(name, namelen,
3572	CAST_USER_ADDR_T(req),
3573	NULL, USER_ADDR_NULL, `0`,
3574	ctx);
3575	}
3576	else {
3577	error = ENOTSUP;
3578	}
3579	}
3580	else {
3581	error = mp->mnt_op->vfs_sysctl(name, namelen,
3582	CAST_USER_ADDR_T(req),
3583	NULL, USER_ADDR_NULL, `0`,
3584	ctx);
3585	}
3586	if (error != ENOTSUP) {
3587	goto out;
3588	}
3589	}
3590	switch (name[`0`]) {
3591	case VFS_CTL_UMOUNT:
3592	req->newidx = `0`;
3593	if (is_64_bit) {
3594	req->newptr = vc.vc64.vc_ptr;
3595	req->newlen = (size_t)vc.vc64.vc_len;
3596	}
3597	else {
3598	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3599	req->newlen = vc.vc32.vc_len;
3600	}
3601	error = SYSCTL_IN(req, &flags, sizeof(flags));
3602	if (error)
3603	break;
3604
3605	mount_ref(mp, `0`);
3606	mount_iterdrop(mp);
3607	gotref = `0`;
3608	/ safedounmount consumes a ref /
3609	error = safedounmount(mp, flags, ctx);
3610	break;
3611	case VFS_CTL_STATFS:
3612	req->newidx = `0`;
3613	if (is_64_bit) {
3614	req->newptr = vc.vc64.vc_ptr;
3615	req->newlen = (size_t)vc.vc64.vc_len;
3616	}
3617	else {
3618	req->newptr = CAST_USER_ADDR_T(vc.vc32.vc_ptr);
3619	req->newlen = vc.vc32.vc_len;
3620	}
3621	error = SYSCTL_IN(req, &flags, sizeof(flags));
3622	if (error)
3623	break;
3624	sp = &mp->mnt_vfsstat;
3625	if (((flags & MNT_NOWAIT) == `0` \|\| (flags & (MNT_WAIT \| MNT_DWAIT))) &&
3626	(error = vfs_update_vfsstat(mp, ctx, VFS_USER_EVENT)))
3627	goto out;
3628	if (is_64_bit) {
3629	struct user64_statfs sfs;
3630	bzero(&sfs, sizeof(sfs));
3631	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3632	sfs.f_type = mp->mnt_vtable->vfc_typenum;
3633	sfs.f_bsize = (user64_long_t)sp->f_bsize;
3634	sfs.f_iosize = (user64_long_t)sp->f_iosize;
3635	sfs.f_blocks = (user64_long_t)sp->f_blocks;
3636	sfs.f_bfree = (user64_long_t)sp->f_bfree;
3637	sfs.f_bavail = (user64_long_t)sp->f_bavail;
3638	sfs.f_files = (user64_long_t)sp->f_files;
3639	sfs.f_ffree = (user64_long_t)sp->f_ffree;
3640	sfs.f_fsid = sp->f_fsid;
3641	sfs.f_owner = sp->f_owner;
3642	#ifdef NFSCLIENT
3643	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3644	strlcpy(&sfs.f_fstypename[`0`], &mp->fstypename_override[`0`], MFSNAMELEN);
3645	} else
3646	#endif
3647	{
3648	strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3649	}
3650	strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3651	strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3652
3653	error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3654	}
3655	else {
3656	struct user32_statfs sfs;
3657	bzero(&sfs, sizeof(sfs));
3658	sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
3659	sfs.f_type = mp->mnt_vtable->vfc_typenum;
3660
3661	/*
3662	* It's possible for there to be more than 2^^31 blocks in the filesystem, so we
3663	* have to fudge the numbers here in that case. We inflate the blocksize in order
3664	* to reflect the filesystem size as best we can.
3665	*/
3666	if (sp->f_blocks > INT_MAX) {
3667	int shift;
3668
3669	/*
3670	* Work out how far we have to shift the block count down to make it fit.
3671	* Note that it's possible to have to shift so far that the resulting
3672	* blocksize would be unreportably large. At that point, we will clip
3673	* any values that don't fit.
3674	*
3675	* For safety's sake, we also ensure that f_iosize is never reported as
3676	* being smaller than f_bsize.
3677	*/
3678	for (shift = `0`; shift < `32`; shift++) {
3679	if ((sp->f_blocks >> shift) <= INT_MAX)
3680	break;
3681	if ((((long long)sp->f_bsize) << (shift + `1`)) > INT_MAX)
3682	break;
3683	}
3684	#define __SHIFT_OR_CLIP(x, s) ((((x) >> (s)) > INT_MAX) ? INT_MAX : ((x) >> (s)))
3685	sfs.f_blocks = (user32_long_t)__SHIFT_OR_CLIP(sp->f_blocks, shift);
3686	sfs.f_bfree = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bfree, shift);
3687	sfs.f_bavail = (user32_long_t)__SHIFT_OR_CLIP(sp->f_bavail, shift);
3688	#undef __SHIFT_OR_CLIP
3689	sfs.f_bsize = (user32_long_t)(sp->f_bsize << shift);
3690	sfs.f_iosize = lmax(sp->f_iosize, sp->f_bsize);
3691	} else {
3692	sfs.f_bsize = (user32_long_t)sp->f_bsize;
3693	sfs.f_iosize = (user32_long_t)sp->f_iosize;
3694	sfs.f_blocks = (user32_long_t)sp->f_blocks;
3695	sfs.f_bfree = (user32_long_t)sp->f_bfree;
3696	sfs.f_bavail = (user32_long_t)sp->f_bavail;
3697	}
3698	sfs.f_files = (user32_long_t)sp->f_files;
3699	sfs.f_ffree = (user32_long_t)sp->f_ffree;
3700	sfs.f_fsid = sp->f_fsid;
3701	sfs.f_owner = sp->f_owner;
3702
3703	#ifdef NFSCLIENT
3704	if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) {
3705	strlcpy(&sfs.f_fstypename[`0`], &mp->fstypename_override[`0`], MFSNAMELEN);
3706	} else
3707	#endif
3708	{
3709	strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN);
3710	}
3711	strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN);
3712	strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN);
3713
3714	error = SYSCTL_OUT(req, &sfs, sizeof(sfs));
3715	}
3716	break;
3717	default:
3718	error = ENOTSUP;
3719	goto out;
3720	}
3721	out:
3722	if(gotref != `0`)
3723	mount_iterdrop(mp);
3724	return (error);
3725	}
3726
3727	static int filt_fsattach(struct knote kn, struct* kevent_internal_s *kev);
3728	static void filt_fsdetach(struct knote *kn);
3729	static int filt_fsevent(struct knote kn, long* hint);
3730	static int filt_fstouch(struct knote kn, struct* kevent_internal_s *kev);
3731	static int filt_fsprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev);
3732	SECURITY_READ_ONLY_EARLY(struct filterops) fs_filtops = {
3733	.f_attach = filt_fsattach,
3734	.f_detach = filt_fsdetach,
3735	.f_event = filt_fsevent,
3736	.f_touch = filt_fstouch,
3737	.f_process = filt_fsprocess,
3738	};
3739
3740	static int
3741	filt_fsattach(struct knote kn, __unused struct* kevent_internal_s *kev)
3742	{
3743	lck_mtx_lock(fs_klist_lock);
3744	KNOTE_ATTACH(&fs_klist, kn);
3745	lck_mtx_unlock(fs_klist_lock);
3746
3747	/*
3748	* filter only sees future events,
3749	* so it can't be fired already.
3750	*/
3751	return (`0`);
3752	}
3753
3754	static void
3755	filt_fsdetach(struct knote *kn)
3756	{
3757	lck_mtx_lock(fs_klist_lock);
3758	KNOTE_DETACH(&fs_klist, kn);
3759	lck_mtx_unlock(fs_klist_lock);
3760	}
3761
3762	static int
3763	filt_fsevent(struct knote kn, long* hint)
3764	{
3765	/*
3766	* Backwards compatibility:
3767	* Other filters would do nothing if kn->kn_sfflags == 0
3768	*/
3769
3770	if ((kn->kn_sfflags == `0`) \|\| (kn->kn_sfflags & hint)) {
3771	kn->kn_fflags \|= hint;
3772	}
3773
3774	return (kn->kn_fflags != `0`);
3775	}
3776
3777	static int
3778	filt_fstouch(struct knote kn, struct* kevent_internal_s *kev)
3779	{
3780	int res;
3781
3782	lck_mtx_lock(fs_klist_lock);
3783
3784	kn->kn_sfflags = kev->fflags;
3785
3786	/*
3787	* the above filter function sets bits even if nobody is looking for them.
3788	* Just preserve those bits even in the new mask is more selective
3789	* than before.
3790	*
3791	* For compatibility with previous implementations, we leave kn_fflags
3792	* as they were before.
3793	*/
3794	//if (kn->kn_sfflags)
3795	// kn->kn_fflags &= kn->kn_sfflags;
3796	res = (kn->kn_fflags != `0`);
3797
3798	lck_mtx_unlock(fs_klist_lock);
3799
3800	return res;
3801	}
3802
3803	static int
3804	filt_fsprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
3805	{
3806	#pragma unused(data)
3807	int res;
3808
3809	lck_mtx_lock(fs_klist_lock);
3810	res = (kn->kn_fflags != `0`);
3811	if (res) {
3812	*kev = kn->kn_kevent;
3813	kn->kn_flags \|= EV_CLEAR; / automatic /
3814	kn->kn_fflags = `0`;
3815	kn->kn_data = `0`;
3816	}
3817	lck_mtx_unlock(fs_klist_lock);
3818	return res;
3819	}
3820
3821	static int
3822	sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp,
3823	__unused void arg1, __unused int* arg2, struct sysctl_req *req)
3824	{
3825	int out, error;
3826	pid_t pid;
3827	proc_t p;
3828
3829	/ We need a pid. /
3830	if (req->newptr == USER_ADDR_NULL)
3831	return (EINVAL);
3832
3833	error = SYSCTL_IN(req, &pid, sizeof(pid));
3834	if (error)
3835	return (error);
3836
3837	p = proc_find(pid < `0` ? -pid : pid);
3838	if (p == NULL)
3839	return (ESRCH);
3840
3841	/*
3842	* Fetching the value is ok, but we only fetch if the old
3843	* pointer is given.
3844	*/
3845	if (req->oldptr != USER_ADDR_NULL) {
3846	out = !((p->p_flag & P_NOREMOTEHANG) == `0`);
3847	proc_rele(p);
3848	error = SYSCTL_OUT(req, &out, sizeof(out));
3849	return (error);
3850	}
3851
3852	/ cansignal offers us enough security. /
3853	if (p != req->p && proc_suser(req->p) != `0`) {
3854	proc_rele(p);
3855	return (EPERM);
3856	}
3857
3858	if (pid < `0`)
3859	OSBitAndAtomic(~((uint32_t)P_NOREMOTEHANG), &p->p_flag);
3860	else
3861	OSBitOrAtomic(P_NOREMOTEHANG, &p->p_flag);
3862	proc_rele(p);
3863
3864	return (`0`);
3865	}
3866
3867	static int
3868	sysctl_vfs_generic_conf SYSCTL_HANDLER_ARGS
3869	{
3870	int *name, namelen;
3871	struct vfstable *vfsp;
3872	struct vfsconf vfsc = {};
3873
3874	(void)oidp;
3875	name = arg1;
3876	namelen = arg2;
3877
3878	if (namelen < `1`) {
3879	return (EISDIR);
3880	} else if (namelen > `1`) {
3881	return (ENOTDIR);
3882	}
3883
3884	mount_list_lock();
3885	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
3886	if (vfsp->vfc_typenum == name[`0`])
3887	break;
3888
3889	if (vfsp == NULL) {
3890	mount_list_unlock();
3891	return (ENOTSUP);
3892	}
3893
3894	vfsc.vfc_reserved1 = `0`;
3895	bcopy(vfsp->vfc_name, vfsc.vfc_name, sizeof(vfsc.vfc_name));
3896	vfsc.vfc_typenum = vfsp->vfc_typenum;
3897	vfsc.vfc_refcount = vfsp->vfc_refcount;
3898	vfsc.vfc_flags = vfsp->vfc_flags;
3899	vfsc.vfc_reserved2 = `0`;
3900	vfsc.vfc_reserved3 = `0`;
3901
3902	mount_list_unlock();
3903	return (SYSCTL_OUT(req, &vfsc, sizeof(struct vfsconf)));
3904	}
3905
3906	/ the vfs.generic. branch. /
3907	SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, "vfs generic hinge");
3908	/ retreive a list of mounted filesystem fsid_t /
3909	SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist,
3910	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
3911	NULL, `0`, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids");
3912	/ perform operations on filesystem via fsid_t /
3913	SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW \| CTLFLAG_LOCKED,
3914	sysctl_vfs_ctlbyfsid, "ctlbyfsid");
3915	SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW \| CTLFLAG_ANYBODY,
3916	NULL, `0`, sysctl_vfs_noremotehang, "I", "noremotehang");
3917	SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
3918	CTLFLAG_RD \| CTLFLAG_KERN \| CTLFLAG_LOCKED,
3919	&maxvfstypenum, `0`, "");
3920	SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW \| CTLFLAG_LOCKED, &sync_timeout_seconds, `0`, "");
3921	SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
3922	CTLFLAG_RD \| CTLFLAG_LOCKED,
3923	sysctl_vfs_generic_conf, "");
3924
3925	/ Indicate that the root file system unmounted cleanly /
3926	static int vfs_root_unmounted_cleanly = `0`;
3927	SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &vfs_root_unmounted_cleanly, `0`, "Root filesystem was unmounted cleanly");
3928
3929	void
3930	vfs_set_root_unmounted_cleanly(void)
3931	{
3932	vfs_root_unmounted_cleanly = `1`;
3933	}
3934
3935	/*
3936	* Print vnode state.
3937	*/
3938	void
3939	vn_print_state(struct vnode vp, const* char *fmt, ...)
3940	{
3941	va_list ap;
3942	char perm_str[] = "(VM_KERNEL_ADDRPERM pointer)";
3943	char fs_name[MFSNAMELEN];
3944
3945	va_start(ap, fmt);
3946	vprintf(fmt, ap);
3947	va_end(ap);
3948	printf("vp 0x%0llx %s: ", (uint64_t)VM_KERNEL_ADDRPERM(vp), perm_str);
3949	printf("tag %d, type %d\n", vp->v_tag, vp->v_type);
3950	/ Counts .. /
3951	printf(" iocount %d, usecount %d, kusecount %d references %d\n",
3952	vp->v_iocount, vp->v_usecount, vp->v_kusecount, vp->v_references);
3953	printf(" writecount %d, numoutput %d\n", vp->v_writecount,
3954	vp->v_numoutput);
3955	/ Flags /
3956	printf(" flag 0x%x, lflag 0x%x, listflag 0x%x\n", vp->v_flag,
3957	vp->v_lflag, vp->v_listflag);
3958
3959	if (vp->v_mount == NULL \|\| vp->v_mount == dead_mountp) {
3960	strlcpy(fs_name, "deadfs", MFSNAMELEN);
3961	} else {
3962	vfs_name(vp->v_mount, fs_name);
3963	}
3964
3965	printf(" v_data 0x%0llx %s\n",
3966	(vp->v_data ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_data) : `0`),
3967	perm_str);
3968	printf(" v_mount 0x%0llx %s vfs_name %s\n",
3969	(vp->v_mount ? (uint64_t)VM_KERNEL_ADDRPERM(vp->v_mount) : `0`),
3970	perm_str, fs_name);
3971	}
3972
3973	long num_reusedvnodes = `0`;
3974
3975
3976	static vnode_t
3977	process_vp(vnode_t vp, int want_vp, int *deferred)
3978	{
3979	unsigned int vpid;
3980
3981	*deferred = `0`;
3982
3983	vpid = vp->v_id;
3984
3985	vnode_list_remove_locked(vp);
3986
3987	vnode_list_unlock();
3988
3989	vnode_lock_spin(vp);
3990
3991	/*
3992	* We could wait for the vnode_lock after removing the vp from the freelist
3993	* and the vid is bumped only at the very end of reclaim. So it is possible
3994	* that we are looking at a vnode that is being terminated. If so skip it.
3995	*/
3996	if ((vpid != vp->v_id) \|\| (vp->v_usecount != `0`) \|\| (vp->v_iocount != `0`) \|\|
3997	VONLIST(vp) \|\| (vp->v_lflag & VL_TERMINATE)) {
3998	/*
3999	* we lost the race between dropping the list lock
4000	* and picking up the vnode_lock... someone else
4001	* used this vnode and it is now in a new state
4002	*/
4003	vnode_unlock(vp);
4004
4005	return (NULLVP);
4006	}
4007	if ( (vp->v_lflag & (VL_NEEDINACTIVE \| VL_MARKTERM)) == VL_NEEDINACTIVE ) {
4008	/*
4009	* we did a vnode_rele_ext that asked for
4010	* us not to reenter the filesystem during
4011	* the release even though VL_NEEDINACTIVE was
4012	* set... we'll do it here by doing a
4013	* vnode_get/vnode_put
4014	*
4015	* pick up an iocount so that we can call
4016	* vnode_put and drive the VNOP_INACTIVE...
4017	* vnode_put will either leave us off
4018	* the freelist if a new ref comes in,
4019	* or put us back on the end of the freelist
4020	* or recycle us if we were marked for termination...
4021	* so we'll just go grab a new candidate
4022	*/
4023	vp->v_iocount++;
4024	#ifdef JOE_DEBUG
4025	record_vp(vp, `1`);
4026	#endif
4027	vnode_put_locked(vp);
4028	vnode_unlock(vp);
4029
4030	return (NULLVP);
4031	}
4032	/*
4033	* Checks for anyone racing us for recycle
4034	*/
4035	if (vp->v_type != VBAD) {
4036	if (want_vp && (vnode_on_reliable_media(vp) == FALSE \|\| (vp->v_flag & VISDIRTY))) {
4037	vnode_async_list_add(vp);
4038	vnode_unlock(vp);
4039
4040	*deferred = `1`;
4041
4042	return (NULLVP);
4043	}
4044	if (vp->v_lflag & VL_DEAD)
4045	panic("new_vnode(%p): the vnode is VL_DEAD but not VBAD", vp);
4046
4047	vnode_lock_convert(vp);
4048	(void)vnode_reclaim_internal(vp, `1`, want_vp, `0`);
4049
4050	if (want_vp) {
4051	if ((VONLIST(vp)))
4052	panic("new_vnode(%p): vp on list", vp);
4053	if (vp->v_usecount \|\| vp->v_iocount \|\| vp->v_kusecount \|\|
4054	(vp->v_lflag & (VNAMED_UBC \| VNAMED_MOUNT \| VNAMED_FSHASH)))
4055	panic("new_vnode(%p): free vnode still referenced", vp);
4056	if ((vp->v_mntvnodes.tqe_prev != `0`) && (vp->v_mntvnodes.tqe_next != `0`))
4057	panic("new_vnode(%p): vnode seems to be on mount list", vp);
4058	if ( !LIST_EMPTY(&vp->v_nclinks) \|\| !TAILQ_EMPTY(&vp->v_ncchildren))
4059	panic("new_vnode(%p): vnode still hooked into the name cache", vp);
4060	} else {
4061	vnode_unlock(vp);
4062	vp = NULLVP;
4063	}
4064	}
4065	return (vp);
4066	}
4067
4068	__attribute__((noreturn))
4069	static void
4070	async_work_continue(void)
4071	{
4072	struct async_work_lst *q;
4073	int deferred;
4074	vnode_t vp;
4075
4076	q = &vnode_async_work_list;
4077
4078	for (;;) {
4079
4080	vnode_list_lock();
4081
4082	if ( TAILQ_EMPTY(q) ) {
4083	assert_wait(q, (THREAD_UNINT));
4084
4085	vnode_list_unlock();
4086
4087	thread_block((thread_continue_t)async_work_continue);
4088
4089	continue;
4090	}
4091	async_work_handled++;
4092
4093	vp = TAILQ_FIRST(q);
4094
4095	vp = process_vp(vp, `0`, &deferred);
4096
4097	if (vp != NULLVP)
4098	panic("found VBAD vp (%p) on async queue", vp);
4099	}
4100	}
4101
4102
4103	static int
4104	new_vnode(vnode_t *vpp)
4105	{
4106	vnode_t vp;
4107	uint32_t retries = `0`, max_retries = `100`; / retry incase of tablefull /
4108	int force_alloc = `0`, walk_count = `0`;
4109	boolean_t need_reliable_vp = FALSE;
4110	int deferred;
4111	struct timeval initial_tv;
4112	struct timeval current_tv;
4113	proc_t curproc = current_proc();
4114
4115	initial_tv.tv_sec = `0`;
4116	retry:
4117	vp = NULLVP;
4118
4119	vnode_list_lock();
4120
4121	if (need_reliable_vp == TRUE)
4122	async_work_timed_out++;
4123
4124	if ((numvnodes - deadvnodes) < desiredvnodes \|\| force_alloc) {
4125	struct timespec ts;
4126
4127	if ( !TAILQ_EMPTY(&vnode_dead_list)) {
4128	/*
4129	* Can always reuse a dead one
4130	*/
4131	vp = TAILQ_FIRST(&vnode_dead_list);
4132	goto steal_this_vp;
4133	}
4134	/*
4135	* no dead vnodes available... if we're under
4136	* the limit, we'll create a new vnode
4137	*/
4138	numvnodes++;
4139	vnode_list_unlock();
4140
4141	MALLOC_ZONE(vp, struct vnode , sizeof(vp), M_VNODE, M_WAITOK);
4142	bzero((char )vp, sizeof(vp));
4143	VLISTNONE(vp); / avoid double queue removal /
4144	lck_mtx_init(&vp->v_lock, vnode_lck_grp, vnode_lck_attr);
4145
4146	TAILQ_INIT(&vp->v_ncchildren);
4147
4148	klist_init(&vp->v_knotes);
4149	nanouptime(&ts);
4150	vp->v_id = ts.tv_nsec;
4151	vp->v_flag = VSTANDARD;
4152
4153	#if CONFIG_MACF
4154	if (mac_vnode_label_init_needed(vp))
4155	mac_vnode_label_init(vp);
4156	#endif /* MAC */
4157
4158	vp->v_iocount = `1`;
4159	goto done;
4160	}
4161	microuptime(&current_tv);
4162
4163	#define MAX_WALK_COUNT 1000
4164
4165	if ( !TAILQ_EMPTY(&vnode_rage_list) &&
4166	(ragevnodes >= rage_limit \|\|
4167	(current_tv.tv_sec - rage_tv.tv_sec) >= RAGE_TIME_LIMIT)) {
4168
4169	TAILQ_FOREACH(vp, &vnode_rage_list, v_freelist) {
4170	if ( !(vp->v_listflag & VLIST_RAGE))
4171	panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp);
4172
4173	// if we're a dependency-capable process, skip vnodes that can
4174	// cause recycling deadlocks. (i.e. this process is diskimages
4175	// helper and the vnode is in a disk image). Querying the
4176	// mnt_kern_flag for the mount's virtual device status
4177	// is safer than checking the mnt_dependent_process, which
4178	// may not be updated if there are multiple devnode layers
4179	// in between the disk image and the final consumer.
4180
4181	if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == `0` \|\| vp->v_mount == NULL \|\|
4182	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == `0`) {
4183	/*
4184	* if need_reliable_vp == TRUE, then we've already sent one or more
4185	* non-reliable vnodes to the async thread for processing and timed
4186	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
4187	* mechanism to first scan for a reliable vnode before forcing
4188	* a new vnode to be created
4189	*/
4190	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE)
4191	break;
4192	}
4193
4194	// don't iterate more than MAX_WALK_COUNT vnodes to
4195	// avoid keeping the vnode list lock held for too long.
4196
4197	if (walk_count++ > MAX_WALK_COUNT) {
4198	vp = NULL;
4199	break;
4200	}
4201	}
4202	}
4203
4204	if (vp == NULL && !TAILQ_EMPTY(&vnode_free_list)) {
4205	/*
4206	* Pick the first vp for possible reuse
4207	*/
4208	walk_count = `0`;
4209	TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
4210
4211	// if we're a dependency-capable process, skip vnodes that can
4212	// cause recycling deadlocks. (i.e. this process is diskimages
4213	// helper and the vnode is in a disk image). Querying the
4214	// mnt_kern_flag for the mount's virtual device status
4215	// is safer than checking the mnt_dependent_process, which
4216	// may not be updated if there are multiple devnode layers
4217	// in between the disk image and the final consumer.
4218
4219	if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == `0` \|\| vp->v_mount == NULL \|\|
4220	(vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == `0`) {
4221	/*
4222	* if need_reliable_vp == TRUE, then we've already sent one or more
4223	* non-reliable vnodes to the async thread for processing and timed
4224	* out waiting for a dead vnode to show up. Use the MAX_WALK_COUNT
4225	* mechanism to first scan for a reliable vnode before forcing
4226	* a new vnode to be created
4227	*/
4228	if (need_reliable_vp == FALSE \|\| vnode_on_reliable_media(vp) == TRUE)
4229	break;
4230	}
4231
4232	// don't iterate more than MAX_WALK_COUNT vnodes to
4233	// avoid keeping the vnode list lock held for too long.
4234
4235	if (walk_count++ > MAX_WALK_COUNT) {
4236	vp = NULL;
4237	break;
4238	}
4239	}
4240	}
4241
4242	//
4243	// if we don't have a vnode and the walk_count is >= MAX_WALK_COUNT
4244	// then we're trying to create a vnode on behalf of a
4245	// process like diskimages-helper that has file systems
4246	// mounted on top of itself (and thus we can't reclaim
4247	// vnodes in the file systems on top of us). if we can't
4248	// find a vnode to reclaim then we'll just have to force
4249	// the allocation.
4250	//
4251	if (vp == NULL && walk_count >= MAX_WALK_COUNT) {
4252	force_alloc = `1`;
4253	vnode_list_unlock();
4254	goto retry;
4255	}
4256
4257	if (vp == NULL) {
4258	/*
4259	* we've reached the system imposed maximum number of vnodes
4260	* but there isn't a single one available
4261	* wait a bit and then retry... if we can't get a vnode
4262	* after our target number of retries, than log a complaint
4263	*/
4264	if (++retries <= max_retries) {
4265	vnode_list_unlock();
4266	delay_for_interval(`1`, `1000` * `1000`);
4267	goto retry;
4268	}
4269
4270	vnode_list_unlock();
4271	tablefull("vnode");
4272	log(LOG_EMERG, "%d desired, %d numvnodes, "
4273	"%d free, %d dead, %d async, %d rage\n",
4274	desiredvnodes, numvnodes, freevnodes, deadvnodes, async_work_vnodes, ragevnodes);
4275	#if CONFIG_JETSAM
4276
4277	#if DEVELOPMENT \|\| DEBUG
4278	if (bootarg_no_vnode_jetsam)
4279	panic("vnode table is full\n");
4280	#endif /* DEVELOPMENT \|\| DEBUG */
4281
4282	/*
4283	* Running out of vnodes tends to make a system unusable. Start killing
4284	* processes that jetsam knows are killable.
4285	*/
4286	if (memorystatus_kill_on_vnode_limit() == FALSE) {
4287	/*
4288	* If jetsam can't find any more processes to kill and there
4289	* still aren't any free vnodes, panic. Hopefully we'll get a
4290	* panic log to tell us why we ran out.
4291	*/
4292	panic("vnode table is full\n");
4293	}
4294
4295	/*
4296	* Now that we've killed someone, wait a bit and continue looking
4297	* (with fewer retries before trying another kill).
4298	*/
4299	delay_for_interval(`3`, `1000` * `1000`);
4300	retries = `0`;
4301	max_retries = `10`;
4302	goto retry;
4303	#endif
4304
4305	*vpp = NULL;
4306	return (ENFILE);
4307	}
4308	steal_this_vp:
4309	if ((vp = process_vp(vp, `1`, &deferred)) == NULLVP) {
4310	if (deferred) {
4311	int elapsed_msecs;
4312	struct timeval elapsed_tv;
4313
4314	if (initial_tv.tv_sec == `0`)
4315	microuptime(&initial_tv);
4316
4317	vnode_list_lock();
4318
4319	dead_vnode_waited++;
4320	dead_vnode_wanted++;
4321
4322	/*
4323	* note that we're only going to explicitly wait 10ms
4324	* for a dead vnode to become available, since even if one
4325	* isn't available, a reliable vnode might now be available
4326	* at the head of the VRAGE or free lists... if so, we
4327	* can satisfy the new_vnode request with less latency then waiting
4328	* for the full 100ms duration we're ultimately willing to tolerate
4329	*/
4330	assert_wait_timeout((caddr_t)&dead_vnode_wanted, (THREAD_INTERRUPTIBLE), `10000`, NSEC_PER_USEC);
4331
4332	vnode_list_unlock();
4333
4334	thread_block(THREAD_CONTINUE_NULL);
4335
4336	microuptime(&elapsed_tv);
4337
4338	timevalsub(&elapsed_tv, &initial_tv);
4339	elapsed_msecs = elapsed_tv.tv_sec * `1000` + elapsed_tv.tv_usec / `1000`;
4340
4341	if (elapsed_msecs >= `100`) {
4342	/*
4343	* we've waited long enough... 100ms is
4344	* somewhat arbitrary for this case, but the
4345	* normal worst case latency used for UI
4346	* interaction is 100ms, so I've chosen to
4347	* go with that.
4348	*
4349	* setting need_reliable_vp to TRUE
4350	* forces us to find a reliable vnode
4351	* that we can process synchronously, or
4352	* to create a new one if the scan for
4353	* a reliable one hits the scan limit
4354	*/
4355	need_reliable_vp = TRUE;
4356	}
4357	}
4358	goto retry;
4359	}
4360	OSAddAtomicLong(`1`, &num_reusedvnodes);
4361
4362
4363	#if CONFIG_MACF
4364	/*
4365	* We should never see VL_LABELWAIT or VL_LABEL here.
4366	* as those operations hold a reference.
4367	*/
4368	assert ((vp->v_lflag & VL_LABELWAIT) != VL_LABELWAIT);
4369	assert ((vp->v_lflag & VL_LABEL) != VL_LABEL);
4370	if (vp->v_lflag & VL_LABELED) {
4371	vnode_lock_convert(vp);
4372	mac_vnode_label_recycle(vp);
4373	} else if (mac_vnode_label_init_needed(vp)) {
4374	vnode_lock_convert(vp);
4375	mac_vnode_label_init(vp);
4376	}
4377
4378	#endif /* MAC */
4379
4380	vp->v_iocount = `1`;
4381	vp->v_lflag = `0`;
4382	vp->v_writecount = `0`;
4383	vp->v_references = `0`;
4384	vp->v_iterblkflags = `0`;
4385	vp->v_flag = VSTANDARD;
4386	/ vbad vnodes can point to dead_mountp /
4387	vp->v_mount = NULL;
4388	vp->v_defer_reclaimlist = (vnode_t)`0`;
4389
4390	vnode_unlock(vp);
4391
4392	done:
4393	*vpp = vp;
4394
4395	return (`0`);
4396	}
4397
4398	void
4399	vnode_lock(vnode_t vp)
4400	{
4401	lck_mtx_lock(&vp->v_lock);
4402	}
4403
4404	void
4405	vnode_lock_spin(vnode_t vp)
4406	{
4407	lck_mtx_lock_spin(&vp->v_lock);
4408	}
4409
4410	void
4411	vnode_unlock(vnode_t vp)
4412	{
4413	lck_mtx_unlock(&vp->v_lock);
4414	}
4415
4416
4417
4418	int
4419	vnode_get(struct vnode *vp)
4420	{
4421	int retval;
4422
4423	vnode_lock_spin(vp);
4424	retval = vnode_get_locked(vp);
4425	vnode_unlock(vp);
4426
4427	return(retval);
4428	}
4429
4430	int
4431	vnode_get_locked(struct vnode *vp)
4432	{
4433	#if DIAGNOSTIC
4434	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4435	#endif
4436	if ((vp->v_iocount == `0`) && (vp->v_lflag & (VL_TERMINATE \| VL_DEAD))) {
4437	return(ENOENT);
4438	}
4439	vp->v_iocount++;
4440	#ifdef JOE_DEBUG
4441	record_vp(vp, `1`);
4442	#endif
4443	return (`0`);
4444	}
4445
4446	/*
4447	* vnode_getwithvid() cuts in line in front of a vnode drain (that is,
4448	* while the vnode is draining, but at no point after that) to prevent
4449	* deadlocks when getting vnodes from filesystem hashes while holding
4450	* resources that may prevent other iocounts from being released.
4451	*/
4452	int
4453	vnode_getwithvid(vnode_t vp, uint32_t vid)
4454	{
4455	return(vget_internal(vp, vid, ( VNODE_NODEAD \| VNODE_WITHID \| VNODE_DRAINO )));
4456	}
4457
4458	/*
4459	* vnode_getwithvid_drainok() is like vnode_getwithvid(), but does block behind a vnode
4460	* drain; it exists for use in the VFS name cache, where we really do want to block behind
4461	* vnode drain to prevent holding off an unmount.
4462	*/
4463	int
4464	vnode_getwithvid_drainok(vnode_t vp, uint32_t vid)
4465	{
4466	return(vget_internal(vp, vid, ( VNODE_NODEAD \| VNODE_WITHID )));
4467	}
4468
4469	int
4470	vnode_getwithref(vnode_t vp)
4471	{
4472	return(vget_internal(vp, `0`, `0`));
4473	}
4474
4475
4476	__private_extern__ int
4477	vnode_getalways(vnode_t vp)
4478	{
4479	return(vget_internal(vp, `0`, VNODE_ALWAYS));
4480	}
4481
4482	int
4483	vnode_put(vnode_t vp)
4484	{
4485	int retval;
4486
4487	vnode_lock_spin(vp);
4488	retval = vnode_put_locked(vp);
4489	vnode_unlock(vp);
4490
4491	return(retval);
4492	}
4493
4494	static inline void
4495	vn_set_dead(vnode_t vp)
4496	{
4497	vp->v_mount = NULL;
4498	vp->v_op = dead_vnodeop_p;
4499	vp->v_tag = VT_NON;
4500	vp->v_data = NULL;
4501	vp->v_type = VBAD;
4502	vp->v_lflag \|= VL_DEAD;
4503	}
4504
4505	int
4506	vnode_put_locked(vnode_t vp)
4507	{
4508	vfs_context_t ctx = vfs_context_current(); / hoist outside loop /
4509
4510	#if DIAGNOSTIC
4511	lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED);
4512	#endif
4513	retry:
4514	if (vp->v_iocount < `1`)
4515	panic("vnode_put(%p): iocount < 1", vp);
4516
4517	if ((vp->v_usecount > `0`) \|\| (vp->v_iocount > `1`)) {
4518	vnode_dropiocount(vp);
4519	return(`0`);
4520	}
4521	if ((vp->v_lflag & (VL_DEAD \| VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) {
4522
4523	vp->v_lflag &= ~VL_NEEDINACTIVE;
4524	vnode_unlock(vp);
4525
4526	VNOP_INACTIVE(vp, ctx);
4527
4528	vnode_lock_spin(vp);
4529	/*
4530	* because we had to drop the vnode lock before calling
4531	* VNOP_INACTIVE, the state of this vnode may have changed...
4532	* we may pick up both VL_MARTERM and either
4533	* an iocount or a usecount while in the VNOP_INACTIVE call
4534	* we don't want to call vnode_reclaim_internal on a vnode
4535	* that has active references on it... so loop back around
4536	* and reevaluate the state
4537	*/
4538	goto retry;
4539	}
4540	vp->v_lflag &= ~VL_NEEDINACTIVE;
4541
4542	if ((vp->v_lflag & (VL_MARKTERM \| VL_TERMINATE \| VL_DEAD)) == VL_MARKTERM) {
4543	vnode_lock_convert(vp);
4544	vnode_reclaim_internal(vp, `1`, `1`, `0`);
4545	}
4546	vnode_dropiocount(vp);
4547	vnode_list_add(vp);
4548
4549	return(`0`);
4550	}
4551
4552	/ is vnode_t in use by others? /
4553	int
4554	vnode_isinuse(vnode_t vp, int refcnt)
4555	{
4556	return(vnode_isinuse_locked(vp, refcnt, `0`));
4557	}
4558
4559	int vnode_usecount(vnode_t vp)
4560	{
4561	return vp->v_usecount;
4562	}
4563
4564	int vnode_iocount(vnode_t vp)
4565	{
4566	return vp->v_iocount;
4567	}
4568
4569	static int
4570	vnode_isinuse_locked(vnode_t vp, int refcnt, int locked)
4571	{
4572	int retval = `0`;
4573
4574	if (!locked)
4575	vnode_lock_spin(vp);
4576	if ((vp->v_type != VREG) && ((vp->v_usecount - vp->v_kusecount) > refcnt)) {
4577	retval = `1`;
4578	goto out;
4579	}
4580	if (vp->v_type == VREG) {
4581	retval = ubc_isinuse_locked(vp, refcnt, `1`);
4582	}
4583
4584	out:
4585	if (!locked)
4586	vnode_unlock(vp);
4587	return(retval);
4588	}
4589
4590
4591	/ resume vnode_t /
4592	errno_t
4593	vnode_resume(vnode_t vp)
4594	{
4595	if ((vp->v_lflag & VL_SUSPENDED) && vp->v_owner == current_thread()) {
4596
4597	vnode_lock_spin(vp);
4598	vp->v_lflag &= ~VL_SUSPENDED;
4599	vp->v_owner = NULL;
4600	vnode_unlock(vp);
4601
4602	wakeup(&vp->v_iocount);
4603	}
4604	return(`0`);
4605	}
4606
4607	/ suspend vnode_t*
4608	* Please do not use on more than one vnode at a time as it may
4609	* cause deadlocks.
4610	* xxx should we explicity prevent this from happening?
4611	*/
4612
4613	errno_t
4614	vnode_suspend(vnode_t vp)
4615	{
4616	if (vp->v_lflag & VL_SUSPENDED) {
4617	return(EBUSY);
4618	}
4619
4620	vnode_lock_spin(vp);
4621
4622	/*
4623	* xxx is this sufficient to check if a vnode_drain is
4624	* progress?
4625	*/
4626
4627	if (vp->v_owner == NULL) {
4628	vp->v_lflag \|= VL_SUSPENDED;
4629	vp->v_owner = current_thread();
4630	}
4631	vnode_unlock(vp);
4632
4633	return(`0`);
4634	}
4635
4636	/*
4637	* Release any blocked locking requests on the vnode.
4638	* Used for forced-unmounts.
4639	*
4640	* XXX What about network filesystems?
4641	*/
4642	static void
4643	vnode_abort_advlocks(vnode_t vp)
4644	{
4645	if (vp->v_flag & VLOCKLOCAL)
4646	lf_abort_advlocks(vp);
4647	}
4648
4649
4650	static errno_t
4651	vnode_drain(vnode_t vp)
4652	{
4653
4654	if (vp->v_lflag & VL_DRAIN) {
4655	panic("vnode_drain: recursive drain");
4656	return(ENOENT);
4657	}
4658	vp->v_lflag \|= VL_DRAIN;
4659	vp->v_owner = current_thread();
4660
4661	while (vp->v_iocount > `1`)
4662	msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL);
4663
4664	vp->v_lflag &= ~VL_DRAIN;
4665
4666	return(`0`);
4667	}
4668
4669
4670	/*
4671	* if the number of recent references via vnode_getwithvid or vnode_getwithref
4672	* exceeds this threshold, than 'UN-AGE' the vnode by removing it from
4673	* the LRU list if it's currently on it... once the iocount and usecount both drop
4674	* to 0, it will get put back on the end of the list, effectively making it younger
4675	* this allows us to keep actively referenced vnodes in the list without having
4676	* to constantly remove and add to the list each time a vnode w/o a usecount is
4677	* referenced which costs us taking and dropping a global lock twice.
4678	* However, if the vnode is marked DIRTY, we want to pull it out much earlier
4679	*/
4680	#define UNAGE_THRESHHOLD 25
4681	#define UNAGE_DIRTYTHRESHHOLD 6
4682
4683	errno_t
4684	vnode_getiocount(vnode_t vp, unsigned int vid, int vflags)
4685	{
4686	int nodead = vflags & VNODE_NODEAD;
4687	int nosusp = vflags & VNODE_NOSUSPEND;
4688	int always = vflags & VNODE_ALWAYS;
4689	int beatdrain = vflags & VNODE_DRAINO;
4690	int withvid = vflags & VNODE_WITHID;
4691
4692	for (;;) {
4693	int sleepflg = `0`;
4694
4695	/*
4696	* if it is a dead vnode with deadfs
4697	*/
4698	if (nodead && (vp->v_lflag & VL_DEAD) && ((vp->v_type == VBAD) \|\| (vp->v_data == `0`))) {
4699	return(ENOENT);
4700	}
4701	/*
4702	* will return VL_DEAD ones
4703	*/
4704	if ((vp->v_lflag & (VL_SUSPENDED \| VL_DRAIN \| VL_TERMINATE)) == `0` ) {
4705	break;
4706	}
4707	/*
4708	* if suspended vnodes are to be failed
4709	*/
4710	if (nosusp && (vp->v_lflag & VL_SUSPENDED)) {
4711	return(ENOENT);
4712	}
4713	/*
4714	* if you are the owner of drain/suspend/termination , can acquire iocount
4715	* check for VL_TERMINATE; it does not set owner
4716	*/
4717	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED \| VL_TERMINATE)) &&
4718	(vp->v_owner == current_thread())) {
4719	break;
4720	}
4721
4722	if (always != `0`)
4723	break;
4724
4725	/*
4726	* If this vnode is getting drained, there are some cases where
4727	* we can't block or, in case of tty vnodes, want to be
4728	* interruptible.
4729	*/
4730	if (vp->v_lflag & VL_DRAIN) {
4731	/*
4732	* In some situations, we want to get an iocount
4733	* even if the vnode is draining to prevent deadlock,
4734	* e.g. if we're in the filesystem, potentially holding
4735	* resources that could prevent other iocounts from
4736	* being released.
4737	*/
4738	if (beatdrain)
4739	break;
4740	/*
4741	* Don't block if the vnode's mount point is unmounting as
4742	* we may be the thread the unmount is itself waiting on
4743	* Only callers who pass in vids (at this point, we've already
4744	* handled nosusp and nodead) are expecting error returns
4745	* from this function, so only we can only return errors for
4746	* those. ENODEV is intended to inform callers that the call
4747	* failed because an unmount is in progress.
4748	*/
4749	if (withvid && (vp->v_mount) && vfs_isunmount(vp->v_mount))
4750	return (ENODEV);
4751
4752	if (vnode_istty(vp)) {
4753	sleepflg = PCATCH;
4754	}
4755	}
4756
4757	vnode_lock_convert(vp);
4758
4759	if (vp->v_lflag & VL_TERMINATE) {
4760	int error;
4761
4762	vp->v_lflag \|= VL_TERMWANT;
4763
4764	error = msleep(&vp->v_lflag, &vp->v_lock,
4765	(PVFS \| sleepflg), "vnode getiocount", NULL);
4766	if (error)
4767	return (error);
4768	} else
4769	msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL);
4770	}
4771	if (withvid && vid != vp->v_id) {
4772	return(ENOENT);
4773	}
4774	if (++vp->v_references >= UNAGE_THRESHHOLD \|\|
4775	(vp->v_flag & VISDIRTY && vp->v_references >= UNAGE_DIRTYTHRESHHOLD)) {
4776	vp->v_references = `0`;
4777	vnode_list_remove(vp);
4778	}
4779	vp->v_iocount++;
4780	#ifdef JOE_DEBUG
4781	record_vp(vp, `1`);
4782	#endif
4783	return(`0`);
4784	}
4785
4786	static void
4787	vnode_dropiocount (vnode_t vp)
4788	{
4789	if (vp->v_iocount < `1`)
4790	panic("vnode_dropiocount(%p): v_iocount < 1", vp);
4791
4792	vp->v_iocount--;
4793	#ifdef JOE_DEBUG
4794	record_vp(vp, -`1`);
4795	#endif
4796	if ((vp->v_lflag & (VL_DRAIN \| VL_SUSPENDED)) && (vp->v_iocount <= `1`))
4797	wakeup(&vp->v_iocount);
4798	}
4799
4800
4801	void
4802	vnode_reclaim(struct vnode * vp)
4803	{
4804	vnode_reclaim_internal(vp, `0`, `0`, `0`);
4805	}
4806
4807	__private_extern__
4808	void
4809	vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags)
4810	{
4811	int isfifo = `0`;
4812
4813	if (!locked)
4814	vnode_lock(vp);
4815
4816	if (vp->v_lflag & VL_TERMINATE) {
4817	panic("vnode reclaim in progress");
4818	}
4819	vp->v_lflag \|= VL_TERMINATE;
4820
4821	vn_clearunionwait(vp, `1`);
4822
4823	vnode_drain(vp);
4824
4825	isfifo = (vp->v_type == VFIFO);
4826
4827	if (vp->v_type != VBAD)
4828	vgone(vp, flags); / clean and reclaim the vnode /
4829
4830	/*
4831	* give the vnode a new identity so that vnode_getwithvid will fail
4832	* on any stale cache accesses...
4833	* grab the list_lock so that if we're in "new_vnode"
4834	* behind the list_lock trying to steal this vnode, the v_id is stable...
4835	* once new_vnode drops the list_lock, it will block trying to take
4836	* the vnode lock until we release it... at that point it will evaluate
4837	* whether the v_vid has changed
4838	* also need to make sure that the vnode isn't on a list where "new_vnode"
4839	* can find it after the v_id has been bumped until we are completely done
4840	* with the vnode (i.e. putting it back on a list has to be the very last
4841	* thing we do to this vnode... many of the callers of vnode_reclaim_internal
4842	* are holding an io_count on the vnode... they need to drop the io_count
4843	* BEFORE doing a vnode_list_add or make sure to hold the vnode lock until
4844	* they are completely done with the vnode
4845	*/
4846	vnode_list_lock();
4847
4848	vnode_list_remove_locked(vp);
4849	vp->v_id++;
4850
4851	vnode_list_unlock();
4852
4853	if (isfifo) {
4854	struct fifoinfo * fip;
4855
4856	fip = vp->v_fifoinfo;
4857	vp->v_fifoinfo = NULL;
4858	FREE(fip, M_TEMP);
4859	}
4860	vp->v_type = VBAD;
4861
4862	if (vp->v_data)
4863	panic("vnode_reclaim_internal: cleaned vnode isn't");
4864	if (vp->v_numoutput)
4865	panic("vnode_reclaim_internal: clean vnode has pending I/O's");
4866	if (UBCINFOEXISTS(vp))
4867	panic("vnode_reclaim_internal: ubcinfo not cleaned");
4868	if (vp->v_parent)
4869	panic("vnode_reclaim_internal: vparent not removed");
4870	if (vp->v_name)
4871	panic("vnode_reclaim_internal: vname not removed");
4872
4873	vp->v_socket = NULL;
4874
4875	vp->v_lflag &= ~VL_TERMINATE;
4876	vp->v_owner = NULL;
4877
4878	KNOTE(&vp->v_knotes, NOTE_REVOKE);
4879
4880	/ Make sure that when we reuse the vnode, no knotes left over /
4881	klist_init(&vp->v_knotes);
4882
4883	if (vp->v_lflag & VL_TERMWANT) {
4884	vp->v_lflag &= ~VL_TERMWANT;
4885	wakeup(&vp->v_lflag);
4886	}
4887	if (!reuse) {
4888	/*
4889	* make sure we get on the
4890	* dead list if appropriate
4891	*/
4892	vnode_list_add(vp);
4893	}
4894	if (!locked)
4895	vnode_unlock(vp);
4896	}
4897
4898	static int
4899	vnode_create_internal(uint32_t flavor, uint32_t size, void data, vnode_t vpp,
4900	int init_vnode)
4901	{
4902	int error;
4903	int insert = `1`;
4904	int existing_vnode;
4905	vnode_t vp;
4906	vnode_t nvp;
4907	vnode_t dvp;
4908	struct uthread *ut;
4909	struct componentname *cnp;
4910	struct vnode_fsparam param = (struct* vnode_fsparam *)data;
4911	#if CONFIG_TRIGGERS
4912	struct vnode_trigger_param *tinfo = NULL;
4913	#endif
4914	if (*vpp) {
4915	vp = *vpp;
4916	*vpp = NULLVP;
4917	existing_vnode = `1`;
4918	} else {
4919	existing_vnode = `0`;
4920	}
4921
4922	if (init_vnode) {
4923	/ Do quick sanity check on the parameters. /
4924	if ((param == NULL) \|\| (param->vnfs_vtype == VBAD)) {
4925	error = EINVAL;
4926	goto error_out;
4927	}
4928
4929	#if CONFIG_TRIGGERS
4930	if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) {
4931	tinfo = (struct vnode_trigger_param *)data;
4932
4933	/ Validate trigger vnode input /
4934	if ((param->vnfs_vtype != VDIR) \|\|
4935	(tinfo->vnt_resolve_func == NULL) \|\|
4936	(tinfo->vnt_flags & ~VNT_VALID_MASK)) {
4937	error = EINVAL;
4938	goto error_out;
4939	}
4940	/ Fall through a normal create (params will be the same) /
4941	flavor = VNCREATE_FLAVOR;
4942	size = VCREATESIZE;
4943	}
4944	#endif
4945	if ((flavor != VNCREATE_FLAVOR) \|\| (size != VCREATESIZE)) {
4946	error = EINVAL;
4947	goto error_out;
4948	}
4949	}
4950
4951	if (!existing_vnode) {
4952	if ((error = new_vnode(&vp)) ) {
4953	return (error);
4954	}
4955	if (!init_vnode) {
4956	/ Make it so that it can be released by a vnode_put) /
4957	vn_set_dead(vp);
4958	*vpp = vp;
4959	return (`0`);
4960	}
4961	} else {
4962	/*
4963	* A vnode obtained by vnode_create_empty has been passed to
4964	* vnode_initialize - Unset VL_DEAD set by vn_set_dead. After
4965	* this point, it is set back on any error.
4966	*
4967	* N.B. vnode locking - We make the same assumptions as the
4968	* "unsplit" vnode_create did - i.e. it is safe to update the
4969	* vnode's fields without the vnode lock. This vnode has been
4970	* out and about with the filesystem and hopefully nothing
4971	* was done to the vnode between the vnode_create_empty and
4972	* now when it has come in through vnode_initialize.
4973	*/
4974	vp->v_lflag &= ~VL_DEAD;
4975	}
4976
4977	dvp = param->vnfs_dvp;
4978	cnp = param->vnfs_cnp;
4979
4980	vp->v_op = param->vnfs_vops;
4981	vp->v_type = param->vnfs_vtype;
4982	vp->v_data = param->vnfs_fsnode;
4983
4984	if (param->vnfs_markroot)
4985	vp->v_flag \|= VROOT;
4986	if (param->vnfs_marksystem)
4987	vp->v_flag \|= VSYSTEM;
4988	if (vp->v_type == VREG) {
4989	error = ubc_info_init_withsize(vp, param->vnfs_filesize);
4990	if (error) {
4991	#ifdef JOE_DEBUG
4992	record_vp(vp, `1`);
4993	#endif
4994	vn_set_dead(vp);
4995
4996	vnode_put(vp);
4997	return(error);
4998	}
4999	if (param->vnfs_mp->mnt_ioflags & MNT_IOFLAGS_IOSCHED_SUPPORTED)
5000	memory_object_mark_io_tracking(vp->v_ubcinfo->ui_control);
5001	}
5002	#ifdef JOE_DEBUG
5003	record_vp(vp, `1`);
5004	#endif
5005
5006	#if CONFIG_TRIGGERS
5007	/*
5008	* For trigger vnodes, attach trigger info to vnode
5009	*/
5010	if ((vp->v_type == VDIR) && (tinfo != NULL)) {
5011	/*
5012	* Note: has a side effect of incrementing trigger count on the
5013	* mount if successful, which we would need to undo on a
5014	* subsequent failure.
5015	*/
5016	#ifdef JOE_DEBUG
5017	record_vp(vp, -`1`);
5018	#endif
5019	error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE);
5020	if (error) {
5021	printf("vnode_create: vnode_resolver_create() err %d\n", error);
5022	vn_set_dead(vp);
5023	#ifdef JOE_DEBUG
5024	record_vp(vp, `1`);
5025	#endif
5026	vnode_put(vp);
5027	return (error);
5028	}
5029	}
5030	#endif
5031	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
5032
5033	vp->v_tag = VT_DEVFS; / callers will reset if needed (bdevvp) /
5034
5035	if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) {
5036	/*
5037	* if checkalias returns a vnode, it will be locked
5038	*
5039	* first get rid of the unneeded vnode we acquired
5040	*/
5041	vp->v_data = NULL;
5042	vp->v_op = spec_vnodeop_p;
5043	vp->v_type = VBAD;
5044	vp->v_lflag = VL_DEAD;
5045	vp->v_data = NULL;
5046	vp->v_tag = VT_NON;
5047	vnode_put(vp);
5048
5049	/*
5050	* switch to aliased vnode and finish
5051	* preparing it
5052	*/
5053	vp = nvp;
5054
5055	vclean(vp, `0`);
5056	vp->v_op = param->vnfs_vops;
5057	vp->v_type = param->vnfs_vtype;
5058	vp->v_data = param->vnfs_fsnode;
5059	vp->v_lflag = `0`;
5060	vp->v_mount = NULL;
5061	insmntque(vp, param->vnfs_mp);
5062	insert = `0`;
5063	vnode_unlock(vp);
5064	}
5065
5066	if (VCHR == vp->v_type) {
5067	u_int maj = major(vp->v_rdev);
5068
5069	if (maj < (u_int)nchrdev && cdevsw[maj].d_type == D_TTY)
5070	vp->v_flag \|= VISTTY;
5071	}
5072	}
5073
5074	if (vp->v_type == VFIFO) {
5075	struct fifoinfo *fip;
5076
5077	MALLOC(fip, struct fifoinfo *,
5078	sizeof(*fip), M_TEMP, M_WAITOK);
5079	bzero(fip, sizeof(struct fifoinfo ));
5080	vp->v_fifoinfo = fip;
5081	}
5082	/ The file systems must pass the address of the location where*
5083	* they store the vnode pointer. When we add the vnode into the mount
5084	* list and name cache they become discoverable. So the file system node
5085	* must have the connection to vnode setup by then
5086	*/
5087	*vpp = vp;
5088
5089	/ Add fs named reference. /
5090	if (param->vnfs_flags & VNFS_ADDFSREF) {
5091	vp->v_lflag \|= VNAMED_FSHASH;
5092	}
5093	if (param->vnfs_mp) {
5094	if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL)
5095	vp->v_flag \|= VLOCKLOCAL;
5096	if (insert) {
5097	if ((vp->v_freelist.tqe_prev != (struct vnode **)`0xdeadb`))
5098	panic("insmntque: vp on the free list\n");
5099
5100	/*
5101	* enter in mount vnode list
5102	*/
5103	insmntque(vp, param->vnfs_mp);
5104	}
5105	}
5106	if (dvp && vnode_ref(dvp) == `0`) {
5107	vp->v_parent = dvp;
5108	}
5109	if (cnp) {
5110	if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE \| VNFS_CANTCACHE)) == `0`)) {
5111	/*
5112	* enter into name cache
5113	* we've got the info to enter it into the name cache now
5114	* cache_enter_create will pick up an extra reference on
5115	* the name entered into the string cache
5116	*/
5117	vp->v_name = cache_enter_create(dvp, vp, cnp);
5118	} else
5119	vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, `0`);
5120
5121	if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED)
5122	vp->v_flag \|= VISUNION;
5123	}
5124	if ((param->vnfs_flags & VNFS_CANTCACHE) == `0`) {
5125	/*
5126	* this vnode is being created as cacheable in the name cache
5127	* this allows us to re-enter it in the cache
5128	*/
5129	vp->v_flag \|= VNCACHEABLE;
5130	}
5131	ut = get_bsdthread_info(current_thread());
5132
5133	if ((current_proc()->p_lflag & P_LRAGE_VNODES) \|\|
5134	(ut->uu_flag & (UT_RAGE_VNODES \| UT_KERN_RAGE_VNODES))) {
5135	/*
5136	* process has indicated that it wants any
5137	* vnodes created on its behalf to be rapidly
5138	* aged to reduce the impact on the cached set
5139	* of vnodes
5140	*
5141	* if UT_KERN_RAGE_VNODES is set, then the
5142	* kernel internally wants vnodes to be rapidly
5143	* aged, even if the process hasn't requested
5144	* this
5145	*/
5146	vp->v_flag \|= VRAGE;
5147	}
5148
5149	#if CONFIG_SECLUDED_MEMORY
5150	switch (secluded_for_filecache) {
5151	case `0`:
5152	/*
5153	* secluded_for_filecache == 0:
5154	* + no file contents in secluded pool
5155	*/
5156	break;
5157	case `1`:
5158	/*
5159	* secluded_for_filecache == 1:
5160	* + no files from /
5161	* + files from /Applications/ are OK
5162	* + files from /Applications/Camera are not OK
5163	* + no files that are open for write
5164	*/
5165	if (vnode_vtype(vp) == VREG &&
5166	vnode_mount(vp) != NULL &&
5167	(! (vfs_flags(vnode_mount(vp)) & MNT_ROOTFS))) {
5168	/ not from root filesystem: eligible for secluded pages /
5169	memory_object_mark_eligible_for_secluded(
5170	ubc_getobject(vp, UBC_FLAGS_NONE),
5171	TRUE);
5172	}
5173	break;
5174	case `2`:
5175	/*
5176	* secluded_for_filecache == 2:
5177	* + all read-only files OK, except:
5178	* + dyld_shared_cache_arm64*
5179	* + Camera
5180	* + mediaserverd
5181	*/
5182	if (vnode_vtype(vp) == VREG) {
5183	memory_object_mark_eligible_for_secluded(
5184	ubc_getobject(vp, UBC_FLAGS_NONE),
5185	TRUE);
5186	}
5187	break;
5188	default:
5189	break;
5190	}
5191	#endif /* CONFIG_SECLUDED_MEMORY */
5192
5193	return (`0`);
5194
5195	error_out:
5196	if (existing_vnode) {
5197	vnode_put(vp);
5198	}
5199	return (error);
5200	}
5201
5202	/ USAGE:*
5203	* The following api creates a vnode and associates all the parameter specified in vnode_fsparam
5204	* structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias
5205	* is obsoleted by this.
5206	*/
5207	int
5208	vnode_create(uint32_t flavor, uint32_t size, void data, vnode_t vpp)
5209	{
5210	*vpp = NULLVP;
5211	return (vnode_create_internal(flavor, size, data, vpp, `1`));
5212	}
5213
5214	int
5215	vnode_create_empty(vnode_t *vpp)
5216	{
5217	*vpp = NULLVP;
5218	return (vnode_create_internal(VNCREATE_FLAVOR, VCREATESIZE, NULL,
5219	vpp, `0`));
5220	}
5221
5222	int
5223	vnode_initialize(uint32_t flavor, uint32_t size, void data, vnode_t vpp)
5224	{
5225	if (*vpp == NULLVP) {
5226	panic("NULL vnode passed to vnode_initialize");
5227	}
5228	#if DEVELOPMENT \|\| DEBUG
5229	/*
5230	* We lock to check that vnode is fit for unlocked use in
5231	* vnode_create_internal.
5232	*/
5233	vnode_lock_spin(*vpp);
5234	VNASSERT(((vpp)->v_iocount == `1`), vpp,
5235	("vnode_initialize : iocount not 1, is %d", (*vpp)->v_iocount));
5236	VNASSERT(((vpp)->v_usecount == `0`), vpp,
5237	("vnode_initialize : usecount not 0, is %d", (*vpp)->v_usecount));
5238	VNASSERT(((vpp)->v_lflag & VL_DEAD), vpp,
5239	("vnode_initialize : v_lflag does not have VL_DEAD, is 0x%x",
5240	(*vpp)->v_lflag));
5241	VNASSERT(((vpp)->v_data == NULL), vpp,
5242	("vnode_initialize : v_data not NULL"));
5243	vnode_unlock(*vpp);
5244	#endif
5245	return (vnode_create_internal(flavor, size, data, vpp, `1`));
5246	}
5247
5248	int
5249	vnode_addfsref(vnode_t vp)
5250	{
5251	vnode_lock_spin(vp);
5252	if (vp->v_lflag & VNAMED_FSHASH)
5253	panic("add_fsref: vp already has named reference");
5254	if ((vp->v_freelist.tqe_prev != (struct vnode **)`0xdeadb`))
5255	panic("addfsref: vp on the free list\n");
5256	vp->v_lflag \|= VNAMED_FSHASH;
5257	vnode_unlock(vp);
5258	return(`0`);
5259
5260	}
5261	int
5262	vnode_removefsref(vnode_t vp)
5263	{
5264	vnode_lock_spin(vp);
5265	if ((vp->v_lflag & VNAMED_FSHASH) == `0`)
5266	panic("remove_fsref: no named reference");
5267	vp->v_lflag &= ~VNAMED_FSHASH;
5268	vnode_unlock(vp);
5269	return(`0`);
5270
5271	}
5272
5273
5274	int
5275	vfs_iterate(int flags, int (callout)(mount_t, void* ), void* *arg)
5276	{
5277	mount_t mp;
5278	int ret = `0`;
5279	fsid_t * fsid_list;
5280	int count, actualcount, i;
5281	void * allocmem;
5282	int indx_start, indx_stop, indx_incr;
5283	int cb_dropref = (flags & VFS_ITERATE_CB_DROPREF);
5284
5285	count = mount_getvfscnt();
5286	count += `10`;
5287
5288	fsid_list = (fsid_t )kalloc(count sizeof(fsid_t));
5289	allocmem = (void *)fsid_list;
5290
5291	actualcount = mount_fillfsids(fsid_list, count);
5292
5293	/*
5294	* Establish the iteration direction
5295	* VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first)
5296	*/
5297	if (flags & VFS_ITERATE_TAIL_FIRST) {
5298	indx_start = actualcount - `1`;
5299	indx_stop = -`1`;
5300	indx_incr = -`1`;
5301	} else / Head first by default / {
5302	indx_start = `0`;
5303	indx_stop = actualcount;
5304	indx_incr = `1`;
5305	}
5306
5307	for (i=indx_start; i != indx_stop; i += indx_incr) {
5308
5309	/ obtain the mount point with iteration reference /
5310	mp = mount_list_lookupby_fsid(&fsid_list[i], `0`, `1`);
5311
5312	if(mp == (struct mount *)`0`)
5313	continue;
5314	mount_lock(mp);
5315	if (mp->mnt_lflag & (MNT_LDEAD \| MNT_LUNMOUNT)) {
5316	mount_unlock(mp);
5317	mount_iterdrop(mp);
5318	continue;
5319
5320	}
5321	mount_unlock(mp);
5322
5323	/ iterate over all the vnodes /
5324	ret = callout(mp, arg);
5325
5326	/*
5327	* Drop the iterref here if the callback didn't do it.
5328	* Note: If cb_dropref is set the mp may no longer exist.
5329	*/
5330	if (!cb_dropref)
5331	mount_iterdrop(mp);
5332
5333	switch (ret) {
5334	case VFS_RETURNED:
5335	case VFS_RETURNED_DONE:
5336	if (ret == VFS_RETURNED_DONE) {
5337	ret = `0`;
5338	goto out;
5339	}
5340	break;
5341
5342	case VFS_CLAIMED_DONE:
5343	ret = `0`;
5344	goto out;
5345	case VFS_CLAIMED:
5346	default:
5347	break;
5348	}
5349	ret = `0`;
5350	}
5351
5352	out:
5353	kfree(allocmem, (count * sizeof(fsid_t)));
5354	return (ret);
5355	}
5356
5357	/*
5358	* Update the vfsstatfs structure in the mountpoint.
5359	* MAC: Parameter eventtype added, indicating whether the event that
5360	* triggered this update came from user space, via a system call
5361	* (VFS_USER_EVENT) or an internal kernel call (VFS_KERNEL_EVENT).
5362	*/
5363	int
5364	vfs_update_vfsstat(mount_t mp, vfs_context_t ctx, __unused int eventtype)
5365	{
5366	struct vfs_attr va;
5367	int error;
5368
5369	/*
5370	* Request the attributes we want to propagate into
5371	* the per-mount vfsstat structure.
5372	*/
5373	VFSATTR_INIT(&va);
5374	VFSATTR_WANTED(&va, f_iosize);
5375	VFSATTR_WANTED(&va, f_blocks);
5376	VFSATTR_WANTED(&va, f_bfree);
5377	VFSATTR_WANTED(&va, f_bavail);
5378	VFSATTR_WANTED(&va, f_bused);
5379	VFSATTR_WANTED(&va, f_files);
5380	VFSATTR_WANTED(&va, f_ffree);
5381	VFSATTR_WANTED(&va, f_bsize);
5382	VFSATTR_WANTED(&va, f_fssubtype);
5383
5384	if ((error = vfs_getattr(mp, &va, ctx)) != `0`) {
5385	KAUTH_DEBUG("STAT - filesystem returned error %d", error);
5386	return(error);
5387	}
5388	#if CONFIG_MACF
5389	if (eventtype == VFS_USER_EVENT) {
5390	error = mac_mount_check_getattr(ctx, mp, &va);
5391	if (error != `0`)
5392	return (error);
5393	}
5394	#endif
5395	/*
5396	* Unpack into the per-mount structure.
5397	*
5398	* We only overwrite these fields, which are likely to change:
5399	* f_blocks
5400	* f_bfree
5401	* f_bavail
5402	* f_bused
5403	* f_files
5404	* f_ffree
5405	*
5406	* And these which are not, but which the FS has no other way
5407	* of providing to us:
5408	* f_bsize
5409	* f_iosize
5410	* f_fssubtype
5411	*
5412	*/
5413	if (VFSATTR_IS_SUPPORTED(&va, f_bsize)) {
5414	/ 4822056 - protect against malformed server mount /
5415	mp->mnt_vfsstat.f_bsize = (va.f_bsize > `0` ? va.f_bsize : `512`);
5416	} else {
5417	mp->mnt_vfsstat.f_bsize = mp->mnt_devblocksize; / default from the device block size /
5418	}
5419	if (VFSATTR_IS_SUPPORTED(&va, f_iosize)) {
5420	mp->mnt_vfsstat.f_iosize = va.f_iosize;
5421	} else {
5422	mp->mnt_vfsstat.f_iosize = `1024` * `1024`; / 1MB sensible I/O size /
5423	}
5424	if (VFSATTR_IS_SUPPORTED(&va, f_blocks))
5425	mp->mnt_vfsstat.f_blocks = va.f_blocks;
5426	if (VFSATTR_IS_SUPPORTED(&va, f_bfree))
5427	mp->mnt_vfsstat.f_bfree = va.f_bfree;
5428	if (VFSATTR_IS_SUPPORTED(&va, f_bavail))
5429	mp->mnt_vfsstat.f_bavail = va.f_bavail;
5430	if (VFSATTR_IS_SUPPORTED(&va, f_bused))
5431	mp->mnt_vfsstat.f_bused = va.f_bused;
5432	if (VFSATTR_IS_SUPPORTED(&va, f_files))
5433	mp->mnt_vfsstat.f_files = va.f_files;
5434	if (VFSATTR_IS_SUPPORTED(&va, f_ffree))
5435	mp->mnt_vfsstat.f_ffree = va.f_ffree;
5436
5437	/ this is unlikely to change, but has to be queried for /
5438	if (VFSATTR_IS_SUPPORTED(&va, f_fssubtype))
5439	mp->mnt_vfsstat.f_fssubtype = va.f_fssubtype;
5440
5441	return(`0`);
5442	}
5443
5444	int
5445	mount_list_add(mount_t mp)
5446	{
5447	int res;
5448
5449	mount_list_lock();
5450	if (system_inshutdown != `0`) {
5451	res = -`1`;
5452	} else {
5453	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
5454	nummounts++;
5455	res = `0`;
5456	}
5457	mount_list_unlock();
5458
5459	return res;
5460	}
5461
5462	void
5463	mount_list_remove(mount_t mp)
5464	{
5465	mount_list_lock();
5466	TAILQ_REMOVE(&mountlist, mp, mnt_list);
5467	nummounts--;
5468	mp->mnt_list.tqe_next = NULL;
5469	mp->mnt_list.tqe_prev = NULL;
5470	mount_list_unlock();
5471	}
5472
5473	mount_t
5474	mount_lookupby_volfsid(int volfs_id, int withref)
5475	{
5476	mount_t cur_mount = (mount_t)`0`;
5477	mount_t mp;
5478
5479	mount_list_lock();
5480	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
5481	if (!(mp->mnt_kern_flag & MNTK_UNMOUNT) &&
5482	(mp->mnt_kern_flag & MNTK_PATH_FROM_ID) &&
5483	(mp->mnt_vfsstat.f_fsid.val[`0`] == volfs_id)) {
5484	cur_mount = mp;
5485	if (withref) {
5486	if (mount_iterref(cur_mount, `1`)) {
5487	cur_mount = (mount_t)`0`;
5488	mount_list_unlock();
5489	goto out;
5490	}
5491	}
5492	break;
5493	}
5494	}
5495	mount_list_unlock();
5496	if (withref && (cur_mount != (mount_t)`0`)) {
5497	mp = cur_mount;
5498	if (vfs_busy(mp, LK_NOWAIT) != `0`) {
5499	cur_mount = (mount_t)`0`;
5500	}
5501	mount_iterdrop(mp);
5502	}
5503	out:
5504	return(cur_mount);
5505	}
5506
5507	mount_t
5508	mount_list_lookupby_fsid(fsid_t fsid, int* locked, int withref)
5509	{
5510	mount_t retmp = (mount_t)`0`;
5511	mount_t mp;
5512
5513	if (!locked)
5514	mount_list_lock();
5515	TAILQ_FOREACH(mp, &mountlist, mnt_list)
5516	if (mp->mnt_vfsstat.f_fsid.val[`0`] == fsid->val[`0`] &&
5517	mp->mnt_vfsstat.f_fsid.val[`1`] == fsid->val[`1`]) {
5518	retmp = mp;
5519	if (withref) {
5520	if (mount_iterref(retmp, `1`))
5521	retmp = (mount_t)`0`;
5522	}
5523	goto out;
5524	}
5525	out:
5526	if (!locked)
5527	mount_list_unlock();
5528	return (retmp);
5529	}
5530
5531	errno_t
5532	vnode_lookup(const char path, int* flags, vnode_t *vpp, vfs_context_t ctx)
5533	{
5534	struct nameidata nd;
5535	int error;
5536	u_int32_t ndflags = `0`;
5537
5538	if (ctx == NULL) {
5539	return EINVAL;
5540	}
5541
5542	if (flags & VNODE_LOOKUP_NOFOLLOW)
5543	ndflags = NOFOLLOW;
5544	else
5545	ndflags = FOLLOW;
5546
5547	if (flags & VNODE_LOOKUP_NOCROSSMOUNT)
5548	ndflags \|= NOCROSSMOUNT;
5549
5550	if (flags & VNODE_LOOKUP_CROSSMOUNTNOWAIT)
5551	ndflags \|= CN_NBMOUNTLOOK;
5552
5553	/ XXX AUDITVNPATH1 needed ? /
5554	NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE,
5555	CAST_USER_ADDR_T(path), ctx);
5556
5557	if ((error = namei(&nd)))
5558	return (error);
5559	*vpp = nd.ni_vp;
5560	nameidone(&nd);
5561
5562	return (`0`);
5563	}
5564
5565	errno_t
5566	vnode_open(const char path, int* fmode, int cmode, int flags, vnode_t *vpp, vfs_context_t ctx)
5567	{
5568	struct nameidata nd;
5569	int error;
5570	u_int32_t ndflags = `0`;
5571	int lflags = flags;
5572
5573	if (ctx == NULL) { / XXX technically an error /
5574	ctx = vfs_context_current();
5575	}
5576
5577	if (fmode & O_NOFOLLOW)
5578	lflags \|= VNODE_LOOKUP_NOFOLLOW;
5579
5580	if (lflags & VNODE_LOOKUP_NOFOLLOW)
5581	ndflags = NOFOLLOW;
5582	else
5583	ndflags = FOLLOW;
5584
5585	if (lflags & VNODE_LOOKUP_NOCROSSMOUNT)
5586	ndflags \|= NOCROSSMOUNT;
5587
5588	if (lflags & VNODE_LOOKUP_CROSSMOUNTNOWAIT)
5589	ndflags \|= CN_NBMOUNTLOOK;
5590
5591	/ XXX AUDITVNPATH1 needed ? /
5592	NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE,
5593	CAST_USER_ADDR_T(path), ctx);
5594
5595	if ((error = vn_open(&nd, fmode, cmode)))
5596	*vpp = NULL;
5597	else
5598	*vpp = nd.ni_vp;
5599
5600	return (error);
5601	}
5602
5603	errno_t
5604	vnode_close(vnode_t vp, int flags, vfs_context_t ctx)
5605	{
5606	int error;
5607
5608	if (ctx == NULL) {
5609	ctx = vfs_context_current();
5610	}
5611
5612	error = vn_close(vp, flags, ctx);
5613	vnode_put(vp);
5614	return (error);
5615	}
5616
5617	errno_t
5618	vnode_mtime(vnode_t vp, struct timespec *mtime, vfs_context_t ctx)
5619	{
5620	struct vnode_attr va;
5621	int error;
5622
5623	VATTR_INIT(&va);
5624	VATTR_WANTED(&va, va_modify_time);
5625	error = vnode_getattr(vp, &va, ctx);
5626	if (!error)
5627	*mtime = va.va_modify_time;
5628	return error;
5629	}
5630
5631	errno_t
5632	vnode_flags(vnode_t vp, uint32_t *flags, vfs_context_t ctx)
5633	{
5634	struct vnode_attr va;
5635	int error;
5636
5637	VATTR_INIT(&va);
5638	VATTR_WANTED(&va, va_flags);
5639	error = vnode_getattr(vp, &va, ctx);
5640	if (!error)
5641	*flags = va.va_flags;
5642	return error;
5643	}
5644
5645	/*
5646	* Returns: 0 Success
5647	* vnode_getattr:???
5648	*/
5649	errno_t
5650	vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
5651	{
5652	struct vnode_attr va;
5653	int error;
5654
5655	VATTR_INIT(&va);
5656	VATTR_WANTED(&va, va_data_size);
5657	error = vnode_getattr(vp, &va, ctx);
5658	if (!error)
5659	*sizep = va.va_data_size;
5660	return(error);
5661	}
5662
5663	errno_t
5664	vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx)
5665	{
5666	struct vnode_attr va;
5667
5668	VATTR_INIT(&va);
5669	VATTR_SET(&va, va_data_size, size);
5670	va.va_vaflags = ioflag & `0xffff`;
5671	return(vnode_setattr(vp, &va, ctx));
5672	}
5673
5674	int
5675	vnode_setdirty(vnode_t vp)
5676	{
5677	vnode_lock_spin(vp);
5678	vp->v_flag \|= VISDIRTY;
5679	vnode_unlock(vp);
5680	return `0`;
5681	}
5682
5683	int
5684	vnode_cleardirty(vnode_t vp)
5685	{
5686	vnode_lock_spin(vp);
5687	vp->v_flag &= ~VISDIRTY;
5688	vnode_unlock(vp);
5689	return `0`;
5690	}
5691
5692	int
5693	vnode_isdirty(vnode_t vp)
5694	{
5695	int dirty;
5696
5697	vnode_lock_spin(vp);
5698	dirty = (vp->v_flag & VISDIRTY) ? `1` : `0`;
5699	vnode_unlock(vp);
5700
5701	return dirty;
5702	}
5703
5704	static int
5705	vn_create_reg(vnode_t dvp, vnode_t vpp, struct* nameidata ndp, struct* vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx)
5706	{
5707	/ Only use compound VNOP for compound operation /
5708	if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != `0`)) {
5709	*vpp = NULLVP;
5710	return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, O_CREAT, fmode, statusp, vap, ctx);
5711	} else {
5712	return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx);
5713	}
5714	}
5715
5716	/*
5717	* Create a filesystem object of arbitrary type with arbitrary attributes in
5718	* the spevied directory with the specified name.
5719	*
5720	* Parameters: dvp Pointer to the vnode of the directory
5721	* in which to create the object.
5722	* vpp Pointer to the area into which to
5723	* return the vnode of the created object.
5724	* cnp Component name pointer from the namei
5725	* data structure, containing the name to
5726	* use for the create object.
5727	* vap Pointer to the vnode_attr structure
5728	* describing the object to be created,
5729	* including the type of object.
5730	* flags VN_* flags controlling ACL inheritance
5731	* and whether or not authorization is to
5732	* be required for the operation.
5733	*
5734	* Returns: 0 Success
5735	* !0 errno value
5736	*
5737	* Implicit: *vpp Contains the vnode of the object that
5738	* was created, if successful.
5739	* *cnp May be modified by the underlying VFS.
5740	* *vap May be modified by the underlying VFS.
5741	* modified by either ACL inheritance or
5742	*
5743	*
5744	* be modified, even if the operation is
5745	*
5746	*
5747	* Notes: The kauth_filesec_t in 'vap', if any, is in host byte order.
5748	*
5749	* Modification of 'cnp' and 'vap' by the underlying VFS is
5750	* strongly discouraged.
5751	*
5752	* XXX: This function is a 'vn_*' function; it belongs in vfs_vnops.c
5753	*
5754	* XXX: We should enummerate the possible errno values here, and where
5755	* in the code they originated.
5756	*/
5757	errno_t
5758	vn_create(vnode_t dvp, vnode_t vpp, struct* nameidata ndp, struct* vnode_attr vap, uint32_t flags, int* fmode, uint32_t *statusp, vfs_context_t ctx)
5759	{
5760	errno_t error, old_error;
5761	vnode_t vp = (vnode_t)`0`;
5762	boolean_t batched;
5763	struct componentname *cnp;
5764	uint32_t defaulted;
5765
5766	cnp = &ndp->ni_cnd;
5767	error = `0`;
5768	batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE;
5769
5770	KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr);
5771
5772	if (flags & VN_CREATE_NOINHERIT)
5773	vap->va_vaflags \|= VA_NOINHERIT;
5774	if (flags & VN_CREATE_NOAUTH)
5775	vap->va_vaflags \|= VA_NOAUTH;
5776	/*
5777	* Handle ACL inheritance, initialize vap.
5778	*/
5779	error = vn_attribute_prepare(dvp, vap, &defaulted, ctx);
5780	if (error) {
5781	return error;
5782	}
5783
5784	if (vap->va_type != VREG && (fmode != `0` \|\| (flags & VN_CREATE_DOOPEN) \|\| statusp)) {
5785	panic("Open parameters, but not a regular file.");
5786	}
5787	if ((fmode != `0`) && ((flags & VN_CREATE_DOOPEN) == `0`)) {
5788	panic("Mode for open, but not trying to open...");
5789	}
5790
5791
5792	/*
5793	* Create the requested node.
5794	*/
5795	switch(vap->va_type) {
5796	case VREG:
5797	error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx);
5798	break;
5799	case VDIR:
5800	error = vn_mkdir(dvp, vpp, ndp, vap, ctx);
5801	break;
5802	case VSOCK:
5803	case VFIFO:
5804	case VBLK:
5805	case VCHR:
5806	error = VNOP_MKNOD(dvp, vpp, cnp, vap, ctx);
5807	break;
5808	default:
5809	panic("vnode_create: unknown vtype %d", vap->va_type);
5810	}
5811	if (error != `0`) {
5812	KAUTH_DEBUG("%p CREATE - error %d returned by filesystem", dvp, error);
5813	goto out;
5814	}
5815
5816	vp = *vpp;
5817	old_error = error;
5818
5819	#if CONFIG_MACF
5820	if (!(flags & VN_CREATE_NOLABEL)) {
5821	error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx);
5822	if (error)
5823	goto error;
5824	}
5825	#endif
5826
5827	/*
5828	* If some of the requested attributes weren't handled by the VNOP,
5829	* use our fallback code.
5830	*/
5831	if (!VATTR_ALL_SUPPORTED(vap) && *vpp) {
5832	KAUTH_DEBUG(" CREATE - doing fallback with ACL %p", vap->va_acl);
5833	error = vnode_setattr_fallback(*vpp, vap, ctx);
5834	}
5835	#if CONFIG_MACF
5836	error:
5837	#endif
5838	if ((error != `0`) && (vp != (vnode_t)`0`)) {
5839
5840	/ If we've done a compound open, close /
5841	if (batched && (old_error == `0`) && (vap->va_type == VREG)) {
5842	VNOP_CLOSE(vp, fmode, ctx);
5843	}
5844
5845	/ Need to provide notifications if a create succeeded /
5846	if (!batched) {
5847	*vpp = (vnode_t) `0`;
5848	vnode_put(vp);
5849	vp = NULLVP;
5850	}
5851	}
5852
5853	/*
5854	* For creation VNOPs, this is the equivalent of
5855	* lookup_handle_found_vnode.
5856	*/
5857	if (kdebug_enable && *vpp)
5858	kdebug_lookup(*vpp, cnp);
5859
5860	out:
5861	vn_attribute_cleanup(vap, defaulted);
5862
5863	return(error);
5864	}
5865
5866	static kauth_scope_t vnode_scope;
5867	static int vnode_authorize_callback(kauth_cred_t credential, void *idata, kauth_action_t action,
5868	uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3);
5869	static int vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
5870	vnode_t vp, vnode_t dvp, int *errorp);
5871
5872	typedef struct _vnode_authorize_context {
5873	vnode_t vp;
5874	struct vnode_attr *vap;
5875	vnode_t dvp;
5876	struct vnode_attr *dvap;
5877	vfs_context_t ctx;
5878	int flags;
5879	int flags_valid;
5880	#define _VAC_IS_OWNER (1<<0)
5881	#define _VAC_IN_GROUP (1<<1)
5882	#define _VAC_IS_DIR_OWNER (1<<2)
5883	#define _VAC_IN_DIR_GROUP (1<<3)
5884	#define _VAC_NO_VNODE_POINTERS (1<<4)
5885	} *vauth_ctx;
5886
5887	void
5888	vnode_authorize_init(void)
5889	{
5890	vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL);
5891	}
5892
5893	#define VATTR_PREPARE_DEFAULTED_UID 0x1
5894	#define VATTR_PREPARE_DEFAULTED_GID 0x2
5895	#define VATTR_PREPARE_DEFAULTED_MODE 0x4
5896
5897	int
5898	vn_attribute_prepare(vnode_t dvp, struct vnode_attr vap, uint32_t defaulted_fieldsp, vfs_context_t ctx)
5899	{
5900	kauth_acl_t nacl = NULL, oacl = NULL;
5901	int error;
5902
5903	/*
5904	* Handle ACL inheritance.
5905	*/
5906	if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) {
5907	/ save the original filesec /
5908	if (VATTR_IS_ACTIVE(vap, va_acl)) {
5909	oacl = vap->va_acl;
5910	}
5911
5912	vap->va_acl = NULL;
5913	if ((error = kauth_acl_inherit(dvp,
5914	oacl,
5915	&nacl,
5916	vap->va_type == VDIR,
5917	ctx)) != `0`) {
5918	KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error);
5919	return(error);
5920	}
5921
5922	/*
5923	* If the generated ACL is NULL, then we can save ourselves some effort
5924	* by clearing the active bit.
5925	*/
5926	if (nacl == NULL) {
5927	VATTR_CLEAR_ACTIVE(vap, va_acl);
5928	} else {
5929	vap->va_base_acl = oacl;
5930	VATTR_SET(vap, va_acl, nacl);
5931	}
5932	}
5933
5934	error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx);
5935	if (error) {
5936	vn_attribute_cleanup(vap, *defaulted_fieldsp);
5937	}
5938
5939	return error;
5940	}
5941
5942	void
5943	vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields)
5944	{
5945	/*
5946	* If the caller supplied a filesec in vap, it has been replaced
5947	* now by the post-inheritance copy. We need to put the original back
5948	* and free the inherited product.
5949	*/
5950	kauth_acl_t nacl, oacl;
5951
5952	if (VATTR_IS_ACTIVE(vap, va_acl)) {
5953	nacl = vap->va_acl;
5954	oacl = vap->va_base_acl;
5955
5956	if (oacl) {
5957	VATTR_SET(vap, va_acl, oacl);
5958	vap->va_base_acl = NULL;
5959	} else {
5960	VATTR_CLEAR_ACTIVE(vap, va_acl);
5961	}
5962
5963	if (nacl != NULL) {
5964	kauth_acl_free(nacl);
5965	}
5966	}
5967
5968	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != `0`) {
5969	VATTR_CLEAR_ACTIVE(vap, va_mode);
5970	}
5971	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != `0`) {
5972	VATTR_CLEAR_ACTIVE(vap, va_gid);
5973	}
5974	if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != `0`) {
5975	VATTR_CLEAR_ACTIVE(vap, va_uid);
5976	}
5977
5978	return;
5979	}
5980
5981	int
5982	vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, __unused void* *reserved)
5983	{
5984	#if !CONFIG_MACF
5985	#pragma unused(cnp)
5986	#endif
5987	int error = `0`;
5988
5989	/*
5990	* Normally, unlinking of directories is not supported.
5991	* However, some file systems may have limited support.
5992	*/
5993	if ((vp->v_type == VDIR) &&
5994	!(vp->v_mount->mnt_kern_flag & MNTK_DIR_HARDLINKS)) {
5995	return (EPERM); / POSIX /
5996	}
5997
5998	/ authorize the delete operation /
5999	#if CONFIG_MACF
6000	if (!error)
6001	error = mac_vnode_check_unlink(ctx, dvp, vp, cnp);
6002	#endif /* MAC */
6003	if (!error)
6004	error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6005
6006	return error;
6007	}
6008
6009	int
6010	vn_authorize_open_existing(vnode_t vp, struct componentname cnp, int* fmode, vfs_context_t ctx, void *reserved)
6011	{
6012	/ Open of existing case /
6013	kauth_action_t action;
6014	int error = `0`;
6015	if (cnp->cn_ndp == NULL) {
6016	panic("NULL ndp");
6017	}
6018	if (reserved != NULL) {
6019	panic("reserved not NULL.");
6020	}
6021
6022	#if CONFIG_MACF
6023	/ XXX may do duplicate work here, but ignore that for now (idempotent) /
6024	if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) {
6025	error = vnode_label(vnode_mount(vp), NULL, vp, NULL, `0`, ctx);
6026	if (error)
6027	return (error);
6028	}
6029	#endif
6030
6031	if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) {
6032	return (ENOTDIR);
6033	}
6034
6035	if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) {
6036	return (EOPNOTSUPP); / Operation not supported on socket /
6037	}
6038
6039	if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != `0`) {
6040	return (ELOOP); / O_NOFOLLOW was specified and the target is a symbolic link /
6041	}
6042
6043	/ disallow write operations on directories /
6044	if (vnode_isdir(vp) && (fmode & (FWRITE \| O_TRUNC))) {
6045	return (EISDIR);
6046	}
6047
6048	if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) {
6049	if (vp->v_type != VDIR) {
6050	return (ENOTDIR);
6051	}
6052	}
6053
6054	#if CONFIG_MACF
6055	/ If a file being opened is a shadow file containing*
6056	* namedstream data, ignore the macf checks because it
6057	* is a kernel internal file and access should always
6058	* be allowed.
6059	*/
6060	if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) {
6061	error = mac_vnode_check_open(ctx, vp, fmode);
6062	if (error) {
6063	return (error);
6064	}
6065	}
6066	#endif
6067
6068	/ compute action to be authorized /
6069	action = `0`;
6070	if (fmode & FREAD) {
6071	action \|= KAUTH_VNODE_READ_DATA;
6072	}
6073	if (fmode & (FWRITE \| O_TRUNC)) {
6074	/*
6075	* If we are writing, appending, and not truncating,
6076	* indicate that we are appending so that if the
6077	* UF_APPEND or SF_APPEND bits are set, we do not deny
6078	* the open.
6079	*/
6080	if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) {
6081	action \|= KAUTH_VNODE_APPEND_DATA;
6082	} else {
6083	action \|= KAUTH_VNODE_WRITE_DATA;
6084	}
6085	}
6086	error = vnode_authorize(vp, NULL, action, ctx);
6087	#if NAMEDSTREAMS
6088	if (error == EACCES) {
6089	/*
6090	* Shadow files may exist on-disk with a different UID/GID
6091	* than that of the current context. Verify that this file
6092	* is really a shadow file. If it was created successfully
6093	* then it should be authorized.
6094	*/
6095	if (vnode_isshadow(vp) && vnode_isnamedstream (vp)) {
6096	error = vnode_verifynamedstream(vp);
6097	}
6098	}
6099	#endif
6100
6101	return error;
6102	}
6103
6104	int
6105	vn_authorize_create(vnode_t dvp, struct componentname cnp, struct* vnode_attr vap, vfs_context_t ctx, void* *reserved)
6106	{
6107	#if !CONFIG_MACF
6108	#pragma unused(vap)
6109	#endif
6110	/ Creation case /
6111	int error;
6112
6113	if (cnp->cn_ndp == NULL) {
6114	panic("NULL cn_ndp");
6115	}
6116	if (reserved != NULL) {
6117	panic("reserved not NULL.");
6118	}
6119
6120	/ Only validate path for creation if we didn't do a complete lookup /
6121	if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) {
6122	error = lookup_validate_creation_path(cnp->cn_ndp);
6123	if (error)
6124	return (error);
6125	}
6126
6127	#if CONFIG_MACF
6128	error = mac_vnode_check_create(ctx, dvp, cnp, vap);
6129	if (error)
6130	return (error);
6131	#endif /* CONFIG_MACF */
6132
6133	return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx));
6134	}
6135
6136	int
6137	vn_authorize_rename(struct vnode fdvp, struct* vnode fvp, struct* componentname *fcnp,
6138	struct vnode tdvp, struct* vnode tvp, struct* componentname *tcnp,
6139	vfs_context_t ctx, void *reserved)
6140	{
6141	return vn_authorize_renamex(fdvp, fvp, fcnp, tdvp, tvp, tcnp, ctx, `0`, reserved);
6142	}
6143
6144	int
6145	vn_authorize_renamex(struct vnode fdvp, struct* vnode fvp, struct* componentname *fcnp,
6146	struct vnode tdvp, struct* vnode tvp, struct* componentname *tcnp,
6147	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6148	{
6149
6150	return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
6151	}
6152
6153	int
6154	vn_authorize_renamex_with_paths(struct vnode fdvp, struct* vnode fvp, struct* componentname fcnp, const* char *from_path,
6155	struct vnode tdvp, struct* vnode tvp, struct* componentname tcnp, const* char *to_path,
6156	vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
6157	{
6158	int error = `0`;
6159	int moving = `0`;
6160	bool swap = flags & VFS_RENAME_SWAP;
6161
6162	if (reserved != NULL) {
6163	panic("Passed something other than NULL as reserved field!");
6164	}
6165
6166	/*
6167	* Avoid renaming "." and "..".
6168	*
6169	* XXX No need to check for this in the FS. We should always have the leaves
6170	* in VFS in this case.
6171	*/
6172	if (fvp->v_type == VDIR &&
6173	((fdvp == fvp) \|\|
6174	(fcnp->cn_namelen == `1` && fcnp->cn_nameptr[`0`] == `'.'`) \|\|
6175	((fcnp->cn_flags \| tcnp->cn_flags) & ISDOTDOT)) ) {
6176	error = EINVAL;
6177	goto out;
6178	}
6179
6180	if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) {
6181	error = lookup_validate_creation_path(tcnp->cn_ndp);
6182	if (error)
6183	goto out;
6184	}
6185
6186	/** <MACF> **/
6187	#if CONFIG_MACF
6188	error = mac_vnode_check_rename(ctx, fdvp, fvp, fcnp, tdvp, tvp, tcnp);
6189	if (error)
6190	goto out;
6191	if (swap) {
6192	error = mac_vnode_check_rename(ctx, tdvp, tvp, tcnp, fdvp, fvp, fcnp);
6193	if (error)
6194	goto out;
6195	}
6196	#endif
6197	/** </MACF> **/
6198
6199	/** <MiscChecks> **/
6200	if (tvp != NULL) {
6201	if (!swap) {
6202	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
6203	error = ENOTDIR;
6204	goto out;
6205	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
6206	error = EISDIR;
6207	goto out;
6208	}
6209	}
6210	} else if (swap) {
6211	/*
6212	* Caller should have already checked this and returned
6213	* ENOENT. If we send back ENOENT here, caller will retry
6214	* which isn't what we want so we send back EINVAL here
6215	* instead.
6216	*/
6217	error = EINVAL;
6218	goto out;
6219	}
6220
6221	if (fvp == tdvp) {
6222	error = EINVAL;
6223	goto out;
6224	}
6225
6226	/*
6227	* The following edge case is caught here:
6228	* (to cannot be a descendent of from)
6229	*
6230	* o fdvp
6231	* /
6232	* /
6233	* o fvp
6234	* \
6235	* \
6236	* o tdvp
6237	* /
6238	* /
6239	* o tvp
6240	*/
6241	if (tdvp->v_parent == fvp) {
6242	error = EINVAL;
6243	goto out;
6244	}
6245
6246	if (swap && fdvp->v_parent == tvp) {
6247	error = EINVAL;
6248	goto out;
6249	}
6250	/** </MiscChecks> **/
6251
6252	/** <Kauth> **/
6253
6254	/*
6255	* As part of the Kauth step, we call out to allow 3rd-party
6256	* fileop notification of "about to rename". This is needed
6257	* in the event that 3rd-parties need to know that the DELETE
6258	* authorization is actually part of a rename. It's important
6259	* that we guarantee that the DELETE call-out will always be
6260	* made if the WILL_RENAME call-out is made. Another fileop
6261	* call-out will be performed once the operation is completed.
6262	* We can ignore the result of kauth_authorize_fileop().
6263	*
6264	* N.B. We are passing the vnode and both paths to each
6265	* call; kauth_authorize_fileop() extracts the "from" path
6266	* when posting a KAUTH_FILEOP_WILL_RENAME notification.
6267	* As such, we only post these notifications if all of the
6268	* information we need is provided.
6269	*/
6270
6271	if (swap) {
6272	kauth_action_t f = `0`, t = `0`;
6273
6274	/*
6275	* Directories changing parents need ...ADD_SUBDIR... to
6276	* permit changing ".."
6277	*/
6278	if (fdvp != tdvp) {
6279	if (vnode_isdir(fvp))
6280	f = KAUTH_VNODE_ADD_SUBDIRECTORY;
6281	if (vnode_isdir(tvp))
6282	t = KAUTH_VNODE_ADD_SUBDIRECTORY;
6283	}
6284	if (to_path != NULL)
6285	kauth_authorize_fileop(vfs_context_ucred(ctx),
6286	KAUTH_FILEOP_WILL_RENAME,
6287	(uintptr_t)fvp,
6288	(uintptr_t)to_path);
6289	error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE \| f, ctx);
6290	if (error)
6291	goto out;
6292	if (from_path != NULL)
6293	kauth_authorize_fileop(vfs_context_ucred(ctx),
6294	KAUTH_FILEOP_WILL_RENAME,
6295	(uintptr_t)tvp,
6296	(uintptr_t)from_path);
6297	error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE \| t, ctx);
6298	if (error)
6299	goto out;
6300	f = vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6301	t = vnode_isdir(tvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE;
6302	if (fdvp == tdvp)
6303	error = vnode_authorize(fdvp, NULL, f \| t, ctx);
6304	else {
6305	error = vnode_authorize(fdvp, NULL, t, ctx);
6306	if (error)
6307	goto out;
6308	error = vnode_authorize(tdvp, NULL, f, ctx);
6309	}
6310	if (error)
6311	goto out;
6312	} else {
6313	error = `0`;
6314	if ((tvp != NULL) && vnode_isdir(tvp)) {
6315	if (tvp != fdvp)
6316	moving = `1`;
6317	} else if (tdvp != fdvp) {
6318	moving = `1`;
6319	}
6320
6321	/*
6322	* must have delete rights to remove the old name even in
6323	* the simple case of fdvp == tdvp.
6324	*
6325	* If fvp is a directory, and we are changing it's parent,
6326	* then we also need rights to rewrite its ".." entry as well.
6327	*/
6328	if (to_path != NULL)
6329	kauth_authorize_fileop(vfs_context_ucred(ctx),
6330	KAUTH_FILEOP_WILL_RENAME,
6331	(uintptr_t)fvp,
6332	(uintptr_t)to_path);
6333	if (vnode_isdir(fvp)) {
6334	if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE \| KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != `0`)
6335	goto out;
6336	} else {
6337	if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != `0`)
6338	goto out;
6339	}
6340	if (moving) {
6341	/ moving into tdvp or tvp, must have rights to add /
6342	if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp,
6343	NULL,
6344	vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE,
6345	ctx)) != `0`) {
6346	goto out;
6347	}
6348	} else {
6349	/ node staying in same directory, must be allowed to add new name /
6350	if ((error = vnode_authorize(fdvp, NULL,
6351	vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != `0`)
6352	goto out;
6353	}
6354	/ overwriting tvp /
6355	if ((tvp != NULL) && !vnode_isdir(tvp) &&
6356	((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != `0`)) {
6357	goto out;
6358	}
6359	}
6360
6361	/** </Kauth> **/
6362
6363	/ XXX more checks? /
6364	out:
6365	return error;
6366	}
6367
6368	int
6369	vn_authorize_mkdir(vnode_t dvp, struct componentname cnp, struct* vnode_attr vap, vfs_context_t ctx, void* *reserved)
6370	{
6371	#if !CONFIG_MACF
6372	#pragma unused(vap)
6373	#endif
6374	int error;
6375
6376	if (reserved != NULL) {
6377	panic("reserved not NULL in vn_authorize_mkdir()");
6378	}
6379
6380	/ XXX A hack for now, to make shadow files work /
6381	if (cnp->cn_ndp == NULL) {
6382	return `0`;
6383	}
6384
6385	if (vnode_compound_mkdir_available(dvp)) {
6386	error = lookup_validate_creation_path(cnp->cn_ndp);
6387	if (error)
6388	goto out;
6389	}
6390
6391	#if CONFIG_MACF
6392	error = mac_vnode_check_create(ctx,
6393	dvp, cnp, vap);
6394	if (error)
6395	goto out;
6396	#endif
6397
6398	/ authorize addition of a directory to the parent /
6399	if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != `0`)
6400	goto out;
6401
6402	out:
6403	return error;
6404	}
6405
6406	int
6407	vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname cnp, vfs_context_t ctx, void* *reserved)
6408	{
6409	#if CONFIG_MACF
6410	int error;
6411	#else
6412	#pragma unused(cnp)
6413	#endif
6414	if (reserved != NULL) {
6415	panic("Non-NULL reserved argument to vn_authorize_rmdir()");
6416	}
6417
6418	if (vp->v_type != VDIR) {
6419	/*
6420	* rmdir only deals with directories
6421	*/
6422	return ENOTDIR;
6423	}
6424
6425	if (dvp == vp) {
6426	/*
6427	* No rmdir "." please.
6428	*/
6429	return EINVAL;
6430	}
6431
6432	#if CONFIG_MACF
6433	error = mac_vnode_check_unlink(ctx, dvp,
6434	vp, cnp);
6435	if (error)
6436	return error;
6437	#endif
6438
6439	return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx);
6440	}
6441
6442	/*
6443	* Authorizer for directory cloning. This does not use vnodes but instead
6444	* uses prefilled vnode attributes from the filesystem.
6445	*
6446	* The same function is called to set up the attributes required, perform the
6447	* authorization and cleanup (if required)
6448	*/
6449	int
6450	vnode_attr_authorize_dir_clone(struct vnode_attr *vap, kauth_action_t action,
6451	struct vnode_attr *dvap, __unused vnode_t sdvp, mount_t mp,
6452	dir_clone_authorizer_op_t vattr_op, uint32_t flags, vfs_context_t ctx,
6453	__unused void *reserved)
6454	{
6455	int error;
6456	int is_suser = vfs_context_issuser(ctx);
6457
6458	if (vattr_op == OP_VATTR_SETUP) {
6459	VATTR_INIT(vap);
6460
6461	/*
6462	* When ACL inheritence is implemented, both vap->va_acl and
6463	* dvap->va_acl will be required (even as superuser).
6464	*/
6465	VATTR_WANTED(vap, va_type);
6466	VATTR_WANTED(vap, va_mode);
6467	VATTR_WANTED(vap, va_flags);
6468	VATTR_WANTED(vap, va_uid);
6469	VATTR_WANTED(vap, va_gid);
6470	if (dvap) {
6471	VATTR_INIT(dvap);
6472	VATTR_WANTED(dvap, va_flags);
6473	}
6474
6475	if (!is_suser) {
6476	/*
6477	* If not superuser, we have to evaluate ACLs and
6478	* need the target directory gid to set the initial
6479	* gid of the new object.
6480	*/
6481	VATTR_WANTED(vap, va_acl);
6482	if (dvap)
6483	VATTR_WANTED(dvap, va_gid);
6484	} else if (dvap && (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6485	VATTR_WANTED(dvap, va_gid);
6486	}
6487	return (`0`);
6488	} else if (vattr_op == OP_VATTR_CLEANUP) {
6489	return (`0`); / Nothing to do for now /
6490	}
6491
6492	/ dvap isn't used for authorization /
6493	error = vnode_attr_authorize(vap, NULL, mp, action, ctx);
6494
6495	if (error)
6496	return (error);
6497
6498	/*
6499	* vn_attribute_prepare should be able to accept attributes as well as
6500	* vnodes but for now we do this inline.
6501	*/
6502	if (!is_suser \|\| (flags & VNODE_CLONEFILE_NOOWNERCOPY)) {
6503	/*
6504	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit
6505	* owner is set, that owner takes ownership of all new files.
6506	*/
6507	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6508	(mp->mnt_fsowner != KAUTH_UID_NONE)) {
6509	VATTR_SET(vap, va_uid, mp->mnt_fsowner);
6510	} else {
6511	/ default owner is current user /
6512	VATTR_SET(vap, va_uid,
6513	kauth_cred_getuid(vfs_context_ucred(ctx)));
6514	}
6515
6516	if ((mp->mnt_flag & MNT_IGNORE_OWNERSHIP) &&
6517	(mp->mnt_fsgroup != KAUTH_GID_NONE)) {
6518	VATTR_SET(vap, va_gid, mp->mnt_fsgroup);
6519	} else {
6520	/*
6521	* default group comes from parent object,
6522	* fallback to current user
6523	*/
6524	if (VATTR_IS_SUPPORTED(dvap, va_gid)) {
6525	VATTR_SET(vap, va_gid, dvap->va_gid);
6526	} else {
6527	VATTR_SET(vap, va_gid,
6528	kauth_cred_getgid(vfs_context_ucred(ctx)));
6529	}
6530	}
6531	}
6532
6533	/ Inherit SF_RESTRICTED bit from destination directory only /
6534	if (VATTR_IS_ACTIVE(vap, va_flags)) {
6535	VATTR_SET(vap, va_flags,
6536	((vap->va_flags & ~(UF_DATAVAULT \| SF_RESTRICTED)))); / Turn off from source /
6537	if (VATTR_IS_ACTIVE(dvap, va_flags))
6538	VATTR_SET(vap, va_flags,
6539	vap->va_flags \| (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
6540	} else if (VATTR_IS_ACTIVE(dvap, va_flags)) {
6541	VATTR_SET(vap, va_flags, (dvap->va_flags & (UF_DATAVAULT \| SF_RESTRICTED)));
6542	}
6543
6544	return (`0`);
6545	}
6546
6547
6548	/*
6549	* Authorize an operation on a vnode.
6550	*
6551	* This is KPI, but here because it needs vnode_scope.
6552	*
6553	* Returns: 0 Success
6554	* kauth_authorize_action:EPERM ...
6555	* xlate => EACCES Permission denied
6556	* kauth_authorize_action:0 Success
6557	* kauth_authorize_action: Depends on callback return; this is
6558	* usually only vnode_authorize_callback(),
6559	* but may include other listerners, if any
6560	* exist.
6561	* EROFS
6562	* EACCES
6563	* EPERM
6564	* ???
6565	*/
6566	int
6567	vnode_authorize(vnode_t vp, vnode_t dvp, kauth_action_t action, vfs_context_t ctx)
6568	{
6569	int error, result;
6570
6571	/*
6572	* We can't authorize against a dead vnode; allow all operations through so that
6573	* the correct error can be returned.
6574	*/
6575	if (vp->v_type == VBAD)
6576	return(`0`);
6577
6578	error = `0`;
6579	result = kauth_authorize_action(vnode_scope, vfs_context_ucred(ctx), action,
6580	(uintptr_t)ctx, (uintptr_t)vp, (uintptr_t)dvp, (uintptr_t)&error);
6581	if (result == EPERM) / traditional behaviour /
6582	result = EACCES;
6583	/ did the lower layers give a better error return? /
6584	if ((result != `0`) && (error != `0`))
6585	return(error);
6586	return(result);
6587	}
6588
6589	/*
6590	* Test for vnode immutability.
6591	*
6592	* The 'append' flag is set when the authorization request is constrained
6593	* to operations which only request the right to append to a file.
6594	*
6595	* The 'ignore' flag is set when an operation modifying the immutability flags
6596	* is being authorized. We check the system securelevel to determine which
6597	* immutability flags we can ignore.
6598	*/
6599	static int
6600	vnode_immutable(struct vnode_attr vap, int* append, int ignore)
6601	{
6602	int mask;
6603
6604	/ start with all bits precluding the operation /
6605	mask = IMMUTABLE \| APPEND;
6606
6607	/ if appending only, remove the append-only bits /
6608	if (append)
6609	mask &= ~APPEND;
6610
6611	/ ignore only set when authorizing flags changes /
6612	if (ignore) {
6613	if (securelevel <= `0`) {
6614	/ in insecure state, flags do not inhibit changes /
6615	mask = `0`;
6616	} else {
6617	/ in secure state, user flags don't inhibit /
6618	mask &= ~(UF_IMMUTABLE \| UF_APPEND);
6619	}
6620	}
6621	KAUTH_DEBUG("IMMUTABLE - file flags 0x%x mask 0x%x append = %d ignore = %d", vap->va_flags, mask, append, ignore);
6622	if ((vap->va_flags & mask) != `0`)
6623	return(EPERM);
6624	return(`0`);
6625	}
6626
6627	static int
6628	vauth_node_owner(struct vnode_attr *vap, kauth_cred_t cred)
6629	{
6630	int result;
6631
6632	/ default assumption is not-owner /
6633	result = `0`;
6634
6635	/*
6636	* If the filesystem has given us a UID, we treat this as authoritative.
6637	*/
6638	if (vap && VATTR_IS_SUPPORTED(vap, va_uid)) {
6639	result = (vap->va_uid == kauth_cred_getuid(cred)) ? `1` : `0`;
6640	}
6641	/ we could test the owner UUID here if we had a policy for it /
6642
6643	return(result);
6644	}
6645
6646	/*
6647	* vauth_node_group
6648	*
6649	* Description: Ask if a cred is a member of the group owning the vnode object
6650	*
6651	* Parameters: vap vnode attribute
6652	* vap->va_gid group owner of vnode object
6653	* cred credential to check
6654	* ismember pointer to where to put the answer
6655	* idontknow Return this if we can't get an answer
6656	*
6657	* Returns: 0 Success
6658	* idontknow Can't get information
6659	* kauth_cred_ismember_gid:? Error from kauth subsystem
6660	* kauth_cred_ismember_gid:? Error from kauth subsystem
6661	*/
6662	static int
6663	vauth_node_group(struct vnode_attr vap, kauth_cred_t cred, int* ismember, int* idontknow)
6664	{
6665	int error;
6666	int result;
6667
6668	error = `0`;
6669	result = `0`;
6670
6671	/*
6672	* The caller is expected to have asked the filesystem for a group
6673	* at some point prior to calling this function. The answer may
6674	* have been that there is no group ownership supported for the
6675	* vnode object, in which case we return
6676	*/
6677	if (vap && VATTR_IS_SUPPORTED(vap, va_gid)) {
6678	error = kauth_cred_ismember_gid(cred, vap->va_gid, &result);
6679	/*
6680	* Credentials which are opted into external group membership
6681	* resolution which are not known to the external resolver
6682	* will result in an ENOENT error. We translate this into
6683	* the appropriate 'idontknow' response for our caller.
6684	*
6685	* XXX We do not make a distinction here between an ENOENT
6686	* XXX arising from a response from the external resolver,
6687	* XXX and an ENOENT which is internally generated. This is
6688	* XXX a deficiency of the published kauth_cred_ismember_gid()
6689	* XXX KPI which can not be overcome without new KPI. For
6690	* XXX all currently known cases, however, this wil result
6691	* XXX in correct behaviour.
6692	*/
6693	if (error == ENOENT)
6694	error = idontknow;
6695	}
6696	/*
6697	* XXX We could test the group UUID here if we had a policy for it,
6698	* XXX but this is problematic from the perspective of synchronizing
6699	* XXX group UUID and POSIX GID ownership of a file and keeping the
6700	* XXX values coherent over time. The problem is that the local
6701	* XXX system will vend transient group UUIDs for unknown POSIX GID
6702	* XXX values, and these are not persistent, whereas storage of values
6703	* XXX is persistent. One potential solution to this is a local
6704	* XXX (persistent) replica of remote directory entries and vended
6705	* XXX local ids in a local directory server (think in terms of a
6706	* XXX caching DNS server).
6707	*/
6708
6709	if (!error)
6710	*ismember = result;
6711	return(error);
6712	}
6713
6714	static int
6715	vauth_file_owner(vauth_ctx vcp)
6716	{
6717	int result;
6718
6719	if (vcp->flags_valid & _VAC_IS_OWNER) {
6720	result = (vcp->flags & _VAC_IS_OWNER) ? `1` : `0`;
6721	} else {
6722	result = vauth_node_owner(vcp->vap, vcp->ctx->vc_ucred);
6723
6724	/ cache our result /
6725	vcp->flags_valid \|= _VAC_IS_OWNER;
6726	if (result) {
6727	vcp->flags \|= _VAC_IS_OWNER;
6728	} else {
6729	vcp->flags &= ~_VAC_IS_OWNER;
6730	}
6731	}
6732	return(result);
6733	}
6734
6735
6736	/*
6737	* vauth_file_ingroup
6738	*
6739	* Description: Ask if a user is a member of the group owning the directory
6740	*
6741	* Parameters: vcp The vnode authorization context that
6742	* contains the user and directory info
6743	* vcp->flags_valid Valid flags
6744	* vcp->flags Flags values
6745	* vcp->vap File vnode attributes
6746	* vcp->ctx VFS Context (for user)
6747	* ismember pointer to where to put the answer
6748	* idontknow Return this if we can't get an answer
6749	*
6750	* Returns: 0 Success
6751	* vauth_node_group:? Error from vauth_node_group()
6752	*
6753	* Implicit returns: *ismember 0 The user is not a group member
6754	* 1 The user is a group member
6755	*/
6756	static int
6757	vauth_file_ingroup(vauth_ctx vcp, int ismember, int* idontknow)
6758	{
6759	int error;
6760
6761	/ Check for a cached answer first, to avoid the check if possible /
6762	if (vcp->flags_valid & _VAC_IN_GROUP) {
6763	*ismember = (vcp->flags & _VAC_IN_GROUP) ? `1` : `0`;
6764	error = `0`;
6765	} else {
6766	/ Otherwise, go look for it /
6767	error = vauth_node_group(vcp->vap, vcp->ctx->vc_ucred, ismember, idontknow);
6768
6769	if (!error) {
6770	/ cache our result /
6771	vcp->flags_valid \|= _VAC_IN_GROUP;
6772	if (*ismember) {
6773	vcp->flags \|= _VAC_IN_GROUP;
6774	} else {
6775	vcp->flags &= ~_VAC_IN_GROUP;
6776	}
6777	}
6778
6779	}
6780	return(error);
6781	}
6782
6783	static int
6784	vauth_dir_owner(vauth_ctx vcp)
6785	{
6786	int result;
6787
6788	if (vcp->flags_valid & _VAC_IS_DIR_OWNER) {
6789	result = (vcp->flags & _VAC_IS_DIR_OWNER) ? `1` : `0`;
6790	} else {
6791	result = vauth_node_owner(vcp->dvap, vcp->ctx->vc_ucred);
6792
6793	/ cache our result /
6794	vcp->flags_valid \|= _VAC_IS_DIR_OWNER;
6795	if (result) {
6796	vcp->flags \|= _VAC_IS_DIR_OWNER;
6797	} else {
6798	vcp->flags &= ~_VAC_IS_DIR_OWNER;
6799	}
6800	}
6801	return(result);
6802	}
6803
6804	/*
6805	* vauth_dir_ingroup
6806	*
6807	* Description: Ask if a user is a member of the group owning the directory
6808	*
6809	* Parameters: vcp The vnode authorization context that
6810	* contains the user and directory info
6811	* vcp->flags_valid Valid flags
6812	* vcp->flags Flags values
6813	* vcp->dvap Dir vnode attributes
6814	* vcp->ctx VFS Context (for user)
6815	* ismember pointer to where to put the answer
6816	* idontknow Return this if we can't get an answer
6817	*
6818	* Returns: 0 Success
6819	* vauth_node_group:? Error from vauth_node_group()
6820	*
6821	* Implicit returns: *ismember 0 The user is not a group member
6822	* 1 The user is a group member
6823	*/
6824	static int
6825	vauth_dir_ingroup(vauth_ctx vcp, int ismember, int* idontknow)
6826	{
6827	int error;
6828
6829	/ Check for a cached answer first, to avoid the check if possible /
6830	if (vcp->flags_valid & _VAC_IN_DIR_GROUP) {
6831	*ismember = (vcp->flags & _VAC_IN_DIR_GROUP) ? `1` : `0`;
6832	error = `0`;
6833	} else {
6834	/ Otherwise, go look for it /
6835	error = vauth_node_group(vcp->dvap, vcp->ctx->vc_ucred, ismember, idontknow);
6836
6837	if (!error) {
6838	/ cache our result /
6839	vcp->flags_valid \|= _VAC_IN_DIR_GROUP;
6840	if (*ismember) {
6841	vcp->flags \|= _VAC_IN_DIR_GROUP;
6842	} else {
6843	vcp->flags &= ~_VAC_IN_DIR_GROUP;
6844	}
6845	}
6846	}
6847	return(error);
6848	}
6849
6850	/*
6851	* Test the posix permissions in (vap) to determine whether (credential)
6852	* may perform (action)
6853	*/
6854	static int
6855	vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir)
6856	{
6857	struct vnode_attr *vap;
6858	int needed, error, owner_ok, group_ok, world_ok, ismember;
6859	#ifdef KAUTH_DEBUG_ENABLE
6860	const char *where = "uninitialized";
6861	# define _SETWHERE(c) where = c;
6862	#else
6863	# define _SETWHERE(c)
6864	#endif
6865
6866	/ checking file or directory? /
6867	if (on_dir) {
6868	vap = vcp->dvap;
6869	} else {
6870	vap = vcp->vap;
6871	}
6872
6873	error = `0`;
6874
6875	/*
6876	* We want to do as little work here as possible. So first we check
6877	* which sets of permissions grant us the access we need, and avoid checking
6878	* whether specific permissions grant access when more generic ones would.
6879	*/
6880
6881	/ owner permissions /
6882	needed = `0`;
6883	if (action & VREAD)
6884	needed \|= S_IRUSR;
6885	if (action & VWRITE)
6886	needed \|= S_IWUSR;
6887	if (action & VEXEC)
6888	needed \|= S_IXUSR;
6889	owner_ok = (needed & vap->va_mode) == needed;
6890
6891	/ group permissions /
6892	needed = `0`;
6893	if (action & VREAD)
6894	needed \|= S_IRGRP;
6895	if (action & VWRITE)
6896	needed \|= S_IWGRP;
6897	if (action & VEXEC)
6898	needed \|= S_IXGRP;
6899	group_ok = (needed & vap->va_mode) == needed;
6900
6901	/ world permissions /
6902	needed = `0`;
6903	if (action & VREAD)
6904	needed \|= S_IROTH;
6905	if (action & VWRITE)
6906	needed \|= S_IWOTH;
6907	if (action & VEXEC)
6908	needed \|= S_IXOTH;
6909	world_ok = (needed & vap->va_mode) == needed;
6910
6911	/ If granted/denied by all three, we're done /
6912	if (owner_ok && group_ok && world_ok) {
6913	_SETWHERE("all");
6914	goto out;
6915	}
6916	if (!owner_ok && !group_ok && !world_ok) {
6917	_SETWHERE("all");
6918	error = EACCES;
6919	goto out;
6920	}
6921
6922	/ Check ownership (relatively cheap) /
6923	if ((on_dir && vauth_dir_owner(vcp)) \|\|
6924	(!on_dir && vauth_file_owner(vcp))) {
6925	_SETWHERE("user");
6926	if (!owner_ok)
6927	error = EACCES;
6928	goto out;
6929	}
6930
6931	/ Not owner; if group and world both grant it we're done /
6932	if (group_ok && world_ok) {
6933	_SETWHERE("group/world");
6934	goto out;
6935	}
6936	if (!group_ok && !world_ok) {
6937	_SETWHERE("group/world");
6938	error = EACCES;
6939	goto out;
6940	}
6941
6942	/ Check group membership (most expensive) /
6943	ismember = `0`; / Default to allow, if the target has no group owner /
6944
6945	/*
6946	* In the case we can't get an answer about the user from the call to
6947	* vauth_dir_ingroup() or vauth_file_ingroup(), we want to fail on
6948	* the side of caution, rather than simply granting access, or we will
6949	* fail to correctly implement exclusion groups, so we set the third
6950	* parameter on the basis of the state of 'group_ok'.
6951	*/
6952	if (on_dir) {
6953	error = vauth_dir_ingroup(vcp, &ismember, (!group_ok ? EACCES : `0`));
6954	} else {
6955	error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : `0`));
6956	}
6957	if (error) {
6958	if (!group_ok)
6959	ismember = `1`;
6960	error = `0`;
6961	}
6962	if (ismember) {
6963	_SETWHERE("group");
6964	if (!group_ok)
6965	error = EACCES;
6966	goto out;
6967	}
6968
6969	/ Not owner, not in group, use world result /
6970	_SETWHERE("world");
6971	if (!world_ok)
6972	error = EACCES;
6973
6974	/ FALLTHROUGH /
6975
6976	out:
6977	KAUTH_DEBUG("%p %s - posix %s permissions : need %s%s%s %x have %s%s%s%s%s%s%s%s%s UID = %d file = %d,%d",
6978	vcp->vp, (error == `0`) ? "ALLOWED" : "DENIED", where,
6979	(action & VREAD) ? "r" : "-",
6980	(action & VWRITE) ? "w" : "-",
6981	(action & VEXEC) ? "x" : "-",
6982	needed,
6983	(vap->va_mode & S_IRUSR) ? "r" : "-",
6984	(vap->va_mode & S_IWUSR) ? "w" : "-",
6985	(vap->va_mode & S_IXUSR) ? "x" : "-",
6986	(vap->va_mode & S_IRGRP) ? "r" : "-",
6987	(vap->va_mode & S_IWGRP) ? "w" : "-",
6988	(vap->va_mode & S_IXGRP) ? "x" : "-",
6989	(vap->va_mode & S_IROTH) ? "r" : "-",
6990	(vap->va_mode & S_IWOTH) ? "w" : "-",
6991	(vap->va_mode & S_IXOTH) ? "x" : "-",
6992	kauth_cred_getuid(vcp->ctx->vc_ucred),
6993	on_dir ? vcp->dvap->va_uid : vcp->vap->va_uid,
6994	on_dir ? vcp->dvap->va_gid : vcp->vap->va_gid);
6995	return(error);
6996	}
6997
6998	/*
6999	* Authorize the deletion of the node vp from the directory dvp.
7000	*
7001	* We assume that:
7002	* - Neither the node nor the directory are immutable.
7003	* - The user is not the superuser.
7004	*
7005	* The precedence of factors for authorizing or denying delete for a credential
7006	*
7007	* 1) Explicit ACE on the node. (allow or deny DELETE)
7008	* 2) Explicit ACE on the directory (allow or deny DELETE_CHILD).
7009	*
7010	* If there are conflicting ACEs on the node and the directory, the node
7011	* ACE wins.
7012	*
7013	* 3) Sticky bit on the directory.
7014	* Deletion is not permitted if the directory is sticky and the caller is
7015	* not owner of the node or directory. The sticky bit rules are like a deny
7016	* delete ACE except lower in priority than ACL's either allowing or denying
7017	* delete.
7018	*
7019	* 4) POSIX permisions on the directory.
7020	*
7021	* As an optimization, we cache whether or not delete child is permitted
7022	* on directories. This enables us to skip directory ACL and POSIX checks
7023	* as we already have the result from those checks. However, we always check the
7024	* node ACL and, if the directory has the sticky bit set, we always check its
7025	* ACL (even for a directory with an authorized delete child). Furthermore,
7026	* caching the delete child authorization is independent of the sticky bit
7027	* being set as it is only applicable in determining whether the node can be
7028	* deleted or not.
7029	*/
7030	static int
7031	vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child)
7032	{
7033	struct vnode_attr *vap = vcp->vap;
7034	struct vnode_attr *dvap = vcp->dvap;
7035	kauth_cred_t cred = vcp->ctx->vc_ucred;
7036	struct kauth_acl_eval eval;
7037	int error, ismember;
7038
7039	/ Check the ACL on the node first /
7040	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7041	eval.ae_requested = KAUTH_VNODE_DELETE;
7042	eval.ae_acl = &vap->va_acl->acl_ace[`0`];
7043	eval.ae_count = vap->va_acl->acl_entrycount;
7044	eval.ae_options = `0`;
7045	if (vauth_file_owner(vcp))
7046	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
7047	/*
7048	* We use ENOENT as a marker to indicate we could not get
7049	* information in order to delay evaluation until after we
7050	* have the ACL evaluation answer. Previously, we would
7051	* always deny the operation at this point.
7052	*/
7053	if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != `0` && error != ENOENT)
7054	return (error);
7055	if (error == ENOENT)
7056	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7057	else if (ismember)
7058	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
7059	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7060	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7061	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7062	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7063
7064	if ((error = kauth_acl_evaluate(cred, &eval)) != `0`) {
7065	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7066	return (error);
7067	}
7068
7069	switch(eval.ae_result) {
7070	case KAUTH_RESULT_DENY:
7071	KAUTH_DEBUG("%p DENIED - denied by ACL", vcp->vp);
7072	return (EACCES);
7073	case KAUTH_RESULT_ALLOW:
7074	KAUTH_DEBUG("%p ALLOWED - granted by ACL", vcp->vp);
7075	return (`0`);
7076	case KAUTH_RESULT_DEFER:
7077	default:
7078	/ Defer to directory /
7079	KAUTH_DEBUG("%p DEFERRED - by file ACL", vcp->vp);
7080	break;
7081	}
7082	}
7083
7084	/*
7085	* Without a sticky bit, a previously authorized delete child is
7086	* sufficient to authorize this delete.
7087	*
7088	* If the sticky bit is set, a directory ACL which allows delete child
7089	* overrides a (potential) sticky bit deny. The authorized delete child
7090	* cannot tell us if it was authorized because of an explicit delete
7091	* child allow ACE or because of POSIX permisions so we have to check
7092	* the directory ACL everytime if the directory has a sticky bit.
7093	*/
7094	if (!(dvap->va_mode & S_ISTXT) && cached_delete_child) {
7095	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL or POSIX permissions and no sticky bit on directory", vcp->vp);
7096	return (`0`);
7097	}
7098
7099	/ check the ACL on the directory /
7100	if (VATTR_IS_NOT(dvap, va_acl, NULL)) {
7101	eval.ae_requested = KAUTH_VNODE_DELETE_CHILD;
7102	eval.ae_acl = &dvap->va_acl->acl_ace[`0`];
7103	eval.ae_count = dvap->va_acl->acl_entrycount;
7104	eval.ae_options = `0`;
7105	if (vauth_dir_owner(vcp))
7106	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
7107	/*
7108	* We use ENOENT as a marker to indicate we could not get
7109	* information in order to delay evaluation until after we
7110	* have the ACL evaluation answer. Previously, we would
7111	* always deny the operation at this point.
7112	*/
7113	if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != `0` && error != ENOENT)
7114	return(error);
7115	if (error == ENOENT)
7116	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7117	else if (ismember)
7118	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
7119	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7120	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7121	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7122	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7123
7124	/*
7125	* If there is no entry, we are going to defer to other
7126	* authorization mechanisms.
7127	*/
7128	error = kauth_acl_evaluate(cred, &eval);
7129
7130	if (error != `0`) {
7131	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7132	return (error);
7133	}
7134	switch(eval.ae_result) {
7135	case KAUTH_RESULT_DENY:
7136	KAUTH_DEBUG("%p DENIED - denied by directory ACL", vcp->vp);
7137	return (EACCES);
7138	case KAUTH_RESULT_ALLOW:
7139	KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp);
7140	if (!cached_delete_child && vcp->dvp) {
7141	vnode_cache_authorized_action(vcp->dvp,
7142	vcp->ctx, KAUTH_VNODE_DELETE_CHILD);
7143	}
7144	return (`0`);
7145	case KAUTH_RESULT_DEFER:
7146	default:
7147	/ Deferred by directory ACL /
7148	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
7149	break;
7150	}
7151	}
7152
7153	/*
7154	* From this point, we can't explicitly allow and if we reach the end
7155	* of the function without a denial, then the delete is authorized.
7156	*/
7157	if (!cached_delete_child) {
7158	if (vnode_authorize_posix(vcp, VWRITE, `1` / on_dir /) != `0`) {
7159	KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp);
7160	return (EACCES);
7161	}
7162	/*
7163	* Cache the authorized action on the vnode if allowed by the
7164	* directory ACL or POSIX permissions. It is correct to cache
7165	* this action even if sticky bit would deny deleting the node.
7166	*/
7167	if (vcp->dvp) {
7168	vnode_cache_authorized_action(vcp->dvp, vcp->ctx,
7169	KAUTH_VNODE_DELETE_CHILD);
7170	}
7171	}
7172
7173	/ enforce sticky bit behaviour /
7174	if ((dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) {
7175	KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)",
7176	vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid);
7177	return (EACCES);
7178	}
7179
7180	/ not denied, must be OK /
7181	return (`0`);
7182	}
7183
7184
7185	/*
7186	* Authorize an operation based on the node's attributes.
7187	*/
7188	static int
7189	vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_rights_t preauth_rights, boolean_t *found_deny)
7190	{
7191	struct vnode_attr *vap = vcp->vap;
7192	kauth_cred_t cred = vcp->ctx->vc_ucred;
7193	struct kauth_acl_eval eval;
7194	int error, ismember;
7195	mode_t posix_action;
7196
7197	/*
7198	* If we are the file owner, we automatically have some rights.
7199	*
7200	* Do we need to expand this to support group ownership?
7201	*/
7202	if (vauth_file_owner(vcp))
7203	acl_rights &= ~(KAUTH_VNODE_WRITE_SECURITY);
7204
7205	/*
7206	* If we are checking both TAKE_OWNERSHIP and WRITE_SECURITY, we can
7207	* mask the latter. If TAKE_OWNERSHIP is requested the caller is about to
7208	* change ownership to themselves, and WRITE_SECURITY is implicitly
7209	* granted to the owner. We need to do this because at this point
7210	* WRITE_SECURITY may not be granted as the caller is not currently
7211	* the owner.
7212	*/
7213	if ((acl_rights & KAUTH_VNODE_TAKE_OWNERSHIP) &&
7214	(acl_rights & KAUTH_VNODE_WRITE_SECURITY))
7215	acl_rights &= ~KAUTH_VNODE_WRITE_SECURITY;
7216
7217	if (acl_rights == `0`) {
7218	KAUTH_DEBUG("%p ALLOWED - implicit or no rights required", vcp->vp);
7219	return(`0`);
7220	}
7221
7222	/ if we have an ACL, evaluate it /
7223	if (VATTR_IS_NOT(vap, va_acl, NULL)) {
7224	eval.ae_requested = acl_rights;
7225	eval.ae_acl = &vap->va_acl->acl_ace[`0`];
7226	eval.ae_count = vap->va_acl->acl_entrycount;
7227	eval.ae_options = `0`;
7228	if (vauth_file_owner(vcp))
7229	eval.ae_options \|= KAUTH_AEVAL_IS_OWNER;
7230	/*
7231	* We use ENOENT as a marker to indicate we could not get
7232	* information in order to delay evaluation until after we
7233	* have the ACL evaluation answer. Previously, we would
7234	* always deny the operation at this point.
7235	*/
7236	if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != `0` && error != ENOENT)
7237	return(error);
7238	if (error == ENOENT)
7239	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP_UNKNOWN;
7240	else if (ismember)
7241	eval.ae_options \|= KAUTH_AEVAL_IN_GROUP;
7242	eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS;
7243	eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS;
7244	eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS;
7245	eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS;
7246
7247	if ((error = kauth_acl_evaluate(cred, &eval)) != `0`) {
7248	KAUTH_DEBUG("%p ERROR during ACL processing - %d", vcp->vp, error);
7249	return(error);
7250	}
7251
7252	switch(eval.ae_result) {
7253	case KAUTH_RESULT_DENY:
7254	KAUTH_DEBUG("%p DENIED - by ACL", vcp->vp);
7255	return(EACCES); / deny, deny, counter-allege /
7256	case KAUTH_RESULT_ALLOW:
7257	KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp);
7258	return(`0`);
7259	case KAUTH_RESULT_DEFER:
7260	default:
7261	/ Effectively the same as !delete_child_denied /
7262	KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp);
7263	break;
7264	}
7265
7266	*found_deny = eval.ae_found_deny;
7267
7268	/ fall through and evaluate residual rights /
7269	} else {
7270	/ no ACL, everything is residual /
7271	eval.ae_residual = acl_rights;
7272	}
7273
7274	/*
7275	* Grant residual rights that have been pre-authorized.
7276	*/
7277	eval.ae_residual &= ~preauth_rights;
7278
7279	/*
7280	* We grant WRITE_ATTRIBUTES to the owner if it hasn't been denied.
7281	*/
7282	if (vauth_file_owner(vcp))
7283	eval.ae_residual &= ~KAUTH_VNODE_WRITE_ATTRIBUTES;
7284
7285	if (eval.ae_residual == `0`) {
7286	KAUTH_DEBUG("%p ALLOWED - rights already authorized", vcp->vp);
7287	return(`0`);
7288	}
7289
7290	/*
7291	* Bail if we have residual rights that can't be granted by posix permissions,
7292	* or aren't presumed granted at this point.
7293	*
7294	* XXX these can be collapsed for performance
7295	*/
7296	if (eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER) {
7297	KAUTH_DEBUG("%p DENIED - CHANGE_OWNER not permitted", vcp->vp);
7298	return(EACCES);
7299	}
7300	if (eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY) {
7301	KAUTH_DEBUG("%p DENIED - WRITE_SECURITY not permitted", vcp->vp);
7302	return(EACCES);
7303	}
7304
7305	#if DIAGNOSTIC
7306	if (eval.ae_residual & KAUTH_VNODE_DELETE)
7307	panic("vnode_authorize: can't be checking delete permission here");
7308	#endif
7309
7310	/*
7311	* Compute the fallback posix permissions that will satisfy the remaining
7312	* rights.
7313	*/
7314	posix_action = `0`;
7315	if (eval.ae_residual & (KAUTH_VNODE_READ_DATA \|
7316	KAUTH_VNODE_LIST_DIRECTORY \|
7317	KAUTH_VNODE_READ_EXTATTRIBUTES))
7318	posix_action \|= VREAD;
7319	if (eval.ae_residual & (KAUTH_VNODE_WRITE_DATA \|
7320	KAUTH_VNODE_ADD_FILE \|
7321	KAUTH_VNODE_ADD_SUBDIRECTORY \|
7322	KAUTH_VNODE_DELETE_CHILD \|
7323	KAUTH_VNODE_WRITE_ATTRIBUTES \|
7324	KAUTH_VNODE_WRITE_EXTATTRIBUTES))
7325	posix_action \|= VWRITE;
7326	if (eval.ae_residual & (KAUTH_VNODE_EXECUTE \|
7327	KAUTH_VNODE_SEARCH))
7328	posix_action \|= VEXEC;
7329
7330	if (posix_action != `0`) {
7331	return(vnode_authorize_posix(vcp, posix_action, `0` / !on_dir /));
7332	} else {
7333	KAUTH_DEBUG("%p ALLOWED - residual rights %s%s%s%s%s%s%s%s%s%s%s%s%s%s granted due to no posix mapping",
7334	vcp->vp,
7335	(eval.ae_residual & KAUTH_VNODE_READ_DATA)
7336	? vnode_isdir(vcp->vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7337	(eval.ae_residual & KAUTH_VNODE_WRITE_DATA)
7338	? vnode_isdir(vcp->vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7339	(eval.ae_residual & KAUTH_VNODE_EXECUTE)
7340	? vnode_isdir(vcp->vp) ? " SEARCH" : " EXECUTE" : "",
7341	(eval.ae_residual & KAUTH_VNODE_DELETE)
7342	? " DELETE" : "",
7343	(eval.ae_residual & KAUTH_VNODE_APPEND_DATA)
7344	? vnode_isdir(vcp->vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7345	(eval.ae_residual & KAUTH_VNODE_DELETE_CHILD)
7346	? " DELETE_CHILD" : "",
7347	(eval.ae_residual & KAUTH_VNODE_READ_ATTRIBUTES)
7348	? " READ_ATTRIBUTES" : "",
7349	(eval.ae_residual & KAUTH_VNODE_WRITE_ATTRIBUTES)
7350	? " WRITE_ATTRIBUTES" : "",
7351	(eval.ae_residual & KAUTH_VNODE_READ_EXTATTRIBUTES)
7352	? " READ_EXTATTRIBUTES" : "",
7353	(eval.ae_residual & KAUTH_VNODE_WRITE_EXTATTRIBUTES)
7354	? " WRITE_EXTATTRIBUTES" : "",
7355	(eval.ae_residual & KAUTH_VNODE_READ_SECURITY)
7356	? " READ_SECURITY" : "",
7357	(eval.ae_residual & KAUTH_VNODE_WRITE_SECURITY)
7358	? " WRITE_SECURITY" : "",
7359	(eval.ae_residual & KAUTH_VNODE_CHECKIMMUTABLE)
7360	? " CHECKIMMUTABLE" : "",
7361	(eval.ae_residual & KAUTH_VNODE_CHANGE_OWNER)
7362	? " CHANGE_OWNER" : "");
7363	}
7364
7365	/*
7366	* Lack of required Posix permissions implies no reason to deny access.
7367	*/
7368	return(`0`);
7369	}
7370
7371	/*
7372	* Check for file immutability.
7373	*/
7374	static int
7375	vnode_authorize_checkimmutable(mount_t mp, struct vnode_attr vap, int* rights, int ignore)
7376	{
7377	int error;
7378	int append;
7379
7380	/*
7381	* Perform immutability checks for operations that change data.
7382	*
7383	* Sockets, fifos and devices require special handling.
7384	*/
7385	switch(vap->va_type) {
7386	case VSOCK:
7387	case VFIFO:
7388	case VBLK:
7389	case VCHR:
7390	/*
7391	* Writing to these nodes does not change the filesystem data,
7392	* so forget that it's being tried.
7393	*/
7394	rights &= ~KAUTH_VNODE_WRITE_DATA;
7395	break;
7396	default:
7397	break;
7398	}
7399
7400	error = `0`;
7401	if (rights & KAUTH_VNODE_WRITE_RIGHTS) {
7402
7403	/ check per-filesystem options if possible /
7404	if (mp != NULL) {
7405
7406	/ check for no-EA filesystems /
7407	if ((rights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) &&
7408	(vfs_flags(mp) & MNT_NOUSERXATTR)) {
7409	KAUTH_DEBUG("%p DENIED - filesystem disallowed extended attributes", vp);
7410	error = EACCES; / User attributes disabled /
7411	goto out;
7412	}
7413	}
7414
7415	/*
7416	* check for file immutability. first, check if the requested rights are
7417	* allowable for a UF_APPEND file.
7418	*/
7419	append = `0`;
7420	if (vap->va_type == VDIR) {
7421	if ((rights & (KAUTH_VNODE_ADD_FILE \| KAUTH_VNODE_ADD_SUBDIRECTORY \| KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights)
7422	append = `1`;
7423	} else {
7424	if ((rights & (KAUTH_VNODE_APPEND_DATA \| KAUTH_VNODE_WRITE_EXTATTRIBUTES)) == rights)
7425	append = `1`;
7426	}
7427	if ((error = vnode_immutable(vap, append, ignore)) != `0`) {
7428	KAUTH_DEBUG("%p DENIED - file is immutable", vp);
7429	goto out;
7430	}
7431	}
7432	out:
7433	return(error);
7434	}
7435
7436	/*
7437	* Handle authorization actions for filesystems that advertise that the
7438	* server will be enforcing.
7439	*
7440	* Returns: 0 Authorization should be handled locally
7441	* 1 Authorization was handled by the FS
7442	*
7443	* Note: Imputed returns will only occur if the authorization request
7444	* was handled by the FS.
7445	*
7446	* Imputed: *resultp, modified Return code from FS when the request is
7447	* handled by the FS.
7448	* VNOP_ACCESS:???
7449	* VNOP_OPEN:???
7450	*/
7451	static int
7452	vnode_authorize_opaque(vnode_t vp, int *resultp, kauth_action_t action, vfs_context_t ctx)
7453	{
7454	int error;
7455
7456	/*
7457	* If the vp is a device node, socket or FIFO it actually represents a local
7458	* endpoint, so we need to handle it locally.
7459	*/
7460	switch(vp->v_type) {
7461	case VBLK:
7462	case VCHR:
7463	case VSOCK:
7464	case VFIFO:
7465	return(`0`);
7466	default:
7467	break;
7468	}
7469
7470	/*
7471	* In the advisory request case, if the filesystem doesn't think it's reliable
7472	* we will attempt to formulate a result ourselves based on VNOP_GETATTR data.
7473	*/
7474	if ((action & KAUTH_VNODE_ACCESS) && !vfs_authopaqueaccess(vp->v_mount))
7475	return(`0`);
7476
7477	/*
7478	* Let the filesystem have a say in the matter. It's OK for it to not implemnent
7479	* VNOP_ACCESS, as most will authorise inline with the actual request.
7480	*/
7481	if ((error = VNOP_ACCESS(vp, action, ctx)) != ENOTSUP) {
7482	*resultp = error;
7483	KAUTH_DEBUG("%p DENIED - opaque filesystem VNOP_ACCESS denied access", vp);
7484	return(`1`);
7485	}
7486
7487	/*
7488	* Typically opaque filesystems do authorisation in-line, but exec is a special case. In
7489	* order to be reasonably sure that exec will be permitted, we try a bit harder here.
7490	*/
7491	if ((action & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG)) {
7492	/ try a VNOP_OPEN for readonly access /
7493	if ((error = VNOP_OPEN(vp, FREAD, ctx)) != `0`) {
7494	*resultp = error;
7495	KAUTH_DEBUG("%p DENIED - EXECUTE denied because file could not be opened readonly", vp);
7496	return(`1`);
7497	}
7498	VNOP_CLOSE(vp, FREAD, ctx);
7499	}
7500
7501	/*
7502	* We don't have any reason to believe that the request has to be denied at this point,
7503	* so go ahead and allow it.
7504	*/
7505	*resultp = `0`;
7506	KAUTH_DEBUG("%p ALLOWED - bypassing access check for non-local filesystem", vp);
7507	return(`1`);
7508	}
7509
7510
7511
7512
7513	/*
7514	* Returns: KAUTH_RESULT_ALLOW
7515	* KAUTH_RESULT_DENY
7516	*
7517	* Imputed: *arg3, modified Error code in the deny case
7518	* EROFS Read-only file system
7519	* EACCES Permission denied
7520	* EPERM Operation not permitted [no execute]
7521	* vnode_getattr:ENOMEM Not enough space [only if has filesec]
7522	* vnode_getattr:???
7523	* vnode_authorize_opaque:*arg2 ???
7524	* vnode_authorize_checkimmutable:???
7525	* vnode_authorize_delete:???
7526	* vnode_authorize_simple:???
7527	*/
7528
7529
7530	static int
7531	vnode_authorize_callback(__unused kauth_cred_t cred, __unused void *idata,
7532	kauth_action_t action, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2,
7533	uintptr_t arg3)
7534	{
7535	vfs_context_t ctx;
7536	vnode_t cvp = NULLVP;
7537	vnode_t vp, dvp;
7538	int result = KAUTH_RESULT_DENY;
7539	int parent_iocount = `0`;
7540	int parent_action; / In case we need to use namedstream's data fork for cached rights/
7541
7542	ctx = (vfs_context_t)arg0;
7543	vp = (vnode_t)arg1;
7544	dvp = (vnode_t)arg2;
7545
7546	/*
7547	* if there are 2 vnodes passed in, we don't know at
7548	* this point which rights to look at based on the
7549	* combined action being passed in... defer until later...
7550	* otherwise check the kauth 'rights' cache hung
7551	* off of the vnode we're interested in... if we've already
7552	* been granted the right we're currently interested in,
7553	* we can just return success... otherwise we'll go through
7554	* the process of authorizing the requested right(s)... if that
7555	* succeeds, we'll add the right(s) to the cache.
7556	* VNOP_SETATTR and VNOP_SETXATTR will invalidate this cache
7557	*/
7558	if (dvp && vp)
7559	goto defer;
7560	if (dvp) {
7561	cvp = dvp;
7562	} else {
7563	/*
7564	* For named streams on local-authorization volumes, rights are cached on the parent;
7565	* authorization is determined by looking at the parent's properties anyway, so storing
7566	* on the parent means that we don't recompute for the named stream and that if
7567	* we need to flush rights (e.g. on VNOP_SETATTR()) we don't need to track down the
7568	* stream to flush its cache separately. If we miss in the cache, then we authorize
7569	* as if there were no cached rights (passing the named stream vnode and desired rights to
7570	* vnode_authorize_callback_int()).
7571	*
7572	* On an opaquely authorized volume, we don't know the relationship between the
7573	* data fork's properties and the rights granted on a stream. Thus, named stream vnodes
7574	* on such a volume are authorized directly (rather than using the parent) and have their
7575	* own caches. When a named stream vnode is created, we mark the parent as having a named
7576	* stream. On a VNOP_SETATTR() for the parent that may invalidate cached authorization, we
7577	* find the stream and flush its cache.
7578	*/
7579	if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) {
7580	cvp = vnode_getparent(vp);
7581	if (cvp != NULLVP) {
7582	parent_iocount = `1`;
7583	} else {
7584	cvp = NULL;
7585	goto defer; / If we can't use the parent, take the slow path /
7586	}
7587
7588	/ Have to translate some actions /
7589	parent_action = action;
7590	if (parent_action & KAUTH_VNODE_READ_DATA) {
7591	parent_action &= ~KAUTH_VNODE_READ_DATA;
7592	parent_action \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
7593	}
7594	if (parent_action & KAUTH_VNODE_WRITE_DATA) {
7595	parent_action &= ~KAUTH_VNODE_WRITE_DATA;
7596	parent_action \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
7597	}
7598
7599	} else {
7600	cvp = vp;
7601	}
7602	}
7603
7604	if (vnode_cache_is_authorized(cvp, ctx, parent_iocount ? parent_action : action) == TRUE) {
7605	result = KAUTH_RESULT_ALLOW;
7606	goto out;
7607	}
7608	defer:
7609	result = vnode_authorize_callback_int(action, ctx, vp, dvp, (int *)arg3);
7610
7611	if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) {
7612	KAUTH_DEBUG("%p - caching action = %x", cvp, action);
7613	vnode_cache_authorized_action(cvp, ctx, action);
7614	}
7615
7616	out:
7617	if (parent_iocount) {
7618	vnode_put(cvp);
7619	}
7620
7621	return result;
7622	}
7623
7624	static int
7625	vnode_attr_authorize_internal(vauth_ctx vcp, mount_t mp,
7626	kauth_ace_rights_t rights, int is_suser, boolean_t *found_deny,
7627	int noimmutable, int parent_authorized_for_delete_child)
7628	{
7629	int result;
7630
7631	/*
7632	* Check for immutability.
7633	*
7634	* In the deletion case, parent directory immutability vetoes specific
7635	* file rights.
7636	*/
7637	if ((result = vnode_authorize_checkimmutable(mp, vcp->vap, rights,
7638	noimmutable)) != `0`)
7639	goto out;
7640
7641	if ((rights & KAUTH_VNODE_DELETE) &&
7642	!parent_authorized_for_delete_child) {
7643	result = vnode_authorize_checkimmutable(mp, vcp->dvap,
7644	KAUTH_VNODE_DELETE_CHILD, `0`);
7645	if (result)
7646	goto out;
7647	}
7648
7649	/*
7650	* Clear rights that have been authorized by reaching this point, bail if nothing left to
7651	* check.
7652	*/
7653	rights &= ~(KAUTH_VNODE_LINKTARGET \| KAUTH_VNODE_CHECKIMMUTABLE);
7654	if (rights == `0`)
7655	goto out;
7656
7657	/*
7658	* If we're not the superuser, authorize based on file properties;
7659	* note that even if parent_authorized_for_delete_child is TRUE, we
7660	* need to check on the node itself.
7661	*/
7662	if (!is_suser) {
7663	/ process delete rights /
7664	if ((rights & KAUTH_VNODE_DELETE) &&
7665	((result = vnode_authorize_delete(vcp, parent_authorized_for_delete_child)) != `0`))
7666	goto out;
7667
7668	/ process remaining rights /
7669	if ((rights & ~KAUTH_VNODE_DELETE) &&
7670	(result = vnode_authorize_simple(vcp, rights, rights & KAUTH_VNODE_DELETE, found_deny)) != `0`)
7671	goto out;
7672	} else {
7673	/*
7674	* Execute is only granted to root if one of the x bits is set. This check only
7675	* makes sense if the posix mode bits are actually supported.
7676	*/
7677	if ((rights & KAUTH_VNODE_EXECUTE) &&
7678	(vcp->vap->va_type == VREG) &&
7679	VATTR_IS_SUPPORTED(vcp->vap, va_mode) &&
7680	!(vcp->vap->va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH))) {
7681	result = EPERM;
7682	KAUTH_DEBUG("%p DENIED - root execute requires at least one x bit in 0x%x", vp, va.va_mode);
7683	goto out;
7684	}
7685
7686	/ Assume that there were DENYs so we don't wrongly cache KAUTH_VNODE_SEARCHBYANYONE /
7687	*found_deny = TRUE;
7688
7689	KAUTH_DEBUG("%p ALLOWED - caller is superuser", vp);
7690	}
7691	out:
7692	return (result);
7693	}
7694
7695	static int
7696	vnode_authorize_callback_int(kauth_action_t action, vfs_context_t ctx,
7697	vnode_t vp, vnode_t dvp, int *errorp)
7698	{
7699	struct _vnode_authorize_context auth_context;
7700	vauth_ctx vcp;
7701	kauth_cred_t cred;
7702	kauth_ace_rights_t rights;
7703	struct vnode_attr va, dva;
7704	int result;
7705	int noimmutable;
7706	boolean_t parent_authorized_for_delete_child = FALSE;
7707	boolean_t found_deny = FALSE;
7708	boolean_t parent_ref= FALSE;
7709	boolean_t is_suser = FALSE;
7710
7711	vcp = &auth_context;
7712	vcp->ctx = ctx;
7713	vcp->vp = vp;
7714	vcp->dvp = dvp;
7715	/*
7716	* Note that we authorize against the context, not the passed cred
7717	* (the same thing anyway)
7718	*/
7719	cred = ctx->vc_ucred;
7720
7721	VATTR_INIT(&va);
7722	vcp->vap = &va;
7723	VATTR_INIT(&dva);
7724	vcp->dvap = &dva;
7725
7726	vcp->flags = vcp->flags_valid = `0`;
7727
7728	#if DIAGNOSTIC
7729	if ((ctx == NULL) \|\| (vp == NULL) \|\| (cred == NULL))
7730	panic("vnode_authorize: bad arguments (context %p vp %p cred %p)", ctx, vp, cred);
7731	#endif
7732
7733	KAUTH_DEBUG("%p AUTH - %s %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s on %s '%s' (0x%x:%p/%p)",
7734	vp, vfs_context_proc(ctx)->p_comm,
7735	(action & KAUTH_VNODE_ACCESS) ? "access" : "auth",
7736	(action & KAUTH_VNODE_READ_DATA) ? vnode_isdir(vp) ? " LIST_DIRECTORY" : " READ_DATA" : "",
7737	(action & KAUTH_VNODE_WRITE_DATA) ? vnode_isdir(vp) ? " ADD_FILE" : " WRITE_DATA" : "",
7738	(action & KAUTH_VNODE_EXECUTE) ? vnode_isdir(vp) ? " SEARCH" : " EXECUTE" : "",
7739	(action & KAUTH_VNODE_DELETE) ? " DELETE" : "",
7740	(action & KAUTH_VNODE_APPEND_DATA) ? vnode_isdir(vp) ? " ADD_SUBDIRECTORY" : " APPEND_DATA" : "",
7741	(action & KAUTH_VNODE_DELETE_CHILD) ? " DELETE_CHILD" : "",
7742	(action & KAUTH_VNODE_READ_ATTRIBUTES) ? " READ_ATTRIBUTES" : "",
7743	(action & KAUTH_VNODE_WRITE_ATTRIBUTES) ? " WRITE_ATTRIBUTES" : "",
7744	(action & KAUTH_VNODE_READ_EXTATTRIBUTES) ? " READ_EXTATTRIBUTES" : "",
7745	(action & KAUTH_VNODE_WRITE_EXTATTRIBUTES) ? " WRITE_EXTATTRIBUTES" : "",
7746	(action & KAUTH_VNODE_READ_SECURITY) ? " READ_SECURITY" : "",
7747	(action & KAUTH_VNODE_WRITE_SECURITY) ? " WRITE_SECURITY" : "",
7748	(action & KAUTH_VNODE_CHANGE_OWNER) ? " CHANGE_OWNER" : "",
7749	(action & KAUTH_VNODE_NOIMMUTABLE) ? " (noimmutable)" : "",
7750	vnode_isdir(vp) ? "directory" : "file",
7751	vp->v_name ? vp->v_name : "<NULL>", action, vp, dvp);
7752
7753	/*
7754	* Extract the control bits from the action, everything else is
7755	* requested rights.
7756	*/
7757	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? `1` : `0`;
7758	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
7759
7760	if (rights & KAUTH_VNODE_DELETE) {
7761	#if DIAGNOSTIC
7762	if (dvp == NULL)
7763	panic("vnode_authorize: KAUTH_VNODE_DELETE test requires a directory");
7764	#endif
7765	/*
7766	* check to see if we've already authorized the parent
7767	* directory for deletion of its children... if so, we
7768	* can skip a whole bunch of work... we will still have to
7769	* authorize that this specific child can be removed
7770	*/
7771	if (vnode_cache_is_authorized(dvp, ctx, KAUTH_VNODE_DELETE_CHILD) == TRUE)
7772	parent_authorized_for_delete_child = TRUE;
7773	} else {
7774	vcp->dvp = NULLVP;
7775	vcp->dvap = NULL;
7776	}
7777
7778	/*
7779	* Check for read-only filesystems.
7780	*/
7781	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
7782	(vp->v_mount->mnt_flag & MNT_RDONLY) &&
7783	((vp->v_type == VREG) \|\| (vp->v_type == VDIR) \|\|
7784	(vp->v_type == VLNK) \|\| (vp->v_type == VCPLX) \|\|
7785	(rights & KAUTH_VNODE_DELETE) \|\| (rights & KAUTH_VNODE_DELETE_CHILD))) {
7786	result = EROFS;
7787	goto out;
7788	}
7789
7790	/*
7791	* Check for noexec filesystems.
7792	*/
7793	if ((rights & KAUTH_VNODE_EXECUTE) && (vp->v_type == VREG) && (vp->v_mount->mnt_flag & MNT_NOEXEC)) {
7794	result = EACCES;
7795	goto out;
7796	}
7797
7798	/*
7799	* Handle cases related to filesystems with non-local enforcement.
7800	* This call can return 0, in which case we will fall through to perform a
7801	* check based on VNOP_GETATTR data. Otherwise it returns 1 and sets
7802	* an appropriate result, at which point we can return immediately.
7803	*/
7804	if ((vp->v_mount->mnt_kern_flag & MNTK_AUTH_OPAQUE) && vnode_authorize_opaque(vp, &result, action, ctx))
7805	goto out;
7806
7807	/*
7808	* If the vnode is a namedstream (extended attribute) data vnode (eg.
7809	* a resource fork), _DATA becomes _EXTATTRIBUTES.
7810	*/
7811	if (vnode_isnamedstream(vp)) {
7812	if (rights & KAUTH_VNODE_READ_DATA) {
7813	rights &= ~KAUTH_VNODE_READ_DATA;
7814	rights \|= KAUTH_VNODE_READ_EXTATTRIBUTES;
7815	}
7816	if (rights & KAUTH_VNODE_WRITE_DATA) {
7817	rights &= ~KAUTH_VNODE_WRITE_DATA;
7818	rights \|= KAUTH_VNODE_WRITE_EXTATTRIBUTES;
7819	}
7820
7821	/*
7822	* Point 'vp' to the namedstream's parent for ACL checking
7823	*/
7824	if ((vp->v_parent != NULL) &&
7825	(vget_internal(vp->v_parent, `0`, VNODE_NODEAD \| VNODE_DRAINO) == `0`)) {
7826	parent_ref = TRUE;
7827	vcp->vp = vp = vp->v_parent;
7828	}
7829	}
7830
7831	if (vfs_context_issuser(ctx)) {
7832	/*
7833	* if we're not asking for execute permissions or modifications,
7834	* then we're done, this action is authorized.
7835	*/
7836	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS)))
7837	goto success;
7838
7839	is_suser = TRUE;
7840	}
7841
7842	/*
7843	* Get vnode attributes and extended security information for the vnode
7844	* and directory if required.
7845	*
7846	* If we're root we only want mode bits and flags for checking
7847	* execute and immutability.
7848	*/
7849	VATTR_WANTED(&va, va_mode);
7850	VATTR_WANTED(&va, va_flags);
7851	if (!is_suser) {
7852	VATTR_WANTED(&va, va_uid);
7853	VATTR_WANTED(&va, va_gid);
7854	VATTR_WANTED(&va, va_acl);
7855	}
7856	if ((result = vnode_getattr(vp, &va, ctx)) != `0`) {
7857	KAUTH_DEBUG("%p ERROR - failed to get vnode attributes - %d", vp, result);
7858	goto out;
7859	}
7860	VATTR_WANTED(&va, va_type);
7861	VATTR_RETURN(&va, va_type, vnode_vtype(vp));
7862
7863	if (vcp->dvp) {
7864	VATTR_WANTED(&dva, va_mode);
7865	VATTR_WANTED(&dva, va_flags);
7866	if (!is_suser) {
7867	VATTR_WANTED(&dva, va_uid);
7868	VATTR_WANTED(&dva, va_gid);
7869	VATTR_WANTED(&dva, va_acl);
7870	}
7871	if ((result = vnode_getattr(vcp->dvp, &dva, ctx)) != `0`) {
7872	KAUTH_DEBUG("%p ERROR - failed to get directory vnode attributes - %d", vp, result);
7873	goto out;
7874	}
7875	VATTR_WANTED(&dva, va_type);
7876	VATTR_RETURN(&dva, va_type, vnode_vtype(vcp->dvp));
7877	}
7878
7879	result = vnode_attr_authorize_internal(vcp, vp->v_mount, rights, is_suser,
7880	&found_deny, noimmutable, parent_authorized_for_delete_child);
7881	out:
7882	if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL))
7883	kauth_acl_free(va.va_acl);
7884	if (VATTR_IS_SUPPORTED(&dva, va_acl) && (dva.va_acl != NULL))
7885	kauth_acl_free(dva.va_acl);
7886
7887	if (result) {
7888	if (parent_ref)
7889	vnode_put(vp);
7890	*errorp = result;
7891	KAUTH_DEBUG("%p DENIED - auth denied", vp);
7892	return(KAUTH_RESULT_DENY);
7893	}
7894	if ((rights & KAUTH_VNODE_SEARCH) && found_deny == FALSE && vp->v_type == VDIR) {
7895	/*
7896	* if we were successfully granted the right to search this directory
7897	* and there were NO ACL DENYs for search and the posix permissions also don't
7898	* deny execute, we can synthesize a global right that allows anyone to
7899	* traverse this directory during a pathname lookup without having to
7900	* match the credential associated with this cache of rights.
7901	*
7902	* Note that we can correctly cache KAUTH_VNODE_SEARCHBYANYONE
7903	* only if we actually check ACLs which we don't for root. As
7904	* a workaround, the lookup fast path checks for root.
7905	*/
7906	if (!VATTR_IS_SUPPORTED(&va, va_mode) \|\|
7907	((va.va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) ==
7908	(S_IXUSR \| S_IXGRP \| S_IXOTH))) {
7909	vnode_cache_authorized_action(vp, ctx, KAUTH_VNODE_SEARCHBYANYONE);
7910	}
7911	}
7912	success:
7913	if (parent_ref)
7914	vnode_put(vp);
7915
7916	/*
7917	* Note that this implies that we will allow requests for no rights, as well as
7918	* for rights that we do not recognise. There should be none of these.
7919	*/
7920	KAUTH_DEBUG("%p ALLOWED - auth granted", vp);
7921	return(KAUTH_RESULT_ALLOW);
7922	}
7923
7924	int
7925	vnode_attr_authorize_init(struct vnode_attr vap, struct* vnode_attr *dvap,
7926	kauth_action_t action, vfs_context_t ctx)
7927	{
7928	VATTR_INIT(vap);
7929	VATTR_WANTED(vap, va_type);
7930	VATTR_WANTED(vap, va_mode);
7931	VATTR_WANTED(vap, va_flags);
7932	if (dvap) {
7933	VATTR_INIT(dvap);
7934	if (action & KAUTH_VNODE_DELETE) {
7935	VATTR_WANTED(dvap, va_type);
7936	VATTR_WANTED(dvap, va_mode);
7937	VATTR_WANTED(dvap, va_flags);
7938	}
7939	} else if (action & KAUTH_VNODE_DELETE) {
7940	return (EINVAL);
7941	}
7942
7943	if (!vfs_context_issuser(ctx)) {
7944	VATTR_WANTED(vap, va_uid);
7945	VATTR_WANTED(vap, va_gid);
7946	VATTR_WANTED(vap, va_acl);
7947	if (dvap && (action & KAUTH_VNODE_DELETE)) {
7948	VATTR_WANTED(dvap, va_uid);
7949	VATTR_WANTED(dvap, va_gid);
7950	VATTR_WANTED(dvap, va_acl);
7951	}
7952	}
7953
7954	return (`0`);
7955	}
7956
7957	int
7958	vnode_attr_authorize(struct vnode_attr vap, struct* vnode_attr *dvap, mount_t mp,
7959	kauth_action_t action, vfs_context_t ctx)
7960	{
7961	struct _vnode_authorize_context auth_context;
7962	vauth_ctx vcp;
7963	kauth_ace_rights_t rights;
7964	int noimmutable;
7965	boolean_t found_deny;
7966	boolean_t is_suser = FALSE;
7967	int result = `0`;
7968
7969	vcp = &auth_context;
7970	vcp->ctx = ctx;
7971	vcp->vp = NULLVP;
7972	vcp->vap = vap;
7973	vcp->dvp = NULLVP;
7974	vcp->dvap = dvap;
7975	vcp->flags = vcp->flags_valid = `0`;
7976
7977	noimmutable = (action & KAUTH_VNODE_NOIMMUTABLE) ? `1` : `0`;
7978	rights = action & ~(KAUTH_VNODE_ACCESS \| KAUTH_VNODE_NOIMMUTABLE);
7979
7980	/*
7981	* Check for read-only filesystems.
7982	*/
7983	if ((rights & KAUTH_VNODE_WRITE_RIGHTS) &&
7984	mp && (mp->mnt_flag & MNT_RDONLY) &&
7985	((vap->va_type == VREG) \|\| (vap->va_type == VDIR) \|\|
7986	(vap->va_type == VLNK) \|\| (rights & KAUTH_VNODE_DELETE) \|\|
7987	(rights & KAUTH_VNODE_DELETE_CHILD))) {
7988	result = EROFS;
7989	goto out;
7990	}
7991
7992	/*
7993	* Check for noexec filesystems.
7994	*/
7995	if ((rights & KAUTH_VNODE_EXECUTE) &&
7996	(vap->va_type == VREG) && mp && (mp->mnt_flag & MNT_NOEXEC)) {
7997	result = EACCES;
7998	goto out;
7999	}
8000
8001	if (vfs_context_issuser(ctx)) {
8002	/*
8003	* if we're not asking for execute permissions or modifications,
8004	* then we're done, this action is authorized.
8005	*/
8006	if (!(rights & (KAUTH_VNODE_EXECUTE \| KAUTH_VNODE_WRITE_RIGHTS)))
8007	goto out;
8008	is_suser = TRUE;
8009	} else {
8010	if (!VATTR_IS_SUPPORTED(vap, va_uid) \|\|
8011	!VATTR_IS_SUPPORTED(vap, va_gid) \|\|
8012	(mp && vfs_extendedsecurity(mp) && !VATTR_IS_SUPPORTED(vap, va_acl))) {
8013	panic("vnode attrs not complete for vnode_attr_authorize\n");
8014	}
8015	}
8016
8017	result = vnode_attr_authorize_internal(vcp, mp, rights, is_suser,
8018	&found_deny, noimmutable, FALSE);
8019
8020	if (result == EPERM)
8021	result = EACCES;
8022	out:
8023	return (result);
8024	}
8025
8026
8027	int
8028	vnode_authattr_new(vnode_t dvp, struct vnode_attr vap, int* noauth, vfs_context_t ctx)
8029	{
8030	return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx);
8031	}
8032
8033	/*
8034	* Check that the attribute information in vattr can be legally applied to
8035	* a new file by the context.
8036	*/
8037	static int
8038	vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr vap, int* noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx)
8039	{
8040	int error;
8041	int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode;
8042	uint32_t inherit_flags;
8043	kauth_cred_t cred;
8044	guid_t changer;
8045	mount_t dmp;
8046	struct vnode_attr dva;
8047
8048	error = `0`;
8049
8050	if (defaulted_fieldsp) {
8051	*defaulted_fieldsp = `0`;
8052	}
8053
8054	defaulted_owner = defaulted_group = defaulted_mode = `0`;
8055
8056	inherit_flags = `0`;
8057
8058	/*
8059	* Require that the filesystem support extended security to apply any.
8060	*/
8061	if (!vfs_extendedsecurity(dvp->v_mount) &&
8062	(VATTR_IS_ACTIVE(vap, va_acl) \|\| VATTR_IS_ACTIVE(vap, va_uuuid) \|\| VATTR_IS_ACTIVE(vap, va_guuid))) {
8063	error = EINVAL;
8064	goto out;
8065	}
8066
8067	/*
8068	* Default some fields.
8069	*/
8070	dmp = dvp->v_mount;
8071
8072	/*
8073	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit owner is set, that
8074	* owner takes ownership of all new files.
8075	*/
8076	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsowner != KAUTH_UID_NONE)) {
8077	VATTR_SET(vap, va_uid, dmp->mnt_fsowner);
8078	defaulted_owner = `1`;
8079	} else {
8080	if (!VATTR_IS_ACTIVE(vap, va_uid)) {
8081	/ default owner is current user /
8082	VATTR_SET(vap, va_uid, kauth_cred_getuid(vfs_context_ucred(ctx)));
8083	defaulted_owner = `1`;
8084	}
8085	}
8086
8087	/*
8088	* We need the dvp's va_flags and may need the gid of the directory,
8089	* we ask for both here.
8090	*/
8091	VATTR_INIT(&dva);
8092	VATTR_WANTED(&dva, va_gid);
8093	VATTR_WANTED(&dva, va_flags);
8094	if ((error = vnode_getattr(dvp, &dva, ctx)) != `0`)
8095	goto out;
8096
8097	/*
8098	* If the filesystem is mounted IGNORE_OWNERSHIP and an explicit grouo is set, that
8099	* group takes ownership of all new files.
8100	*/
8101	if ((dmp->mnt_flag & MNT_IGNORE_OWNERSHIP) && (dmp->mnt_fsgroup != KAUTH_GID_NONE)) {
8102	VATTR_SET(vap, va_gid, dmp->mnt_fsgroup);
8103	defaulted_group = `1`;
8104	} else {
8105	if (!VATTR_IS_ACTIVE(vap, va_gid)) {
8106	/ default group comes from parent object, fallback to current user /
8107	if (VATTR_IS_SUPPORTED(&dva, va_gid)) {
8108	VATTR_SET(vap, va_gid, dva.va_gid);
8109	} else {
8110	VATTR_SET(vap, va_gid, kauth_cred_getgid(vfs_context_ucred(ctx)));
8111	}
8112	defaulted_group = `1`;
8113	}
8114	}
8115
8116	if (!VATTR_IS_ACTIVE(vap, va_flags))
8117	VATTR_SET(vap, va_flags, `0`);
8118
8119	/ Determine if SF_RESTRICTED should be inherited from the parent*
8120	* directory. */
8121	if (VATTR_IS_SUPPORTED(&dva, va_flags)) {
8122	inherit_flags = dva.va_flags & (UF_DATAVAULT \| SF_RESTRICTED);
8123	}
8124
8125	/ default mode is everything, masked with current umask /
8126	if (!VATTR_IS_ACTIVE(vap, va_mode)) {
8127	VATTR_SET(vap, va_mode, ACCESSPERMS & ~vfs_context_proc(ctx)->p_fd->fd_cmask);
8128	KAUTH_DEBUG("ATTR - defaulting new file mode to %o from umask %o", vap->va_mode, vfs_context_proc(ctx)->p_fd->fd_cmask);
8129	defaulted_mode = `1`;
8130	}
8131	/ set timestamps to now /
8132	if (!VATTR_IS_ACTIVE(vap, va_create_time)) {
8133	nanotime(&vap->va_create_time);
8134	VATTR_SET_ACTIVE(vap, va_create_time);
8135	}
8136
8137	/*
8138	* Check for attempts to set nonsensical fields.
8139	*/
8140	if (vap->va_active & ~VNODE_ATTR_NEWOBJ) {
8141	error = EINVAL;
8142	KAUTH_DEBUG("ATTR - ERROR - attempt to set unsupported new-file attributes %llx",
8143	vap->va_active & ~VNODE_ATTR_NEWOBJ);
8144	goto out;
8145	}
8146
8147	/*
8148	* Quickly check for the applicability of any enforcement here.
8149	* Tests below maintain the integrity of the local security model.
8150	*/
8151	if (vfs_authopaque(dvp->v_mount))
8152	goto out;
8153
8154	/*
8155	* We need to know if the caller is the superuser, or if the work is
8156	* otherwise already authorised.
8157	*/
8158	cred = vfs_context_ucred(ctx);
8159	if (noauth) {
8160	/ doing work for the kernel /
8161	has_priv_suser = `1`;
8162	} else {
8163	has_priv_suser = vfs_context_issuser(ctx);
8164	}
8165
8166
8167	if (VATTR_IS_ACTIVE(vap, va_flags)) {
8168	if (has_priv_suser) {
8169	if ((vap->va_flags & (UF_SETTABLE \| SF_SETTABLE)) != vap->va_flags) {
8170	error = EPERM;
8171	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8172	goto out;
8173	}
8174	} else {
8175	if ((vap->va_flags & UF_SETTABLE) != vap->va_flags) {
8176	error = EPERM;
8177	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8178	goto out;
8179	}
8180	}
8181	}
8182
8183	/ if not superuser, validate legality of new-item attributes /
8184	if (!has_priv_suser) {
8185	if (!defaulted_mode && VATTR_IS_ACTIVE(vap, va_mode)) {
8186	/ setgid? /
8187	if (vap->va_mode & S_ISGID) {
8188	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != `0`) {
8189	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8190	goto out;
8191	}
8192	if (!ismember) {
8193	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", vap->va_gid);
8194	error = EPERM;
8195	goto out;
8196	}
8197	}
8198
8199	/ setuid? /
8200	if ((vap->va_mode & S_ISUID) && (vap->va_uid != kauth_cred_getuid(cred))) {
8201	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8202	error = EPERM;
8203	goto out;
8204	}
8205	}
8206	if (!defaulted_owner && (vap->va_uid != kauth_cred_getuid(cred))) {
8207	KAUTH_DEBUG(" DENIED - cannot create new item owned by %d", vap->va_uid);
8208	error = EPERM;
8209	goto out;
8210	}
8211	if (!defaulted_group) {
8212	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != `0`) {
8213	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8214	goto out;
8215	}
8216	if (!ismember) {
8217	KAUTH_DEBUG(" DENIED - cannot create new item with group %d - not a member", vap->va_gid);
8218	error = EPERM;
8219	goto out;
8220	}
8221	}
8222
8223	/ initialising owner/group UUID /
8224	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8225	if ((error = kauth_cred_getguid(cred, &changer)) != `0`) {
8226	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8227	/ XXX ENOENT here - no GUID - should perhaps become EPERM /
8228	goto out;
8229	}
8230	if (!kauth_guid_equal(&vap->va_uuuid, &changer)) {
8231	KAUTH_DEBUG(" ERROR - cannot create item with supplied owner UUID - not us");
8232	error = EPERM;
8233	goto out;
8234	}
8235	}
8236	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8237	if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != `0`) {
8238	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8239	goto out;
8240	}
8241	if (!ismember) {
8242	KAUTH_DEBUG(" ERROR - cannot create item with supplied group UUID - not a member");
8243	error = EPERM;
8244	goto out;
8245	}
8246	}
8247	}
8248	out:
8249	if (inherit_flags) {
8250	/ Apply SF_RESTRICTED to the file if its parent directory was*
8251	* restricted. This is done at the end so that root is not
8252	* required if this flag is only set due to inheritance. */
8253	VATTR_SET(vap, va_flags, (vap->va_flags \| inherit_flags));
8254	}
8255	if (defaulted_fieldsp) {
8256	if (defaulted_mode) {
8257	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_MODE;
8258	}
8259	if (defaulted_group) {
8260	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_GID;
8261	}
8262	if (defaulted_owner) {
8263	*defaulted_fieldsp \|= VATTR_PREPARE_DEFAULTED_UID;
8264	}
8265	}
8266	return(error);
8267	}
8268
8269	/*
8270	* Check that the attribute information in vap can be legally written by the
8271	* context.
8272	*
8273	* Call this when you're not sure about the vnode_attr; either its contents
8274	* have come from an unknown source, or when they are variable.
8275	*
8276	* Returns errno, or zero and sets actionp to the KAUTH_VNODE_ actions that
8277	* must be authorized to be permitted to write the vattr.
8278	*/
8279	int
8280	vnode_authattr(vnode_t vp, struct vnode_attr vap, kauth_action_t actionp, vfs_context_t ctx)
8281	{
8282	struct vnode_attr ova;
8283	kauth_action_t required_action;
8284	int error, has_priv_suser, ismember, chowner, chgroup, clear_suid, clear_sgid;
8285	guid_t changer;
8286	gid_t group;
8287	uid_t owner;
8288	mode_t newmode;
8289	kauth_cred_t cred;
8290	uint32_t fdelta;
8291
8292	VATTR_INIT(&ova);
8293	required_action = `0`;
8294	error = `0`;
8295
8296	/*
8297	* Quickly check for enforcement applicability.
8298	*/
8299	if (vfs_authopaque(vp->v_mount))
8300	goto out;
8301
8302	/*
8303	* Check for attempts to set nonsensical fields.
8304	*/
8305	if (vap->va_active & VNODE_ATTR_RDONLY) {
8306	KAUTH_DEBUG("ATTR - ERROR: attempt to set readonly attribute(s)");
8307	error = EINVAL;
8308	goto out;
8309	}
8310
8311	/*
8312	* We need to know if the caller is the superuser.
8313	*/
8314	cred = vfs_context_ucred(ctx);
8315	has_priv_suser = kauth_cred_issuser(cred);
8316
8317	/*
8318	* If any of the following are changing, we need information from the old file:
8319	* va_uid
8320	* va_gid
8321	* va_mode
8322	* va_uuuid
8323	* va_guuid
8324	*/
8325	if (VATTR_IS_ACTIVE(vap, va_uid) \|\|
8326	VATTR_IS_ACTIVE(vap, va_gid) \|\|
8327	VATTR_IS_ACTIVE(vap, va_mode) \|\|
8328	VATTR_IS_ACTIVE(vap, va_uuuid) \|\|
8329	VATTR_IS_ACTIVE(vap, va_guuid)) {
8330	VATTR_WANTED(&ova, va_mode);
8331	VATTR_WANTED(&ova, va_uid);
8332	VATTR_WANTED(&ova, va_gid);
8333	VATTR_WANTED(&ova, va_uuuid);
8334	VATTR_WANTED(&ova, va_guuid);
8335	KAUTH_DEBUG("ATTR - security information changing, fetching existing attributes");
8336	}
8337
8338	/*
8339	* If timestamps are being changed, we need to know who the file is owned
8340	* by.
8341	*/
8342	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
8343	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
8344	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
8345	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
8346	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
8347	VATTR_IS_ACTIVE(vap, va_addedtime)) {
8348
8349	VATTR_WANTED(&ova, va_uid);
8350	#if 0 /* enable this when we support UUIDs as official owners */
8351	VATTR_WANTED(&ova, va_uuuid);
8352	#endif
8353	KAUTH_DEBUG("ATTR - timestamps changing, fetching uid and GUID");
8354	}
8355
8356	/*
8357	* If flags are being changed, we need the old flags.
8358	*/
8359	if (VATTR_IS_ACTIVE(vap, va_flags)) {
8360	KAUTH_DEBUG("ATTR - flags changing, fetching old flags");
8361	VATTR_WANTED(&ova, va_flags);
8362	}
8363
8364	/*
8365	* If ACLs are being changed, we need the old ACLs.
8366	*/
8367	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8368	KAUTH_DEBUG("ATTR - acl changing, fetching old flags");
8369	VATTR_WANTED(&ova, va_acl);
8370	}
8371
8372	/*
8373	* If the size is being set, make sure it's not a directory.
8374	*/
8375	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8376	/ size is only meaningful on regular files, don't permit otherwise /
8377	if (!vnode_isreg(vp)) {
8378	KAUTH_DEBUG("ATTR - ERROR: size change requested on non-file");
8379	error = vnode_isdir(vp) ? EISDIR : EINVAL;
8380	goto out;
8381	}
8382	}
8383
8384	/*
8385	* Get old data.
8386	*/
8387	KAUTH_DEBUG("ATTR - fetching old attributes %016llx", ova.va_active);
8388	if ((error = vnode_getattr(vp, &ova, ctx)) != `0`) {
8389	KAUTH_DEBUG(" ERROR - got %d trying to get attributes", error);
8390	goto out;
8391	}
8392
8393	/*
8394	* Size changes require write access to the file data.
8395	*/
8396	if (VATTR_IS_ACTIVE(vap, va_data_size)) {
8397	/ if we can't get the size, or it's different, we need write access /
8398	KAUTH_DEBUG("ATTR - size change, requiring WRITE_DATA");
8399	required_action \|= KAUTH_VNODE_WRITE_DATA;
8400	}
8401
8402	/*
8403	* Changing timestamps?
8404	*
8405	* Note that we are only called to authorize user-requested time changes;
8406	* side-effect time changes are not authorized. Authorisation is only
8407	* required for existing files.
8408	*
8409	* Non-owners are not permitted to change the time on an existing
8410	* file to anything other than the current time.
8411	*/
8412	if (VATTR_IS_ACTIVE(vap, va_create_time) \|\|
8413	VATTR_IS_ACTIVE(vap, va_change_time) \|\|
8414	VATTR_IS_ACTIVE(vap, va_modify_time) \|\|
8415	VATTR_IS_ACTIVE(vap, va_access_time) \|\|
8416	VATTR_IS_ACTIVE(vap, va_backup_time) \|\|
8417	VATTR_IS_ACTIVE(vap, va_addedtime)) {
8418	/*
8419	* The owner and root may set any timestamps they like,
8420	* provided that the file is not immutable. The owner still needs
8421	* WRITE_ATTRIBUTES (implied by ownership but still deniable).
8422	*/
8423	if (has_priv_suser \|\| vauth_node_owner(&ova, cred)) {
8424	KAUTH_DEBUG("ATTR - root or owner changing timestamps");
8425	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE \| KAUTH_VNODE_WRITE_ATTRIBUTES;
8426	} else {
8427	/ just setting the current time? /
8428	if (vap->va_vaflags & VA_UTIMES_NULL) {
8429	KAUTH_DEBUG("ATTR - non-root/owner changing timestamps, requiring WRITE_ATTRIBUTES");
8430	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
8431	} else {
8432	KAUTH_DEBUG("ATTR - ERROR: illegal timestamp modification attempted");
8433	error = EACCES;
8434	goto out;
8435	}
8436	}
8437	}
8438
8439	/*
8440	* Changing file mode?
8441	*/
8442	if (VATTR_IS_ACTIVE(vap, va_mode) && VATTR_IS_SUPPORTED(&ova, va_mode) && (ova.va_mode != vap->va_mode)) {
8443	KAUTH_DEBUG("ATTR - mode change from %06o to %06o", ova.va_mode, vap->va_mode);
8444
8445	/*
8446	* Mode changes always have the same basic auth requirements.
8447	*/
8448	if (has_priv_suser) {
8449	KAUTH_DEBUG("ATTR - superuser mode change, requiring immutability check");
8450	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
8451	} else {
8452	/ need WRITE_SECURITY /
8453	KAUTH_DEBUG("ATTR - non-superuser mode change, requiring WRITE_SECURITY");
8454	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8455	}
8456
8457	/*
8458	* Can't set the setgid bit if you're not in the group and not root. Have to have
8459	* existing group information in the case we're not setting it right now.
8460	*/
8461	if (vap->va_mode & S_ISGID) {
8462	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; / always required /
8463	if (!has_priv_suser) {
8464	if (VATTR_IS_ACTIVE(vap, va_gid)) {
8465	group = vap->va_gid;
8466	} else if (VATTR_IS_SUPPORTED(&ova, va_gid)) {
8467	group = ova.va_gid;
8468	} else {
8469	KAUTH_DEBUG("ATTR - ERROR: setgid but no gid available");
8470	error = EINVAL;
8471	goto out;
8472	}
8473	/*
8474	* This might be too restrictive; WRITE_SECURITY might be implied by
8475	* membership in this case, rather than being an additional requirement.
8476	*/
8477	if ((error = kauth_cred_ismember_gid(cred, group, &ismember)) != `0`) {
8478	KAUTH_DEBUG("ATTR - ERROR: got %d checking for membership in %d", error, vap->va_gid);
8479	goto out;
8480	}
8481	if (!ismember) {
8482	KAUTH_DEBUG(" DENIED - can't set SGID bit, not a member of %d", group);
8483	error = EPERM;
8484	goto out;
8485	}
8486	}
8487	}
8488
8489	/*
8490	* Can't set the setuid bit unless you're root or the file's owner.
8491	*/
8492	if (vap->va_mode & S_ISUID) {
8493	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE; / always required /
8494	if (!has_priv_suser) {
8495	if (VATTR_IS_ACTIVE(vap, va_uid)) {
8496	owner = vap->va_uid;
8497	} else if (VATTR_IS_SUPPORTED(&ova, va_uid)) {
8498	owner = ova.va_uid;
8499	} else {
8500	KAUTH_DEBUG("ATTR - ERROR: setuid but no uid available");
8501	error = EINVAL;
8502	goto out;
8503	}
8504	if (owner != kauth_cred_getuid(cred)) {
8505	/*
8506	* We could allow this if WRITE_SECURITY is permitted, perhaps.
8507	*/
8508	KAUTH_DEBUG("ATTR - ERROR: illegal attempt to set the setuid bit");
8509	error = EPERM;
8510	goto out;
8511	}
8512	}
8513	}
8514	}
8515
8516	/*
8517	* Validate/mask flags changes. This checks that only the flags in
8518	* the UF_SETTABLE mask are being set, and preserves the flags in
8519	* the SF_SETTABLE case.
8520	*
8521	* Since flags changes may be made in conjunction with other changes,
8522	* we will ask the auth code to ignore immutability in the case that
8523	* the SF_* flags are not set and we are only manipulating the file flags.
8524	*
8525	*/
8526	if (VATTR_IS_ACTIVE(vap, va_flags)) {
8527	/ compute changing flags bits /
8528	if (VATTR_IS_SUPPORTED(&ova, va_flags)) {
8529	fdelta = vap->va_flags ^ ova.va_flags;
8530	} else {
8531	fdelta = vap->va_flags;
8532	}
8533
8534	if (fdelta != `0`) {
8535	KAUTH_DEBUG("ATTR - flags changing, requiring WRITE_SECURITY");
8536	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8537
8538	/ check that changing bits are legal /
8539	if (has_priv_suser) {
8540	/*
8541	* The immutability check will prevent us from clearing the SF_*
8542	* flags unless the system securelevel permits it, so just check
8543	* for legal flags here.
8544	*/
8545	if (fdelta & ~(UF_SETTABLE \| SF_SETTABLE)) {
8546	error = EPERM;
8547	KAUTH_DEBUG(" DENIED - superuser attempt to set illegal flag(s)");
8548	goto out;
8549	}
8550	} else {
8551	if (fdelta & ~UF_SETTABLE) {
8552	error = EPERM;
8553	KAUTH_DEBUG(" DENIED - user attempt to set illegal flag(s)");
8554	goto out;
8555	}
8556	}
8557	/*
8558	* If the caller has the ability to manipulate file flags,
8559	* security is not reduced by ignoring them for this operation.
8560	*
8561	* A more complete test here would consider the 'after' states of the flags
8562	* to determine whether it would permit the operation, but this becomes
8563	* very complex.
8564	*
8565	* Ignoring immutability is conditional on securelevel; this does not bypass
8566	* the SF_* flags if securelevel > 0.
8567	*/
8568	required_action \|= KAUTH_VNODE_NOIMMUTABLE;
8569	}
8570	}
8571
8572	/*
8573	* Validate ownership information.
8574	*/
8575	chowner = `0`;
8576	chgroup = `0`;
8577	clear_suid = `0`;
8578	clear_sgid = `0`;
8579
8580	/*
8581	* uid changing
8582	* Note that if the filesystem didn't give us a UID, we expect that it doesn't
8583	* support them in general, and will ignore it if/when we try to set it.
8584	* We might want to clear the uid out of vap completely here.
8585	*/
8586	if (VATTR_IS_ACTIVE(vap, va_uid)) {
8587	if (VATTR_IS_SUPPORTED(&ova, va_uid) && (vap->va_uid != ova.va_uid)) {
8588	if (!has_priv_suser && (kauth_cred_getuid(cred) != vap->va_uid)) {
8589	KAUTH_DEBUG(" DENIED - non-superuser cannot change ownershipt to a third party");
8590	error = EPERM;
8591	goto out;
8592	}
8593	chowner = `1`;
8594	}
8595	clear_suid = `1`;
8596	}
8597
8598	/*
8599	* gid changing
8600	* Note that if the filesystem didn't give us a GID, we expect that it doesn't
8601	* support them in general, and will ignore it if/when we try to set it.
8602	* We might want to clear the gid out of vap completely here.
8603	*/
8604	if (VATTR_IS_ACTIVE(vap, va_gid)) {
8605	if (VATTR_IS_SUPPORTED(&ova, va_gid) && (vap->va_gid != ova.va_gid)) {
8606	if (!has_priv_suser) {
8607	if ((error = kauth_cred_ismember_gid(cred, vap->va_gid, &ismember)) != `0`) {
8608	KAUTH_DEBUG(" ERROR - got %d checking for membership in %d", error, vap->va_gid);
8609	goto out;
8610	}
8611	if (!ismember) {
8612	KAUTH_DEBUG(" DENIED - group change from %d to %d but not a member of target group",
8613	ova.va_gid, vap->va_gid);
8614	error = EPERM;
8615	goto out;
8616	}
8617	}
8618	chgroup = `1`;
8619	}
8620	clear_sgid = `1`;
8621	}
8622
8623	/*
8624	* Owner UUID being set or changed.
8625	*/
8626	if (VATTR_IS_ACTIVE(vap, va_uuuid)) {
8627	/ if the owner UUID is not actually changing ... /
8628	if (VATTR_IS_SUPPORTED(&ova, va_uuuid)) {
8629	if (kauth_guid_equal(&vap->va_uuuid, &ova.va_uuuid))
8630	goto no_uuuid_change;
8631
8632	/*
8633	* If the current owner UUID is a null GUID, check
8634	* it against the UUID corresponding to the owner UID.
8635	*/
8636	if (kauth_guid_equal(&ova.va_uuuid, &kauth_null_guid) &&
8637	VATTR_IS_SUPPORTED(&ova, va_uid)) {
8638	guid_t uid_guid;
8639
8640	if (kauth_cred_uid2guid(ova.va_uid, &uid_guid) == `0` &&
8641	kauth_guid_equal(&vap->va_uuuid, &uid_guid))
8642	goto no_uuuid_change;
8643	}
8644	}
8645
8646	/*
8647	* The owner UUID cannot be set by a non-superuser to anything other than
8648	* their own or a null GUID (to "unset" the owner UUID).
8649	* Note that file systems must be prepared to handle the
8650	* null UUID case in a manner appropriate for that file
8651	* system.
8652	*/
8653	if (!has_priv_suser) {
8654	if ((error = kauth_cred_getguid(cred, &changer)) != `0`) {
8655	KAUTH_DEBUG(" ERROR - got %d trying to get caller UUID", error);
8656	/ XXX ENOENT here - no UUID - should perhaps become EPERM /
8657	goto out;
8658	}
8659	if (!kauth_guid_equal(&vap->va_uuuid, &changer) &&
8660	!kauth_guid_equal(&vap->va_uuuid, &kauth_null_guid)) {
8661	KAUTH_DEBUG(" ERROR - cannot set supplied owner UUID - not us / null");
8662	error = EPERM;
8663	goto out;
8664	}
8665	}
8666	chowner = `1`;
8667	clear_suid = `1`;
8668	}
8669	no_uuuid_change:
8670	/*
8671	* Group UUID being set or changed.
8672	*/
8673	if (VATTR_IS_ACTIVE(vap, va_guuid)) {
8674	/ if the group UUID is not actually changing ... /
8675	if (VATTR_IS_SUPPORTED(&ova, va_guuid)) {
8676	if (kauth_guid_equal(&vap->va_guuid, &ova.va_guuid))
8677	goto no_guuid_change;
8678
8679	/*
8680	* If the current group UUID is a null UUID, check
8681	* it against the UUID corresponding to the group GID.
8682	*/
8683	if (kauth_guid_equal(&ova.va_guuid, &kauth_null_guid) &&
8684	VATTR_IS_SUPPORTED(&ova, va_gid)) {
8685	guid_t gid_guid;
8686
8687	if (kauth_cred_gid2guid(ova.va_gid, &gid_guid) == `0` &&
8688	kauth_guid_equal(&vap->va_guuid, &gid_guid))
8689	goto no_guuid_change;
8690	}
8691	}
8692
8693	/*
8694	* The group UUID cannot be set by a non-superuser to anything other than
8695	* one of which they are a member or a null GUID (to "unset"
8696	* the group UUID).
8697	* Note that file systems must be prepared to handle the
8698	* null UUID case in a manner appropriate for that file
8699	* system.
8700	*/
8701	if (!has_priv_suser) {
8702	if (kauth_guid_equal(&vap->va_guuid, &kauth_null_guid))
8703	ismember = `1`;
8704	else if ((error = kauth_cred_ismember_guid(cred, &vap->va_guuid, &ismember)) != `0`) {
8705	KAUTH_DEBUG(" ERROR - got %d trying to check group membership", error);
8706	goto out;
8707	}
8708	if (!ismember) {
8709	KAUTH_DEBUG(" ERROR - cannot set supplied group UUID - not a member / null");
8710	error = EPERM;
8711	goto out;
8712	}
8713	}
8714	chgroup = `1`;
8715	}
8716	no_guuid_change:
8717
8718	/*
8719	* Compute authorisation for group/ownership changes.
8720	*/
8721	if (chowner \|\| chgroup \|\| clear_suid \|\| clear_sgid) {
8722	if (has_priv_suser) {
8723	KAUTH_DEBUG("ATTR - superuser changing file owner/group, requiring immutability check");
8724	required_action \|= KAUTH_VNODE_CHECKIMMUTABLE;
8725	} else {
8726	if (chowner) {
8727	KAUTH_DEBUG("ATTR - ownership change, requiring TAKE_OWNERSHIP");
8728	required_action \|= KAUTH_VNODE_TAKE_OWNERSHIP;
8729	}
8730	if (chgroup && !chowner) {
8731	KAUTH_DEBUG("ATTR - group change, requiring WRITE_SECURITY");
8732	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8733	}
8734
8735	}
8736
8737	/*
8738	* clear set-uid and set-gid bits. POSIX only requires this for
8739	* non-privileged processes but we do it even for root.
8740	*/
8741	if (VATTR_IS_ACTIVE(vap, va_mode)) {
8742	newmode = vap->va_mode;
8743	} else if (VATTR_IS_SUPPORTED(&ova, va_mode)) {
8744	newmode = ova.va_mode;
8745	} else {
8746	KAUTH_DEBUG("CHOWN - trying to change owner but cannot get mode from filesystem to mask setugid bits");
8747	newmode = `0`;
8748	}
8749
8750	/ chown always clears setuid/gid bits. An exception is made for*
8751	* setattrlist executed by a root process to set <uid, gid, mode> on a file:
8752	* setattrlist is allowed to set the new mode on the file and change (chown)
8753	* uid/gid.
8754	*/
8755	if (newmode & (S_ISUID \| S_ISGID)) {
8756	if (!VATTR_IS_ACTIVE(vap, va_mode) \|\| !has_priv_suser) {
8757	KAUTH_DEBUG("CHOWN - masking setugid bits from mode %o to %o",
8758	newmode, newmode & ~(S_ISUID \| S_ISGID));
8759	newmode &= ~(S_ISUID \| S_ISGID);
8760	}
8761	VATTR_SET(vap, va_mode, newmode);
8762	}
8763	}
8764
8765	/*
8766	* Authorise changes in the ACL.
8767	*/
8768	if (VATTR_IS_ACTIVE(vap, va_acl)) {
8769
8770	/ no existing ACL /
8771	if (!VATTR_IS_ACTIVE(&ova, va_acl) \|\| (ova.va_acl == NULL)) {
8772
8773	/ adding an ACL /
8774	if (vap->va_acl != NULL) {
8775	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8776	KAUTH_DEBUG("CHMOD - adding ACL");
8777	}
8778
8779	/ removing an existing ACL /
8780	} else if (vap->va_acl == NULL) {
8781	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8782	KAUTH_DEBUG("CHMOD - removing ACL");
8783
8784	/ updating an existing ACL /
8785	} else {
8786	if (vap->va_acl->acl_entrycount != ova.va_acl->acl_entrycount) {
8787	/ entry count changed, must be different /
8788	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8789	KAUTH_DEBUG("CHMOD - adding/removing ACL entries");
8790	} else if (vap->va_acl->acl_entrycount > `0`) {
8791	/ both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs /
8792	if (memcmp(&vap->va_acl->acl_ace[`0`], &ova.va_acl->acl_ace[`0`],
8793	sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) {
8794	required_action \|= KAUTH_VNODE_WRITE_SECURITY;
8795	KAUTH_DEBUG("CHMOD - changing ACL entries");
8796	}
8797	}
8798	}
8799	}
8800
8801	/*
8802	* Other attributes that require authorisation.
8803	*/
8804	if (VATTR_IS_ACTIVE(vap, va_encoding))
8805	required_action \|= KAUTH_VNODE_WRITE_ATTRIBUTES;
8806
8807	out:
8808	if (VATTR_IS_SUPPORTED(&ova, va_acl) && (ova.va_acl != NULL))
8809	kauth_acl_free(ova.va_acl);
8810	if (error == `0`)
8811	*actionp = required_action;
8812	return(error);
8813	}
8814
8815	static int
8816	setlocklocal_callback(struct vnode vp, __unused void* *cargs)
8817	{
8818	vnode_lock_spin(vp);
8819	vp->v_flag \|= VLOCKLOCAL;
8820	vnode_unlock(vp);
8821
8822	return (VNODE_RETURNED);
8823	}
8824
8825	void
8826	vfs_setlocklocal(mount_t mp)
8827	{
8828	mount_lock_spin(mp);
8829	mp->mnt_kern_flag \|= MNTK_LOCK_LOCAL;
8830	mount_unlock(mp);
8831
8832	/*
8833	* The number of active vnodes is expected to be
8834	* very small when vfs_setlocklocal is invoked.
8835	*/
8836	vnode_iterate(mp, `0`, setlocklocal_callback, NULL);
8837	}
8838
8839	void
8840	vfs_setcompoundopen(mount_t mp)
8841	{
8842	mount_lock_spin(mp);
8843	mp->mnt_compound_ops \|= COMPOUND_VNOP_OPEN;
8844	mount_unlock(mp);
8845	}
8846
8847	void
8848	vnode_setswapmount(vnode_t vp)
8849	{
8850	mount_lock(vp->v_mount);
8851	vp->v_mount->mnt_kern_flag \|= MNTK_SWAP_MOUNT;
8852	mount_unlock(vp->v_mount);
8853	}
8854
8855
8856	int64_t
8857	vnode_getswappin_avail(vnode_t vp)
8858	{
8859	int64_t max_swappin_avail = `0`;
8860
8861	mount_lock(vp->v_mount);
8862	if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_SWAPPIN_SUPPORTED)
8863	max_swappin_avail = vp->v_mount->mnt_max_swappin_available;
8864	mount_unlock(vp->v_mount);
8865
8866	return (max_swappin_avail);
8867	}
8868
8869
8870	void
8871	vn_setunionwait(vnode_t vp)
8872	{
8873	vnode_lock_spin(vp);
8874	vp->v_flag \|= VISUNION;
8875	vnode_unlock(vp);
8876	}
8877
8878
8879	void
8880	vn_checkunionwait(vnode_t vp)
8881	{
8882	vnode_lock_spin(vp);
8883	while ((vp->v_flag & VISUNION) == VISUNION)
8884	msleep((caddr_t)&vp->v_flag, &vp->v_lock, `0`, `0`, `0`);
8885	vnode_unlock(vp);
8886	}
8887
8888	void
8889	vn_clearunionwait(vnode_t vp, int locked)
8890	{
8891	if (!locked)
8892	vnode_lock_spin(vp);
8893	if((vp->v_flag & VISUNION) == VISUNION) {
8894	vp->v_flag &= ~VISUNION;
8895	wakeup((caddr_t)&vp->v_flag);
8896	}
8897	if (!locked)
8898	vnode_unlock(vp);
8899	}
8900
8901	/*
8902	* Removes orphaned apple double files during a rmdir
8903	* Works by:
8904	* 1. vnode_suspend().
8905	* 2. Call VNOP_READDIR() till the end of directory is reached.
8906	* 3. Check if the directory entries returned are regular files with name starting with "._". If not, return ENOTEMPTY.
8907	* 4. Continue (2) and (3) till end of directory is reached.
8908	* 5. If all the entries in the directory were files with "._" name, delete all the files.
8909	* 6. vnode_resume()
8910	* 7. If deletion of all files succeeded, call VNOP_RMDIR() again.
8911	*/
8912
8913	errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * restart_flag)
8914	{
8915
8916	#define UIO_BUFF_SIZE 2048
8917	uio_t auio = NULL;
8918	int eofflag, siz = UIO_BUFF_SIZE, nentries = `0`;
8919	int open_flag = `0`, full_erase_flag = `0`;
8920	char uio_buf[ UIO_SIZEOF(`1`) ];
8921	char *rbuf = NULL;
8922	void *dir_pos;
8923	void *dir_end;
8924	struct dirent *dp;
8925	errno_t error;
8926
8927	error = vnode_suspend(vp);
8928
8929	/*
8930	* restart_flag is set so that the calling rmdir sleeps and resets
8931	*/
8932	if (error == EBUSY)
8933	*restart_flag = `1`;
8934	if (error != `0`)
8935	return (error);
8936
8937	/*
8938	* set up UIO
8939	*/
8940	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
8941	if (rbuf)
8942	auio = uio_createwithbuffer(`1`, `0`, UIO_SYSSPACE, UIO_READ,
8943	&uio_buf[`0`], sizeof(uio_buf));
8944	if (!rbuf \|\| !auio) {
8945	error = ENOMEM;
8946	goto outsc;
8947	}
8948
8949	uio_setoffset(auio,`0`);
8950
8951	eofflag = `0`;
8952
8953	if ((error = VNOP_OPEN(vp, FREAD, ctx)))
8954	goto outsc;
8955	else
8956	open_flag = `1`;
8957
8958	/*
8959	* First pass checks if all files are appleDouble files.
8960	*/
8961
8962	do {
8963	siz = UIO_BUFF_SIZE;
8964	uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
8965	uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
8966
8967	if((error = VNOP_READDIR(vp, auio, `0`, &eofflag, &nentries, ctx)))
8968	goto outsc;
8969
8970	if (uio_resid(auio) != `0`)
8971	siz -= uio_resid(auio);
8972
8973	/*
8974	* Iterate through directory
8975	*/
8976	dir_pos = (void*) rbuf;
8977	dir_end = (void*) (rbuf + siz);
8978	dp = (struct dirent*) (dir_pos);
8979
8980	if (dir_pos == dir_end)
8981	eofflag = `1`;
8982
8983	while (dir_pos < dir_end) {
8984	/*
8985	* Check for . and .. as well as directories
8986	*/
8987	if (dp->d_ino != `0` &&
8988	!((dp->d_namlen == `1` && dp->d_name[`0`] == `'.'`) \|\|
8989	(dp->d_namlen == `2` && dp->d_name[`0`] == `'.'` && dp->d_name[`1`] == `'.'`))) {
8990	/*
8991	* Check for irregular files and ._ files
8992	* If there is a ._._ file abort the op
8993	*/
8994	if ( dp->d_namlen < `2` \|\|
8995	strncmp(dp->d_name,"._",`2`) \|\|
8996	(dp->d_namlen >= `4` && !strncmp(&(dp->d_name[`2`]), "._",`2`))) {
8997	error = ENOTEMPTY;
8998	goto outsc;
8999	}
9000	}
9001	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
9002	dp = (struct dirent*)dir_pos;
9003	}
9004
9005	/*
9006	* workaround for HFS/NFS setting eofflag before end of file
9007	*/
9008	if (vp->v_tag == VT_HFS && nentries > `2`)
9009	eofflag=`0`;
9010
9011	if (vp->v_tag == VT_NFS) {
9012	if (eofflag && !full_erase_flag) {
9013	full_erase_flag = `1`;
9014	eofflag = `0`;
9015	uio_reset(auio, `0`, UIO_SYSSPACE, UIO_READ);
9016	}
9017	else if (!eofflag && full_erase_flag)
9018	full_erase_flag = `0`;
9019	}
9020
9021	} while (!eofflag);
9022	/*
9023	* If we've made it here all the files in the dir are ._ files.
9024	* We can delete the files even though the node is suspended
9025	* because we are the owner of the file.
9026	*/
9027
9028	uio_reset(auio, `0`, UIO_SYSSPACE, UIO_READ);
9029	eofflag = `0`;
9030	full_erase_flag = `0`;
9031
9032	do {
9033	siz = UIO_BUFF_SIZE;
9034	uio_reset(auio, uio_offset(auio), UIO_SYSSPACE, UIO_READ);
9035	uio_addiov(auio, CAST_USER_ADDR_T(rbuf), UIO_BUFF_SIZE);
9036
9037	error = VNOP_READDIR(vp, auio, `0`, &eofflag, &nentries, ctx);
9038
9039	if (error != `0`)
9040	goto outsc;
9041
9042	if (uio_resid(auio) != `0`)
9043	siz -= uio_resid(auio);
9044
9045	/*
9046	* Iterate through directory
9047	*/
9048	dir_pos = (void*) rbuf;
9049	dir_end = (void*) (rbuf + siz);
9050	dp = (struct dirent*) dir_pos;
9051
9052	if (dir_pos == dir_end)
9053	eofflag = `1`;
9054
9055	while (dir_pos < dir_end) {
9056	/*
9057	* Check for . and .. as well as directories
9058	*/
9059	if (dp->d_ino != `0` &&
9060	!((dp->d_namlen == `1` && dp->d_name[`0`] == `'.'`) \|\|
9061	(dp->d_namlen == `2` && dp->d_name[`0`] == `'.'` && dp->d_name[`1`] == `'.'`))
9062	) {
9063
9064	error = unlink1(ctx, vp,
9065	CAST_USER_ADDR_T(dp->d_name), UIO_SYSSPACE,
9066	VNODE_REMOVE_SKIP_NAMESPACE_EVENT \|
9067	VNODE_REMOVE_NO_AUDIT_PATH);
9068
9069	if (error && error != ENOENT) {
9070	goto outsc;
9071	}
9072
9073	}
9074	dir_pos = (void) ((uint8_t)dir_pos + dp->d_reclen);
9075	dp = (struct dirent*)dir_pos;
9076	}
9077
9078	/*
9079	* workaround for HFS/NFS setting eofflag before end of file
9080	*/
9081	if (vp->v_tag == VT_HFS && nentries > `2`)
9082	eofflag=`0`;
9083
9084	if (vp->v_tag == VT_NFS) {
9085	if (eofflag && !full_erase_flag) {
9086	full_erase_flag = `1`;
9087	eofflag = `0`;
9088	uio_reset(auio, `0`, UIO_SYSSPACE, UIO_READ);
9089	}
9090	else if (!eofflag && full_erase_flag)
9091	full_erase_flag = `0`;
9092	}
9093
9094	} while (!eofflag);
9095
9096
9097	error = `0`;
9098
9099	outsc:
9100	if (open_flag)
9101	VNOP_CLOSE(vp, FREAD, ctx);
9102
9103	if (auio)
9104	uio_free(auio);
9105	FREE(rbuf, M_TEMP);
9106
9107	vnode_resume(vp);
9108
9109
9110	return(error);
9111
9112	}
9113
9114
9115	void
9116	lock_vnode_and_post(vnode_t vp, int kevent_num)
9117	{
9118	/ Only take the lock if there's something there! /
9119	if (vp->v_knotes.slh_first != NULL) {
9120	vnode_lock(vp);
9121	KNOTE(&vp->v_knotes, kevent_num);
9122	vnode_unlock(vp);
9123	}
9124	}
9125
9126	void panic_print_vnodes(void);
9127
9128	/ define PANIC_PRINTS_VNODES only if investigation is required. /
9129	#ifdef PANIC_PRINTS_VNODES
9130
9131	static const char *__vtype(uint16_t vtype)
9132	{
9133	switch (vtype) {
9134	case VREG:
9135	return "R";
9136	case VDIR:
9137	return "D";
9138	case VBLK:
9139	return "B";
9140	case VCHR:
9141	return "C";
9142	case VLNK:
9143	return "L";
9144	case VSOCK:
9145	return "S";
9146	case VFIFO:
9147	return "F";
9148	case VBAD:
9149	return "x";
9150	case VSTR:
9151	return "T";
9152	case VCPLX:
9153	return "X";
9154	default:
9155	return "?";
9156	}
9157	}
9158
9159	/*
9160	* build a path from the bottom up
9161	* NOTE: called from the panic path - no alloc'ing of memory and no locks!
9162	*/
9163	static char __vpath(vnode_t vp, char* str, int* len, int depth)
9164	{
9165	int vnm_len;
9166	const char *src;
9167	char *dst;
9168
9169	if (len <= `0`)
9170	return str;
9171	/ str + len is the start of the string we created /
9172	if (!vp->v_name)
9173	return str + len;
9174
9175	/ follow mount vnodes to get the full path /
9176	if ((vp->v_flag & VROOT)) {
9177	if (vp->v_mount != NULL && vp->v_mount->mnt_vnodecovered) {
9178	return __vpath(vp->v_mount->mnt_vnodecovered,
9179	str, len, depth+`1`);
9180	}
9181	return str + len;
9182	}
9183
9184	src = vp->v_name;
9185	vnm_len = strlen(src);
9186	if (vnm_len > len) {
9187	/ truncate the name to fit in the string /
9188	src += (vnm_len - len);
9189	vnm_len = len;
9190	}
9191
9192	/ start from the back and copy just characters (no NULLs) /
9193
9194	/ this will chop off leaf path (file) names /
9195	if (depth > `0`) {
9196	dst = str + len - vnm_len;
9197	memcpy(dst, src, vnm_len);
9198	len -= vnm_len;
9199	} else {
9200	dst = str + len;
9201	}
9202
9203	if (vp->v_parent && len > `1`) {
9204	/ follow parents up the chain /
9205	len--;
9206	*(dst-`1`) = `'/'`;
9207	return __vpath(vp->v_parent, str, len, depth + `1`);
9208	}
9209
9210	return dst;
9211	}
9212
9213	#define SANE_VNODE_PRINT_LIMIT 5000
9214	void panic_print_vnodes(void)
9215	{
9216	mount_t mnt;
9217	vnode_t vp;
9218	int nvnodes = `0`;
9219	const char *type;
9220	char *nm;
9221	char vname[`257`];
9222
9223	paniclog_append_noflush("\n*** VNODES ***\n"
9224	"TYPE UREF ICNT PATH\n");
9225
9226	/ NULL-terminate the path name /
9227	vname[sizeof(vname)-`1`] = `'\0'`;
9228
9229	/*
9230	* iterate all vnodelist items in all mounts (mntlist) -> mnt_vnodelist
9231	*/
9232	TAILQ_FOREACH(mnt, &mountlist, mnt_list) {
9233
9234	if (!ml_validate_nofault((vm_offset_t)mnt, sizeof(mount_t))) {
9235	paniclog_append_noflush("Unable to iterate the mount list %p - encountered an invalid mount pointer %p \n",
9236	&mountlist, mnt);
9237	break;
9238	}
9239
9240	TAILQ_FOREACH(vp, &mnt->mnt_vnodelist, v_mntvnodes) {
9241
9242	if (!ml_validate_nofault((vm_offset_t)vp, sizeof(vnode_t))) {
9243	paniclog_append_noflush("Unable to iterate the vnode list %p - encountered an invalid vnode pointer %p \n",
9244	&mnt->mnt_vnodelist, vp);
9245	break;
9246	}
9247
9248	if (++nvnodes > SANE_VNODE_PRINT_LIMIT)
9249	return;
9250	type = __vtype(vp->v_type);
9251	nm = __vpath(vp, vname, sizeof(vname)-`1`, `0`);
9252	paniclog_append_noflush("%s %0d %0d %s\n",
9253	type, vp->v_usecount, vp->v_iocount, nm);
9254	}
9255	}
9256	}
9257
9258	#else /* !PANIC_PRINTS_VNODES */
9259	void panic_print_vnodes(void)
9260	{
9261	return;
9262	}
9263	#endif
9264
9265
9266	#ifdef JOE_DEBUG
9267	static void record_vp(vnode_t vp, int count) {
9268	struct uthread *ut;
9269
9270	#if CONFIG_TRIGGERS
9271	if (vp->v_resolve)
9272	return;
9273	#endif
9274	if ((vp->v_flag & VSYSTEM))
9275	return;
9276
9277	ut = get_bsdthread_info(current_thread());
9278	ut->uu_iocount += count;
9279
9280	if (count == `1`) {
9281	if (ut->uu_vpindex < `32`) {
9282	OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][`0`], `10`);
9283
9284	ut->uu_vps[ut->uu_vpindex] = vp;
9285	ut->uu_vpindex++;
9286	}
9287	}
9288	}
9289	#endif
9290
9291
9292	#if CONFIG_TRIGGERS
9293
9294	#define TRIG_DEBUG 0
9295
9296	#if TRIG_DEBUG
9297	#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0)
9298	#else
9299	#define TRIG_LOG(...)
9300	#endif
9301
9302	/*
9303	* Resolver result functions
9304	*/
9305
9306	resolver_result_t
9307	vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux)
9308	{
9309	/*
9310	* \|<--- 32 --->\|<--- 28 --->\|<- 4 ->\|
9311	* sequence auxiliary status
9312	*/
9313	return (((uint64_t)seq) << `32`) \|
9314	(((uint64_t)(aux & `0x0fffffff`)) << `4`) \|
9315	(uint64_t)(stat & `0x0000000F`);
9316	}
9317
9318	enum resolver_status
9319	vfs_resolver_status(resolver_result_t result)
9320	{
9321	/ lower 4 bits is status /
9322	return (result & `0x0000000F`);
9323	}
9324
9325	uint32_t
9326	vfs_resolver_sequence(resolver_result_t result)
9327	{
9328	/ upper 32 bits is sequence /
9329	return (uint32_t)(result >> `32`);
9330	}
9331
9332	int
9333	vfs_resolver_auxiliary(resolver_result_t result)
9334	{
9335	/ 28 bits of auxiliary /
9336	return (int)(((uint32_t)(result & `0xFFFFFFF0`)) >> `4`);
9337	}
9338
9339	/*
9340	* SPI
9341	* Call in for resolvers to update vnode trigger state
9342	*/
9343	int
9344	vnode_trigger_update(vnode_t vp, resolver_result_t result)
9345	{
9346	vnode_resolve_t rp;
9347	uint32_t seq;
9348	enum resolver_status stat;
9349
9350	if (vp->v_resolve == NULL) {
9351	return (EINVAL);
9352	}
9353
9354	stat = vfs_resolver_status(result);
9355	seq = vfs_resolver_sequence(result);
9356
9357	if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) {
9358	return (EINVAL);
9359	}
9360
9361	rp = vp->v_resolve;
9362	lck_mtx_lock(&rp->vr_lock);
9363
9364	if (seq > rp->vr_lastseq) {
9365	if (stat == RESOLVER_RESOLVED)
9366	rp->vr_flags \|= VNT_RESOLVED;
9367	else
9368	rp->vr_flags &= ~VNT_RESOLVED;
9369
9370	rp->vr_lastseq = seq;
9371	}
9372
9373	lck_mtx_unlock(&rp->vr_lock);
9374
9375	return (`0`);
9376	}
9377
9378	static int
9379	vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref)
9380	{
9381	int error;
9382
9383	vnode_lock_spin(vp);
9384	if (vp->v_resolve != NULL) {
9385	vnode_unlock(vp);
9386	return EINVAL;
9387	} else {
9388	vp->v_resolve = rp;
9389	}
9390	vnode_unlock(vp);
9391
9392	if (ref) {
9393	error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE);
9394	if (error != `0`) {
9395	panic("VNODE_REF_FORCE didn't help...");
9396	}
9397	}
9398
9399	return `0`;
9400	}
9401
9402	/*
9403	* VFS internal interfaces for vnode triggers
9404	*
9405	* vnode must already have an io count on entry
9406	* v_resolve is stable when io count is non-zero
9407	*/
9408	static int
9409	vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external)
9410	{
9411	vnode_resolve_t rp;
9412	int result;
9413	char byte;
9414
9415	#if 1
9416	/ minimum pointer test (debugging) /
9417	if (tinfo->vnt_data)
9418	byte = ((char* *)tinfo->vnt_data);
9419	#endif
9420	MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK);
9421	if (rp == NULL)
9422	return (ENOMEM);
9423
9424	lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr);
9425
9426	rp->vr_resolve_func = tinfo->vnt_resolve_func;
9427	rp->vr_unresolve_func = tinfo->vnt_unresolve_func;
9428	rp->vr_rearm_func = tinfo->vnt_rearm_func;
9429	rp->vr_reclaim_func = tinfo->vnt_reclaim_func;
9430	rp->vr_data = tinfo->vnt_data;
9431	rp->vr_lastseq = `0`;
9432	rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK;
9433	if (external) {
9434	rp->vr_flags \|= VNT_EXTERNAL;
9435	}
9436
9437	result = vnode_resolver_attach(vp, rp, external);
9438	if (result != `0`) {
9439	goto out;
9440	}
9441
9442	if (mp) {
9443	OSAddAtomic(`1`, &mp->mnt_numtriggers);
9444	}
9445
9446	return (result);
9447
9448	out:
9449	FREE(rp, M_TEMP);
9450	return result;
9451	}
9452
9453	static void
9454	vnode_resolver_release(vnode_resolve_t rp)
9455	{
9456	/*
9457	* Give them a chance to free any private data
9458	*/
9459	if (rp->vr_data && rp->vr_reclaim_func) {
9460	rp->vr_reclaim_func(NULLVP, rp->vr_data);
9461	}
9462
9463	lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp);
9464	FREE(rp, M_TEMP);
9465
9466	}
9467
9468	/ Called after the vnode has been drained /
9469	static void
9470	vnode_resolver_detach(vnode_t vp)
9471	{
9472	vnode_resolve_t rp;
9473	mount_t mp;
9474
9475	mp = vnode_mount(vp);
9476
9477	vnode_lock(vp);
9478	rp = vp->v_resolve;
9479	vp->v_resolve = NULL;
9480	vnode_unlock(vp);
9481
9482	if ((rp->vr_flags & VNT_EXTERNAL) != `0`) {
9483	vnode_rele_ext(vp, O_EVTONLY, `1`);
9484	}
9485
9486	vnode_resolver_release(rp);
9487
9488	/ Keep count of active trigger vnodes per mount /
9489	OSAddAtomic(-`1`, &mp->mnt_numtriggers);
9490	}
9491
9492	__private_extern__
9493	void
9494	vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx)
9495	{
9496	vnode_resolve_t rp;
9497	resolver_result_t result;
9498	enum resolver_status status;
9499	uint32_t seq;
9500
9501	if ((vp->v_resolve == NULL) \|\|
9502	(vp->v_resolve->vr_rearm_func == NULL) \|\|
9503	(vp->v_resolve->vr_flags & VNT_AUTO_REARM) == `0`) {
9504	return;
9505	}
9506
9507	rp = vp->v_resolve;
9508	lck_mtx_lock(&rp->vr_lock);
9509
9510	/*
9511	* Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes.
9512	*/
9513	if (rp->vr_flags & VNT_VFS_UNMOUNTED) {
9514	lck_mtx_unlock(&rp->vr_lock);
9515	return;
9516	}
9517
9518	/ Check if this vnode is already armed /
9519	if ((rp->vr_flags & VNT_RESOLVED) == `0`) {
9520	lck_mtx_unlock(&rp->vr_lock);
9521	return;
9522	}
9523
9524	lck_mtx_unlock(&rp->vr_lock);
9525
9526	result = rp->vr_rearm_func(vp, `0`, rp->vr_data, ctx);
9527	status = vfs_resolver_status(result);
9528	seq = vfs_resolver_sequence(result);
9529
9530	lck_mtx_lock(&rp->vr_lock);
9531	if (seq > rp->vr_lastseq) {
9532	if (status == RESOLVER_UNRESOLVED)
9533	rp->vr_flags &= ~VNT_RESOLVED;
9534	rp->vr_lastseq = seq;
9535	}
9536	lck_mtx_unlock(&rp->vr_lock);
9537	}
9538
9539	__private_extern__
9540	int
9541	vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx)
9542	{
9543	vnode_resolve_t rp;
9544	enum path_operation op;
9545	resolver_result_t result;
9546	enum resolver_status status;
9547	uint32_t seq;
9548
9549	/ Only trigger on topmost vnodes /
9550	if ((vp->v_resolve == NULL) \|\|
9551	(vp->v_resolve->vr_resolve_func == NULL) \|\|
9552	(vp->v_mountedhere != NULL)) {
9553	return (`0`);
9554	}
9555
9556	rp = vp->v_resolve;
9557	lck_mtx_lock(&rp->vr_lock);
9558
9559	/ Check if this vnode is already resolved /
9560	if (rp->vr_flags & VNT_RESOLVED) {
9561	lck_mtx_unlock(&rp->vr_lock);
9562	return (`0`);
9563	}
9564
9565	lck_mtx_unlock(&rp->vr_lock);
9566
9567	#if CONFIG_MACF
9568	int rv = mac_vnode_check_trigger_resolve(ctx, vp, &ndp->ni_cnd);
9569	if (rv != `0`)
9570	return rv;
9571	#endif
9572
9573	/*
9574	* XXX
9575	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9576	* is there anyway to know this???
9577	* there can also be other legitimate lookups in parallel
9578	*
9579	* XXX - should we call this on a separate thread with a timeout?
9580	*
9581	* XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should
9582	* get the richer set and non-leafs should get generic OP_LOOKUP? TBD
9583	*/
9584	op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP;
9585
9586	result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, `0`, rp->vr_data, ctx);
9587	status = vfs_resolver_status(result);
9588	seq = vfs_resolver_sequence(result);
9589
9590	lck_mtx_lock(&rp->vr_lock);
9591	if (seq > rp->vr_lastseq) {
9592	if (status == RESOLVER_RESOLVED)
9593	rp->vr_flags \|= VNT_RESOLVED;
9594	rp->vr_lastseq = seq;
9595	}
9596	lck_mtx_unlock(&rp->vr_lock);
9597
9598	/ On resolver errors, propagate the error back up /
9599	return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : `0`);
9600	}
9601
9602	static int
9603	vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx)
9604	{
9605	vnode_resolve_t rp;
9606	resolver_result_t result;
9607	enum resolver_status status;
9608	uint32_t seq;
9609
9610	if ((vp->v_resolve == NULL) \|\| (vp->v_resolve->vr_unresolve_func == NULL)) {
9611	return (`0`);
9612	}
9613
9614	rp = vp->v_resolve;
9615	lck_mtx_lock(&rp->vr_lock);
9616
9617	/ Check if this vnode is already resolved /
9618	if ((rp->vr_flags & VNT_RESOLVED) == `0`) {
9619	printf("vnode_trigger_unresolve: not currently resolved\n");
9620	lck_mtx_unlock(&rp->vr_lock);
9621	return (`0`);
9622	}
9623
9624	rp->vr_flags \|= VNT_VFS_UNMOUNTED;
9625
9626	lck_mtx_unlock(&rp->vr_lock);
9627
9628	/*
9629	* XXX
9630	* assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock)
9631	* there can also be other legitimate lookups in parallel
9632	*
9633	* XXX - should we call this on a separate thread with a timeout?
9634	*/
9635
9636	result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx);
9637	status = vfs_resolver_status(result);
9638	seq = vfs_resolver_sequence(result);
9639
9640	lck_mtx_lock(&rp->vr_lock);
9641	if (seq > rp->vr_lastseq) {
9642	if (status == RESOLVER_UNRESOLVED)
9643	rp->vr_flags &= ~VNT_RESOLVED;
9644	rp->vr_lastseq = seq;
9645	}
9646	rp->vr_flags &= ~VNT_VFS_UNMOUNTED;
9647	lck_mtx_unlock(&rp->vr_lock);
9648
9649	/ On resolver errors, propagate the error back up /
9650	return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : `0`);
9651	}
9652
9653	static int
9654	triggerisdescendant(mount_t mp, mount_t rmp)
9655	{
9656	int match = FALSE;
9657
9658	/*
9659	* walk up vnode covered chain looking for a match
9660	*/
9661	name_cache_lock_shared();
9662
9663	while (`1`) {
9664	vnode_t vp;
9665
9666	/ did we encounter "/" ? /
9667	if (mp->mnt_flag & MNT_ROOTFS)
9668	break;
9669
9670	vp = mp->mnt_vnodecovered;
9671	if (vp == NULLVP)
9672	break;
9673
9674	mp = vp->v_mount;
9675	if (mp == rmp) {
9676	match = TRUE;
9677	break;
9678	}
9679	}
9680
9681	name_cache_unlock();
9682
9683	return (match);
9684	}
9685
9686	struct trigger_unmount_info {
9687	vfs_context_t ctx;
9688	mount_t top_mp;
9689	vnode_t trigger_vp;
9690	mount_t trigger_mp;
9691	uint32_t trigger_vid;
9692	int flags;
9693	};
9694
9695	static int
9696	trigger_unmount_callback(mount_t mp, void * arg)
9697	{
9698	struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg;
9699	boolean_t mountedtrigger = FALSE;
9700
9701	/*
9702	* When we encounter the top level mount we're done
9703	*/
9704	if (mp == infop->top_mp)
9705	return (VFS_RETURNED_DONE);
9706
9707	if ((mp->mnt_vnodecovered == NULL) \|\|
9708	(vnode_getwithref(mp->mnt_vnodecovered) != `0`)) {
9709	return (VFS_RETURNED);
9710	}
9711
9712	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
9713	(mp->mnt_vnodecovered->v_resolve != NULL) &&
9714	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) {
9715	mountedtrigger = TRUE;
9716	}
9717	vnode_put(mp->mnt_vnodecovered);
9718
9719	/*
9720	* When we encounter a mounted trigger, check if its under the top level mount
9721	*/
9722	if ( !mountedtrigger \|\| !triggerisdescendant(mp, infop->top_mp) )
9723	return (VFS_RETURNED);
9724
9725	/*
9726	* Process any pending nested mount (now that its not referenced)
9727	*/
9728	if ((infop->trigger_vp != NULLVP) &&
9729	(vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == `0`)) {
9730	vnode_t vp = infop->trigger_vp;
9731	int error;
9732
9733	infop->trigger_vp = NULLVP;
9734
9735	if (mp == vp->v_mountedhere) {
9736	vnode_put(vp);
9737	printf("trigger_unmount_callback: unexpected match '%s'\n",
9738	mp->mnt_vfsstat.f_mntonname);
9739	return (VFS_RETURNED);
9740	}
9741	if (infop->trigger_mp != vp->v_mountedhere) {
9742	vnode_put(vp);
9743	printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n",
9744	infop->trigger_mp, vp->v_mountedhere);
9745	goto savenext;
9746	}
9747
9748	error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx);
9749	vnode_put(vp);
9750	if (error) {
9751	printf("unresolving: '%s', err %d\n",
9752	vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname :
9753	"???", error);
9754	return (VFS_RETURNED_DONE); / stop iteration on errors /
9755	}
9756	}
9757	savenext:
9758	/*
9759	* We can't call resolver here since we hold a mount iter
9760	* ref on mp so save its covered vp for later processing
9761	*/
9762	infop->trigger_vp = mp->mnt_vnodecovered;
9763	if ((infop->trigger_vp != NULLVP) &&
9764	(vnode_getwithref(infop->trigger_vp) == `0`)) {
9765	if (infop->trigger_vp->v_mountedhere == mp) {
9766	infop->trigger_vid = infop->trigger_vp->v_id;
9767	infop->trigger_mp = mp;
9768	}
9769	vnode_put(infop->trigger_vp);
9770	}
9771
9772	return (VFS_RETURNED);
9773	}
9774
9775	/*
9776	* Attempt to unmount any trigger mounts nested underneath a mount.
9777	* This is a best effort attempt and no retries are performed here.
9778	*
9779	* Note: mp->mnt_rwlock is held exclusively on entry (so be carefull)
9780	*/
9781	__private_extern__
9782	void
9783	vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx)
9784	{
9785	struct trigger_unmount_info info;
9786
9787	/ Must have trigger vnodes /
9788	if (mp->mnt_numtriggers == `0`) {
9789	return;
9790	}
9791	/ Avoid recursive requests (by checking covered vnode) /
9792	if ((mp->mnt_vnodecovered != NULL) &&
9793	(vnode_getwithref(mp->mnt_vnodecovered) == `0`)) {
9794	boolean_t recursive = FALSE;
9795
9796	if ((mp->mnt_vnodecovered->v_mountedhere == mp) &&
9797	(mp->mnt_vnodecovered->v_resolve != NULL) &&
9798	(mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) {
9799	recursive = TRUE;
9800	}
9801	vnode_put(mp->mnt_vnodecovered);
9802	if (recursive)
9803	return;
9804	}
9805
9806	/*
9807	* Attempt to unmount any nested trigger mounts (best effort)
9808	*/
9809	info.ctx = ctx;
9810	info.top_mp = mp;
9811	info.trigger_vp = NULLVP;
9812	info.trigger_vid = `0`;
9813	info.trigger_mp = NULL;
9814	info.flags = flags;
9815
9816	(void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info);
9817
9818	/*
9819	* Process remaining nested mount (now that its not referenced)
9820	*/
9821	if ((info.trigger_vp != NULLVP) &&
9822	(vnode_getwithvid(info.trigger_vp, info.trigger_vid) == `0`)) {
9823	vnode_t vp = info.trigger_vp;
9824
9825	if (info.trigger_mp == vp->v_mountedhere) {
9826	(void) vnode_trigger_unresolve(vp, flags, ctx);
9827	}
9828	vnode_put(vp);
9829	}
9830	}
9831
9832	int
9833	vfs_addtrigger(mount_t mp, const char relpath, struct* vnode_trigger_info *vtip, vfs_context_t ctx)
9834	{
9835	struct nameidata nd;
9836	int res;
9837	vnode_t rvp, vp;
9838	struct vnode_trigger_param vtp;
9839
9840	/*
9841	* Must be called for trigger callback, wherein rwlock is held
9842	*/
9843	lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD);
9844
9845	TRIG_LOG("Adding trigger at %s\n", relpath);
9846	TRIG_LOG("Trying VFS_ROOT\n");
9847
9848	/*
9849	* We do a lookup starting at the root of the mountpoint, unwilling
9850	* to cross into other mountpoints.
9851	*/
9852	res = VFS_ROOT(mp, &rvp, ctx);
9853	if (res != `0`) {
9854	goto out;
9855	}
9856
9857	TRIG_LOG("Trying namei\n");
9858
9859	NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP \| NOCROSSMOUNT \| FOLLOW, UIO_SYSSPACE,
9860	CAST_USER_ADDR_T(relpath), ctx);
9861	nd.ni_dvp = rvp;
9862	res = namei(&nd);
9863	if (res != `0`) {
9864	vnode_put(rvp);
9865	goto out;
9866	}
9867
9868	vp = nd.ni_vp;
9869	nameidone(&nd);
9870	vnode_put(rvp);
9871
9872	TRIG_LOG("Trying vnode_resolver_create()\n");
9873
9874	/*
9875	* Set up blob. vnode_create() takes a larger structure
9876	* with creation info, and we needed something different
9877	* for this case. One needs to win, or we need to munge both;
9878	* vnode_create() wins.
9879	*/
9880	bzero(&vtp, sizeof(vtp));
9881	vtp.vnt_resolve_func = vtip->vti_resolve_func;
9882	vtp.vnt_unresolve_func = vtip->vti_unresolve_func;
9883	vtp.vnt_rearm_func = vtip->vti_rearm_func;
9884	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
9885	vtp.vnt_reclaim_func = vtip->vti_reclaim_func;
9886	vtp.vnt_data = vtip->vti_data;
9887	vtp.vnt_flags = vtip->vti_flags;
9888
9889	res = vnode_resolver_create(mp, vp, &vtp, TRUE);
9890	vnode_put(vp);
9891	out:
9892	TRIG_LOG("Returning %d\n", res);
9893	return res;
9894	}
9895
9896	#endif /* CONFIG_TRIGGERS */
9897
9898	vm_offset_t kdebug_vnode(vnode_t vp)
9899	{
9900	return VM_KERNEL_ADDRPERM(vp);
9901	}
9902
9903	static int flush_cache_on_write = `0`;
9904	SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write,
9905	CTLFLAG_RW \| CTLFLAG_LOCKED, &flush_cache_on_write, `0`,
9906	"always flush the drive cache on writes to uncached files");
9907
9908	int vnode_should_flush_after_write(vnode_t vp, int ioflag)
9909	{
9910	return (flush_cache_on_write
9911	&& (ISSET(ioflag, IO_NOCACHE) \|\| vnode_isnocache(vp)));
9912	}
9913
9914	/*
9915	* sysctl for use by disk I/O tracing tools to get the list of existing
9916	* vnodes' paths
9917	*/
9918
9919	struct vnode_trace_paths_context {
9920	uint64_t count;
9921	long path[MAXPATHLEN / sizeof (long) + `1`]; / + 1 in case sizeof (long) does not divide MAXPATHLEN /
9922	};
9923
9924	static int vnode_trace_path_callback(struct vnode vp, void* *arg) {
9925	int len, rv;
9926	struct vnode_trace_paths_context *ctx;
9927
9928	ctx = arg;
9929
9930	len = sizeof (ctx->path);
9931	rv = vn_getpath(vp, (char *)ctx->path, &len);
9932	/ vn_getpath() NUL-terminates, and len includes the NUL /
9933
9934	if (!rv) {
9935	kdebug_vfs_lookup(ctx->path, len, vp,
9936	KDBG_VFS_LOOKUP_FLAG_LOOKUP \| KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
9937
9938	if (++(ctx->count) == `1000`) {
9939	thread_yield_to_preemption();
9940	ctx->count = `0`;
9941	}
9942	}
9943
9944	return VNODE_RETURNED;
9945	}
9946
9947	static int vfs_trace_paths_callback(mount_t mp, void *arg) {
9948	if (mp->mnt_flag & MNT_LOCAL)
9949	vnode_iterate(mp, VNODE_ITERATE_ALL, vnode_trace_path_callback, arg);
9950
9951	return VFS_RETURNED;
9952	}
9953
9954	static int sysctl_vfs_trace_paths SYSCTL_HANDLER_ARGS {
9955	struct vnode_trace_paths_context ctx;
9956
9957	(void)oidp;
9958	(void)arg1;
9959	(void)arg2;
9960	(void)req;
9961
9962	if (!kauth_cred_issuser(kauth_cred_get()))
9963	return EPERM;
9964
9965	if (!kdebug_enable \|\| !kdebug_debugid_enabled(VFS_LOOKUP))
9966	return EINVAL;
9967
9968	bzero(&ctx, sizeof (struct vnode_trace_paths_context));
9969
9970	vfs_iterate(`0`, vfs_trace_paths_callback, &ctx);
9971
9972	return `0`;
9973	}
9974
9975	SYSCTL_PROC(_vfs_generic, OID_AUTO, trace_paths, CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED, NULL, `0`, &sysctl_vfs_trace_paths, "-", "trace_paths");
9976

Browse the source code of codebrowser/bsd/vfs/vfs_subr.c