vm_fault.c source code [codebrowser/osfmk/vm/vm_fault.c]

1	/*
2	* Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: vm_fault.c
60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
61	*
62	* Page fault handling module.
63	*/
64
65	#include <mach_cluster_stats.h>
66	#include <mach_pagemap.h>
67	#include <libkern/OSAtomic.h>
68
69	#include <mach/mach_types.h>
70	#include <mach/kern_return.h>
71	#include <mach/message.h> /* for error codes */
72	#include <mach/vm_param.h>
73	#include <mach/vm_behavior.h>
74	#include <mach/memory_object.h>
75	/ For memory_object_data_{request,unlock} /
76	#include <mach/sdt.h>
77
78	#include <kern/kern_types.h>
79	#include <kern/host_statistics.h>
80	#include <kern/counters.h>
81	#include <kern/task.h>
82	#include <kern/thread.h>
83	#include <kern/sched_prim.h>
84	#include <kern/host.h>
85	#include <kern/xpr.h>
86	#include <kern/mach_param.h>
87	#include <kern/macro_help.h>
88	#include <kern/zalloc.h>
89	#include <kern/misc_protos.h>
90	#include <kern/policy_internal.h>
91
92	#include <vm/vm_compressor.h>
93	#include <vm/vm_compressor_pager.h>
94	#include <vm/vm_fault.h>
95	#include <vm/vm_map.h>
96	#include <vm/vm_object.h>
97	#include <vm/vm_page.h>
98	#include <vm/vm_kern.h>
99	#include <vm/pmap.h>
100	#include <vm/vm_pageout.h>
101	#include <vm/vm_protos.h>
102	#include <vm/vm_external.h>
103	#include <vm/memory_object.h>
104	#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105	#include <vm/vm_shared_region.h>
106
107	#include <sys/codesign.h>
108	#include <sys/reason.h>
109	#include <sys/signalvar.h>
110
111	#include <san/kasan.h>
112
113	#define VM_FAULT_CLASSIFY 0
114
115	#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116
117	unsigned int vm_object_pagein_throttle = `16`;
118
119	/*
120	* We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
121	* kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
122	* of memory if they're buggy and can run the system completely out of swap space. If this happens, we
123	* impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
124	* keep the UI active so that the user has a chance to kill the offending task before the system
125	* completely hangs.
126	*
127	* The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
128	* to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
129	* will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
130	* delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
131	*/
132
133	extern void throttle_lowpri_io(int);
134
135	extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
136
137	uint64_t vm_hard_throttle_threshold;
138
139
140
141	#define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) \|\| \
142	((vm_page_free_count < vm_page_throttle_limit \|\| \
143	HARD_THROTTLE_LIMIT_REACHED()) && \
144	proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
145
146
147	#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
148	#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
149
150	#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
151	#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
152
153
154	boolean_t current_thread_aborted(void);
155
156	/ Forward declarations of internal routines. /
157	static kern_return_t vm_fault_wire_fast(
158	vm_map_t map,
159	vm_map_offset_t va,
160	vm_prot_t prot,
161	vm_tag_t wire_tag,
162	vm_map_entry_t entry,
163	pmap_t pmap,
164	vm_map_offset_t pmap_addr,
165	ppnum_t *physpage_p);
166
167	static kern_return_t vm_fault_internal(
168	vm_map_t map,
169	vm_map_offset_t vaddr,
170	vm_prot_t caller_prot,
171	boolean_t change_wiring,
172	vm_tag_t wire_tag,
173	int interruptible,
174	pmap_t pmap,
175	vm_map_offset_t pmap_addr,
176	ppnum_t *physpage_p);
177
178	static void vm_fault_copy_cleanup(
179	vm_page_t page,
180	vm_page_t top_page);
181
182	static void vm_fault_copy_dst_cleanup(
183	vm_page_t page);
184
185	#if VM_FAULT_CLASSIFY
186	extern void vm_fault_classify(vm_object_t object,
187	vm_object_offset_t offset,
188	vm_prot_t fault_type);
189
190	extern void vm_fault_classify_init(void);
191	#endif
192
193	unsigned long vm_pmap_enter_blocked = `0`;
194	unsigned long vm_pmap_enter_retried = `0`;
195
196	unsigned long vm_cs_validates = `0`;
197	unsigned long vm_cs_revalidates = `0`;
198	unsigned long vm_cs_query_modified = `0`;
199	unsigned long vm_cs_validated_dirtied = `0`;
200	unsigned long vm_cs_bitmap_validated = `0`;
201	#if PMAP_CS
202	uint64_t vm_cs_defer_to_pmap_cs = `0`;
203	uint64_t vm_cs_defer_to_pmap_cs_not = `0`;
204	#endif /* PMAP_CS */
205
206	void vm_pre_fault(vm_map_offset_t);
207
208	extern char *kdp_compressor_decompressed_page;
209	extern addr64_t kdp_compressor_decompressed_page_paddr;
210	extern ppnum_t kdp_compressor_decompressed_page_ppnum;
211
212	struct vmrtfr {
213	int vmrtfr_maxi;
214	int vmrtfr_curi;
215	int64_t vmrtf_total;
216	vm_rtfault_record_t *vm_rtf_records;
217	} vmrtfrs;
218	#define VMRTF_DEFAULT_BUFSIZE (4096)
219	#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
220	int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
221
222	static void vm_rtfrecord_lock(void);
223	static void vm_rtfrecord_unlock(void);
224	static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
225
226	lck_spin_t vm_rtfr_slock;
227	extern lck_grp_t vm_page_lck_grp_bucket;
228	extern lck_attr_t vm_page_lck_attr;
229
230	/*
231	* Routine: vm_fault_init
232	* Purpose:
233	* Initialize our private data structures.
234	*/
235	void
236	vm_fault_init(void)
237	{
238	int i, vm_compressor_temp;
239	boolean_t need_default_val = TRUE;
240	/*
241	* Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
242	* computed as a percentage of available memory, and the percentage used is scaled inversely with
243	* the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
244	* and reduce the value down to 10% for very large memory configurations. This helps give us a
245	* definition of a memory hog that makes more sense relative to the amount of ram in the machine.
246	* The formula here simply uses the number of gigabytes of ram to adjust the percentage.
247	*/
248
249	vm_hard_throttle_threshold = sane_size * (`35` - MIN((int)(sane_size / (`1024``1024``1024`)), `25`)) / `100`;
250
251	/*
252	* Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
253	*/
254
255	if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
256	for ( i = `0`; i < VM_PAGER_MAX_MODES; i++) {
257	if (vm_compressor_temp > `0` &&
258	((vm_compressor_temp & ( `1` << i)) == vm_compressor_temp)) {
259	need_default_val = FALSE;
260	vm_compressor_mode = vm_compressor_temp;
261	break;
262	}
263	}
264	if (need_default_val)
265	printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
266	}
267	if (need_default_val) {
268	/ If no boot arg or incorrect boot arg, try device tree. /
269	PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
270	}
271	printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
272	}
273
274	void vm_rtfault_record_init(void) {
275	PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
276
277	assert(vmrtf_num_records >= `1`);
278	vmrtf_num_records = MAX(vmrtf_num_records, `1`);
279	size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
280	vmrtfrs.vm_rtf_records = kalloc(kallocsz);
281	bzero(vmrtfrs.vm_rtf_records, kallocsz);
282	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - `1`;
283	lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
284	}
285	/*
286	* Routine: vm_fault_cleanup
287	* Purpose:
288	* Clean up the result of vm_fault_page.
289	* Results:
290	* The paging reference for "object" is released.
291	* "object" is unlocked.
292	* If "top_page" is not null, "top_page" is
293	* freed and the paging reference for the object
294	* containing it is released.
295	*
296	* In/out conditions:
297	* "object" must be locked.
298	*/
299	void
300	vm_fault_cleanup(
301	vm_object_t object,
302	vm_page_t top_page)
303	{
304	vm_object_paging_end(object);
305	vm_object_unlock(object);
306
307	if (top_page != VM_PAGE_NULL) {
308	object = VM_PAGE_OBJECT(top_page);
309
310	vm_object_lock(object);
311	VM_PAGE_FREE(top_page);
312	vm_object_paging_end(object);
313	vm_object_unlock(object);
314	}
315	}
316
317	#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
318
319
320	boolean_t vm_page_deactivate_behind = TRUE;
321	/*
322	* default sizes given VM_BEHAVIOR_DEFAULT reference behavior
323	*/
324	#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
325	#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
326	/ we use it to size an array on the stack /
327
328	int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
329
330	#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
331
332	/*
333	* vm_page_is_sequential
334	*
335	* Determine if sequential access is in progress
336	* in accordance with the behavior specified.
337	* Update state to indicate current access pattern.
338	*
339	* object must have at least the shared lock held
340	*/
341	static
342	void
343	vm_fault_is_sequential(
344	vm_object_t object,
345	vm_object_offset_t offset,
346	vm_behavior_t behavior)
347	{
348	vm_object_offset_t last_alloc;
349	int sequential;
350	int orig_sequential;
351
352	last_alloc = object->last_alloc;
353	sequential = object->sequential;
354	orig_sequential = sequential;
355
356	switch (behavior) {
357	case VM_BEHAVIOR_RANDOM:
358	/*
359	* reset indicator of sequential behavior
360	*/
361	sequential = `0`;
362	break;
363
364	case VM_BEHAVIOR_SEQUENTIAL:
365	if (offset && last_alloc == offset - PAGE_SIZE_64) {
366	/*
367	* advance indicator of sequential behavior
368	*/
369	if (sequential < MAX_SEQUENTIAL_RUN)
370	sequential += PAGE_SIZE;
371	} else {
372	/*
373	* reset indicator of sequential behavior
374	*/
375	sequential = `0`;
376	}
377	break;
378
379	case VM_BEHAVIOR_RSEQNTL:
380	if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
381	/*
382	* advance indicator of sequential behavior
383	*/
384	if (sequential > -MAX_SEQUENTIAL_RUN)
385	sequential -= PAGE_SIZE;
386	} else {
387	/*
388	* reset indicator of sequential behavior
389	*/
390	sequential = `0`;
391	}
392	break;
393
394	case VM_BEHAVIOR_DEFAULT:
395	default:
396	if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
397	/*
398	* advance indicator of sequential behavior
399	*/
400	if (sequential < `0`)
401	sequential = `0`;
402	if (sequential < MAX_SEQUENTIAL_RUN)
403	sequential += PAGE_SIZE;
404
405	} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
406	/*
407	* advance indicator of sequential behavior
408	*/
409	if (sequential > `0`)
410	sequential = `0`;
411	if (sequential > -MAX_SEQUENTIAL_RUN)
412	sequential -= PAGE_SIZE;
413	} else {
414	/*
415	* reset indicator of sequential behavior
416	*/
417	sequential = `0`;
418	}
419	break;
420	}
421	if (sequential != orig_sequential) {
422	if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
423	/*
424	* if someone else has already updated object->sequential
425	* don't bother trying to update it or object->last_alloc
426	*/
427	return;
428	}
429	}
430	/*
431	* I'd like to do this with a OSCompareAndSwap64, but that
432	* doesn't exist for PPC... however, it shouldn't matter
433	* that much... last_alloc is maintained so that we can determine
434	* if a sequential access pattern is taking place... if only
435	* one thread is banging on this object, no problem with the unprotected
436	* update... if 2 or more threads are banging away, we run the risk of
437	* someone seeing a mangled update... however, in the face of multiple
438	* accesses, no sequential access pattern can develop anyway, so we
439	* haven't lost any real info.
440	*/
441	object->last_alloc = offset;
442	}
443
444
445	int vm_page_deactivate_behind_count = `0`;
446
447	/*
448	* vm_page_deactivate_behind
449	*
450	* Determine if sequential access is in progress
451	* in accordance with the behavior specified. If
452	* so, compute a potential page to deactivate and
453	* deactivate it.
454	*
455	* object must be locked.
456	*
457	* return TRUE if we actually deactivate a page
458	*/
459	static
460	boolean_t
461	vm_fault_deactivate_behind(
462	vm_object_t object,
463	vm_object_offset_t offset,
464	vm_behavior_t behavior)
465	{
466	int n;
467	int pages_in_run = `0`;
468	int max_pages_in_run = `0`;
469	int sequential_run;
470	int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
471	vm_object_offset_t run_offset = `0`;
472	vm_object_offset_t pg_offset = `0`;
473	vm_page_t m;
474	vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
475
476	pages_in_run = `0`;
477	#if TRACEFAULTPAGE
478	dbgTrace(`0xBEEF0018`, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); / (TEST/DEBUG) /
479	#endif
480
481	if (object == kernel_object \|\| vm_page_deactivate_behind == FALSE) {
482	/*
483	* Do not deactivate pages from the kernel object: they
484	* are not intended to become pageable.
485	* or we've disabled the deactivate behind mechanism
486	*/
487	return FALSE;
488	}
489	if ((sequential_run = object->sequential)) {
490	if (sequential_run < `0`) {
491	sequential_behavior = VM_BEHAVIOR_RSEQNTL;
492	sequential_run = `0` - sequential_run;
493	} else {
494	sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
495	}
496	}
497	switch (behavior) {
498	case VM_BEHAVIOR_RANDOM:
499	break;
500	case VM_BEHAVIOR_SEQUENTIAL:
501	if (sequential_run >= (int)PAGE_SIZE) {
502	run_offset = `0` - PAGE_SIZE_64;
503	max_pages_in_run = `1`;
504	}
505	break;
506	case VM_BEHAVIOR_RSEQNTL:
507	if (sequential_run >= (int)PAGE_SIZE) {
508	run_offset = PAGE_SIZE_64;
509	max_pages_in_run = `1`;
510	}
511	break;
512	case VM_BEHAVIOR_DEFAULT:
513	default:
514	{ vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
515
516	/*
517	* determine if the run of sequential accesss has been
518	* long enough on an object with default access behavior
519	* to consider it for deactivation
520	*/
521	if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == `0`) {
522	/*
523	* the comparisons between offset and behind are done
524	* in this kind of odd fashion in order to prevent wrap around
525	* at the end points
526	*/
527	if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
528	if (offset >= behind) {
529	run_offset = `0` - behind;
530	pg_offset = PAGE_SIZE_64;
531	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
532	}
533	} else {
534	if (offset < -behind) {
535	run_offset = behind;
536	pg_offset = `0` - PAGE_SIZE_64;
537	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
538	}
539	}
540	}
541	break;
542	}
543	}
544	for (n = `0`; n < max_pages_in_run; n++) {
545	m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
546
547	if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
548	page_run[pages_in_run++] = m;
549
550	/*
551	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
552	*
553	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
554	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
555	* new reference happens. If no futher references happen on the page after that remote TLB flushes
556	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
557	* by pageout_scan, which is just fine since the last reference would have happened quite far
558	* in the past (TLB caches don't hang around for very long), and of course could just as easily
559	* have happened before we did the deactivate_behind.
560	*/
561	pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
562	}
563	}
564	if (pages_in_run) {
565	vm_page_lockspin_queues();
566
567	for (n = `0`; n < pages_in_run; n++) {
568
569	m = page_run[n];
570
571	vm_page_deactivate_internal(m, FALSE);
572
573	vm_page_deactivate_behind_count++;
574	#if TRACEFAULTPAGE
575	dbgTrace(`0xBEEF0019`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
576	#endif
577	}
578	vm_page_unlock_queues();
579
580	return TRUE;
581	}
582	return FALSE;
583	}
584
585
586	#if (DEVELOPMENT \|\| DEBUG)
587	uint32_t vm_page_creation_throttled_hard = `0`;
588	uint32_t vm_page_creation_throttled_soft = `0`;
589	uint64_t vm_page_creation_throttle_avoided = `0`;
590	#endif /* DEVELOPMENT \|\| DEBUG */
591
592	static int
593	vm_page_throttled(boolean_t page_kept)
594	{
595	clock_sec_t elapsed_sec;
596	clock_sec_t tv_sec;
597	clock_usec_t tv_usec;
598
599	thread_t thread = current_thread();
600
601	if (thread->options & TH_OPT_VMPRIV)
602	return (`0`);
603
604	if (thread->t_page_creation_throttled) {
605	thread->t_page_creation_throttled = `0`;
606
607	if (page_kept == FALSE)
608	goto no_throttle;
609	}
610	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
611	#if (DEVELOPMENT \|\| DEBUG)
612	thread->t_page_creation_throttled_hard++;
613	OSAddAtomic(`1`, &vm_page_creation_throttled_hard);
614	#endif /* DEVELOPMENT \|\| DEBUG */
615	return (HARD_THROTTLE_DELAY);
616	}
617
618	if ((vm_page_free_count < vm_page_throttle_limit \|\| (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
619	thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
620
621	if (vm_page_free_wanted == `0` && vm_page_free_wanted_privileged == `0`) {
622	#if (DEVELOPMENT \|\| DEBUG)
623	OSAddAtomic64(`1`, &vm_page_creation_throttle_avoided);
624	#endif
625	goto no_throttle;
626	}
627	clock_get_system_microtime(&tv_sec, &tv_usec);
628
629	elapsed_sec = tv_sec - thread->t_page_creation_time;
630
631	if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS \|\|
632	(thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
633
634	if (elapsed_sec >= (`3` * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
635	/*
636	* we'll reset our stats to give a well behaved app
637	* that was unlucky enough to accumulate a bunch of pages
638	* over a long period of time a chance to get out of
639	* the throttled state... we reset the counter and timestamp
640	* so that if it stays under the rate limit for the next second
641	* it will be back in our good graces... if it exceeds it, it
642	* will remain in the throttled state
643	*/
644	thread->t_page_creation_time = tv_sec;
645	thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - `1`);
646	}
647	VM_PAGEOUT_DEBUG(vm_page_throttle_count, `1`);
648
649	thread->t_page_creation_throttled = `1`;
650
651	if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
652	#if (DEVELOPMENT \|\| DEBUG)
653	thread->t_page_creation_throttled_hard++;
654	OSAddAtomic(`1`, &vm_page_creation_throttled_hard);
655	#endif /* DEVELOPMENT \|\| DEBUG */
656	return (HARD_THROTTLE_DELAY);
657	} else {
658	#if (DEVELOPMENT \|\| DEBUG)
659	thread->t_page_creation_throttled_soft++;
660	OSAddAtomic(`1`, &vm_page_creation_throttled_soft);
661	#endif /* DEVELOPMENT \|\| DEBUG */
662	return (SOFT_THROTTLE_DELAY);
663	}
664	}
665	thread->t_page_creation_time = tv_sec;
666	thread->t_page_creation_count = `0`;
667	}
668	no_throttle:
669	thread->t_page_creation_count++;
670
671	return (`0`);
672	}
673
674
675	/*
676	* check for various conditions that would
677	* prevent us from creating a ZF page...
678	* cleanup is based on being called from vm_fault_page
679	*
680	* object must be locked
681	* object == m->vmp_object
682	*/
683	static vm_fault_return_t
684	vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
685	{
686	int throttle_delay;
687
688	if (object->shadow_severed \|\|
689	VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
690	/*
691	* Either:
692	* 1. the shadow chain was severed,
693	* 2. the purgeable object is volatile or empty and is marked
694	* to fault on access while volatile.
695	* Just have to return an error at this point
696	*/
697	if (m != VM_PAGE_NULL)
698	VM_PAGE_FREE(m);
699	vm_fault_cleanup(object, first_m);
700
701	thread_interrupt_level(interruptible_state);
702
703	return (VM_FAULT_MEMORY_ERROR);
704	}
705	if (page_throttle == TRUE) {
706	if ((throttle_delay = vm_page_throttled(FALSE))) {
707	/*
708	* we're throttling zero-fills...
709	* treat this as if we couldn't grab a page
710	*/
711	if (m != VM_PAGE_NULL)
712	VM_PAGE_FREE(m);
713	vm_fault_cleanup(object, first_m);
714
715	VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
716
717	delay(throttle_delay);
718
719	if (current_thread_aborted()) {
720	thread_interrupt_level(interruptible_state);
721	return VM_FAULT_INTERRUPTED;
722	}
723	thread_interrupt_level(interruptible_state);
724
725	return (VM_FAULT_MEMORY_SHORTAGE);
726	}
727	}
728	return (VM_FAULT_SUCCESS);
729	}
730
731
732	/*
733	* do the work to zero fill a page and
734	* inject it into the correct paging queue
735	*
736	* m->vmp_object must be locked
737	* page queue lock must NOT be held
738	*/
739	static int
740	vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
741	{
742	int my_fault = DBG_ZERO_FILL_FAULT;
743	vm_object_t object;
744
745	object = VM_PAGE_OBJECT(m);
746
747	/*
748	* This is is a zero-fill page fault...
749	*
750	* Checking the page lock is a waste of
751	* time; this page was absent, so
752	* it can't be page locked by a pager.
753	*
754	* we also consider it undefined
755	* with respect to instruction
756	* execution. i.e. it is the responsibility
757	* of higher layers to call for an instruction
758	* sync after changing the contents and before
759	* sending a program into this area. We
760	* choose this approach for performance
761	*/
762	m->vmp_pmapped = TRUE;
763
764	m->vmp_cs_validated = FALSE;
765	m->vmp_cs_tainted = FALSE;
766	m->vmp_cs_nx = FALSE;
767
768	if (no_zero_fill == TRUE) {
769	my_fault = DBG_NZF_PAGE_FAULT;
770
771	if (m->vmp_absent && m->vmp_busy)
772	return (my_fault);
773	} else {
774	vm_page_zero_fill(m);
775
776	VM_STAT_INCR(zero_fill_count);
777	DTRACE_VM2(zfod, int, `1`, (uint64_t *), NULL);
778	}
779	assert(!m->vmp_laundry);
780	assert(object != kernel_object);
781	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
782
783	if (!VM_DYNAMIC_PAGING_ENABLED() &&
784	(object->purgable == VM_PURGABLE_DENY \|\|
785	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
786	object->purgable == VM_PURGABLE_VOLATILE )) {
787
788	vm_page_lockspin_queues();
789
790	if (!VM_DYNAMIC_PAGING_ENABLED()) {
791	assert(!VM_PAGE_WIRED(m));
792
793	/*
794	* can't be on the pageout queue since we don't
795	* have a pager to try and clean to
796	*/
797	vm_page_queues_remove(m, TRUE);
798	vm_page_check_pageable_safe(m);
799	vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
800	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
801	vm_page_throttled_count++;
802	}
803	vm_page_unlock_queues();
804	}
805	return (my_fault);
806	}
807
808
809	/*
810	* Routine: vm_fault_page
811	* Purpose:
812	* Find the resident page for the virtual memory
813	* specified by the given virtual memory object
814	* and offset.
815	* Additional arguments:
816	* The required permissions for the page is given
817	* in "fault_type". Desired permissions are included
818	* in "protection".
819	* fault_info is passed along to determine pagein cluster
820	* limits... it contains the expected reference pattern,
821	* cluster size if available, etc...
822	*
823	* If the desired page is known to be resident (for
824	* example, because it was previously wired down), asserting
825	* the "unwiring" parameter will speed the search.
826	*
827	* If the operation can be interrupted (by thread_abort
828	* or thread_terminate), then the "interruptible"
829	* parameter should be asserted.
830	*
831	* Results:
832	* The page containing the proper data is returned
833	* in "result_page".
834	*
835	* In/out conditions:
836	* The source object must be locked and referenced,
837	* and must donate one paging reference. The reference
838	* is not affected. The paging reference and lock are
839	* consumed.
840	*
841	* If the call succeeds, the object in which "result_page"
842	* resides is left locked and holding a paging reference.
843	* If this is not the original object, a busy page in the
844	* original object is returned in "top_page", to prevent other
845	* callers from pursuing this same data, along with a paging
846	* reference for the original object. The "top_page" should
847	* be destroyed when this guarantee is no longer required.
848	* The "result_page" is also left busy. It is not removed
849	* from the pageout queues.
850	* Special Case:
851	* A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
852	* fault succeeded but there's no VM page (i.e. the VM object
853	* does not actually hold VM pages, but device memory or
854	* large pages). The object is still locked and we still hold a
855	* paging_in_progress reference.
856	*/
857	unsigned int vm_fault_page_blocked_access = `0`;
858	unsigned int vm_fault_page_forced_retry = `0`;
859
860	vm_fault_return_t
861	vm_fault_page(
862	/ Arguments: /
863	vm_object_t first_object, / Object to begin search /
864	vm_object_offset_t first_offset, / Offset into object /
865	vm_prot_t fault_type, / What access is requested /
866	boolean_t must_be_resident,/ Must page be resident? /
867	boolean_t caller_lookup, / caller looked up page /
868	/ Modifies in place: /
869	vm_prot_t protection, /* Protection for mapping /
870	vm_page_t result_page, /* Page found, if successful /
871	/ Returns: /
872	vm_page_t top_page, /* Page in top object, if*
873	* not result_page. */
874	int type_of_fault, /* if non-null, fill in with type of fault*
875	* COW, zero-fill, etc... returned in trace point */
876	/ More arguments: /
877	kern_return_t error_code, /* code if page is in error /
878	boolean_t no_zero_fill, / don't zero fill absent pages /
879	boolean_t data_supply, / treat as data_supply if*
880	* it is a write fault and a full
881	* page is provided */
882	vm_object_fault_info_t fault_info)
883	{
884	vm_page_t m;
885	vm_object_t object;
886	vm_object_offset_t offset;
887	vm_page_t first_m;
888	vm_object_t next_object;
889	vm_object_t copy_object;
890	boolean_t look_for_page;
891	boolean_t force_fault_retry = FALSE;
892	vm_prot_t access_required = fault_type;
893	vm_prot_t wants_copy_flag;
894	kern_return_t wait_result;
895	wait_interrupt_t interruptible_state;
896	boolean_t data_already_requested = FALSE;
897	vm_behavior_t orig_behavior;
898	vm_size_t orig_cluster_size;
899	vm_fault_return_t error;
900	int my_fault;
901	uint32_t try_failed_count;
902	int interruptible; / how may fault be interrupted? /
903	int external_state = VM_EXTERNAL_STATE_UNKNOWN;
904	memory_object_t pager;
905	vm_fault_return_t retval;
906	int grab_options;
907
908	/*
909	* MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
910	* marked as paged out in the compressor pager or the pager doesn't exist.
911	* Note also that if the pager for an internal object
912	* has not been created, the pager is not invoked regardless of the value
913	* of MUST_ASK_PAGER().
914	*
915	* PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
916	* is marked as paged out in the compressor pager.
917	* PAGED_OUT() is used to determine if a page has already been pushed
918	* into a copy object in order to avoid a redundant page out operation.
919	*/
920	#define MUST_ASK_PAGER(o, f, s) \
921	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
922
923	#define PAGED_OUT(o, f) \
924	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
925
926	/*
927	* Recovery actions
928	*/
929	#define RELEASE_PAGE(m) \
930	MACRO_BEGIN \
931	PAGE_WAKEUP_DONE(m); \
932	if ( !VM_PAGE_PAGEABLE(m)) { \
933	vm_page_lockspin_queues(); \
934	if ( !VM_PAGE_PAGEABLE(m)) { \
935	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
936	vm_page_deactivate(m); \
937	else \
938	vm_page_activate(m); \
939	} \
940	vm_page_unlock_queues(); \
941	} \
942	MACRO_END
943
944	#if TRACEFAULTPAGE
945	dbgTrace(`0xBEEF0002`, (unsigned int) first_object, (unsigned int) first_offset); / (TEST/DEBUG) /
946	#endif
947
948	interruptible = fault_info->interruptible;
949	interruptible_state = thread_interrupt_level(interruptible);
950
951	/*
952	* INVARIANTS (through entire routine):
953	*
954	* 1) At all times, we must either have the object
955	* lock or a busy page in some object to prevent
956	* some other thread from trying to bring in
957	* the same page.
958	*
959	* Note that we cannot hold any locks during the
960	* pager access or when waiting for memory, so
961	* we use a busy page then.
962	*
963	* 2) To prevent another thread from racing us down the
964	* shadow chain and entering a new page in the top
965	* object before we do, we must keep a busy page in
966	* the top object while following the shadow chain.
967	*
968	* 3) We must increment paging_in_progress on any object
969	* for which we have a busy page before dropping
970	* the object lock
971	*
972	* 4) We leave busy pages on the pageout queues.
973	* If the pageout daemon comes across a busy page,
974	* it will remove the page from the pageout queues.
975	*/
976
977	object = first_object;
978	offset = first_offset;
979	first_m = VM_PAGE_NULL;
980	access_required = fault_type;
981
982
983	XPR(XPR_VM_FAULT,
984	"vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
985	object, offset, fault_type, *protection, `0`);
986
987	/*
988	* default type of fault
989	*/
990	my_fault = DBG_CACHE_HIT_FAULT;
991
992	while (TRUE) {
993	#if TRACEFAULTPAGE
994	dbgTrace(`0xBEEF0003`, (unsigned int) `0`, (unsigned int) `0`); / (TEST/DEBUG) /
995	#endif
996
997	grab_options = `0`;
998	#if CONFIG_SECLUDED_MEMORY
999	if (object->can_grab_secluded) {
1000	grab_options \|= VM_PAGE_GRAB_SECLUDED;
1001	}
1002	#endif /* CONFIG_SECLUDED_MEMORY */
1003
1004	if (!object->alive) {
1005	/*
1006	* object is no longer valid
1007	* clean up and return error
1008	*/
1009	vm_fault_cleanup(object, first_m);
1010	thread_interrupt_level(interruptible_state);
1011
1012	return (VM_FAULT_MEMORY_ERROR);
1013	}
1014
1015	if (!object->pager_created && object->phys_contiguous) {
1016	/*
1017	* A physically-contiguous object without a pager:
1018	* must be a "large page" object. We do not deal
1019	* with VM pages for this object.
1020	*/
1021	caller_lookup = FALSE;
1022	m = VM_PAGE_NULL;
1023	goto phys_contig_object;
1024	}
1025
1026	if (object->blocked_access) {
1027	/*
1028	* Access to this VM object has been blocked.
1029	* Replace our "paging_in_progress" reference with
1030	* a "activity_in_progress" reference and wait for
1031	* access to be unblocked.
1032	*/
1033	caller_lookup = FALSE; / no longer valid after sleep /
1034	vm_object_activity_begin(object);
1035	vm_object_paging_end(object);
1036	while (object->blocked_access) {
1037	vm_object_sleep(object,
1038	VM_OBJECT_EVENT_UNBLOCKED,
1039	THREAD_UNINT);
1040	}
1041	vm_fault_page_blocked_access++;
1042	vm_object_paging_begin(object);
1043	vm_object_activity_end(object);
1044	}
1045
1046	/*
1047	* See whether the page at 'offset' is resident
1048	*/
1049	if (caller_lookup == TRUE) {
1050	/*
1051	* The caller has already looked up the page
1052	* and gave us the result in "result_page".
1053	* We can use this for the first lookup but
1054	* it loses its validity as soon as we unlock
1055	* the object.
1056	*/
1057	m = *result_page;
1058	caller_lookup = FALSE; / no longer valid after that /
1059	} else {
1060	m = vm_page_lookup(object, offset);
1061	}
1062	#if TRACEFAULTPAGE
1063	dbgTrace(`0xBEEF0004`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1064	#endif
1065	if (m != VM_PAGE_NULL) {
1066
1067	if (m->vmp_busy) {
1068	/*
1069	* The page is being brought in,
1070	* wait for it and then retry.
1071	*/
1072	#if TRACEFAULTPAGE
1073	dbgTrace(`0xBEEF0005`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1074	#endif
1075	wait_result = PAGE_SLEEP(object, m, interruptible);
1076
1077	XPR(XPR_VM_FAULT,
1078	"vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1079	object, offset,
1080	m, `0`, `0`);
1081	counter(c_vm_fault_page_block_busy_kernel++);
1082
1083	if (wait_result != THREAD_AWAKENED) {
1084	vm_fault_cleanup(object, first_m);
1085	thread_interrupt_level(interruptible_state);
1086
1087	if (wait_result == THREAD_RESTART)
1088	return (VM_FAULT_RETRY);
1089	else
1090	return (VM_FAULT_INTERRUPTED);
1091	}
1092	continue;
1093	}
1094	if (m->vmp_laundry) {
1095	m->vmp_free_when_done = FALSE;
1096
1097	if (!m->vmp_cleaning)
1098	vm_pageout_steal_laundry(m, FALSE);
1099	}
1100	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1101	/*
1102	* Guard page: off limits !
1103	*/
1104	if (fault_type == VM_PROT_NONE) {
1105	/*
1106	* The fault is not requesting any
1107	* access to the guard page, so it must
1108	* be just to wire or unwire it.
1109	* Let's pretend it succeeded...
1110	*/
1111	m->vmp_busy = TRUE;
1112	*result_page = m;
1113	assert(first_m == VM_PAGE_NULL);
1114	*top_page = first_m;
1115	if (type_of_fault)
1116	*type_of_fault = DBG_GUARD_FAULT;
1117	thread_interrupt_level(interruptible_state);
1118	return VM_FAULT_SUCCESS;
1119	} else {
1120	/*
1121	* The fault requests access to the
1122	* guard page: let's deny that !
1123	*/
1124	vm_fault_cleanup(object, first_m);
1125	thread_interrupt_level(interruptible_state);
1126	return VM_FAULT_MEMORY_ERROR;
1127	}
1128	}
1129
1130	if (m->vmp_error) {
1131	/*
1132	* The page is in error, give up now.
1133	*/
1134	#if TRACEFAULTPAGE
1135	dbgTrace(`0xBEEF0006`, (unsigned int) m, (unsigned int) error_code); / (TEST/DEBUG) /
1136	#endif
1137	if (error_code)
1138	*error_code = KERN_MEMORY_ERROR;
1139	VM_PAGE_FREE(m);
1140
1141	vm_fault_cleanup(object, first_m);
1142	thread_interrupt_level(interruptible_state);
1143
1144	return (VM_FAULT_MEMORY_ERROR);
1145	}
1146	if (m->vmp_restart) {
1147	/*
1148	* The pager wants us to restart
1149	* at the top of the chain,
1150	* typically because it has moved the
1151	* page to another pager, then do so.
1152	*/
1153	#if TRACEFAULTPAGE
1154	dbgTrace(`0xBEEF0007`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1155	#endif
1156	VM_PAGE_FREE(m);
1157
1158	vm_fault_cleanup(object, first_m);
1159	thread_interrupt_level(interruptible_state);
1160
1161	return (VM_FAULT_RETRY);
1162	}
1163	if (m->vmp_absent) {
1164	/*
1165	* The page isn't busy, but is absent,
1166	* therefore it's deemed "unavailable".
1167	*
1168	* Remove the non-existent page (unless it's
1169	* in the top object) and move on down to the
1170	* next object (if there is one).
1171	*/
1172	#if TRACEFAULTPAGE
1173	dbgTrace(`0xBEEF0008`, (unsigned int) m, (unsigned int) object->shadow); / (TEST/DEBUG) /
1174	#endif
1175	next_object = object->shadow;
1176
1177	if (next_object == VM_OBJECT_NULL) {
1178	/*
1179	* Absent page at bottom of shadow
1180	* chain; zero fill the page we left
1181	* busy in the first object, and free
1182	* the absent page.
1183	*/
1184	assert(!must_be_resident);
1185
1186	/*
1187	* check for any conditions that prevent
1188	* us from creating a new zero-fill page
1189	* vm_fault_check will do all of the
1190	* fault cleanup in the case of an error condition
1191	* including resetting the thread_interrupt_level
1192	*/
1193	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1194
1195	if (error != VM_FAULT_SUCCESS)
1196	return (error);
1197
1198	XPR(XPR_VM_FAULT,
1199	"vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1200	object, offset,
1201	m,
1202	first_object, `0`);
1203
1204	if (object != first_object) {
1205	/*
1206	* free the absent page we just found
1207	*/
1208	VM_PAGE_FREE(m);
1209
1210	/*
1211	* drop reference and lock on current object
1212	*/
1213	vm_object_paging_end(object);
1214	vm_object_unlock(object);
1215
1216	/*
1217	* grab the original page we
1218	* 'soldered' in place and
1219	* retake lock on 'first_object'
1220	*/
1221	m = first_m;
1222	first_m = VM_PAGE_NULL;
1223
1224	object = first_object;
1225	offset = first_offset;
1226
1227	vm_object_lock(object);
1228	} else {
1229	/*
1230	* we're going to use the absent page we just found
1231	* so convert it to a 'busy' page
1232	*/
1233	m->vmp_absent = FALSE;
1234	m->vmp_busy = TRUE;
1235	}
1236	if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1237	m->vmp_absent = TRUE;
1238	/*
1239	* zero-fill the page and put it on
1240	* the correct paging queue
1241	*/
1242	my_fault = vm_fault_zero_page(m, no_zero_fill);
1243
1244	break;
1245	} else {
1246	if (must_be_resident)
1247	vm_object_paging_end(object);
1248	else if (object != first_object) {
1249	vm_object_paging_end(object);
1250	VM_PAGE_FREE(m);
1251	} else {
1252	first_m = m;
1253	m->vmp_absent = FALSE;
1254	m->vmp_busy = TRUE;
1255
1256	vm_page_lockspin_queues();
1257	vm_page_queues_remove(m, FALSE);
1258	vm_page_unlock_queues();
1259	}
1260	XPR(XPR_VM_FAULT,
1261	"vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1262	object, offset,
1263	next_object,
1264	offset+object->vo_shadow_offset,`0`);
1265
1266	offset += object->vo_shadow_offset;
1267	fault_info->lo_offset += object->vo_shadow_offset;
1268	fault_info->hi_offset += object->vo_shadow_offset;
1269	access_required = VM_PROT_READ;
1270
1271	vm_object_lock(next_object);
1272	vm_object_unlock(object);
1273	object = next_object;
1274	vm_object_paging_begin(object);
1275
1276	/*
1277	* reset to default type of fault
1278	*/
1279	my_fault = DBG_CACHE_HIT_FAULT;
1280
1281	continue;
1282	}
1283	}
1284	if ((m->vmp_cleaning)
1285	&& ((object != first_object) \|\| (object->copy != VM_OBJECT_NULL))
1286	&& (fault_type & VM_PROT_WRITE)) {
1287	/*
1288	* This is a copy-on-write fault that will
1289	* cause us to revoke access to this page, but
1290	* this page is in the process of being cleaned
1291	* in a clustered pageout. We must wait until
1292	* the cleaning operation completes before
1293	* revoking access to the original page,
1294	* otherwise we might attempt to remove a
1295	* wired mapping.
1296	*/
1297	#if TRACEFAULTPAGE
1298	dbgTrace(`0xBEEF0009`, (unsigned int) m, (unsigned int) offset); / (TEST/DEBUG) /
1299	#endif
1300	XPR(XPR_VM_FAULT,
1301	"vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1302	object, offset,
1303	m, `0`, `0`);
1304	/*
1305	* take an extra ref so that object won't die
1306	*/
1307	vm_object_reference_locked(object);
1308
1309	vm_fault_cleanup(object, first_m);
1310
1311	counter(c_vm_fault_page_block_backoff_kernel++);
1312	vm_object_lock(object);
1313	assert(object->ref_count > `0`);
1314
1315	m = vm_page_lookup(object, offset);
1316
1317	if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1318	PAGE_ASSERT_WAIT(m, interruptible);
1319
1320	vm_object_unlock(object);
1321	wait_result = thread_block(THREAD_CONTINUE_NULL);
1322	vm_object_deallocate(object);
1323
1324	goto backoff;
1325	} else {
1326	vm_object_unlock(object);
1327
1328	vm_object_deallocate(object);
1329	thread_interrupt_level(interruptible_state);
1330
1331	return (VM_FAULT_RETRY);
1332	}
1333	}
1334	if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1335	!(fault_info != NULL && fault_info->stealth)) {
1336	/*
1337	* If we were passed a non-NULL pointer for
1338	* "type_of_fault", than we came from
1339	* vm_fault... we'll let it deal with
1340	* this condition, since it
1341	* needs to see m->vmp_speculative to correctly
1342	* account the pageins, otherwise...
1343	* take it off the speculative queue, we'll
1344	* let the caller of vm_fault_page deal
1345	* with getting it onto the correct queue
1346	*
1347	* If the caller specified in fault_info that
1348	* it wants a "stealth" fault, we also leave
1349	* the page in the speculative queue.
1350	*/
1351	vm_page_lockspin_queues();
1352	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q)
1353	vm_page_queues_remove(m, FALSE);
1354	vm_page_unlock_queues();
1355	}
1356	assert(object == VM_PAGE_OBJECT(m));
1357
1358	if (object->code_signed) {
1359	/*
1360	* CODE SIGNING:
1361	* We just paged in a page from a signed
1362	* memory object but we don't need to
1363	* validate it now. We'll validate it if
1364	* when it gets mapped into a user address
1365	* space for the first time or when the page
1366	* gets copied to another object as a result
1367	* of a copy-on-write.
1368	*/
1369	}
1370
1371	/*
1372	* We mark the page busy and leave it on
1373	* the pageout queues. If the pageout
1374	* deamon comes across it, then it will
1375	* remove the page from the queue, but not the object
1376	*/
1377	#if TRACEFAULTPAGE
1378	dbgTrace(`0xBEEF000B`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1379	#endif
1380	XPR(XPR_VM_FAULT,
1381	"vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1382	object, offset, m, `0`, `0`);
1383	assert(!m->vmp_busy);
1384	assert(!m->vmp_absent);
1385
1386	m->vmp_busy = TRUE;
1387	break;
1388	}
1389
1390
1391	/*
1392	* we get here when there is no page present in the object at
1393	* the offset we're interested in... we'll allocate a page
1394	* at this point if the pager associated with
1395	* this object can provide the data or we're the top object...
1396	* object is locked; m == NULL
1397	*/
1398
1399	if (must_be_resident) {
1400	if (fault_type == VM_PROT_NONE &&
1401	object == kernel_object) {
1402	/*
1403	* We've been called from vm_fault_unwire()
1404	* while removing a map entry that was allocated
1405	* with KMA_KOBJECT and KMA_VAONLY. This page
1406	* is not present and there's nothing more to
1407	* do here (nothing to unwire).
1408	*/
1409	vm_fault_cleanup(object, first_m);
1410	thread_interrupt_level(interruptible_state);
1411
1412	return VM_FAULT_MEMORY_ERROR;
1413	}
1414
1415	goto dont_look_for_page;
1416	}
1417
1418	/ Don't expect to fault pages into the kernel object. /
1419	assert(object != kernel_object);
1420
1421	data_supply = FALSE;
1422
1423	look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1424
1425	#if TRACEFAULTPAGE
1426	dbgTrace(`0xBEEF000C`, (unsigned int) look_for_page, (unsigned int) object); / (TEST/DEBUG) /
1427	#endif
1428	if (!look_for_page && object == first_object && !object->phys_contiguous) {
1429	/*
1430	* Allocate a new page for this object/offset pair as a placeholder
1431	*/
1432	m = vm_page_grab_options(grab_options);
1433	#if TRACEFAULTPAGE
1434	dbgTrace(`0xBEEF000D`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1435	#endif
1436	if (m == VM_PAGE_NULL) {
1437
1438	vm_fault_cleanup(object, first_m);
1439	thread_interrupt_level(interruptible_state);
1440
1441	return (VM_FAULT_MEMORY_SHORTAGE);
1442	}
1443
1444	if (fault_info && fault_info->batch_pmap_op == TRUE) {
1445	vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1446	} else {
1447	vm_page_insert(m, object, offset);
1448	}
1449	}
1450	if (look_for_page) {
1451	kern_return_t rc;
1452	int my_fault_type;
1453
1454	/*
1455	* If the memory manager is not ready, we
1456	* cannot make requests.
1457	*/
1458	if (!object->pager_ready) {
1459	#if TRACEFAULTPAGE
1460	dbgTrace(`0xBEEF000E`, (unsigned int) `0`, (unsigned int) `0`); / (TEST/DEBUG) /
1461	#endif
1462	if (m != VM_PAGE_NULL)
1463	VM_PAGE_FREE(m);
1464
1465	XPR(XPR_VM_FAULT,
1466	"vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1467	object, offset, `0`, `0`, `0`);
1468
1469	/*
1470	* take an extra ref so object won't die
1471	*/
1472	vm_object_reference_locked(object);
1473	vm_fault_cleanup(object, first_m);
1474	counter(c_vm_fault_page_block_backoff_kernel++);
1475
1476	vm_object_lock(object);
1477	assert(object->ref_count > `0`);
1478
1479	if (!object->pager_ready) {
1480	wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1481
1482	vm_object_unlock(object);
1483	if (wait_result == THREAD_WAITING)
1484	wait_result = thread_block(THREAD_CONTINUE_NULL);
1485	vm_object_deallocate(object);
1486
1487	goto backoff;
1488	} else {
1489	vm_object_unlock(object);
1490	vm_object_deallocate(object);
1491	thread_interrupt_level(interruptible_state);
1492
1493	return (VM_FAULT_RETRY);
1494	}
1495	}
1496	if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1497	/*
1498	* If there are too many outstanding page
1499	* requests pending on this external object, we
1500	* wait for them to be resolved now.
1501	*/
1502	#if TRACEFAULTPAGE
1503	dbgTrace(`0xBEEF0010`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1504	#endif
1505	if (m != VM_PAGE_NULL)
1506	VM_PAGE_FREE(m);
1507	/*
1508	* take an extra ref so object won't die
1509	*/
1510	vm_object_reference_locked(object);
1511
1512	vm_fault_cleanup(object, first_m);
1513
1514	counter(c_vm_fault_page_block_backoff_kernel++);
1515
1516	vm_object_lock(object);
1517	assert(object->ref_count > `0`);
1518
1519	if (object->paging_in_progress >= vm_object_pagein_throttle) {
1520	vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1521
1522	vm_object_unlock(object);
1523	wait_result = thread_block(THREAD_CONTINUE_NULL);
1524	vm_object_deallocate(object);
1525
1526	goto backoff;
1527	} else {
1528	vm_object_unlock(object);
1529	vm_object_deallocate(object);
1530	thread_interrupt_level(interruptible_state);
1531
1532	return (VM_FAULT_RETRY);
1533	}
1534	}
1535	if (object->internal) {
1536	int compressed_count_delta;
1537
1538	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1539
1540	if (m == VM_PAGE_NULL) {
1541	/*
1542	* Allocate a new page for this object/offset pair as a placeholder
1543	*/
1544	m = vm_page_grab_options(grab_options);
1545	#if TRACEFAULTPAGE
1546	dbgTrace(`0xBEEF000D`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1547	#endif
1548	if (m == VM_PAGE_NULL) {
1549
1550	vm_fault_cleanup(object, first_m);
1551	thread_interrupt_level(interruptible_state);
1552
1553	return (VM_FAULT_MEMORY_SHORTAGE);
1554	}
1555
1556	m->vmp_absent = TRUE;
1557	if (fault_info && fault_info->batch_pmap_op == TRUE) {
1558	vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1559	} else {
1560	vm_page_insert(m, object, offset);
1561	}
1562	}
1563	assert(m->vmp_busy);
1564
1565	m->vmp_absent = TRUE;
1566	pager = object->pager;
1567
1568	assert(object->paging_in_progress > `0`);
1569	vm_object_unlock(object);
1570
1571	rc = vm_compressor_pager_get(
1572	pager,
1573	offset + object->paging_offset,
1574	VM_PAGE_GET_PHYS_PAGE(m),
1575	&my_fault_type,
1576	`0`,
1577	&compressed_count_delta);
1578
1579	if (type_of_fault == NULL) {
1580	int throttle_delay;
1581
1582	/*
1583	* we weren't called from vm_fault, so we
1584	* need to apply page creation throttling
1585	* do it before we re-acquire any locks
1586	*/
1587	if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1588	if ((throttle_delay = vm_page_throttled(TRUE))) {
1589	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `1`, `0`);
1590	delay(throttle_delay);
1591	}
1592	}
1593	}
1594	vm_object_lock(object);
1595	assert(object->paging_in_progress > `0`);
1596
1597	vm_compressor_pager_count(
1598	pager,
1599	compressed_count_delta,
1600	FALSE, / shared_lock /
1601	object);
1602
1603	switch (rc) {
1604	case KERN_SUCCESS:
1605	m->vmp_absent = FALSE;
1606	m->vmp_dirty = TRUE;
1607	if ((object->wimg_bits &
1608	VM_WIMG_MASK) !=
1609	VM_WIMG_USE_DEFAULT) {
1610	/*
1611	* If the page is not cacheable,
1612	* we can't let its contents
1613	* linger in the data cache
1614	* after the decompression.
1615	*/
1616	pmap_sync_page_attributes_phys(
1617	VM_PAGE_GET_PHYS_PAGE(m));
1618	} else {
1619	m->vmp_written_by_kernel = TRUE;
1620	}
1621
1622	/*
1623	* If the object is purgeable, its
1624	* owner's purgeable ledgers have been
1625	* updated in vm_page_insert() but the
1626	* page was also accounted for in a
1627	* "compressed purgeable" ledger, so
1628	* update that now.
1629	*/
1630	if (((object->purgable !=
1631	VM_PURGABLE_DENY) \|\|
1632	object->vo_ledger_tag) &&
1633	(object->vo_owner !=
1634	NULL)) {
1635	/*
1636	* One less compressed
1637	* purgeable/tagged page.
1638	*/
1639	vm_object_owner_compressed_update(
1640	object,
1641	-`1`);
1642	}
1643
1644	break;
1645	case KERN_MEMORY_FAILURE:
1646	m->vmp_unusual = TRUE;
1647	m->vmp_error = TRUE;
1648	m->vmp_absent = FALSE;
1649	break;
1650	case KERN_MEMORY_ERROR:
1651	assert(m->vmp_absent);
1652	break;
1653	default:
1654	panic("vm_fault_page(): unexpected "
1655	"error %d from "
1656	"vm_compressor_pager_get()\n",
1657	rc);
1658	}
1659	PAGE_WAKEUP_DONE(m);
1660
1661	rc = KERN_SUCCESS;
1662	goto data_requested;
1663	}
1664	my_fault_type = DBG_PAGEIN_FAULT;
1665
1666	if (m != VM_PAGE_NULL) {
1667	VM_PAGE_FREE(m);
1668	m = VM_PAGE_NULL;
1669	}
1670
1671	#if TRACEFAULTPAGE
1672	dbgTrace(`0xBEEF0012`, (unsigned int) object, (unsigned int) `0`); / (TEST/DEBUG) /
1673	#endif
1674
1675	/*
1676	* It's possible someone called vm_object_destroy while we weren't
1677	* holding the object lock. If that has happened, then bail out
1678	* here.
1679	*/
1680
1681	pager = object->pager;
1682
1683	if (pager == MEMORY_OBJECT_NULL) {
1684	vm_fault_cleanup(object, first_m);
1685	thread_interrupt_level(interruptible_state);
1686	return VM_FAULT_MEMORY_ERROR;
1687	}
1688
1689	/*
1690	* We have an absent page in place for the faulting offset,
1691	* so we can release the object lock.
1692	*/
1693
1694	if (object->object_is_shared_cache) {
1695	set_thread_rwlock_boost();
1696	}
1697
1698	vm_object_unlock(object);
1699
1700	/*
1701	* If this object uses a copy_call strategy,
1702	* and we are interested in a copy of this object
1703	* (having gotten here only by following a
1704	* shadow chain), then tell the memory manager
1705	* via a flag added to the desired_access
1706	* parameter, so that it can detect a race
1707	* between our walking down the shadow chain
1708	* and its pushing pages up into a copy of
1709	* the object that it manages.
1710	*/
1711	if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1712	wants_copy_flag = VM_PROT_WANTS_COPY;
1713	else
1714	wants_copy_flag = VM_PROT_NONE;
1715
1716	XPR(XPR_VM_FAULT,
1717	"vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1718	object, offset, m,
1719	access_required \| wants_copy_flag, `0`);
1720
1721	if (object->copy == first_object) {
1722	/*
1723	* if we issue the memory_object_data_request in
1724	* this state, we are subject to a deadlock with
1725	* the underlying filesystem if it is trying to
1726	* shrink the file resulting in a push of pages
1727	* into the copy object... that push will stall
1728	* on the placeholder page, and if the pushing thread
1729	* is holding a lock that is required on the pagein
1730	* path (such as a truncate lock), we'll deadlock...
1731	* to avoid this potential deadlock, we throw away
1732	* our placeholder page before calling memory_object_data_request
1733	* and force this thread to retry the vm_fault_page after
1734	* we have issued the I/O. the second time through this path
1735	* we will find the page already in the cache (presumably still
1736	* busy waiting for the I/O to complete) and then complete
1737	* the fault w/o having to go through memory_object_data_request again
1738	*/
1739	assert(first_m != VM_PAGE_NULL);
1740	assert(VM_PAGE_OBJECT(first_m) == first_object);
1741
1742	vm_object_lock(first_object);
1743	VM_PAGE_FREE(first_m);
1744	vm_object_paging_end(first_object);
1745	vm_object_unlock(first_object);
1746
1747	first_m = VM_PAGE_NULL;
1748	force_fault_retry = TRUE;
1749
1750	vm_fault_page_forced_retry++;
1751	}
1752
1753	if (data_already_requested == TRUE) {
1754	orig_behavior = fault_info->behavior;
1755	orig_cluster_size = fault_info->cluster_size;
1756
1757	fault_info->behavior = VM_BEHAVIOR_RANDOM;
1758	fault_info->cluster_size = PAGE_SIZE;
1759	}
1760	/*
1761	* Call the memory manager to retrieve the data.
1762	*/
1763	rc = memory_object_data_request(
1764	pager,
1765	offset + object->paging_offset,
1766	PAGE_SIZE,
1767	access_required \| wants_copy_flag,
1768	(memory_object_fault_info_t)fault_info);
1769
1770	if (data_already_requested == TRUE) {
1771	fault_info->behavior = orig_behavior;
1772	fault_info->cluster_size = orig_cluster_size;
1773	} else
1774	data_already_requested = TRUE;
1775
1776	DTRACE_VM2(maj_fault, int, `1`, (uint64_t *), NULL);
1777	#if TRACEFAULTPAGE
1778	dbgTrace(`0xBEEF0013`, (unsigned int) object, (unsigned int) rc); / (TEST/DEBUG) /
1779	#endif
1780	vm_object_lock(object);
1781
1782	if (object->object_is_shared_cache) {
1783	clear_thread_rwlock_boost();
1784	}
1785
1786	data_requested:
1787	if (rc != KERN_SUCCESS) {
1788
1789	vm_fault_cleanup(object, first_m);
1790	thread_interrupt_level(interruptible_state);
1791
1792	return ((rc == MACH_SEND_INTERRUPTED) ?
1793	VM_FAULT_INTERRUPTED :
1794	VM_FAULT_MEMORY_ERROR);
1795	} else {
1796	clock_sec_t tv_sec;
1797	clock_usec_t tv_usec;
1798
1799	if (my_fault_type == DBG_PAGEIN_FAULT) {
1800	clock_get_system_microtime(&tv_sec, &tv_usec);
1801	current_thread()->t_page_creation_time = tv_sec;
1802	current_thread()->t_page_creation_count = `0`;
1803	}
1804	}
1805	if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1806
1807	vm_fault_cleanup(object, first_m);
1808	thread_interrupt_level(interruptible_state);
1809
1810	return (VM_FAULT_INTERRUPTED);
1811	}
1812	if (force_fault_retry == TRUE) {
1813
1814	vm_fault_cleanup(object, first_m);
1815	thread_interrupt_level(interruptible_state);
1816
1817	return (VM_FAULT_RETRY);
1818	}
1819	if (m == VM_PAGE_NULL && object->phys_contiguous) {
1820	/*
1821	* No page here means that the object we
1822	* initially looked up was "physically
1823	* contiguous" (i.e. device memory). However,
1824	* with Virtual VRAM, the object might not
1825	* be backed by that device memory anymore,
1826	* so we're done here only if the object is
1827	* still "phys_contiguous".
1828	* Otherwise, if the object is no longer
1829	* "phys_contiguous", we need to retry the
1830	* page fault against the object's new backing
1831	* store (different memory object).
1832	*/
1833	phys_contig_object:
1834	goto done;
1835	}
1836	/*
1837	* potentially a pagein fault
1838	* if we make it through the state checks
1839	* above, than we'll count it as such
1840	*/
1841	my_fault = my_fault_type;
1842
1843	/*
1844	* Retry with same object/offset, since new data may
1845	* be in a different page (i.e., m is meaningless at
1846	* this point).
1847	*/
1848	continue;
1849	}
1850	dont_look_for_page:
1851	/*
1852	* We get here if the object has no pager, or an existence map
1853	* exists and indicates the page isn't present on the pager
1854	* or we're unwiring a page. If a pager exists, but there
1855	* is no existence map, then the m->vmp_absent case above handles
1856	* the ZF case when the pager can't provide the page
1857	*/
1858	#if TRACEFAULTPAGE
1859	dbgTrace(`0xBEEF0014`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
1860	#endif
1861	if (object == first_object)
1862	first_m = m;
1863	else
1864	assert(m == VM_PAGE_NULL);
1865
1866	XPR(XPR_VM_FAULT,
1867	"vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1868	object, offset, m,
1869	object->shadow, `0`);
1870
1871	next_object = object->shadow;
1872
1873	if (next_object == VM_OBJECT_NULL) {
1874	/*
1875	* we've hit the bottom of the shadown chain,
1876	* fill the page in the top object with zeros.
1877	*/
1878	assert(!must_be_resident);
1879
1880	if (object != first_object) {
1881	vm_object_paging_end(object);
1882	vm_object_unlock(object);
1883
1884	object = first_object;
1885	offset = first_offset;
1886	vm_object_lock(object);
1887	}
1888	m = first_m;
1889	assert(VM_PAGE_OBJECT(m) == object);
1890	first_m = VM_PAGE_NULL;
1891
1892	/*
1893	* check for any conditions that prevent
1894	* us from creating a new zero-fill page
1895	* vm_fault_check will do all of the
1896	* fault cleanup in the case of an error condition
1897	* including resetting the thread_interrupt_level
1898	*/
1899	error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1900
1901	if (error != VM_FAULT_SUCCESS)
1902	return (error);
1903
1904	if (m == VM_PAGE_NULL) {
1905	m = vm_page_grab_options(grab_options);
1906
1907	if (m == VM_PAGE_NULL) {
1908	vm_fault_cleanup(object, VM_PAGE_NULL);
1909	thread_interrupt_level(interruptible_state);
1910
1911	return (VM_FAULT_MEMORY_SHORTAGE);
1912	}
1913	vm_page_insert(m, object, offset);
1914	}
1915	if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1916	m->vmp_absent = TRUE;
1917
1918	my_fault = vm_fault_zero_page(m, no_zero_fill);
1919
1920	break;
1921
1922	} else {
1923	/*
1924	* Move on to the next object. Lock the next
1925	* object before unlocking the current one.
1926	*/
1927	if ((object != first_object) \|\| must_be_resident)
1928	vm_object_paging_end(object);
1929
1930	offset += object->vo_shadow_offset;
1931	fault_info->lo_offset += object->vo_shadow_offset;
1932	fault_info->hi_offset += object->vo_shadow_offset;
1933	access_required = VM_PROT_READ;
1934
1935	vm_object_lock(next_object);
1936	vm_object_unlock(object);
1937
1938	object = next_object;
1939	vm_object_paging_begin(object);
1940	}
1941	}
1942
1943	/*
1944	* PAGE HAS BEEN FOUND.
1945	*
1946	* This page (m) is:
1947	* busy, so that we can play with it;
1948	* not absent, so that nobody else will fill it;
1949	* possibly eligible for pageout;
1950	*
1951	* The top-level page (first_m) is:
1952	* VM_PAGE_NULL if the page was found in the
1953	* top-level object;
1954	* busy, not absent, and ineligible for pageout.
1955	*
1956	* The current object (object) is locked. A paging
1957	* reference is held for the current and top-level
1958	* objects.
1959	*/
1960
1961	#if TRACEFAULTPAGE
1962	dbgTrace(`0xBEEF0015`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
1963	#endif
1964	#if EXTRA_ASSERTIONS
1965	assert(m->vmp_busy && !m->vmp_absent);
1966	assert((first_m == VM_PAGE_NULL) \|\|
1967	(first_m->vmp_busy && !first_m->vmp_absent &&
1968	!first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1969	#endif /* EXTRA_ASSERTIONS */
1970
1971	XPR(XPR_VM_FAULT,
1972	"vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1973	object, offset, m,
1974	first_object, first_m);
1975
1976	/*
1977	* If the page is being written, but isn't
1978	* already owned by the top-level object,
1979	* we have to copy it into a new page owned
1980	* by the top-level object.
1981	*/
1982	if (object != first_object) {
1983
1984	#if TRACEFAULTPAGE
1985	dbgTrace(`0xBEEF0016`, (unsigned int) object, (unsigned int) fault_type); / (TEST/DEBUG) /
1986	#endif
1987	if (fault_type & VM_PROT_WRITE) {
1988	vm_page_t copy_m;
1989
1990	/*
1991	* We only really need to copy if we
1992	* want to write it.
1993	*/
1994	assert(!must_be_resident);
1995
1996	/*
1997	* If we try to collapse first_object at this
1998	* point, we may deadlock when we try to get
1999	* the lock on an intermediate object (since we
2000	* have the bottom object locked). We can't
2001	* unlock the bottom object, because the page
2002	* we found may move (by collapse) if we do.
2003	*
2004	* Instead, we first copy the page. Then, when
2005	* we have no more use for the bottom object,
2006	* we unlock it and try to collapse.
2007	*
2008	* Note that we copy the page even if we didn't
2009	* need to... that's the breaks.
2010	*/
2011
2012	/*
2013	* Allocate a page for the copy
2014	*/
2015	copy_m = vm_page_grab_options(grab_options);
2016
2017	if (copy_m == VM_PAGE_NULL) {
2018	RELEASE_PAGE(m);
2019
2020	vm_fault_cleanup(object, first_m);
2021	thread_interrupt_level(interruptible_state);
2022
2023	return (VM_FAULT_MEMORY_SHORTAGE);
2024	}
2025	XPR(XPR_VM_FAULT,
2026	"vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2027	object, offset,
2028	m, copy_m, `0`);
2029
2030	vm_page_copy(m, copy_m);
2031
2032	/*
2033	* If another map is truly sharing this
2034	* page with us, we have to flush all
2035	* uses of the original page, since we
2036	* can't distinguish those which want the
2037	* original from those which need the
2038	* new copy.
2039	*
2040	* XXXO If we know that only one map has
2041	* access to this page, then we could
2042	* avoid the pmap_disconnect() call.
2043	*/
2044	if (m->vmp_pmapped)
2045	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2046
2047	if (m->vmp_clustered) {
2048	VM_PAGE_COUNT_AS_PAGEIN(m);
2049	VM_PAGE_CONSUME_CLUSTERED(m);
2050	}
2051	assert(!m->vmp_cleaning);
2052
2053	/*
2054	* We no longer need the old page or object.
2055	*/
2056	RELEASE_PAGE(m);
2057
2058	/*
2059	* This check helps with marking the object as having a sequential pattern
2060	* Normally we'll miss doing this below because this fault is about COW to
2061	* the first_object i.e. bring page in from disk, push to object above but
2062	* don't update the file object's sequential pattern.
2063	*/
2064	if (object->internal == FALSE) {
2065	vm_fault_is_sequential(object, offset, fault_info->behavior);
2066	}
2067
2068	vm_object_paging_end(object);
2069	vm_object_unlock(object);
2070
2071	my_fault = DBG_COW_FAULT;
2072	VM_STAT_INCR(cow_faults);
2073	DTRACE_VM2(cow_fault, int, `1`, (uint64_t *), NULL);
2074	current_task()->cow_faults++;
2075
2076	object = first_object;
2077	offset = first_offset;
2078
2079	vm_object_lock(object);
2080	/*
2081	* get rid of the place holder
2082	* page that we soldered in earlier
2083	*/
2084	VM_PAGE_FREE(first_m);
2085	first_m = VM_PAGE_NULL;
2086
2087	/*
2088	* and replace it with the
2089	* page we just copied into
2090	*/
2091	assert(copy_m->vmp_busy);
2092	vm_page_insert(copy_m, object, offset);
2093	SET_PAGE_DIRTY(copy_m, TRUE);
2094
2095	m = copy_m;
2096	/*
2097	* Now that we've gotten the copy out of the
2098	* way, let's try to collapse the top object.
2099	* But we have to play ugly games with
2100	* paging_in_progress to do that...
2101	*/
2102	vm_object_paging_end(object);
2103	vm_object_collapse(object, offset, TRUE);
2104	vm_object_paging_begin(object);
2105
2106	} else
2107	*protection &= (~VM_PROT_WRITE);
2108	}
2109	/*
2110	* Now check whether the page needs to be pushed into the
2111	* copy object. The use of asymmetric copy on write for
2112	* shared temporary objects means that we may do two copies to
2113	* satisfy the fault; one above to get the page from a
2114	* shadowed object, and one here to push it into the copy.
2115	*/
2116	try_failed_count = `0`;
2117
2118	while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2119	vm_object_offset_t copy_offset;
2120	vm_page_t copy_m;
2121
2122	#if TRACEFAULTPAGE
2123	dbgTrace(`0xBEEF0017`, (unsigned int) copy_object, (unsigned int) fault_type); / (TEST/DEBUG) /
2124	#endif
2125	/*
2126	* If the page is being written, but hasn't been
2127	* copied to the copy-object, we have to copy it there.
2128	*/
2129	if ((fault_type & VM_PROT_WRITE) == `0`) {
2130	*protection &= ~VM_PROT_WRITE;
2131	break;
2132	}
2133
2134	/*
2135	* If the page was guaranteed to be resident,
2136	* we must have already performed the copy.
2137	*/
2138	if (must_be_resident)
2139	break;
2140
2141	/*
2142	* Try to get the lock on the copy_object.
2143	*/
2144	if (!vm_object_lock_try(copy_object)) {
2145
2146	vm_object_unlock(object);
2147	try_failed_count++;
2148
2149	mutex_pause(try_failed_count); / wait a bit /
2150	vm_object_lock(object);
2151
2152	continue;
2153	}
2154	try_failed_count = `0`;
2155
2156	/*
2157	* Make another reference to the copy-object,
2158	* to keep it from disappearing during the
2159	* copy.
2160	*/
2161	vm_object_reference_locked(copy_object);
2162
2163	/*
2164	* Does the page exist in the copy?
2165	*/
2166	copy_offset = first_offset - copy_object->vo_shadow_offset;
2167
2168	if (copy_object->vo_size <= copy_offset)
2169	/*
2170	* Copy object doesn't cover this page -- do nothing.
2171	*/
2172	;
2173	else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2174	/*
2175	* Page currently exists in the copy object
2176	*/
2177	if (copy_m->vmp_busy) {
2178	/*
2179	* If the page is being brought
2180	* in, wait for it and then retry.
2181	*/
2182	RELEASE_PAGE(m);
2183
2184	/*
2185	* take an extra ref so object won't die
2186	*/
2187	vm_object_reference_locked(copy_object);
2188	vm_object_unlock(copy_object);
2189	vm_fault_cleanup(object, first_m);
2190	counter(c_vm_fault_page_block_backoff_kernel++);
2191
2192	vm_object_lock(copy_object);
2193	assert(copy_object->ref_count > `0`);
2194	VM_OBJ_RES_DECR(copy_object);
2195	vm_object_lock_assert_exclusive(copy_object);
2196	copy_object->ref_count--;
2197	assert(copy_object->ref_count > `0`);
2198	copy_m = vm_page_lookup(copy_object, copy_offset);
2199
2200	if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2201	PAGE_ASSERT_WAIT(copy_m, interruptible);
2202
2203	vm_object_unlock(copy_object);
2204	wait_result = thread_block(THREAD_CONTINUE_NULL);
2205	vm_object_deallocate(copy_object);
2206
2207	goto backoff;
2208	} else {
2209	vm_object_unlock(copy_object);
2210	vm_object_deallocate(copy_object);
2211	thread_interrupt_level(interruptible_state);
2212
2213	return (VM_FAULT_RETRY);
2214	}
2215	}
2216	}
2217	else if (!PAGED_OUT(copy_object, copy_offset)) {
2218	/*
2219	* If PAGED_OUT is TRUE, then the page used to exist
2220	* in the copy-object, and has already been paged out.
2221	* We don't need to repeat this. If PAGED_OUT is
2222	* FALSE, then either we don't know (!pager_created,
2223	* for example) or it hasn't been paged out.
2224	* (VM_EXTERNAL_STATE_UNKNOWN\|\|VM_EXTERNAL_STATE_ABSENT)
2225	* We must copy the page to the copy object.
2226	*
2227	* Allocate a page for the copy
2228	*/
2229	copy_m = vm_page_alloc(copy_object, copy_offset);
2230
2231	if (copy_m == VM_PAGE_NULL) {
2232	RELEASE_PAGE(m);
2233
2234	VM_OBJ_RES_DECR(copy_object);
2235	vm_object_lock_assert_exclusive(copy_object);
2236	copy_object->ref_count--;
2237	assert(copy_object->ref_count > `0`);
2238
2239	vm_object_unlock(copy_object);
2240	vm_fault_cleanup(object, first_m);
2241	thread_interrupt_level(interruptible_state);
2242
2243	return (VM_FAULT_MEMORY_SHORTAGE);
2244	}
2245	/*
2246	* Must copy page into copy-object.
2247	*/
2248	vm_page_copy(m, copy_m);
2249
2250	/*
2251	* If the old page was in use by any users
2252	* of the copy-object, it must be removed
2253	* from all pmaps. (We can't know which
2254	* pmaps use it.)
2255	*/
2256	if (m->vmp_pmapped)
2257	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2258
2259	if (m->vmp_clustered) {
2260	VM_PAGE_COUNT_AS_PAGEIN(m);
2261	VM_PAGE_CONSUME_CLUSTERED(m);
2262	}
2263	/*
2264	* If there's a pager, then immediately
2265	* page out this page, using the "initialize"
2266	* option. Else, we use the copy.
2267	*/
2268	if ((!copy_object->pager_ready)
2269	\|\| VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2270	) {
2271
2272	vm_page_lockspin_queues();
2273	assert(!m->vmp_cleaning);
2274	vm_page_activate(copy_m);
2275	vm_page_unlock_queues();
2276
2277	SET_PAGE_DIRTY(copy_m, TRUE);
2278	PAGE_WAKEUP_DONE(copy_m);
2279
2280	} else {
2281
2282	assert(copy_m->vmp_busy == TRUE);
2283	assert(!m->vmp_cleaning);
2284
2285	/*
2286	* dirty is protected by the object lock
2287	*/
2288	SET_PAGE_DIRTY(copy_m, TRUE);
2289
2290	/*
2291	* The page is already ready for pageout:
2292	* not on pageout queues and busy.
2293	* Unlock everything except the
2294	* copy_object itself.
2295	*/
2296	vm_object_unlock(object);
2297
2298	/*
2299	* Write the page to the copy-object,
2300	* flushing it from the kernel.
2301	*/
2302	vm_pageout_initialize_page(copy_m);
2303
2304	/*
2305	* Since the pageout may have
2306	* temporarily dropped the
2307	* copy_object's lock, we
2308	* check whether we'll have
2309	* to deallocate the hard way.
2310	*/
2311	if ((copy_object->shadow != object) \|\| (copy_object->ref_count == `1`)) {
2312	vm_object_unlock(copy_object);
2313	vm_object_deallocate(copy_object);
2314	vm_object_lock(object);
2315
2316	continue;
2317	}
2318	/*
2319	* Pick back up the old object's
2320	* lock. [It is safe to do so,
2321	* since it must be deeper in the
2322	* object tree.]
2323	*/
2324	vm_object_lock(object);
2325	}
2326
2327	/*
2328	* Because we're pushing a page upward
2329	* in the object tree, we must restart
2330	* any faults that are waiting here.
2331	* [Note that this is an expansion of
2332	* PAGE_WAKEUP that uses the THREAD_RESTART
2333	* wait result]. Can't turn off the page's
2334	* busy bit because we're not done with it.
2335	*/
2336	if (m->vmp_wanted) {
2337	m->vmp_wanted = FALSE;
2338	thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2339	}
2340	}
2341	/*
2342	* The reference count on copy_object must be
2343	* at least 2: one for our extra reference,
2344	* and at least one from the outside world
2345	* (we checked that when we last locked
2346	* copy_object).
2347	*/
2348	vm_object_lock_assert_exclusive(copy_object);
2349	copy_object->ref_count--;
2350	assert(copy_object->ref_count > `0`);
2351
2352	VM_OBJ_RES_DECR(copy_object);
2353	vm_object_unlock(copy_object);
2354
2355	break;
2356	}
2357
2358	done:
2359	*result_page = m;
2360	*top_page = first_m;
2361
2362	XPR(XPR_VM_FAULT,
2363	"vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2364	object, offset, m, first_m, `0`);
2365
2366	if (m != VM_PAGE_NULL) {
2367	assert(VM_PAGE_OBJECT(m) == object);
2368
2369	retval = VM_FAULT_SUCCESS;
2370
2371	if (my_fault == DBG_PAGEIN_FAULT) {
2372
2373	VM_PAGE_COUNT_AS_PAGEIN(m);
2374
2375	if (object->internal)
2376	my_fault = DBG_PAGEIND_FAULT;
2377	else
2378	my_fault = DBG_PAGEINV_FAULT;
2379
2380	/*
2381	* evaluate access pattern and update state
2382	* vm_fault_deactivate_behind depends on the
2383	* state being up to date
2384	*/
2385	vm_fault_is_sequential(object, offset, fault_info->behavior);
2386	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2387
2388	} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2389	/*
2390	* we weren't called from vm_fault, so handle the
2391	* accounting here for hits in the cache
2392	*/
2393	if (m->vmp_clustered) {
2394	VM_PAGE_COUNT_AS_PAGEIN(m);
2395	VM_PAGE_CONSUME_CLUSTERED(m);
2396	}
2397	vm_fault_is_sequential(object, offset, fault_info->behavior);
2398	vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2399
2400	} else if (my_fault == DBG_COMPRESSOR_FAULT \|\| my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2401
2402	VM_STAT_INCR(decompressions);
2403	}
2404	if (type_of_fault)
2405	*type_of_fault = my_fault;
2406	} else {
2407	retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2408	assert(first_m == VM_PAGE_NULL);
2409	assert(object == first_object);
2410	}
2411
2412	thread_interrupt_level(interruptible_state);
2413
2414	#if TRACEFAULTPAGE
2415	dbgTrace(`0xBEEF001A`, (unsigned int) VM_FAULT_SUCCESS, `0`); / (TEST/DEBUG) /
2416	#endif
2417	return retval;
2418
2419	backoff:
2420	thread_interrupt_level(interruptible_state);
2421
2422	if (wait_result == THREAD_INTERRUPTED)
2423	return (VM_FAULT_INTERRUPTED);
2424	return (VM_FAULT_RETRY);
2425
2426	#undef RELEASE_PAGE
2427	}
2428
2429
2430
2431	/*
2432	* CODE SIGNING:
2433	* When soft faulting a page, we have to validate the page if:
2434	* 1. the page is being mapped in user space
2435	* 2. the page hasn't already been found to be "tainted"
2436	* 3. the page belongs to a code-signed object
2437	* 4. the page has not been validated yet or has been mapped for write.
2438	*/
2439	#define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj) \
2440	((pmap) != kernel_pmap /1/ && \
2441	!(page)->vmp_cs_tainted /2/ && \
2442	(page_obj)->code_signed /3/ && \
2443	(!(page)->vmp_cs_validated \|\| (page)->vmp_wpmapped /4/))
2444
2445
2446	/*
2447	* page queue lock must NOT be held
2448	* m->vmp_object must be locked
2449	*
2450	* NOTE: m->vmp_object could be locked "shared" only if we are called
2451	* from vm_fault() as part of a soft fault. If so, we must be
2452	* careful not to modify the VM object in any way that is not
2453	* legal under a shared lock...
2454	*/
2455	extern int panic_on_cs_killed;
2456	extern int proc_selfpid(void);
2457	extern char proc_name_address(void* *p);
2458	unsigned long cs_enter_tainted_rejected = `0`;
2459	unsigned long cs_enter_tainted_accepted = `0`;
2460	kern_return_t
2461	vm_fault_enter(vm_page_t m,
2462	pmap_t pmap,
2463	vm_map_offset_t vaddr,
2464	vm_prot_t prot,
2465	vm_prot_t caller_prot,
2466	boolean_t wired,
2467	boolean_t change_wiring,
2468	vm_tag_t wire_tag,
2469	vm_object_fault_info_t fault_info,
2470	boolean_t *need_retry,
2471	int *type_of_fault)
2472	{
2473	kern_return_t kr, pe_result;
2474	boolean_t previously_pmapped = m->vmp_pmapped;
2475	boolean_t must_disconnect = `0`;
2476	boolean_t map_is_switched, map_is_switch_protected;
2477	boolean_t cs_violation;
2478	int cs_enforcement_enabled;
2479	vm_prot_t fault_type;
2480	vm_object_t object;
2481	boolean_t no_cache = fault_info->no_cache;
2482	boolean_t cs_bypass = fault_info->cs_bypass;
2483	int pmap_options = fault_info->pmap_options;
2484
2485	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2486	object = VM_PAGE_OBJECT(m);
2487
2488	vm_object_lock_assert_held(object);
2489
2490	#if KASAN
2491	if (pmap == kernel_pmap) {
2492	kasan_notify_address(vaddr, PAGE_SIZE);
2493	}
2494	#endif
2495
2496	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2497
2498	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2499	assert(m->vmp_fictitious);
2500	return KERN_SUCCESS;
2501	}
2502
2503	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2504
2505	vm_object_lock_assert_exclusive(object);
2506
2507	} else if ((fault_type & VM_PROT_WRITE) == `0` &&
2508	(!m->vmp_wpmapped
2509	#if VM_OBJECT_ACCESS_TRACKING
2510	\|\| object->access_tracking
2511	#endif /* VM_OBJECT_ACCESS_TRACKING */
2512	)) {
2513	/*
2514	* This is not a "write" fault, so we
2515	* might not have taken the object lock
2516	* exclusively and we might not be able
2517	* to update the "wpmapped" bit in
2518	* vm_fault_enter().
2519	* Let's just grant read access to
2520	* the page for now and we'll
2521	* soft-fault again if we need write
2522	* access later...
2523	*/
2524
2525	/ This had better not be a JIT page. /
2526	if (!pmap_has_prot_policy(prot)) {
2527	prot &= ~VM_PROT_WRITE;
2528	} else {
2529	assert(cs_bypass);
2530	}
2531	}
2532	if (m->vmp_pmapped == FALSE) {
2533
2534	if (m->vmp_clustered) {
2535	if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2536	/*
2537	* found it in the cache, but this
2538	* is the first fault-in of the page (m->vmp_pmapped == FALSE)
2539	* so it must have come in as part of
2540	* a cluster... account 1 pagein against it
2541	*/
2542	if (object->internal)
2543	*type_of_fault = DBG_PAGEIND_FAULT;
2544	else
2545	*type_of_fault = DBG_PAGEINV_FAULT;
2546
2547	VM_PAGE_COUNT_AS_PAGEIN(m);
2548	}
2549	VM_PAGE_CONSUME_CLUSTERED(m);
2550	}
2551	}
2552
2553	if (*type_of_fault != DBG_COW_FAULT) {
2554	DTRACE_VM2(as_fault, int, `1`, (uint64_t *), NULL);
2555
2556	if (pmap == kernel_pmap) {
2557	DTRACE_VM2(kernel_asflt, int, `1`, (uint64_t *), NULL);
2558	}
2559	}
2560
2561	/ Validate code signature if necessary. /
2562	if (!cs_bypass &&
2563	VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2564	vm_object_lock_assert_exclusive(object);
2565
2566	if (m->vmp_cs_validated) {
2567	vm_cs_revalidates++;
2568	}
2569
2570	/ VM map is locked, so 1 ref will remain on VM object -*
2571	* so no harm if vm_page_validate_cs drops the object lock */
2572
2573	#if PMAP_CS
2574	if (fault_info->pmap_cs_associated &&
2575	pmap_cs_enforced(pmap) &&
2576	!m->vmp_cs_validated &&
2577	!m->vmp_cs_tainted &&
2578	!m->vmp_cs_nx &&
2579	(prot & VM_PROT_EXECUTE) &&
2580	(caller_prot & VM_PROT_EXECUTE)) {
2581	/*
2582	* With pmap_cs, the pmap layer will validate the
2583	* code signature for any executable pmap mapping.
2584	* No need for us to validate this page too:
2585	* in pmap_cs we trust...
2586	*/
2587	vm_cs_defer_to_pmap_cs++;
2588	} else {
2589	vm_cs_defer_to_pmap_cs_not++;
2590	vm_page_validate_cs(m);
2591	}
2592	#else /* PMAP_CS */
2593	vm_page_validate_cs(m);
2594	#endif /* PMAP_CS */
2595	}
2596
2597	#define page_immutable(m,prot) ((m)->vmp_cs_validated /&& ((prot) & VM_PROT_EXECUTE)/)
2598	#define page_nx(m) ((m)->vmp_cs_nx)
2599
2600	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2601	(pmap == vm_map_pmap(current_thread()->map)));
2602	map_is_switch_protected = current_thread()->map->switch_protect;
2603
2604	/ If the map is switched, and is switch-protected, we must protect*
2605	* some pages from being write-faulted: immutable pages because by
2606	* definition they may not be written, and executable pages because that
2607	* would provide a way to inject unsigned code.
2608	* If the page is immutable, we can simply return. However, we can't
2609	* immediately determine whether a page is executable anywhere. But,
2610	* we can disconnect it everywhere and remove the executable protection
2611	* from the current map. We do that below right before we do the
2612	* PMAP_ENTER.
2613	*/
2614	cs_enforcement_enabled = cs_process_enforcement(NULL);
2615
2616	if(cs_enforcement_enabled && map_is_switched &&
2617	map_is_switch_protected && page_immutable(m, prot) &&
2618	(prot & VM_PROT_WRITE))
2619	{
2620	return KERN_CODESIGN_ERROR;
2621	}
2622
2623	if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2624	if (cs_debug)
2625	printf("page marked to be NX, not letting it be mapped EXEC\n");
2626	return KERN_CODESIGN_ERROR;
2627	}
2628
2629	/ A page could be tainted, or pose a risk of being tainted later.*
2630	* Check whether the receiving process wants it, and make it feel
2631	* the consequences (that hapens in cs_invalid_page()).
2632	* For CS Enforcement, two other conditions will
2633	* cause that page to be tainted as well:
2634	* - pmapping an unsigned page executable - this means unsigned code;
2635	* - writeable mapping of a validated page - the content of that page
2636	* can be changed without the kernel noticing, therefore unsigned
2637	* code can be created
2638	*/
2639	if (cs_bypass) {
2640	/ code-signing is bypassed /
2641	cs_violation = FALSE;
2642	} else if (m->vmp_cs_tainted) {
2643	/ tainted page /
2644	cs_violation = TRUE;
2645	} else if (!cs_enforcement_enabled) {
2646	/ no further code-signing enforcement /
2647	cs_violation = FALSE;
2648	} else if (page_immutable(m, prot) &&
2649	((prot & VM_PROT_WRITE) \|\|
2650	m->vmp_wpmapped)) {
2651	/*
2652	* The page should be immutable, but is in danger of being
2653	* modified.
2654	* This is the case where we want policy from the code
2655	* directory - is the page immutable or not? For now we have
2656	* to assume that code pages will be immutable, data pages not.
2657	* We'll assume a page is a code page if it has a code directory
2658	* and we fault for execution.
2659	* That is good enough since if we faulted the code page for
2660	* writing in another map before, it is wpmapped; if we fault
2661	* it for writing in this map later it will also be faulted for
2662	* executing at the same time; and if we fault for writing in
2663	* another map later, we will disconnect it from this pmap so
2664	* we'll notice the change.
2665	*/
2666	cs_violation = TRUE;
2667	} else if (!m->vmp_cs_validated &&
2668	(prot & VM_PROT_EXECUTE)
2669	#if PMAP_CS
2670	/*
2671	* Executable pages will be validated by pmap_cs;
2672	* in pmap_cs we trust...
2673	* If pmap_cs is turned off, this is a code-signing
2674	* violation.
2675	*/
2676	&& ! (pmap_cs_enforced(pmap))
2677	#endif /* PMAP_CS */
2678	) {
2679	cs_violation = TRUE;
2680	} else {
2681	cs_violation = FALSE;
2682	}
2683
2684	if (cs_violation) {
2685	/ We will have a tainted page. Have to handle the special case*
2686	* of a switched map now. If the map is not switched, standard
2687	* procedure applies - call cs_invalid_page().
2688	* If the map is switched, the real owner is invalid already.
2689	* There is no point in invalidating the switching process since
2690	* it will not be executing from the map. So we don't call
2691	* cs_invalid_page() in that case. */
2692	boolean_t reject_page, cs_killed;
2693	if(map_is_switched) {
2694	assert(pmap==vm_map_pmap(current_thread()->map));
2695	assert(!(prot & VM_PROT_WRITE) \|\| (map_is_switch_protected == FALSE));
2696	reject_page = FALSE;
2697	} else {
2698	if (cs_debug > `5`)
2699	printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2700	object->code_signed ? "yes" : "no",
2701	m->vmp_cs_validated ? "yes" : "no",
2702	m->vmp_cs_tainted ? "yes" : "no",
2703	m->vmp_wpmapped ? "yes" : "no",
2704	(int)prot);
2705	reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2706	}
2707
2708	if (reject_page) {
2709	/ reject the invalid page: abort the page fault /
2710	int pid;
2711	const char *procname;
2712	task_t task;
2713	vm_object_t file_object, shadow;
2714	vm_object_offset_t file_offset;
2715	char pathname, filename;
2716	vm_size_t pathname_len, filename_len;
2717	boolean_t truncated_path;
2718	#define __PATH_MAX 1024
2719	struct timespec mtime, cs_mtime;
2720	int shadow_depth;
2721	os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2722
2723	kr = KERN_CODESIGN_ERROR;
2724	cs_enter_tainted_rejected++;
2725
2726	/ get process name and pid /
2727	procname = "?";
2728	task = current_task();
2729	pid = proc_selfpid();
2730	if (task->bsd_info != NULL)
2731	procname = proc_name_address(task->bsd_info);
2732
2733	/ get file's VM object /
2734	file_object = object;
2735	file_offset = m->vmp_offset;
2736	for (shadow = file_object->shadow,
2737	shadow_depth = `0`;
2738	shadow != VM_OBJECT_NULL;
2739	shadow = file_object->shadow,
2740	shadow_depth++) {
2741	vm_object_lock_shared(shadow);
2742	if (file_object != object) {
2743	vm_object_unlock(file_object);
2744	}
2745	file_offset += file_object->vo_shadow_offset;
2746	file_object = shadow;
2747	}
2748
2749	mtime.tv_sec = `0`;
2750	mtime.tv_nsec = `0`;
2751	cs_mtime.tv_sec = `0`;
2752	cs_mtime.tv_nsec = `0`;
2753
2754	/ get file's pathname and/or filename /
2755	pathname = NULL;
2756	filename = NULL;
2757	pathname_len = `0`;
2758	filename_len = `0`;
2759	truncated_path = FALSE;
2760	/ no pager -> no file -> no pathname, use "<nil>" in that case /
2761	if (file_object->pager != NULL) {
2762	pathname = (char )kalloc(__PATH_MAX `2`);
2763	if (pathname) {
2764	pathname[`0`] = `'\0'`;
2765	pathname_len = __PATH_MAX;
2766	filename = pathname + pathname_len;
2767	filename_len = __PATH_MAX;
2768	}
2769	vnode_pager_get_object_name(file_object->pager,
2770	pathname,
2771	pathname_len,
2772	filename,
2773	filename_len,
2774	&truncated_path);
2775	if (pathname) {
2776	/ safety first... /
2777	pathname[__PATH_MAX-`1`] = `'\0'`;
2778	filename[__PATH_MAX-`1`] = `'\0'`;
2779	}
2780	vnode_pager_get_object_mtime(file_object->pager,
2781	&mtime,
2782	&cs_mtime);
2783	}
2784	printf("CODE SIGNING: process %d[%s]: "
2785	"rejecting invalid page at address 0x%llx "
2786	"from offset 0x%llx in file \"%s%s%s\" "
2787	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2788	"(signed:%d validated:%d tainted:%d nx:%d "
2789	"wpmapped:%d dirty:%d depth:%d)\n",
2790	pid, procname, (addr64_t) vaddr,
2791	file_offset,
2792	(pathname ? pathname : "<nil>"),
2793	(truncated_path ? "/.../" : ""),
2794	(truncated_path ? filename : ""),
2795	cs_mtime.tv_sec, cs_mtime.tv_nsec,
2796	((cs_mtime.tv_sec == mtime.tv_sec &&
2797	cs_mtime.tv_nsec == mtime.tv_nsec)
2798	? "=="
2799	: "!="),
2800	mtime.tv_sec, mtime.tv_nsec,
2801	object->code_signed,
2802	m->vmp_cs_validated,
2803	m->vmp_cs_tainted,
2804	m->vmp_cs_nx,
2805	m->vmp_wpmapped,
2806	m->vmp_dirty,
2807	shadow_depth);
2808
2809	/*
2810	* We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2811	* did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2812	* process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2813	* will deal with the segmentation fault.
2814	*/
2815	if (cs_killed) {
2816	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
2817	pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, `0`, `0`);
2818
2819	codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2820	if (codesigning_exit_reason == NULL) {
2821	printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2822	} else {
2823	mach_vm_address_t data_addr = `0`;
2824	struct codesigning_exit_reason_info *ceri = NULL;
2825	uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(`1`, sizeof(*ceri));
2826
2827	if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2828	printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2829	} else {
2830	if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2831	EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2832	ceri = (struct codesigning_exit_reason_info *)data_addr;
2833	static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2834
2835	ceri->ceri_virt_addr = vaddr;
2836	ceri->ceri_file_offset = file_offset;
2837	if (pathname)
2838	strncpy((char )&ceri->ceri_pathname, pathname, sizeof*(ceri->ceri_pathname));
2839	else
2840	ceri->ceri_pathname[`0`] = `'\0'`;
2841	if (filename)
2842	strncpy((char )&ceri->ceri_filename, filename, sizeof*(ceri->ceri_filename));
2843	else
2844	ceri->ceri_filename[`0`] = `'\0'`;
2845	ceri->ceri_path_truncated = (truncated_path);
2846	ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2847	ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2848	ceri->ceri_page_modtime_secs = mtime.tv_sec;
2849	ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2850	ceri->ceri_object_codesigned = (object->code_signed);
2851	ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2852	ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2853	ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2854	ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2855	ceri->ceri_page_slid = `0`;
2856	ceri->ceri_page_dirty = (m->vmp_dirty);
2857	ceri->ceri_page_shadow_depth = shadow_depth;
2858	} else {
2859	#if DEBUG \|\| DEVELOPMENT
2860	panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2861	#else
2862	printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2863	#endif /* DEBUG \|\| DEVELOPMENT */
2864	/ Free the buffer /
2865	os_reason_alloc_buffer_noblock(codesigning_exit_reason, `0`);
2866	}
2867	}
2868	}
2869
2870	set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2871	}
2872	if (panic_on_cs_killed &&
2873	object->object_is_shared_cache) {
2874	panic("CODE SIGNING: process %d[%s]: "
2875	"rejecting invalid page at address 0x%llx "
2876	"from offset 0x%llx in file \"%s%s%s\" "
2877	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2878	"(signed:%d validated:%d tainted:%d nx:%d"
2879	"wpmapped:%d dirty:%d depth:%d)\n",
2880	pid, procname, (addr64_t) vaddr,
2881	file_offset,
2882	(pathname ? pathname : "<nil>"),
2883	(truncated_path ? "/.../" : ""),
2884	(truncated_path ? filename : ""),
2885	cs_mtime.tv_sec, cs_mtime.tv_nsec,
2886	((cs_mtime.tv_sec == mtime.tv_sec &&
2887	cs_mtime.tv_nsec == mtime.tv_nsec)
2888	? "=="
2889	: "!="),
2890	mtime.tv_sec, mtime.tv_nsec,
2891	object->code_signed,
2892	m->vmp_cs_validated,
2893	m->vmp_cs_tainted,
2894	m->vmp_cs_nx,
2895	m->vmp_wpmapped,
2896	m->vmp_dirty,
2897	shadow_depth);
2898	}
2899
2900	if (file_object != object) {
2901	vm_object_unlock(file_object);
2902	}
2903	if (pathname_len != `0`) {
2904	kfree(pathname, __PATH_MAX * `2`);
2905	pathname = NULL;
2906	filename = NULL;
2907	}
2908	} else {
2909	/ proceed with the invalid page /
2910	kr = KERN_SUCCESS;
2911	if (!m->vmp_cs_validated &&
2912	!object->code_signed) {
2913	/*
2914	* This page has not been (fully) validated but
2915	* does not belong to a code-signed object
2916	* so it should not be forcefully considered
2917	* as tainted.
2918	* We're just concerned about it here because
2919	* we've been asked to "execute" it but that
2920	* does not mean that it should cause other
2921	* accesses to fail.
2922	* This happens when a debugger sets a
2923	* breakpoint and we then execute code in
2924	* that page. Marking the page as "tainted"
2925	* would cause any inspection tool ("leaks",
2926	* "vmmap", "CrashReporter", ...) to get killed
2927	* due to code-signing violation on that page,
2928	* even though they're just reading it and not
2929	* executing from it.
2930	*/
2931	} else {
2932	/*
2933	* Page might have been tainted before or not;
2934	* now it definitively is. If the page wasn't
2935	* tainted, we must disconnect it from all
2936	* pmaps later, to force existing mappings
2937	* through that code path for re-consideration
2938	* of the validity of that page.
2939	*/
2940	must_disconnect = !m->vmp_cs_tainted;
2941	m->vmp_cs_tainted = TRUE;
2942	}
2943	cs_enter_tainted_accepted++;
2944	}
2945	if (kr != KERN_SUCCESS) {
2946	if (cs_debug) {
2947	printf("CODESIGNING: vm_fault_enter(0x%llx): "
2948	"* INVALID PAGE *\n",
2949	(long long)vaddr);
2950	}
2951	#if !SECURE_KERNEL
2952	if (cs_enforcement_panic) {
2953	panic("CODESIGNING: panicking on invalid page\n");
2954	}
2955	#endif
2956	}
2957
2958	} else {
2959	/ proceed with the valid page /
2960	kr = KERN_SUCCESS;
2961	}
2962
2963	boolean_t page_queues_locked = FALSE;
2964	#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
2965	MACRO_BEGIN \
2966	if (! page_queues_locked) { \
2967	page_queues_locked = TRUE; \
2968	vm_page_lockspin_queues(); \
2969	} \
2970	MACRO_END
2971	#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
2972	MACRO_BEGIN \
2973	if (page_queues_locked) { \
2974	page_queues_locked = FALSE; \
2975	vm_page_unlock_queues(); \
2976	} \
2977	MACRO_END
2978
2979	/*
2980	* Hold queues lock to manipulate
2981	* the page queues. Change wiring
2982	* case is obvious.
2983	*/
2984	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) \|\| object != compressor_object);
2985
2986	#if CONFIG_BACKGROUND_QUEUE
2987	vm_page_update_background_state(m);
2988	#endif
2989	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2990	/*
2991	* Compressor pages are neither wired
2992	* nor pageable and should never change.
2993	*/
2994	assert(object == compressor_object);
2995	} else if (change_wiring) {
2996	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2997
2998	if (wired) {
2999	if (kr == KERN_SUCCESS) {
3000	vm_page_wire(m, wire_tag, TRUE);
3001	}
3002	} else {
3003	vm_page_unwire(m, TRUE);
3004	}
3005	/ we keep the page queues lock, if we need it later /
3006
3007	} else {
3008	if (object->internal == TRUE) {
3009	/*
3010	* don't allow anonymous pages on
3011	* the speculative queues
3012	*/
3013	no_cache = FALSE;
3014	}
3015	if (kr != KERN_SUCCESS) {
3016	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3017	vm_page_deactivate(m);
3018	/ we keep the page queues lock, if we need it later /
3019	} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) \|\|
3020	(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) \|\|
3021	(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) \|\|
3022	((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3023	!VM_PAGE_WIRED(m)) {
3024
3025	if (vm_page_local_q &&
3026	(*type_of_fault == DBG_COW_FAULT \|\|
3027	*type_of_fault == DBG_ZERO_FILL_FAULT) ) {
3028	struct vpl *lq;
3029	uint32_t lid;
3030
3031	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3032
3033	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3034	vm_object_lock_assert_exclusive(object);
3035
3036	/*
3037	* we got a local queue to stuff this
3038	* new page on...
3039	* its safe to manipulate local and
3040	* local_id at this point since we're
3041	* behind an exclusive object lock and
3042	* the page is not on any global queue.
3043	*
3044	* we'll use the current cpu number to
3045	* select the queue note that we don't
3046	* need to disable preemption... we're
3047	* going to be behind the local queue's
3048	* lock to do the real work
3049	*/
3050	lid = cpu_number();
3051
3052	lq = &vm_page_local_q[lid].vpl_un.vpl;
3053
3054	VPL_LOCK(&lq->vpl_lock);
3055
3056	vm_page_check_pageable_safe(m);
3057	vm_page_queue_enter(&lq->vpl_queue, m,
3058	vm_page_t, vmp_pageq);
3059	m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3060	m->vmp_local_id = lid;
3061	lq->vpl_count++;
3062
3063	if (object->internal)
3064	lq->vpl_internal_count++;
3065	else
3066	lq->vpl_external_count++;
3067
3068	VPL_UNLOCK(&lq->vpl_lock);
3069
3070	if (lq->vpl_count > vm_page_local_q_soft_limit)
3071	{
3072	/*
3073	* we're beyond the soft limit
3074	* for the local queue
3075	* vm_page_reactivate_local will
3076	* 'try' to take the global page
3077	* queue lock... if it can't
3078	* that's ok... we'll let the
3079	* queue continue to grow up
3080	* to the hard limit... at that
3081	* point we'll wait for the
3082	* lock... once we've got the
3083	* lock, we'll transfer all of
3084	* the pages from the local
3085	* queue to the global active
3086	* queue
3087	*/
3088	vm_page_reactivate_local(lid, FALSE, FALSE);
3089	}
3090	} else {
3091
3092	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3093
3094	/*
3095	* test again now that we hold the
3096	* page queue lock
3097	*/
3098	if (!VM_PAGE_WIRED(m)) {
3099	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3100	vm_page_queues_remove(m, FALSE);
3101
3102	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, `1`);
3103	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, `1`);
3104	}
3105
3106	if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) \|\|
3107	no_cache) {
3108	/*
3109	* If this is a no_cache mapping
3110	* and the page has never been
3111	* mapped before or was
3112	* previously a no_cache page,
3113	* then we want to leave pages
3114	* in the speculative state so
3115	* that they can be readily
3116	* recycled if free memory runs
3117	* low. Otherwise the page is
3118	* activated as normal.
3119	*/
3120
3121	if (no_cache &&
3122	(!previously_pmapped \|\|
3123	m->vmp_no_cache)) {
3124	m->vmp_no_cache = TRUE;
3125
3126	if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)
3127	vm_page_speculate(m, FALSE);
3128
3129	} else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3130	vm_page_activate(m);
3131	}
3132	}
3133	}
3134	/ we keep the page queues lock, if we need it later /
3135	}
3136	}
3137	}
3138	/ we're done with the page queues lock, if we ever took it /
3139	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3140
3141
3142	/ If we have a KERN_SUCCESS from the previous checks, we either have*
3143	* a good page, or a tainted page that has been accepted by the process.
3144	* In both cases the page will be entered into the pmap.
3145	* If the page is writeable, we need to disconnect it from other pmaps
3146	* now so those processes can take note.
3147	*/
3148	if (kr == KERN_SUCCESS) {
3149	/*
3150	* NOTE: we may only hold the vm_object lock SHARED
3151	* at this point, so we need the phys_page lock to
3152	* properly serialize updating the pmapped and
3153	* xpmapped bits
3154	*/
3155	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3156	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3157
3158	pmap_lock_phys_page(phys_page);
3159	/*
3160	* go ahead and take the opportunity
3161	* to set 'pmapped' here so that we don't
3162	* need to grab this lock a 2nd time
3163	* just below
3164	*/
3165	m->vmp_pmapped = TRUE;
3166
3167	if (!m->vmp_xpmapped) {
3168
3169	m->vmp_xpmapped = TRUE;
3170
3171	pmap_unlock_phys_page(phys_page);
3172
3173	if (!object->internal)
3174	OSAddAtomic(`1`, &vm_page_xpmapped_external_count);
3175
3176	#if defined(__arm__) \|\| defined(__arm64__)
3177	pmap_sync_page_data_phys(phys_page);
3178	#else
3179	if (object->internal &&
3180	object->pager != NULL) {
3181	/*
3182	* This page could have been
3183	* uncompressed by the
3184	* compressor pager and its
3185	* contents might be only in
3186	* the data cache.
3187	* Since it's being mapped for
3188	* "execute" for the fist time,
3189	* make sure the icache is in
3190	* sync.
3191	*/
3192	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3193	pmap_sync_page_data_phys(phys_page);
3194	}
3195	#endif
3196	} else
3197	pmap_unlock_phys_page(phys_page);
3198	} else {
3199	if (m->vmp_pmapped == FALSE) {
3200	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3201
3202	pmap_lock_phys_page(phys_page);
3203	m->vmp_pmapped = TRUE;
3204	pmap_unlock_phys_page(phys_page);
3205	}
3206	}
3207
3208	if (fault_type & VM_PROT_WRITE) {
3209
3210	if (m->vmp_wpmapped == FALSE) {
3211	vm_object_lock_assert_exclusive(object);
3212	if (!object->internal && object->pager) {
3213	task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3214	}
3215	m->vmp_wpmapped = TRUE;
3216	}
3217	if (must_disconnect) {
3218	/*
3219	* We can only get here
3220	* because of the CSE logic
3221	*/
3222	assert(cs_enforcement_enabled);
3223	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3224	/*
3225	* If we are faulting for a write, we can clear
3226	* the execute bit - that will ensure the page is
3227	* checked again before being executable, which
3228	* protects against a map switch.
3229	* This only happens the first time the page
3230	* gets tainted, so we won't get stuck here
3231	* to make an already writeable page executable.
3232	*/
3233	if (!cs_bypass){
3234	assert(!pmap_has_prot_policy(prot));
3235	prot &= ~VM_PROT_EXECUTE;
3236	}
3237	}
3238	}
3239	assert(VM_PAGE_OBJECT(m) == object);
3240
3241	#if VM_OBJECT_ACCESS_TRACKING
3242	if (object->access_tracking) {
3243	DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3244	if (fault_type & VM_PROT_WRITE) {
3245	object->access_tracking_writes++;
3246	vm_object_access_tracking_writes++;
3247	} else {
3248	object->access_tracking_reads++;
3249	vm_object_access_tracking_reads++;
3250	}
3251	}
3252	#endif /* VM_OBJECT_ACCESS_TRACKING */
3253
3254	#if PMAP_CS
3255	/*
3256	* If CS enforcement is on, we don't ask for an executable page if the
3257	* fault does not call for execution, because that can fail in
3258	* situations where the caller only actually wanted read access.
3259	* However, it may be better to instead retry without execute on
3260	* failure, or pass a flag into pmap_enter to do the right thing.
3261	*/
3262	// TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
3263	if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
3264	prot &= ~VM_PROT_EXECUTE;
3265	}
3266	#endif
3267
3268	/ Prevent a deadlock by not*
3269	* holding the object lock if we need to wait for a page in
3270	* pmap_enter() - <rdar://problem/7138958> */
3271	PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, `0`,
3272	wired,
3273	pmap_options \| PMAP_OPTIONS_NOWAIT,
3274	pe_result);
3275	#if __x86_64__
3276	if (pe_result == KERN_INVALID_ARGUMENT &&
3277	pmap == PMAP_NULL &&
3278	wired) {
3279	/*
3280	* Wiring a page in a pmap-less VM map:
3281	* VMware's "vmmon" kernel extension does this
3282	* to grab pages.
3283	* Let it proceed even though the PMAP_ENTER() failed.
3284	*/
3285	pe_result = KERN_SUCCESS;
3286	}
3287	#endif /* __x86_64__ */
3288
3289	if(pe_result == KERN_RESOURCE_SHORTAGE) {
3290
3291	if (need_retry) {
3292	/*
3293	* this will be non-null in the case where we hold the lock
3294	* on the top-object in this chain... we can't just drop
3295	* the lock on the object we're inserting the page into
3296	* and recall the PMAP_ENTER since we can still cause
3297	* a deadlock if one of the critical paths tries to
3298	* acquire the lock on the top-object and we're blocked
3299	* in PMAP_ENTER waiting for memory... our only recourse
3300	* is to deal with it at a higher level where we can
3301	* drop both locks.
3302	*/
3303	*need_retry = TRUE;
3304	vm_pmap_enter_retried++;
3305	goto after_the_pmap_enter;
3306	}
3307	/ The nonblocking version of pmap_enter did not succeed.*
3308	* and we don't need to drop other locks and retry
3309	* at the level above us, so
3310	* use the blocking version instead. Requires marking
3311	* the page busy and unlocking the object */
3312	boolean_t was_busy = m->vmp_busy;
3313
3314	vm_object_lock_assert_exclusive(object);
3315
3316	m->vmp_busy = TRUE;
3317	vm_object_unlock(object);
3318
3319	PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3320	`0`, wired,
3321	pmap_options, pe_result);
3322
3323	assert(VM_PAGE_OBJECT(m) == object);
3324
3325	/ Take the object lock again. /
3326	vm_object_lock(object);
3327
3328	/ If the page was busy, someone else will wake it up.*
3329	* Otherwise, we have to do it now. */
3330	assert(m->vmp_busy);
3331	if(!was_busy) {
3332	PAGE_WAKEUP_DONE(m);
3333	}
3334	vm_pmap_enter_blocked++;
3335	}
3336
3337	kr = pe_result;
3338	}
3339
3340	after_the_pmap_enter:
3341	return kr;
3342	}
3343
3344	void
3345	vm_pre_fault(vm_map_offset_t vaddr)
3346	{
3347	if (pmap_find_phys(current_map()->pmap, vaddr) == `0`) {
3348
3349	vm_fault(current_map(), / map /
3350	vaddr, / vaddr /
3351	VM_PROT_READ, / fault_type /
3352	FALSE, / change_wiring /
3353	VM_KERN_MEMORY_NONE, / tag - not wiring /
3354	THREAD_UNINT, / interruptible /
3355	NULL, / caller_pmap /
3356	`0` / caller_pmap_addr /);
3357	}
3358	}
3359
3360
3361	/*
3362	* Routine: vm_fault
3363	* Purpose:
3364	* Handle page faults, including pseudo-faults
3365	* used to change the wiring status of pages.
3366	* Returns:
3367	* Explicit continuations have been removed.
3368	* Implementation:
3369	* vm_fault and vm_fault_page save mucho state
3370	* in the moral equivalent of a closure. The state
3371	* structure is allocated when first entering vm_fault
3372	* and deallocated when leaving vm_fault.
3373	*/
3374
3375	extern int _map_enter_debug;
3376	extern uint64_t get_current_unique_pid(void);
3377
3378	unsigned long vm_fault_collapse_total = `0`;
3379	unsigned long vm_fault_collapse_skipped = `0`;
3380
3381
3382	kern_return_t
3383	vm_fault_external(
3384	vm_map_t map,
3385	vm_map_offset_t vaddr,
3386	vm_prot_t fault_type,
3387	boolean_t change_wiring,
3388	int interruptible,
3389	pmap_t caller_pmap,
3390	vm_map_offset_t caller_pmap_addr)
3391	{
3392	return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3393	interruptible, caller_pmap, caller_pmap_addr,
3394	NULL);
3395	}
3396
3397	kern_return_t
3398	vm_fault(
3399	vm_map_t map,
3400	vm_map_offset_t vaddr,
3401	vm_prot_t fault_type,
3402	boolean_t change_wiring,
3403	vm_tag_t wire_tag, / if wiring must pass tag != VM_KERN_MEMORY_NONE /
3404	int interruptible,
3405	pmap_t caller_pmap,
3406	vm_map_offset_t caller_pmap_addr)
3407	{
3408	return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3409	interruptible, caller_pmap, caller_pmap_addr,
3410	NULL);
3411	}
3412
3413	kern_return_t
3414	vm_fault_internal(
3415	vm_map_t map,
3416	vm_map_offset_t vaddr,
3417	vm_prot_t caller_prot,
3418	boolean_t change_wiring,
3419	vm_tag_t wire_tag, / if wiring must pass tag != VM_KERN_MEMORY_NONE /
3420	int interruptible,
3421	pmap_t caller_pmap,
3422	vm_map_offset_t caller_pmap_addr,
3423	ppnum_t *physpage_p)
3424	{
3425	vm_map_version_t version; / Map version for verificiation /
3426	boolean_t wired; / Should mapping be wired down? /
3427	vm_object_t object; / Top-level object /
3428	vm_object_offset_t offset; / Top-level offset /
3429	vm_prot_t prot; / Protection for mapping /
3430	vm_object_t old_copy_object; / Saved copy object /
3431	vm_page_t result_page; / Result of vm_fault_page /
3432	vm_page_t top_page; / Placeholder page /
3433	kern_return_t kr;
3434
3435	vm_page_t m; / Fast access to result_page /
3436	kern_return_t error_code;
3437	vm_object_t cur_object;
3438	vm_object_t m_object = NULL;
3439	vm_object_offset_t cur_offset;
3440	vm_page_t cur_m;
3441	vm_object_t new_object;
3442	int type_of_fault;
3443	pmap_t pmap;
3444	wait_interrupt_t interruptible_state;
3445	vm_map_t real_map = map;
3446	vm_map_t original_map = map;
3447	boolean_t object_locks_dropped = FALSE;
3448	vm_prot_t fault_type;
3449	vm_prot_t original_fault_type;
3450	struct vm_object_fault_info fault_info = {};
3451	boolean_t need_collapse = FALSE;
3452	boolean_t need_retry = FALSE;
3453	boolean_t *need_retry_ptr = NULL;
3454	int object_lock_type = `0`;
3455	int cur_object_lock_type;
3456	vm_object_t top_object = VM_OBJECT_NULL;
3457	vm_object_t written_on_object = VM_OBJECT_NULL;
3458	memory_object_t written_on_pager = NULL;
3459	vm_object_offset_t written_on_offset = `0`;
3460	int throttle_delay;
3461	int compressed_count_delta;
3462	int grab_options;
3463	vm_map_offset_t trace_vaddr;
3464	vm_map_offset_t trace_real_vaddr;
3465	#if DEVELOPMENT \|\| DEBUG
3466	vm_map_offset_t real_vaddr;
3467
3468	real_vaddr = vaddr;
3469	#endif /* DEVELOPMENT \|\| DEBUG */
3470	trace_real_vaddr = vaddr;
3471	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3472
3473	if (map == kernel_map) {
3474	trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3475	trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3476	} else {
3477	trace_vaddr = vaddr;
3478	}
3479
3480	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3481	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_START,
3482	((uint64_t)trace_vaddr >> `32`),
3483	trace_vaddr,
3484	(map == kernel_map),
3485	`0`,
3486	`0`);
3487
3488	if (get_preemption_level() != `0`) {
3489	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3490	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_END,
3491	((uint64_t)trace_vaddr >> `32`),
3492	trace_vaddr,
3493	KERN_FAILURE,
3494	`0`,
3495	`0`);
3496
3497	return (KERN_FAILURE);
3498	}
3499
3500	thread_t cthread = current_thread();
3501	boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3502	uint64_t fstart = `0`;
3503
3504	if (rtfault) {
3505	fstart = mach_continuous_time();
3506	}
3507
3508	interruptible_state = thread_interrupt_level(interruptible);
3509
3510	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3511
3512	VM_STAT_INCR(faults);
3513	current_task()->faults++;
3514	original_fault_type = fault_type;
3515
3516	if (fault_type & VM_PROT_WRITE)
3517	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3518	else
3519	object_lock_type = OBJECT_LOCK_SHARED;
3520
3521	cur_object_lock_type = OBJECT_LOCK_SHARED;
3522
3523	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3524	if (compressor_map) {
3525	if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3526	panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void ) vaddr, caller_prot, (void* ) vm_map_min(compressor_map), (void* *) vm_map_max(compressor_map));
3527
3528	}
3529	}
3530	}
3531	RetryFault:
3532	assert(written_on_object == VM_OBJECT_NULL);
3533
3534	/*
3535	* assume we will hit a page in the cache
3536	* otherwise, explicitly override with
3537	* the real fault type once we determine it
3538	*/
3539	type_of_fault = DBG_CACHE_HIT_FAULT;
3540
3541	/*
3542	* Find the backing store object and offset into
3543	* it to begin the search.
3544	*/
3545	fault_type = original_fault_type;
3546	map = original_map;
3547	vm_map_lock_read(map);
3548
3549	kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3550	object_lock_type, &version,
3551	&object, &offset, &prot, &wired,
3552	&fault_info,
3553	&real_map);
3554
3555	if (kr != KERN_SUCCESS) {
3556	vm_map_unlock_read(map);
3557	goto done;
3558	}
3559	pmap = real_map->pmap;
3560	fault_info.interruptible = interruptible;
3561	fault_info.stealth = FALSE;
3562	fault_info.io_sync = FALSE;
3563	fault_info.mark_zf_absent = FALSE;
3564	fault_info.batch_pmap_op = FALSE;
3565
3566	/*
3567	* If the page is wired, we must fault for the current protection
3568	* value, to avoid further faults.
3569	*/
3570	if (wired) {
3571	fault_type = prot \| VM_PROT_WRITE;
3572	/*
3573	* since we're treating this fault as a 'write'
3574	* we must hold the top object lock exclusively
3575	*/
3576	if (object_lock_type == OBJECT_LOCK_SHARED) {
3577
3578	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3579
3580	if (vm_object_lock_upgrade(object) == FALSE) {
3581	/*
3582	* couldn't upgrade, so explictly
3583	* take the lock exclusively
3584	*/
3585	vm_object_lock(object);
3586	}
3587	}
3588	}
3589
3590	#if VM_FAULT_CLASSIFY
3591	/*
3592	* Temporary data gathering code
3593	*/
3594	vm_fault_classify(object, offset, fault_type);
3595	#endif
3596	/*
3597	* Fast fault code. The basic idea is to do as much as
3598	* possible while holding the map lock and object locks.
3599	* Busy pages are not used until the object lock has to
3600	* be dropped to do something (copy, zero fill, pmap enter).
3601	* Similarly, paging references aren't acquired until that
3602	* point, and object references aren't used.
3603	*
3604	* If we can figure out what to do
3605	* (zero fill, copy on write, pmap enter) while holding
3606	* the locks, then it gets done. Otherwise, we give up,
3607	* and use the original fault path (which doesn't hold
3608	* the map lock, and relies on busy pages).
3609	* The give up cases include:
3610	* - Have to talk to pager.
3611	* - Page is busy, absent or in error.
3612	* - Pager has locked out desired access.
3613	* - Fault needs to be restarted.
3614	* - Have to push page into copy object.
3615	*
3616	* The code is an infinite loop that moves one level down
3617	* the shadow chain each time. cur_object and cur_offset
3618	* refer to the current object being examined. object and offset
3619	* are the original object from the map. The loop is at the
3620	* top level if and only if object and cur_object are the same.
3621	*
3622	* Invariants: Map lock is held throughout. Lock is held on
3623	* original object and cur_object (if different) when
3624	* continuing or exiting loop.
3625	*
3626	*/
3627
3628	#if defined(__arm64__)
3629	/*
3630	* Fail if reading an execute-only page in a
3631	* pmap that enforces execute-only protection.
3632	*/
3633	if (fault_type == VM_PROT_READ &&
3634	(prot & VM_PROT_EXECUTE) &&
3635	!(prot & VM_PROT_READ) &&
3636	pmap_enforces_execute_only(pmap)) {
3637	vm_object_unlock(object);
3638	vm_map_unlock_read(map);
3639	if (real_map != map) {
3640	vm_map_unlock(real_map);
3641	}
3642	kr = KERN_PROTECTION_FAILURE;
3643	goto done;
3644	}
3645	#endif
3646
3647	/*
3648	* If this page is to be inserted in a copy delay object
3649	* for writing, and if the object has a copy, then the
3650	* copy delay strategy is implemented in the slow fault page.
3651	*/
3652	if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3653	object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3654	goto handle_copy_delay;
3655
3656	cur_object = object;
3657	cur_offset = offset;
3658
3659	grab_options = `0`;
3660	#if CONFIG_SECLUDED_MEMORY
3661	if (object->can_grab_secluded) {
3662	grab_options \|= VM_PAGE_GRAB_SECLUDED;
3663	}
3664	#endif /* CONFIG_SECLUDED_MEMORY */
3665
3666	while (TRUE) {
3667	if (!cur_object->pager_created &&
3668	cur_object->phys_contiguous) / superpage /
3669	break;
3670
3671	if (cur_object->blocked_access) {
3672	/*
3673	* Access to this VM object has been blocked.
3674	* Let the slow path handle it.
3675	*/
3676	break;
3677	}
3678
3679	m = vm_page_lookup(cur_object, cur_offset);
3680	m_object = NULL;
3681
3682	if (m != VM_PAGE_NULL) {
3683	m_object = cur_object;
3684
3685	if (m->vmp_busy) {
3686	wait_result_t result;
3687
3688	/*
3689	* in order to do the PAGE_ASSERT_WAIT, we must
3690	* have object that 'm' belongs to locked exclusively
3691	*/
3692	if (object != cur_object) {
3693
3694	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3695
3696	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3697
3698	if (vm_object_lock_upgrade(cur_object) == FALSE) {
3699	/*
3700	* couldn't upgrade so go do a full retry
3701	* immediately since we can no longer be
3702	* certain about cur_object (since we
3703	* don't hold a reference on it)...
3704	* first drop the top object lock
3705	*/
3706	vm_object_unlock(object);
3707
3708	vm_map_unlock_read(map);
3709	if (real_map != map)
3710	vm_map_unlock(real_map);
3711
3712	goto RetryFault;
3713	}
3714	}
3715	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3716
3717	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3718
3719	if (vm_object_lock_upgrade(object) == FALSE) {
3720	/*
3721	* couldn't upgrade, so explictly take the lock
3722	* exclusively and go relookup the page since we
3723	* will have dropped the object lock and
3724	* a different thread could have inserted
3725	* a page at this offset
3726	* no need for a full retry since we're
3727	* at the top level of the object chain
3728	*/
3729	vm_object_lock(object);
3730
3731	continue;
3732	}
3733	}
3734	if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3735	/*
3736	* m->vmp_busy == TRUE and the object is locked exclusively
3737	* if m->pageout_queue == TRUE after we acquire the
3738	* queues lock, we are guaranteed that it is stable on
3739	* the pageout queue and therefore reclaimable
3740	*
3741	* NOTE: this is only true for the internal pageout queue
3742	* in the compressor world
3743	*/
3744	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3745
3746	vm_page_lock_queues();
3747
3748	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3749	vm_pageout_throttle_up(m);
3750	vm_page_unlock_queues();
3751
3752	PAGE_WAKEUP_DONE(m);
3753	goto reclaimed_from_pageout;
3754	}
3755	vm_page_unlock_queues();
3756	}
3757	if (object != cur_object)
3758	vm_object_unlock(object);
3759
3760	vm_map_unlock_read(map);
3761	if (real_map != map)
3762	vm_map_unlock(real_map);
3763
3764	result = PAGE_ASSERT_WAIT(m, interruptible);
3765
3766	vm_object_unlock(cur_object);
3767
3768	if (result == THREAD_WAITING) {
3769	result = thread_block(THREAD_CONTINUE_NULL);
3770
3771	counter(c_vm_fault_page_block_busy_kernel++);
3772	}
3773	if (result == THREAD_AWAKENED \|\| result == THREAD_RESTART)
3774	goto RetryFault;
3775
3776	kr = KERN_ABORTED;
3777	goto done;
3778	}
3779	reclaimed_from_pageout:
3780	if (m->vmp_laundry) {
3781	if (object != cur_object) {
3782	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3783	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3784
3785	vm_object_unlock(object);
3786	vm_object_unlock(cur_object);
3787
3788	vm_map_unlock_read(map);
3789	if (real_map != map)
3790	vm_map_unlock(real_map);
3791
3792	goto RetryFault;
3793	}
3794
3795	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3796
3797	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3798
3799	if (vm_object_lock_upgrade(object) == FALSE) {
3800	/*
3801	* couldn't upgrade, so explictly take the lock
3802	* exclusively and go relookup the page since we
3803	* will have dropped the object lock and
3804	* a different thread could have inserted
3805	* a page at this offset
3806	* no need for a full retry since we're
3807	* at the top level of the object chain
3808	*/
3809	vm_object_lock(object);
3810
3811	continue;
3812	}
3813	}
3814	vm_pageout_steal_laundry(m, FALSE);
3815	}
3816
3817	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3818	/*
3819	* Guard page: let the slow path deal with it
3820	*/
3821	break;
3822	}
3823	if (m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_private \|\| m->vmp_absent)) {
3824	/*
3825	* Unusual case... let the slow path deal with it
3826	*/
3827	break;
3828	}
3829	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3830	if (object != cur_object)
3831	vm_object_unlock(object);
3832	vm_map_unlock_read(map);
3833	if (real_map != map)
3834	vm_map_unlock(real_map);
3835	vm_object_unlock(cur_object);
3836	kr = KERN_MEMORY_ERROR;
3837	goto done;
3838	}
3839	assert(m_object == VM_PAGE_OBJECT(m));
3840
3841	if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) \|\|
3842	(physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3843	upgrade_for_validation:
3844	/*
3845	* We might need to validate this page
3846	* against its code signature, so we
3847	* want to hold the VM object exclusively.
3848	*/
3849	if (object != cur_object) {
3850	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3851	vm_object_unlock(object);
3852	vm_object_unlock(cur_object);
3853
3854	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3855
3856	vm_map_unlock_read(map);
3857	if (real_map != map)
3858	vm_map_unlock(real_map);
3859
3860	goto RetryFault;
3861	}
3862
3863	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
3864
3865	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3866
3867	if (vm_object_lock_upgrade(object) == FALSE) {
3868	/*
3869	* couldn't upgrade, so explictly take the lock
3870	* exclusively and go relookup the page since we
3871	* will have dropped the object lock and
3872	* a different thread could have inserted
3873	* a page at this offset
3874	* no need for a full retry since we're
3875	* at the top level of the object chain
3876	*/
3877	vm_object_lock(object);
3878
3879	continue;
3880	}
3881	}
3882	}
3883	/*
3884	* Two cases of map in faults:
3885	* - At top level w/o copy object.
3886	* - Read fault anywhere.
3887	* --> must disallow write.
3888	*/
3889
3890	if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3891
3892	goto FastPmapEnter;
3893	}
3894
3895	if ((fault_type & VM_PROT_WRITE) == `0`) {
3896	if (!pmap_has_prot_policy(prot)) {
3897	prot &= ~VM_PROT_WRITE;
3898	} else {
3899	/*
3900	* For a protection that the pmap cares
3901	* about, we must hand over the full
3902	* set of protections (so that the pmap
3903	* layer can apply any desired policy).
3904	* This means that cs_bypass must be
3905	* set, as this can force us to pass
3906	* RWX.
3907	*/
3908	assert(fault_info.cs_bypass);
3909	}
3910
3911	if (object != cur_object) {
3912	/*
3913	* We still need to hold the top object
3914	* lock here to prevent a race between
3915	* a read fault (taking only "shared"
3916	* locks) and a write fault (taking
3917	* an "exclusive" lock on the top
3918	* object.
3919	* Otherwise, as soon as we release the
3920	* top lock, the write fault could
3921	* proceed and actually complete before
3922	* the read fault, and the copied page's
3923	* translation could then be overwritten
3924	* by the read fault's translation for
3925	* the original page.
3926	*
3927	* Let's just record what the top object
3928	* is and we'll release it later.
3929	*/
3930	top_object = object;
3931
3932	/*
3933	* switch to the object that has the new page
3934	*/
3935	object = cur_object;
3936	object_lock_type = cur_object_lock_type;
3937	}
3938	FastPmapEnter:
3939	assert(m_object == VM_PAGE_OBJECT(m));
3940
3941	/*
3942	* prepare for the pmap_enter...
3943	* object and map are both locked
3944	* m contains valid data
3945	* object == m->vmp_object
3946	* cur_object == NULL or it's been unlocked
3947	* no paging references on either object or cur_object
3948	*/
3949	if (top_object != VM_OBJECT_NULL \|\| object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3950	need_retry_ptr = &need_retry;
3951	else
3952	need_retry_ptr = NULL;
3953
3954	if (caller_pmap) {
3955	kr = vm_fault_enter(m,
3956	caller_pmap,
3957	caller_pmap_addr,
3958	prot,
3959	caller_prot,
3960	wired,
3961	change_wiring,
3962	wire_tag,
3963	&fault_info,
3964	need_retry_ptr,
3965	&type_of_fault);
3966	} else {
3967	kr = vm_fault_enter(m,
3968	pmap,
3969	vaddr,
3970	prot,
3971	caller_prot,
3972	wired,
3973	change_wiring,
3974	wire_tag,
3975	&fault_info,
3976	need_retry_ptr,
3977	&type_of_fault);
3978	}
3979	#if DEVELOPMENT \|\| DEBUG
3980	{
3981	int event_code = `0`;
3982
3983	if (m_object->internal)
3984	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3985	else if (m_object->object_is_shared_cache)
3986	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3987	else
3988	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3989
3990	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << `16`) \| (caller_prot << `8`) \| type_of_fault, m->vmp_offset, get_current_unique_pid(), `0`);
3991
3992	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
3993	}
3994	#endif
3995	if (kr == KERN_SUCCESS &&
3996	physpage_p != NULL) {
3997	/ for vm_map_wire_and_extract() /
3998	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3999	if (prot & VM_PROT_WRITE) {
4000	vm_object_lock_assert_exclusive(m_object);
4001	m->vmp_dirty = TRUE;
4002	}
4003	}
4004
4005	if (top_object != VM_OBJECT_NULL) {
4006	/*
4007	* It's safe to drop the top object
4008	* now that we've done our
4009	* vm_fault_enter(). Any other fault
4010	* in progress for that virtual
4011	* address will either find our page
4012	* and translation or put in a new page
4013	* and translation.
4014	*/
4015	vm_object_unlock(top_object);
4016	top_object = VM_OBJECT_NULL;
4017	}
4018
4019	if (need_collapse == TRUE)
4020	vm_object_collapse(object, offset, TRUE);
4021
4022	if (need_retry == FALSE &&
4023	(type_of_fault == DBG_PAGEIND_FAULT \|\| type_of_fault == DBG_PAGEINV_FAULT \|\| type_of_fault == DBG_CACHE_HIT_FAULT)) {
4024	/*
4025	* evaluate access pattern and update state
4026	* vm_fault_deactivate_behind depends on the
4027	* state being up to date
4028	*/
4029	vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4030
4031	vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4032	}
4033	/*
4034	* That's it, clean up and return.
4035	*/
4036	if (m->vmp_busy)
4037	PAGE_WAKEUP_DONE(m);
4038
4039	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4040
4041	vm_object_paging_begin(m_object);
4042
4043	assert(written_on_object == VM_OBJECT_NULL);
4044	written_on_object = m_object;
4045	written_on_pager = m_object->pager;
4046	written_on_offset = m_object->paging_offset + m->vmp_offset;
4047	}
4048	vm_object_unlock(object);
4049
4050	vm_map_unlock_read(map);
4051	if (real_map != map)
4052	vm_map_unlock(real_map);
4053
4054	if (need_retry == TRUE) {
4055	/*
4056	* vm_fault_enter couldn't complete the PMAP_ENTER...
4057	* at this point we don't hold any locks so it's safe
4058	* to ask the pmap layer to expand the page table to
4059	* accommodate this mapping... once expanded, we'll
4060	* re-drive the fault which should result in vm_fault_enter
4061	* being able to successfully enter the mapping this time around
4062	*/
4063	(void)pmap_enter_options(
4064	pmap, vaddr, `0`, `0`, `0`, `0`, `0`,
4065	PMAP_OPTIONS_NOENTER, NULL);
4066
4067	need_retry = FALSE;
4068	goto RetryFault;
4069	}
4070	goto done;
4071	}
4072	/*
4073	* COPY ON WRITE FAULT
4074	*/
4075	assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4076
4077	/*
4078	* If objects match, then
4079	* object->copy must not be NULL (else control
4080	* would be in previous code block), and we
4081	* have a potential push into the copy object
4082	* with which we can't cope with here.
4083	*/
4084	if (cur_object == object) {
4085	/*
4086	* must take the slow path to
4087	* deal with the copy push
4088	*/
4089	break;
4090	}
4091
4092	/*
4093	* This is now a shadow based copy on write
4094	* fault -- it requires a copy up the shadow
4095	* chain.
4096	*/
4097	assert(m_object == VM_PAGE_OBJECT(m));
4098
4099	if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4100	VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4101	goto upgrade_for_validation;
4102	}
4103
4104	/*
4105	* Allocate a page in the original top level
4106	* object. Give up if allocate fails. Also
4107	* need to remember current page, as it's the
4108	* source of the copy.
4109	*
4110	* at this point we hold locks on both
4111	* object and cur_object... no need to take
4112	* paging refs or mark pages BUSY since
4113	* we don't drop either object lock until
4114	* the page has been copied and inserted
4115	*/
4116	cur_m = m;
4117	m = vm_page_grab_options(grab_options);
4118	m_object = NULL;
4119
4120	if (m == VM_PAGE_NULL) {
4121	/*
4122	* no free page currently available...
4123	* must take the slow path
4124	*/
4125	break;
4126	}
4127	/*
4128	* Now do the copy. Mark the source page busy...
4129	*
4130	* NOTE: This code holds the map lock across
4131	* the page copy.
4132	*/
4133	vm_page_copy(cur_m, m);
4134	vm_page_insert(m, object, offset);
4135	m_object = object;
4136	SET_PAGE_DIRTY(m, FALSE);
4137
4138	/*
4139	* Now cope with the source page and object
4140	*/
4141	if (object->ref_count > `1` && cur_m->vmp_pmapped)
4142	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4143
4144	if (cur_m->vmp_clustered) {
4145	VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4146	VM_PAGE_CONSUME_CLUSTERED(cur_m);
4147	vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4148	}
4149	need_collapse = TRUE;
4150
4151	if (!cur_object->internal &&
4152	cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4153	/*
4154	* The object from which we've just
4155	* copied a page is most probably backed
4156	* by a vnode. We don't want to waste too
4157	* much time trying to collapse the VM objects
4158	* and create a bottleneck when several tasks
4159	* map the same file.
4160	*/
4161	if (cur_object->copy == object) {
4162	/*
4163	* Shared mapping or no COW yet.
4164	* We can never collapse a copy
4165	* object into its backing object.
4166	*/
4167	need_collapse = FALSE;
4168	} else if (cur_object->copy == object->shadow &&
4169	object->shadow->resident_page_count == `0`) {
4170	/*
4171	* Shared mapping after a COW occurred.
4172	*/
4173	need_collapse = FALSE;
4174	}
4175	}
4176	vm_object_unlock(cur_object);
4177
4178	if (need_collapse == FALSE)
4179	vm_fault_collapse_skipped++;
4180	vm_fault_collapse_total++;
4181
4182	type_of_fault = DBG_COW_FAULT;
4183	VM_STAT_INCR(cow_faults);
4184	DTRACE_VM2(cow_fault, int, `1`, (uint64_t *), NULL);
4185	current_task()->cow_faults++;
4186
4187	goto FastPmapEnter;
4188
4189	} else {
4190	/*
4191	* No page at cur_object, cur_offset... m == NULL
4192	*/
4193	if (cur_object->pager_created) {
4194	int compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4195
4196	if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4197	int my_fault_type;
4198	int c_flags = C_DONT_BLOCK;
4199	boolean_t insert_cur_object = FALSE;
4200
4201	/*
4202	* May have to talk to a pager...
4203	* if so, take the slow path by
4204	* doing a 'break' from the while (TRUE) loop
4205	*
4206	* external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4207	* if the compressor is active and the page exists there
4208	*/
4209	if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4210	break;
4211
4212	if (map == kernel_map \|\| real_map == kernel_map) {
4213	/*
4214	* can't call into the compressor with the kernel_map
4215	* lock held, since the compressor may try to operate
4216	* on the kernel map in order to return an empty c_segment
4217	*/
4218	break;
4219	}
4220	if (object != cur_object) {
4221	if (fault_type & VM_PROT_WRITE)
4222	c_flags \|= C_KEEP;
4223	else
4224	insert_cur_object = TRUE;
4225	}
4226	if (insert_cur_object == TRUE) {
4227
4228	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4229
4230	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4231
4232	if (vm_object_lock_upgrade(cur_object) == FALSE) {
4233	/*
4234	* couldn't upgrade so go do a full retry
4235	* immediately since we can no longer be
4236	* certain about cur_object (since we
4237	* don't hold a reference on it)...
4238	* first drop the top object lock
4239	*/
4240	vm_object_unlock(object);
4241
4242	vm_map_unlock_read(map);
4243	if (real_map != map)
4244	vm_map_unlock(real_map);
4245
4246	goto RetryFault;
4247	}
4248	}
4249	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4250
4251	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4252
4253	if (object != cur_object) {
4254	/*
4255	* we can't go for the upgrade on the top
4256	* lock since the upgrade may block waiting
4257	* for readers to drain... since we hold
4258	* cur_object locked at this point, waiting
4259	* for the readers to drain would represent
4260	* a lock order inversion since the lock order
4261	* for objects is the reference order in the
4262	* shadown chain
4263	*/
4264	vm_object_unlock(object);
4265	vm_object_unlock(cur_object);
4266
4267	vm_map_unlock_read(map);
4268	if (real_map != map)
4269	vm_map_unlock(real_map);
4270
4271	goto RetryFault;
4272	}
4273	if (vm_object_lock_upgrade(object) == FALSE) {
4274	/*
4275	* couldn't upgrade, so explictly take the lock
4276	* exclusively and go relookup the page since we
4277	* will have dropped the object lock and
4278	* a different thread could have inserted
4279	* a page at this offset
4280	* no need for a full retry since we're
4281	* at the top level of the object chain
4282	*/
4283	vm_object_lock(object);
4284
4285	continue;
4286	}
4287	}
4288	m = vm_page_grab_options(grab_options);
4289	m_object = NULL;
4290
4291	if (m == VM_PAGE_NULL) {
4292	/*
4293	* no free page currently available...
4294	* must take the slow path
4295	*/
4296	break;
4297	}
4298
4299	/*
4300	* The object is and remains locked
4301	* so no need to take a
4302	* "paging_in_progress" reference.
4303	*/
4304	boolean_t shared_lock;
4305	if ((object == cur_object &&
4306	object_lock_type == OBJECT_LOCK_EXCLUSIVE) \|\|
4307	(object != cur_object &&
4308	cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4309	shared_lock = FALSE;
4310	} else {
4311	shared_lock = TRUE;
4312	}
4313
4314	kr = vm_compressor_pager_get(
4315	cur_object->pager,
4316	(cur_offset +
4317	cur_object->paging_offset),
4318	VM_PAGE_GET_PHYS_PAGE(m),
4319	&my_fault_type,
4320	c_flags,
4321	&compressed_count_delta);
4322
4323	vm_compressor_pager_count(
4324	cur_object->pager,
4325	compressed_count_delta,
4326	shared_lock,
4327	cur_object);
4328
4329	if (kr != KERN_SUCCESS) {
4330	vm_page_release(m, FALSE);
4331	m = VM_PAGE_NULL;
4332	break;
4333	}
4334	m->vmp_dirty = TRUE;
4335
4336	/*
4337	* If the object is purgeable, its
4338	* owner's purgeable ledgers will be
4339	* updated in vm_page_insert() but the
4340	* page was also accounted for in a
4341	* "compressed purgeable" ledger, so
4342	* update that now.
4343	*/
4344	if (object != cur_object &&
4345	!insert_cur_object) {
4346	/*
4347	* We're not going to insert
4348	* the decompressed page into
4349	* the object it came from.
4350	*
4351	* We're dealing with a
4352	* copy-on-write fault on
4353	* "object".
4354	* We're going to decompress
4355	* the page directly into the
4356	* target "object" while
4357	* keepin the compressed
4358	* page for "cur_object", so
4359	* no ledger update in that
4360	* case.
4361	*/
4362	} else if (((cur_object->purgable ==
4363	VM_PURGABLE_DENY) &&
4364	(!cur_object->vo_ledger_tag)) \|\|
4365	(cur_object->vo_owner ==
4366	NULL)) {
4367	/*
4368	* "cur_object" is not purgeable
4369	* and is not ledger-taged, or
4370	* there's no owner for it,
4371	* so no owner's ledgers to
4372	* update.
4373	*/
4374	} else {
4375	/*
4376	* One less compressed
4377	* purgeable/tagged page for
4378	* cur_object's owner.
4379	*/
4380	vm_object_owner_compressed_update(
4381	cur_object,
4382	-`1`);
4383	}
4384
4385	if (insert_cur_object) {
4386	vm_page_insert(m, cur_object, cur_offset);
4387	m_object = cur_object;
4388	} else {
4389	vm_page_insert(m, object, offset);
4390	m_object = object;
4391	}
4392
4393	if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4394	/*
4395	* If the page is not cacheable,
4396	* we can't let its contents
4397	* linger in the data cache
4398	* after the decompression.
4399	*/
4400	pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4401	}
4402
4403	type_of_fault = my_fault_type;
4404
4405	VM_STAT_INCR(decompressions);
4406
4407	if (cur_object != object) {
4408	if (insert_cur_object) {
4409	top_object = object;
4410	/*
4411	* switch to the object that has the new page
4412	*/
4413	object = cur_object;
4414	object_lock_type = cur_object_lock_type;
4415	} else {
4416	vm_object_unlock(cur_object);
4417	cur_object = object;
4418	}
4419	}
4420	goto FastPmapEnter;
4421	}
4422	/*
4423	* existence map present and indicates
4424	* that the pager doesn't have this page
4425	*/
4426	}
4427	if (cur_object->shadow == VM_OBJECT_NULL) {
4428	/*
4429	* Zero fill fault. Page gets
4430	* inserted into the original object.
4431	*/
4432	if (cur_object->shadow_severed \|\|
4433	VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) \|\|
4434	cur_object == compressor_object \|\|
4435	cur_object == kernel_object \|\|
4436	cur_object == vm_submap_object) {
4437	if (object != cur_object)
4438	vm_object_unlock(cur_object);
4439	vm_object_unlock(object);
4440
4441	vm_map_unlock_read(map);
4442	if (real_map != map)
4443	vm_map_unlock(real_map);
4444
4445	kr = KERN_MEMORY_ERROR;
4446	goto done;
4447	}
4448	if (cur_object != object) {
4449	vm_object_unlock(cur_object);
4450
4451	cur_object = object;
4452	}
4453	if (object_lock_type == OBJECT_LOCK_SHARED) {
4454
4455	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4456
4457	if (vm_object_lock_upgrade(object) == FALSE) {
4458	/*
4459	* couldn't upgrade so do a full retry on the fault
4460	* since we dropped the object lock which
4461	* could allow another thread to insert
4462	* a page at this offset
4463	*/
4464	vm_map_unlock_read(map);
4465	if (real_map != map)
4466	vm_map_unlock(real_map);
4467
4468	goto RetryFault;
4469	}
4470	}
4471	m = vm_page_alloc(object, offset);
4472	m_object = NULL;
4473
4474	if (m == VM_PAGE_NULL) {
4475	/*
4476	* no free page currently available...
4477	* must take the slow path
4478	*/
4479	break;
4480	}
4481	m_object = object;
4482
4483	/*
4484	* Now zero fill page...
4485	* the page is probably going to
4486	* be written soon, so don't bother
4487	* to clear the modified bit
4488	*
4489	* NOTE: This code holds the map
4490	* lock across the zero fill.
4491	*/
4492	type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4493
4494	goto FastPmapEnter;
4495	}
4496	/*
4497	* On to the next level in the shadow chain
4498	*/
4499	cur_offset += cur_object->vo_shadow_offset;
4500	new_object = cur_object->shadow;
4501
4502	/*
4503	* take the new_object's lock with the indicated state
4504	*/
4505	if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4506	vm_object_lock_shared(new_object);
4507	else
4508	vm_object_lock(new_object);
4509
4510	if (cur_object != object)
4511	vm_object_unlock(cur_object);
4512
4513	cur_object = new_object;
4514
4515	continue;
4516	}
4517	}
4518	/*
4519	* Cleanup from fast fault failure. Drop any object
4520	* lock other than original and drop map lock.
4521	*/
4522	if (object != cur_object)
4523	vm_object_unlock(cur_object);
4524
4525	/*
4526	* must own the object lock exclusively at this point
4527	*/
4528	if (object_lock_type == OBJECT_LOCK_SHARED) {
4529	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4530
4531	if (vm_object_lock_upgrade(object) == FALSE) {
4532	/*
4533	* couldn't upgrade, so explictly
4534	* take the lock exclusively
4535	* no need to retry the fault at this
4536	* point since "vm_fault_page" will
4537	* completely re-evaluate the state
4538	*/
4539	vm_object_lock(object);
4540	}
4541	}
4542
4543	handle_copy_delay:
4544	vm_map_unlock_read(map);
4545	if (real_map != map)
4546	vm_map_unlock(real_map);
4547
4548	if (__improbable(object == compressor_object \|\|
4549	object == kernel_object \|\|
4550	object == vm_submap_object)) {
4551	/*
4552	* These objects are explicitly managed and populated by the
4553	* kernel. The virtual ranges backed by these objects should
4554	* either have wired pages or "holes" that are not supposed to
4555	* be accessed at all until they get explicitly populated.
4556	* We should never have to resolve a fault on a mapping backed
4557	* by one of these VM objects and providing a zero-filled page
4558	* would be wrong here, so let's fail the fault and let the
4559	* caller crash or recover.
4560	*/
4561	vm_object_unlock(object);
4562	kr = KERN_MEMORY_ERROR;
4563	goto done;
4564	}
4565
4566	assert(object != compressor_object);
4567	assert(object != kernel_object);
4568	assert(object != vm_submap_object);
4569
4570	/*
4571	* Make a reference to this object to
4572	* prevent its disposal while we are messing with
4573	* it. Once we have the reference, the map is free
4574	* to be diddled. Since objects reference their
4575	* shadows (and copies), they will stay around as well.
4576	*/
4577	vm_object_reference_locked(object);
4578	vm_object_paging_begin(object);
4579
4580	XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",`0`,`0`,`0`,`0`,`0`);
4581
4582	error_code = `0`;
4583
4584	result_page = VM_PAGE_NULL;
4585	kr = vm_fault_page(object, offset, fault_type,
4586	(change_wiring && !wired),
4587	FALSE, / page not looked up /
4588	&prot, &result_page, &top_page,
4589	&type_of_fault,
4590	&error_code, map->no_zero_fill,
4591	FALSE, &fault_info);
4592
4593	/*
4594	* if kr != VM_FAULT_SUCCESS, then the paging reference
4595	* has been dropped and the object unlocked... the ref_count
4596	* is still held
4597	*
4598	* if kr == VM_FAULT_SUCCESS, then the paging reference
4599	* is still held along with the ref_count on the original object
4600	*
4601	* the object is returned locked with a paging reference
4602	*
4603	* if top_page != NULL, then it's BUSY and the
4604	* object it belongs to has a paging reference
4605	* but is returned unlocked
4606	*/
4607	if (kr != VM_FAULT_SUCCESS &&
4608	kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4609	/*
4610	* we didn't succeed, lose the object reference immediately.
4611	*/
4612	vm_object_deallocate(object);
4613
4614	/*
4615	* See why we failed, and take corrective action.
4616	*/
4617	switch (kr) {
4618	case VM_FAULT_MEMORY_SHORTAGE:
4619	if (vm_page_wait((change_wiring) ?
4620	THREAD_UNINT :
4621	THREAD_ABORTSAFE))
4622	goto RetryFault;
4623	/*
4624	* fall thru
4625	*/
4626	case VM_FAULT_INTERRUPTED:
4627	kr = KERN_ABORTED;
4628	goto done;
4629	case VM_FAULT_RETRY:
4630	goto RetryFault;
4631	case VM_FAULT_MEMORY_ERROR:
4632	if (error_code)
4633	kr = error_code;
4634	else
4635	kr = KERN_MEMORY_ERROR;
4636	goto done;
4637	default:
4638	panic("vm_fault: unexpected error 0x%x from "
4639	"vm_fault_page()\n", kr);
4640	}
4641	}
4642	m = result_page;
4643	m_object = NULL;
4644
4645	if (m != VM_PAGE_NULL) {
4646	m_object = VM_PAGE_OBJECT(m);
4647	assert((change_wiring && !wired) ?
4648	(top_page == VM_PAGE_NULL) :
4649	((top_page == VM_PAGE_NULL) == (m_object == object)));
4650	}
4651
4652	/*
4653	* What to do with the resulting page from vm_fault_page
4654	* if it doesn't get entered into the physical map:
4655	*/
4656	#define RELEASE_PAGE(m) \
4657	MACRO_BEGIN \
4658	PAGE_WAKEUP_DONE(m); \
4659	if ( !VM_PAGE_PAGEABLE(m)) { \
4660	vm_page_lockspin_queues(); \
4661	if ( !VM_PAGE_PAGEABLE(m)) \
4662	vm_page_activate(m); \
4663	vm_page_unlock_queues(); \
4664	} \
4665	MACRO_END
4666
4667
4668	object_locks_dropped = FALSE;
4669	/*
4670	* We must verify that the maps have not changed
4671	* since our last lookup. vm_map_verify() needs the
4672	* map lock (shared) but we are holding object locks.
4673	* So we do a try_lock() first and, if that fails, we
4674	* drop the object locks and go in for the map lock again.
4675	*/
4676	if (!vm_map_try_lock_read(original_map)) {
4677
4678	if (m != VM_PAGE_NULL) {
4679	old_copy_object = m_object->copy;
4680	vm_object_unlock(m_object);
4681	} else {
4682	old_copy_object = VM_OBJECT_NULL;
4683	vm_object_unlock(object);
4684	}
4685
4686	object_locks_dropped = TRUE;
4687
4688	vm_map_lock_read(original_map);
4689	}
4690
4691	if ((map != original_map) \|\| !vm_map_verify(map, &version)) {
4692
4693	if (object_locks_dropped == FALSE) {
4694	if (m != VM_PAGE_NULL) {
4695	old_copy_object = m_object->copy;
4696	vm_object_unlock(m_object);
4697	} else {
4698	old_copy_object = VM_OBJECT_NULL;
4699	vm_object_unlock(object);
4700	}
4701
4702	object_locks_dropped = TRUE;
4703	}
4704
4705	/*
4706	* no object locks are held at this point
4707	*/
4708	vm_object_t retry_object;
4709	vm_object_offset_t retry_offset;
4710	vm_prot_t retry_prot;
4711
4712	/*
4713	* To avoid trying to write_lock the map while another
4714	* thread has it read_locked (in vm_map_pageable), we
4715	* do not try for write permission. If the page is
4716	* still writable, we will get write permission. If it
4717	* is not, or has been marked needs_copy, we enter the
4718	* mapping without write permission, and will merely
4719	* take another fault.
4720	*/
4721	map = original_map;
4722
4723	kr = vm_map_lookup_locked(&map, vaddr,
4724	fault_type & ~VM_PROT_WRITE,
4725	OBJECT_LOCK_EXCLUSIVE, &version,
4726	&retry_object, &retry_offset, &retry_prot,
4727	&wired,
4728	&fault_info,
4729	&real_map);
4730	pmap = real_map->pmap;
4731
4732	if (kr != KERN_SUCCESS) {
4733	vm_map_unlock_read(map);
4734
4735	if (m != VM_PAGE_NULL) {
4736	assert(VM_PAGE_OBJECT(m) == m_object);
4737
4738	/*
4739	* retake the lock so that
4740	* we can drop the paging reference
4741	* in vm_fault_cleanup and do the
4742	* PAGE_WAKEUP_DONE in RELEASE_PAGE
4743	*/
4744	vm_object_lock(m_object);
4745
4746	RELEASE_PAGE(m);
4747
4748	vm_fault_cleanup(m_object, top_page);
4749	} else {
4750	/*
4751	* retake the lock so that
4752	* we can drop the paging reference
4753	* in vm_fault_cleanup
4754	*/
4755	vm_object_lock(object);
4756
4757	vm_fault_cleanup(object, top_page);
4758	}
4759	vm_object_deallocate(object);
4760
4761	goto done;
4762	}
4763	vm_object_unlock(retry_object);
4764
4765	if ((retry_object != object) \|\| (retry_offset != offset)) {
4766
4767	vm_map_unlock_read(map);
4768	if (real_map != map)
4769	vm_map_unlock(real_map);
4770
4771	if (m != VM_PAGE_NULL) {
4772	assert(VM_PAGE_OBJECT(m) == m_object);
4773
4774	/*
4775	* retake the lock so that
4776	* we can drop the paging reference
4777	* in vm_fault_cleanup and do the
4778	* PAGE_WAKEUP_DONE in RELEASE_PAGE
4779	*/
4780	vm_object_lock(m_object);
4781
4782	RELEASE_PAGE(m);
4783
4784	vm_fault_cleanup(m_object, top_page);
4785	} else {
4786	/*
4787	* retake the lock so that
4788	* we can drop the paging reference
4789	* in vm_fault_cleanup
4790	*/
4791	vm_object_lock(object);
4792
4793	vm_fault_cleanup(object, top_page);
4794	}
4795	vm_object_deallocate(object);
4796
4797	goto RetryFault;
4798	}
4799	/*
4800	* Check whether the protection has changed or the object
4801	* has been copied while we left the map unlocked.
4802	*/
4803	if (pmap_has_prot_policy(retry_prot)) {
4804	/ If the pmap layer cares, pass the full set. /
4805	prot = retry_prot;
4806	} else {
4807	prot &= retry_prot;
4808	}
4809	}
4810
4811	if (object_locks_dropped == TRUE) {
4812	if (m != VM_PAGE_NULL) {
4813	vm_object_lock(m_object);
4814
4815	if (m_object->copy != old_copy_object) {
4816	/*
4817	* The copy object changed while the top-level object
4818	* was unlocked, so take away write permission.
4819	*/
4820	assert(!pmap_has_prot_policy(prot));
4821	prot &= ~VM_PROT_WRITE;
4822	}
4823	} else
4824	vm_object_lock(object);
4825
4826	object_locks_dropped = FALSE;
4827	}
4828
4829	/*
4830	* If we want to wire down this page, but no longer have
4831	* adequate permissions, we must start all over.
4832	*/
4833	if (wired && (fault_type != (prot \| VM_PROT_WRITE))) {
4834
4835	vm_map_unlock_read(map);
4836	if (real_map != map)
4837	vm_map_unlock(real_map);
4838
4839	if (m != VM_PAGE_NULL) {
4840	assert(VM_PAGE_OBJECT(m) == m_object);
4841
4842	RELEASE_PAGE(m);
4843
4844	vm_fault_cleanup(m_object, top_page);
4845	} else
4846	vm_fault_cleanup(object, top_page);
4847
4848	vm_object_deallocate(object);
4849
4850	goto RetryFault;
4851	}
4852	if (m != VM_PAGE_NULL) {
4853	/*
4854	* Put this page into the physical map.
4855	* We had to do the unlock above because pmap_enter
4856	* may cause other faults. The page may be on
4857	* the pageout queues. If the pageout daemon comes
4858	* across the page, it will remove it from the queues.
4859	*/
4860	if (caller_pmap) {
4861	kr = vm_fault_enter(m,
4862	caller_pmap,
4863	caller_pmap_addr,
4864	prot,
4865	caller_prot,
4866	wired,
4867	change_wiring,
4868	wire_tag,
4869	&fault_info,
4870	NULL,
4871	&type_of_fault);
4872	} else {
4873	kr = vm_fault_enter(m,
4874	pmap,
4875	vaddr,
4876	prot,
4877	caller_prot,
4878	wired,
4879	change_wiring,
4880	wire_tag,
4881	&fault_info,
4882	NULL,
4883	&type_of_fault);
4884	}
4885	assert(VM_PAGE_OBJECT(m) == m_object);
4886
4887	#if DEVELOPMENT \|\| DEBUG
4888	{
4889	int event_code = `0`;
4890
4891	if (m_object->internal)
4892	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4893	else if (m_object->object_is_shared_cache)
4894	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4895	else
4896	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4897
4898	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << `16`) \| (caller_prot << `8`) \| type_of_fault, m->vmp_offset, get_current_unique_pid(), `0`);
4899
4900	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4901	}
4902	#endif
4903	if (kr != KERN_SUCCESS) {
4904	/ abort this page fault /
4905	vm_map_unlock_read(map);
4906	if (real_map != map)
4907	vm_map_unlock(real_map);
4908	PAGE_WAKEUP_DONE(m);
4909	vm_fault_cleanup(m_object, top_page);
4910	vm_object_deallocate(object);
4911	goto done;
4912	}
4913	if (physpage_p != NULL) {
4914	/ for vm_map_wire_and_extract() /
4915	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4916	if (prot & VM_PROT_WRITE) {
4917	vm_object_lock_assert_exclusive(m_object);
4918	m->vmp_dirty = TRUE;
4919	}
4920	}
4921	} else {
4922
4923	vm_map_entry_t entry;
4924	vm_map_offset_t laddr;
4925	vm_map_offset_t ldelta, hdelta;
4926
4927	/*
4928	* do a pmap block mapping from the physical address
4929	* in the object
4930	*/
4931
4932	if (real_map != map)
4933	vm_map_unlock(real_map);
4934
4935	if (original_map != map) {
4936	vm_map_unlock_read(map);
4937	vm_map_lock_read(original_map);
4938	map = original_map;
4939	}
4940	real_map = map;
4941
4942	laddr = vaddr;
4943	hdelta = `0xFFFFF000`;
4944	ldelta = `0xFFFFF000`;
4945
4946	while (vm_map_lookup_entry(map, laddr, &entry)) {
4947	if (ldelta > (laddr - entry->vme_start))
4948	ldelta = laddr - entry->vme_start;
4949	if (hdelta > (entry->vme_end - laddr))
4950	hdelta = entry->vme_end - laddr;
4951	if (entry->is_sub_map) {
4952
4953	laddr = ((laddr - entry->vme_start)
4954	+ VME_OFFSET(entry));
4955	vm_map_lock_read(VME_SUBMAP(entry));
4956
4957	if (map != real_map)
4958	vm_map_unlock_read(map);
4959	if (entry->use_pmap) {
4960	vm_map_unlock_read(real_map);
4961	real_map = VME_SUBMAP(entry);
4962	}
4963	map = VME_SUBMAP(entry);
4964
4965	} else {
4966	break;
4967	}
4968	}
4969
4970	if (vm_map_lookup_entry(map, laddr, &entry) &&
4971	(VME_OBJECT(entry) != NULL) &&
4972	(VME_OBJECT(entry) == object)) {
4973	int superpage;
4974
4975	if (!object->pager_created &&
4976	object->phys_contiguous &&
4977	VME_OFFSET(entry) == `0` &&
4978	(entry->vme_end - entry->vme_start == object->vo_size) &&
4979	VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size-`1`))) {
4980	superpage = VM_MEM_SUPERPAGE;
4981	} else {
4982	superpage = `0`;
4983	}
4984
4985	if (superpage && physpage_p) {
4986	/ for vm_map_wire_and_extract() /
4987	*physpage_p = (ppnum_t)
4988	((((vm_map_offset_t)
4989	object->vo_shadow_offset)
4990	+ VME_OFFSET(entry)
4991	+ (laddr - entry->vme_start))
4992	>> PAGE_SHIFT);
4993	}
4994
4995	if (caller_pmap) {
4996	/*
4997	* Set up a block mapped area
4998	*/
4999	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5000	kr = pmap_map_block(caller_pmap,
5001	(addr64_t)(caller_pmap_addr - ldelta),
5002	(ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5003	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5004	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5005	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, `0`);
5006
5007	if (kr != KERN_SUCCESS) {
5008	goto cleanup;
5009	}
5010	} else {
5011	/*
5012	* Set up a block mapped area
5013	*/
5014	assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5015	kr = pmap_map_block(real_map->pmap,
5016	(addr64_t)(vaddr - ldelta),
5017	(ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5018	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5019	(uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5020	(VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, `0`);
5021
5022	if (kr != KERN_SUCCESS) {
5023	goto cleanup;
5024	}
5025	}
5026	}
5027	}
5028
5029	/*
5030	* Success
5031	*/
5032	kr = KERN_SUCCESS;
5033
5034	/*
5035	* TODO: could most of the done cases just use cleanup?
5036	*/
5037	cleanup:
5038	/*
5039	* Unlock everything, and return
5040	*/
5041	vm_map_unlock_read(map);
5042	if (real_map != map)
5043	vm_map_unlock(real_map);
5044
5045	if (m != VM_PAGE_NULL) {
5046	assert(VM_PAGE_OBJECT(m) == m_object);
5047
5048	if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5049
5050	vm_object_paging_begin(m_object);
5051
5052	assert(written_on_object == VM_OBJECT_NULL);
5053	written_on_object = m_object;
5054	written_on_pager = m_object->pager;
5055	written_on_offset = m_object->paging_offset + m->vmp_offset;
5056	}
5057	PAGE_WAKEUP_DONE(m);
5058
5059	vm_fault_cleanup(m_object, top_page);
5060	} else
5061	vm_fault_cleanup(object, top_page);
5062
5063	vm_object_deallocate(object);
5064
5065	#undef RELEASE_PAGE
5066
5067	done:
5068	thread_interrupt_level(interruptible_state);
5069
5070	/*
5071	* Only I/O throttle on faults which cause a pagein/swapin.
5072	*/
5073	if ((type_of_fault == DBG_PAGEIND_FAULT) \|\| (type_of_fault == DBG_PAGEINV_FAULT) \|\| (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5074	throttle_lowpri_io(`1`);
5075	} else {
5076	if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5077
5078	if ((throttle_delay = vm_page_throttled(TRUE))) {
5079
5080	if (vm_debug_events) {
5081	if (type_of_fault == DBG_COMPRESSOR_FAULT)
5082	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
5083	else if (type_of_fault == DBG_COW_FAULT)
5084	VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
5085	else
5086	VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
5087	}
5088	delay(throttle_delay);
5089	}
5090	}
5091	}
5092
5093	if (written_on_object) {
5094
5095	vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5096
5097	vm_object_lock(written_on_object);
5098	vm_object_paging_end(written_on_object);
5099	vm_object_unlock(written_on_object);
5100
5101	written_on_object = VM_OBJECT_NULL;
5102	}
5103
5104	if (rtfault) {
5105	vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5106	}
5107
5108	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5109	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_END,
5110	((uint64_t)trace_vaddr >> `32`),
5111	trace_vaddr,
5112	kr,
5113	type_of_fault,
5114	`0`);
5115
5116	return (kr);
5117	}
5118
5119	/*
5120	* vm_fault_wire:
5121	*
5122	* Wire down a range of virtual addresses in a map.
5123	*/
5124	kern_return_t
5125	vm_fault_wire(
5126	vm_map_t map,
5127	vm_map_entry_t entry,
5128	vm_prot_t prot,
5129	vm_tag_t wire_tag,
5130	pmap_t pmap,
5131	vm_map_offset_t pmap_addr,
5132	ppnum_t *physpage_p)
5133	{
5134	vm_map_offset_t va;
5135	vm_map_offset_t end_addr = entry->vme_end;
5136	kern_return_t rc;
5137
5138	assert(entry->in_transition);
5139
5140	if ((VME_OBJECT(entry) != NULL) &&
5141	!entry->is_sub_map &&
5142	VME_OBJECT(entry)->phys_contiguous) {
5143	return KERN_SUCCESS;
5144	}
5145
5146	/*
5147	* Inform the physical mapping system that the
5148	* range of addresses may not fault, so that
5149	* page tables and such can be locked down as well.
5150	*/
5151
5152	pmap_pageable(pmap, pmap_addr,
5153	pmap_addr + (end_addr - entry->vme_start), FALSE);
5154
5155	/*
5156	* We simulate a fault to get the page and enter it
5157	* in the physical map.
5158	*/
5159
5160	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5161	rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5162	pmap_addr + (va - entry->vme_start),
5163	physpage_p);
5164	if (rc != KERN_SUCCESS) {
5165	rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5166	((pmap == kernel_pmap)
5167	? THREAD_UNINT
5168	: THREAD_ABORTSAFE),
5169	pmap,
5170	(pmap_addr +
5171	(va - entry->vme_start)),
5172	physpage_p);
5173	DTRACE_VM2(softlock, int, `1`, (uint64_t *), NULL);
5174	}
5175
5176	if (rc != KERN_SUCCESS) {
5177	struct vm_map_entry tmp_entry = *entry;
5178
5179	/ unwire wired pages /
5180	tmp_entry.vme_end = va;
5181	vm_fault_unwire(map,
5182	&tmp_entry, FALSE, pmap, pmap_addr);
5183
5184	return rc;
5185	}
5186	}
5187	return KERN_SUCCESS;
5188	}
5189
5190	/*
5191	* vm_fault_unwire:
5192	*
5193	* Unwire a range of virtual addresses in a map.
5194	*/
5195	void
5196	vm_fault_unwire(
5197	vm_map_t map,
5198	vm_map_entry_t entry,
5199	boolean_t deallocate,
5200	pmap_t pmap,
5201	vm_map_offset_t pmap_addr)
5202	{
5203	vm_map_offset_t va;
5204	vm_map_offset_t end_addr = entry->vme_end;
5205	vm_object_t object;
5206	struct vm_object_fault_info fault_info = {};
5207	unsigned int unwired_pages;
5208
5209	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5210
5211	/*
5212	* If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5213	* do anything since such memory is wired by default. So we don't have
5214	* anything to undo here.
5215	*/
5216
5217	if (object != VM_OBJECT_NULL && object->phys_contiguous)
5218	return;
5219
5220	fault_info.interruptible = THREAD_UNINT;
5221	fault_info.behavior = entry->behavior;
5222	fault_info.user_tag = VME_ALIAS(entry);
5223	if (entry->iokit_acct \|\|
5224	(!entry->is_sub_map && !entry->use_pmap)) {
5225	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
5226	}
5227	fault_info.lo_offset = VME_OFFSET(entry);
5228	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5229	fault_info.no_cache = entry->no_cache;
5230	fault_info.stealth = TRUE;
5231
5232	unwired_pages = `0`;
5233
5234	/*
5235	* Since the pages are wired down, we must be able to
5236	* get their mappings from the physical map system.
5237	*/
5238
5239	for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5240
5241	if (object == VM_OBJECT_NULL) {
5242	if (pmap) {
5243	pmap_change_wiring(pmap,
5244	pmap_addr + (va - entry->vme_start), FALSE);
5245	}
5246	(void) vm_fault(map, va, VM_PROT_NONE,
5247	TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5248	} else {
5249	vm_prot_t prot;
5250	vm_page_t result_page;
5251	vm_page_t top_page;
5252	vm_object_t result_object;
5253	vm_fault_return_t result;
5254
5255	/ cap cluster size at maximum UPL size /
5256	upl_size_t cluster_size;
5257	if (os_sub_overflow(end_addr, va, &cluster_size)) {
5258	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
5259	}
5260	fault_info.cluster_size = cluster_size;
5261
5262	do {
5263	prot = VM_PROT_NONE;
5264
5265	vm_object_lock(object);
5266	vm_object_paging_begin(object);
5267	XPR(XPR_VM_FAULT,
5268	"vm_fault_unwire -> vm_fault_page\n",
5269	`0`,`0`,`0`,`0`,`0`);
5270	result_page = VM_PAGE_NULL;
5271	result = vm_fault_page(
5272	object,
5273	(VME_OFFSET(entry) +
5274	(va - entry->vme_start)),
5275	VM_PROT_NONE, TRUE,
5276	FALSE, / page not looked up /
5277	&prot, &result_page, &top_page,
5278	(int *)`0`,
5279	NULL, map->no_zero_fill,
5280	FALSE, &fault_info);
5281	} while (result == VM_FAULT_RETRY);
5282
5283	/*
5284	* If this was a mapping to a file on a device that has been forcibly
5285	* unmounted, then we won't get a page back from vm_fault_page(). Just
5286	* move on to the next one in case the remaining pages are mapped from
5287	* different objects. During a forced unmount, the object is terminated
5288	* so the alive flag will be false if this happens. A forced unmount will
5289	* will occur when an external disk is unplugged before the user does an
5290	* eject, so we don't want to panic in that situation.
5291	*/
5292
5293	if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5294	continue;
5295
5296	if (result == VM_FAULT_MEMORY_ERROR &&
5297	object == kernel_object) {
5298	/*
5299	* This must have been allocated with
5300	* KMA_KOBJECT and KMA_VAONLY and there's
5301	* no physical page at this offset.
5302	* We're done (no page to free).
5303	*/
5304	assert(deallocate);
5305	continue;
5306	}
5307
5308	if (result != VM_FAULT_SUCCESS)
5309	panic("vm_fault_unwire: failure");
5310
5311	result_object = VM_PAGE_OBJECT(result_page);
5312
5313	if (deallocate) {
5314	assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5315	vm_page_fictitious_addr);
5316	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5317	if (VM_PAGE_WIRED(result_page)) {
5318	unwired_pages++;
5319	}
5320	VM_PAGE_FREE(result_page);
5321	} else {
5322	if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr))
5323	pmap_change_wiring(pmap,
5324	pmap_addr + (va - entry->vme_start), FALSE);
5325
5326
5327	if (VM_PAGE_WIRED(result_page)) {
5328	vm_page_lockspin_queues();
5329	vm_page_unwire(result_page, TRUE);
5330	vm_page_unlock_queues();
5331	unwired_pages++;
5332	}
5333	if(entry->zero_wired_pages) {
5334	pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5335	entry->zero_wired_pages = FALSE;
5336	}
5337
5338	PAGE_WAKEUP_DONE(result_page);
5339	}
5340	vm_fault_cleanup(result_object, top_page);
5341	}
5342	}
5343
5344	/*
5345	* Inform the physical mapping system that the range
5346	* of addresses may fault, so that page tables and
5347	* such may be unwired themselves.
5348	*/
5349
5350	pmap_pageable(pmap, pmap_addr,
5351	pmap_addr + (end_addr - entry->vme_start), TRUE);
5352
5353	if (kernel_object == object) {
5354	vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5355	}
5356	}
5357
5358	/*
5359	* vm_fault_wire_fast:
5360	*
5361	* Handle common case of a wire down page fault at the given address.
5362	* If successful, the page is inserted into the associated physical map.
5363	* The map entry is passed in to avoid the overhead of a map lookup.
5364	*
5365	* NOTE: the given address should be truncated to the
5366	* proper page address.
5367	*
5368	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
5369	* a standard error specifying why the fault is fatal is returned.
5370	*
5371	* The map in question must be referenced, and remains so.
5372	* Caller has a read lock on the map.
5373	*
5374	* This is a stripped version of vm_fault() for wiring pages. Anything
5375	* other than the common case will return KERN_FAILURE, and the caller
5376	* is expected to call vm_fault().
5377	*/
5378	static kern_return_t
5379	vm_fault_wire_fast(
5380	__unused vm_map_t map,
5381	vm_map_offset_t va,
5382	__unused vm_prot_t caller_prot,
5383	vm_tag_t wire_tag,
5384	vm_map_entry_t entry,
5385	pmap_t pmap,
5386	vm_map_offset_t pmap_addr,
5387	ppnum_t *physpage_p)
5388	{
5389	vm_object_t object;
5390	vm_object_offset_t offset;
5391	vm_page_t m;
5392	vm_prot_t prot;
5393	thread_t thread = current_thread();
5394	int type_of_fault;
5395	kern_return_t kr;
5396	struct vm_object_fault_info fault_info = {};
5397
5398	VM_STAT_INCR(faults);
5399
5400	if (thread != THREAD_NULL && thread->task != TASK_NULL)
5401	thread->task->faults++;
5402
5403	/*
5404	* Recovery actions
5405	*/
5406
5407	#undef RELEASE_PAGE
5408	#define RELEASE_PAGE(m) { \
5409	PAGE_WAKEUP_DONE(m); \
5410	vm_page_lockspin_queues(); \
5411	vm_page_unwire(m, TRUE); \
5412	vm_page_unlock_queues(); \
5413	}
5414
5415
5416	#undef UNLOCK_THINGS
5417	#define UNLOCK_THINGS { \
5418	vm_object_paging_end(object); \
5419	vm_object_unlock(object); \
5420	}
5421
5422	#undef UNLOCK_AND_DEALLOCATE
5423	#define UNLOCK_AND_DEALLOCATE { \
5424	UNLOCK_THINGS; \
5425	vm_object_deallocate(object); \
5426	}
5427	/*
5428	* Give up and have caller do things the hard way.
5429	*/
5430
5431	#define GIVE_UP { \
5432	UNLOCK_AND_DEALLOCATE; \
5433	return(KERN_FAILURE); \
5434	}
5435
5436
5437	/*
5438	* If this entry is not directly to a vm_object, bail out.
5439	*/
5440	if (entry->is_sub_map) {
5441	assert(physpage_p == NULL);
5442	return(KERN_FAILURE);
5443	}
5444
5445	/*
5446	* Find the backing store object and offset into it.
5447	*/
5448
5449	object = VME_OBJECT(entry);
5450	offset = (va - entry->vme_start) + VME_OFFSET(entry);
5451	prot = entry->protection;
5452
5453	/*
5454	* Make a reference to this object to prevent its
5455	* disposal while we are messing with it.
5456	*/
5457
5458	vm_object_lock(object);
5459	vm_object_reference_locked(object);
5460	vm_object_paging_begin(object);
5461
5462	/*
5463	* INVARIANTS (through entire routine):
5464	*
5465	* 1) At all times, we must either have the object
5466	* lock or a busy page in some object to prevent
5467	* some other thread from trying to bring in
5468	* the same page.
5469	*
5470	* 2) Once we have a busy page, we must remove it from
5471	* the pageout queues, so that the pageout daemon
5472	* will not grab it away.
5473	*
5474	*/
5475
5476	/*
5477	* Look for page in top-level object. If it's not there or
5478	* there's something going on, give up.
5479	*/
5480	m = vm_page_lookup(object, offset);
5481	if ((m == VM_PAGE_NULL) \|\| (m->vmp_busy) \|\|
5482	(m->vmp_unusual && ( m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent))) {
5483
5484	GIVE_UP;
5485	}
5486	if (m->vmp_fictitious &&
5487	VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5488	/*
5489	* Guard pages are fictitious pages and are never
5490	* entered into a pmap, so let's say it's been wired...
5491	*/
5492	kr = KERN_SUCCESS;
5493	goto done;
5494	}
5495
5496	/*
5497	* Wire the page down now. All bail outs beyond this
5498	* point must unwire the page.
5499	*/
5500
5501	vm_page_lockspin_queues();
5502	vm_page_wire(m, wire_tag, TRUE);
5503	vm_page_unlock_queues();
5504
5505	/*
5506	* Mark page busy for other threads.
5507	*/
5508	assert(!m->vmp_busy);
5509	m->vmp_busy = TRUE;
5510	assert(!m->vmp_absent);
5511
5512	/*
5513	* Give up if the page is being written and there's a copy object
5514	*/
5515	if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5516	RELEASE_PAGE(m);
5517	GIVE_UP;
5518	}
5519
5520	fault_info.user_tag = VME_ALIAS(entry);
5521	fault_info.pmap_options = `0`;
5522	if (entry->iokit_acct \|\|
5523	(!entry->is_sub_map && !entry->use_pmap)) {
5524	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
5525	}
5526
5527	/*
5528	* Put this page into the physical map.
5529	*/
5530	type_of_fault = DBG_CACHE_HIT_FAULT;
5531	kr = vm_fault_enter(m,
5532	pmap,
5533	pmap_addr,
5534	prot,
5535	prot,
5536	TRUE, / wired /
5537	FALSE, / change_wiring /
5538	wire_tag,
5539	&fault_info,
5540	NULL,
5541	&type_of_fault);
5542	if (kr != KERN_SUCCESS) {
5543	RELEASE_PAGE(m);
5544	GIVE_UP;
5545	}
5546
5547	done:
5548	/*
5549	* Unlock everything, and return
5550	*/
5551
5552	if (physpage_p) {
5553	/ for vm_map_wire_and_extract() /
5554	if (kr == KERN_SUCCESS) {
5555	assert(object == VM_PAGE_OBJECT(m));
5556	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5557	if (prot & VM_PROT_WRITE) {
5558	vm_object_lock_assert_exclusive(object);
5559	m->vmp_dirty = TRUE;
5560	}
5561	} else {
5562	*physpage_p = `0`;
5563	}
5564	}
5565
5566	PAGE_WAKEUP_DONE(m);
5567	UNLOCK_AND_DEALLOCATE;
5568
5569	return kr;
5570
5571	}
5572
5573	/*
5574	* Routine: vm_fault_copy_cleanup
5575	* Purpose:
5576	* Release a page used by vm_fault_copy.
5577	*/
5578
5579	static void
5580	vm_fault_copy_cleanup(
5581	vm_page_t page,
5582	vm_page_t top_page)
5583	{
5584	vm_object_t object = VM_PAGE_OBJECT(page);
5585
5586	vm_object_lock(object);
5587	PAGE_WAKEUP_DONE(page);
5588	if ( !VM_PAGE_PAGEABLE(page)) {
5589	vm_page_lockspin_queues();
5590	if ( !VM_PAGE_PAGEABLE(page)) {
5591	vm_page_activate(page);
5592	}
5593	vm_page_unlock_queues();
5594	}
5595	vm_fault_cleanup(object, top_page);
5596	}
5597
5598	static void
5599	vm_fault_copy_dst_cleanup(
5600	vm_page_t page)
5601	{
5602	vm_object_t object;
5603
5604	if (page != VM_PAGE_NULL) {
5605	object = VM_PAGE_OBJECT(page);
5606	vm_object_lock(object);
5607	vm_page_lockspin_queues();
5608	vm_page_unwire(page, TRUE);
5609	vm_page_unlock_queues();
5610	vm_object_paging_end(object);
5611	vm_object_unlock(object);
5612	}
5613	}
5614
5615	/*
5616	* Routine: vm_fault_copy
5617	*
5618	* Purpose:
5619	* Copy pages from one virtual memory object to another --
5620	* neither the source nor destination pages need be resident.
5621	*
5622	* Before actually copying a page, the version associated with
5623	* the destination address map wil be verified.
5624	*
5625	* In/out conditions:
5626	* The caller must hold a reference, but not a lock, to
5627	* each of the source and destination objects and to the
5628	* destination map.
5629	*
5630	* Results:
5631	* Returns KERN_SUCCESS if no errors were encountered in
5632	* reading or writing the data. Returns KERN_INTERRUPTED if
5633	* the operation was interrupted (only possible if the
5634	* "interruptible" argument is asserted). Other return values
5635	* indicate a permanent error in copying the data.
5636	*
5637	* The actual amount of data copied will be returned in the
5638	* "copy_size" argument. In the event that the destination map
5639	* verification failed, this amount may be less than the amount
5640	* requested.
5641	*/
5642	kern_return_t
5643	vm_fault_copy(
5644	vm_object_t src_object,
5645	vm_object_offset_t src_offset,
5646	vm_map_size_t copy_size, /* INOUT /
5647	vm_object_t dst_object,
5648	vm_object_offset_t dst_offset,
5649	vm_map_t dst_map,
5650	vm_map_version_t *dst_version,
5651	int interruptible)
5652	{
5653	vm_page_t result_page;
5654
5655	vm_page_t src_page;
5656	vm_page_t src_top_page;
5657	vm_prot_t src_prot;
5658
5659	vm_page_t dst_page;
5660	vm_page_t dst_top_page;
5661	vm_prot_t dst_prot;
5662
5663	vm_map_size_t amount_left;
5664	vm_object_t old_copy_object;
5665	vm_object_t result_page_object = NULL;
5666	kern_return_t error = `0`;
5667	vm_fault_return_t result;
5668
5669	vm_map_size_t part_size;
5670	struct vm_object_fault_info fault_info_src = {};
5671	struct vm_object_fault_info fault_info_dst = {};
5672
5673	/*
5674	* In order not to confuse the clustered pageins, align
5675	* the different offsets on a page boundary.
5676	*/
5677
5678	#define RETURN(x) \
5679	MACRO_BEGIN \
5680	*copy_size -= amount_left; \
5681	MACRO_RETURN(x); \
5682	MACRO_END
5683
5684	amount_left = *copy_size;
5685
5686	fault_info_src.interruptible = interruptible;
5687	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5688	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5689	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5690	fault_info_src.stealth = TRUE;
5691
5692	fault_info_dst.interruptible = interruptible;
5693	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5694	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5695	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5696	fault_info_dst.stealth = TRUE;
5697
5698	do { / while (amount_left > 0) /
5699	/*
5700	* There may be a deadlock if both source and destination
5701	* pages are the same. To avoid this deadlock, the copy must
5702	* start by getting the destination page in order to apply
5703	* COW semantics if any.
5704	*/
5705
5706	RetryDestinationFault: ;
5707
5708	dst_prot = VM_PROT_WRITE\|VM_PROT_READ;
5709
5710	vm_object_lock(dst_object);
5711	vm_object_paging_begin(dst_object);
5712
5713	/ cap cluster size at maximum UPL size /
5714	upl_size_t cluster_size;
5715	if (os_convert_overflow(amount_left, &cluster_size)) {
5716	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
5717	}
5718	fault_info_dst.cluster_size = cluster_size;
5719
5720	XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",`0`,`0`,`0`,`0`,`0`);
5721	dst_page = VM_PAGE_NULL;
5722	result = vm_fault_page(dst_object,
5723	vm_object_trunc_page(dst_offset),
5724	VM_PROT_WRITE\|VM_PROT_READ,
5725	FALSE,
5726	FALSE, / page not looked up /
5727	&dst_prot, &dst_page, &dst_top_page,
5728	(int *)`0`,
5729	&error,
5730	dst_map->no_zero_fill,
5731	FALSE, &fault_info_dst);
5732	switch (result) {
5733	case VM_FAULT_SUCCESS:
5734	break;
5735	case VM_FAULT_RETRY:
5736	goto RetryDestinationFault;
5737	case VM_FAULT_MEMORY_SHORTAGE:
5738	if (vm_page_wait(interruptible))
5739	goto RetryDestinationFault;
5740	/ fall thru /
5741	case VM_FAULT_INTERRUPTED:
5742	RETURN(MACH_SEND_INTERRUPTED);
5743	case VM_FAULT_SUCCESS_NO_VM_PAGE:
5744	/ success but no VM page: fail the copy /
5745	vm_object_paging_end(dst_object);
5746	vm_object_unlock(dst_object);
5747	/FALLTHROUGH/
5748	case VM_FAULT_MEMORY_ERROR:
5749	if (error)
5750	return (error);
5751	else
5752	return(KERN_MEMORY_ERROR);
5753	default:
5754	panic("vm_fault_copy: unexpected error 0x%x from "
5755	"vm_fault_page()\n", result);
5756	}
5757	assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5758
5759	assert(dst_object == VM_PAGE_OBJECT(dst_page));
5760	old_copy_object = dst_object->copy;
5761
5762	/*
5763	* There exists the possiblity that the source and
5764	* destination page are the same. But we can't
5765	* easily determine that now. If they are the
5766	* same, the call to vm_fault_page() for the
5767	* destination page will deadlock. To prevent this we
5768	* wire the page so we can drop busy without having
5769	* the page daemon steal the page. We clean up the
5770	* top page but keep the paging reference on the object
5771	* holding the dest page so it doesn't go away.
5772	*/
5773
5774	vm_page_lockspin_queues();
5775	vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5776	vm_page_unlock_queues();
5777	PAGE_WAKEUP_DONE(dst_page);
5778	vm_object_unlock(dst_object);
5779
5780	if (dst_top_page != VM_PAGE_NULL) {
5781	vm_object_lock(dst_object);
5782	VM_PAGE_FREE(dst_top_page);
5783	vm_object_paging_end(dst_object);
5784	vm_object_unlock(dst_object);
5785	}
5786
5787	RetrySourceFault: ;
5788
5789	if (src_object == VM_OBJECT_NULL) {
5790	/*
5791	* No source object. We will just
5792	* zero-fill the page in dst_object.
5793	*/
5794	src_page = VM_PAGE_NULL;
5795	result_page = VM_PAGE_NULL;
5796	} else {
5797	vm_object_lock(src_object);
5798	src_page = vm_page_lookup(src_object,
5799	vm_object_trunc_page(src_offset));
5800	if (src_page == dst_page) {
5801	src_prot = dst_prot;
5802	result_page = VM_PAGE_NULL;
5803	} else {
5804	src_prot = VM_PROT_READ;
5805	vm_object_paging_begin(src_object);
5806
5807	/ cap cluster size at maximum UPL size /
5808	if (os_convert_overflow(amount_left, &cluster_size)) {
5809	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
5810	}
5811	fault_info_src.cluster_size = cluster_size;
5812
5813	XPR(XPR_VM_FAULT,
5814	"vm_fault_copy(2) -> vm_fault_page\n",
5815	`0`,`0`,`0`,`0`,`0`);
5816	result_page = VM_PAGE_NULL;
5817	result = vm_fault_page(
5818	src_object,
5819	vm_object_trunc_page(src_offset),
5820	VM_PROT_READ, FALSE,
5821	FALSE, / page not looked up /
5822	&src_prot,
5823	&result_page, &src_top_page,
5824	(int *)`0`, &error, FALSE,
5825	FALSE, &fault_info_src);
5826
5827	switch (result) {
5828	case VM_FAULT_SUCCESS:
5829	break;
5830	case VM_FAULT_RETRY:
5831	goto RetrySourceFault;
5832	case VM_FAULT_MEMORY_SHORTAGE:
5833	if (vm_page_wait(interruptible))
5834	goto RetrySourceFault;
5835	/ fall thru /
5836	case VM_FAULT_INTERRUPTED:
5837	vm_fault_copy_dst_cleanup(dst_page);
5838	RETURN(MACH_SEND_INTERRUPTED);
5839	case VM_FAULT_SUCCESS_NO_VM_PAGE:
5840	/ success but no VM page: fail /
5841	vm_object_paging_end(src_object);
5842	vm_object_unlock(src_object);
5843	/FALLTHROUGH/
5844	case VM_FAULT_MEMORY_ERROR:
5845	vm_fault_copy_dst_cleanup(dst_page);
5846	if (error)
5847	return (error);
5848	else
5849	return(KERN_MEMORY_ERROR);
5850	default:
5851	panic("vm_fault_copy(2): unexpected "
5852	"error 0x%x from "
5853	"vm_fault_page()\n", result);
5854	}
5855
5856	result_page_object = VM_PAGE_OBJECT(result_page);
5857	assert((src_top_page == VM_PAGE_NULL) ==
5858	(result_page_object == src_object));
5859	}
5860	assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5861	vm_object_unlock(result_page_object);
5862	}
5863
5864	vm_map_lock_read(dst_map);
5865
5866	if (!vm_map_verify(dst_map, dst_version)) {
5867	vm_map_unlock_read(dst_map);
5868	if (result_page != VM_PAGE_NULL && src_page != dst_page)
5869	vm_fault_copy_cleanup(result_page, src_top_page);
5870	vm_fault_copy_dst_cleanup(dst_page);
5871	break;
5872	}
5873	assert(dst_object == VM_PAGE_OBJECT(dst_page));
5874
5875	vm_object_lock(dst_object);
5876
5877	if (dst_object->copy != old_copy_object) {
5878	vm_object_unlock(dst_object);
5879	vm_map_unlock_read(dst_map);
5880	if (result_page != VM_PAGE_NULL && src_page != dst_page)
5881	vm_fault_copy_cleanup(result_page, src_top_page);
5882	vm_fault_copy_dst_cleanup(dst_page);
5883	break;
5884	}
5885	vm_object_unlock(dst_object);
5886
5887	/*
5888	* Copy the page, and note that it is dirty
5889	* immediately.
5890	*/
5891
5892	if (!page_aligned(src_offset) \|\|
5893	!page_aligned(dst_offset) \|\|
5894	!page_aligned(amount_left)) {
5895
5896	vm_object_offset_t src_po,
5897	dst_po;
5898
5899	src_po = src_offset - vm_object_trunc_page(src_offset);
5900	dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5901
5902	if (dst_po > src_po) {
5903	part_size = PAGE_SIZE - dst_po;
5904	} else {
5905	part_size = PAGE_SIZE - src_po;
5906	}
5907	if (part_size > (amount_left)){
5908	part_size = amount_left;
5909	}
5910
5911	if (result_page == VM_PAGE_NULL) {
5912	assert((vm_offset_t) dst_po == dst_po);
5913	assert((vm_size_t) part_size == part_size);
5914	vm_page_part_zero_fill(dst_page,
5915	(vm_offset_t) dst_po,
5916	(vm_size_t) part_size);
5917	} else {
5918	assert((vm_offset_t) src_po == src_po);
5919	assert((vm_offset_t) dst_po == dst_po);
5920	assert((vm_size_t) part_size == part_size);
5921	vm_page_part_copy(result_page,
5922	(vm_offset_t) src_po,
5923	dst_page,
5924	(vm_offset_t) dst_po,
5925	(vm_size_t)part_size);
5926	if(!dst_page->vmp_dirty){
5927	vm_object_lock(dst_object);
5928	SET_PAGE_DIRTY(dst_page, TRUE);
5929	vm_object_unlock(dst_object);
5930	}
5931
5932	}
5933	} else {
5934	part_size = PAGE_SIZE;
5935
5936	if (result_page == VM_PAGE_NULL)
5937	vm_page_zero_fill(dst_page);
5938	else{
5939	vm_object_lock(result_page_object);
5940	vm_page_copy(result_page, dst_page);
5941	vm_object_unlock(result_page_object);
5942
5943	if(!dst_page->vmp_dirty){
5944	vm_object_lock(dst_object);
5945	SET_PAGE_DIRTY(dst_page, TRUE);
5946	vm_object_unlock(dst_object);
5947	}
5948	}
5949
5950	}
5951
5952	/*
5953	* Unlock everything, and return
5954	*/
5955
5956	vm_map_unlock_read(dst_map);
5957
5958	if (result_page != VM_PAGE_NULL && src_page != dst_page)
5959	vm_fault_copy_cleanup(result_page, src_top_page);
5960	vm_fault_copy_dst_cleanup(dst_page);
5961
5962	amount_left -= part_size;
5963	src_offset += part_size;
5964	dst_offset += part_size;
5965	} while (amount_left > `0`);
5966
5967	RETURN(KERN_SUCCESS);
5968	#undef RETURN
5969
5970	/NOTREACHED/
5971	}
5972
5973	#if VM_FAULT_CLASSIFY
5974	/*
5975	* Temporary statistics gathering support.
5976	*/
5977
5978	/*
5979	* Statistics arrays:
5980	*/
5981	#define VM_FAULT_TYPES_MAX 5
5982	#define VM_FAULT_LEVEL_MAX 8
5983
5984	int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5985
5986	#define VM_FAULT_TYPE_ZERO_FILL 0
5987	#define VM_FAULT_TYPE_MAP_IN 1
5988	#define VM_FAULT_TYPE_PAGER 2
5989	#define VM_FAULT_TYPE_COPY 3
5990	#define VM_FAULT_TYPE_OTHER 4
5991
5992
5993	void
5994	vm_fault_classify(vm_object_t object,
5995	vm_object_offset_t offset,
5996	vm_prot_t fault_type)
5997	{
5998	int type, level = `0`;
5999	vm_page_t m;
6000
6001	while (TRUE) {
6002	m = vm_page_lookup(object, offset);
6003	if (m != VM_PAGE_NULL) {
6004	if (m->vmp_busy \|\| m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent) {
6005	type = VM_FAULT_TYPE_OTHER;
6006	break;
6007	}
6008	if (((fault_type & VM_PROT_WRITE) == `0`) \|\|
6009	((level == `0`) && object->copy == VM_OBJECT_NULL)) {
6010	type = VM_FAULT_TYPE_MAP_IN;
6011	break;
6012	}
6013	type = VM_FAULT_TYPE_COPY;
6014	break;
6015	}
6016	else {
6017	if (object->pager_created) {
6018	type = VM_FAULT_TYPE_PAGER;
6019	break;
6020	}
6021	if (object->shadow == VM_OBJECT_NULL) {
6022	type = VM_FAULT_TYPE_ZERO_FILL;
6023	break;
6024	}
6025
6026	offset += object->vo_shadow_offset;
6027	object = object->shadow;
6028	level++;
6029	continue;
6030	}
6031	}
6032
6033	if (level > VM_FAULT_LEVEL_MAX)
6034	level = VM_FAULT_LEVEL_MAX;
6035
6036	vm_fault_stats[type][level] += `1`;
6037
6038	return;
6039	}
6040
6041	/ cleanup routine to call from debugger /
6042
6043	void
6044	vm_fault_classify_init(void)
6045	{
6046	int type, level;
6047
6048	for (type = `0`; type < VM_FAULT_TYPES_MAX; type++) {
6049	for (level = `0`; level < VM_FAULT_LEVEL_MAX; level++) {
6050	vm_fault_stats[type][level] = `0`;
6051	}
6052	}
6053
6054	return;
6055	}
6056	#endif /* VM_FAULT_CLASSIFY */
6057
6058	vm_offset_t
6059	kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6060	{
6061	vm_map_entry_t entry;
6062	vm_object_t object;
6063	vm_offset_t object_offset;
6064	vm_page_t m;
6065	int compressor_external_state, compressed_count_delta;
6066	int compressor_flags = (C_DONT_BLOCK \| C_KEEP \| C_KDP);
6067	int my_fault_type = VM_PROT_READ;
6068	kern_return_t kr;
6069
6070	if (not_in_kdp) {
6071	panic("kdp_lightweight_fault called from outside of debugger context");
6072	}
6073
6074	assert(map != VM_MAP_NULL);
6075
6076	assert((cur_target_addr & PAGE_MASK) == `0`);
6077	if ((cur_target_addr & PAGE_MASK) != `0`) {
6078	return `0`;
6079	}
6080
6081	if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6082	return `0`;
6083	}
6084
6085	if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6086	return `0`;
6087	}
6088
6089	if (entry->is_sub_map) {
6090	return `0`;
6091	}
6092
6093	object = VME_OBJECT(entry);
6094	if (object == VM_OBJECT_NULL) {
6095	return `0`;
6096	}
6097
6098	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6099
6100	while (TRUE) {
6101	if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6102	return `0`;
6103	}
6104
6105	if (object->pager_created && (object->paging_in_progress \|\|
6106	object->activity_in_progress)) {
6107	return `0`;
6108	}
6109
6110	m = kdp_vm_page_lookup(object, object_offset);
6111
6112	if (m != VM_PAGE_NULL) {
6113
6114	if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6115	return `0`;
6116	}
6117
6118	if (m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_free_when_done \|\| m->vmp_absent \|\| m->vmp_error \|\| m->vmp_cleaning \|\|
6119	m->vmp_overwriting \|\| m->vmp_restart \|\| m->vmp_unusual) {
6120	return `0`;
6121	}
6122
6123	assert(!m->vmp_private);
6124	if (m->vmp_private) {
6125	return `0`;
6126	}
6127
6128	assert(!m->vmp_fictitious);
6129	if (m->vmp_fictitious) {
6130	return `0`;
6131	}
6132
6133	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6134	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6135	return `0`;
6136	}
6137
6138	return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6139	}
6140
6141	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6142
6143	if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6144	if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6145	kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6146	kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6147	compressor_flags, &compressed_count_delta);
6148	if (kr == KERN_SUCCESS) {
6149	return kdp_compressor_decompressed_page_paddr;
6150	} else {
6151	return `0`;
6152	}
6153	}
6154	}
6155
6156	if (object->shadow == VM_OBJECT_NULL) {
6157	return `0`;
6158	}
6159
6160	object_offset += object->vo_shadow_offset;
6161	object = object->shadow;
6162	}
6163
6164	}
6165
6166	/*
6167	* vm_page_validate_cs_fast():
6168	* Performs a few quick checks to determine if the page's code signature
6169	* really needs to be fully validated. It could:
6170	* 1. have been modified (i.e. automatically tainted),
6171	* 2. have already been validated,
6172	* 3. have already been found to be tainted,
6173	* 4. no longer have a backing store.
6174	* Returns FALSE if the page needs to be fully validated.
6175	*/
6176	static boolean_t
6177	vm_page_validate_cs_fast(
6178	vm_page_t page)
6179	{
6180	vm_object_t object;
6181
6182	object = VM_PAGE_OBJECT(page);
6183	vm_object_lock_assert_held(object);
6184
6185	if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6186	/*
6187	* This page was mapped for "write" access sometime in the
6188	* past and could still be modifiable in the future.
6189	* Consider it tainted.
6190	* [ If the page was already found to be "tainted", no
6191	* need to re-validate. ]
6192	*/
6193	vm_object_lock_assert_exclusive(object);
6194	page->vmp_cs_validated = TRUE;
6195	page->vmp_cs_tainted = TRUE;
6196	if (cs_debug) {
6197	printf("CODESIGNING: %s: "
6198	"page %p obj %p off 0x%llx "
6199	"was modified\n",
6200	__FUNCTION__,
6201	page, object, page->vmp_offset);
6202	}
6203	vm_cs_validated_dirtied++;
6204	}
6205
6206	if (page->vmp_cs_validated \|\| page->vmp_cs_tainted) {
6207	return TRUE;
6208	}
6209	vm_object_lock_assert_exclusive(object);
6210
6211	#if CHECK_CS_VALIDATION_BITMAP
6212	kern_return_t kr;
6213
6214	kr = vnode_pager_cs_check_validation_bitmap(
6215	object->pager,
6216	page->vmp_offset + object->paging_offset,
6217	CS_BITMAP_CHECK);
6218	if (kr == KERN_SUCCESS) {
6219	page->vmp_cs_validated = TRUE;
6220	page->vmp_cs_tainted = FALSE;
6221	vm_cs_bitmap_validated++;
6222	return TRUE;
6223	}
6224	#endif /* CHECK_CS_VALIDATION_BITMAP */
6225
6226	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
6227	/*
6228	* The object is terminating and we don't have its pager
6229	* so we can't validate the data...
6230	*/
6231	return TRUE;
6232	}
6233
6234	/ we need to really validate this page /
6235	vm_object_lock_assert_exclusive(object);
6236	return FALSE;
6237	}
6238
6239	void
6240	vm_page_validate_cs_mapped_slow(
6241	vm_page_t page,
6242	const void *kaddr)
6243	{
6244	vm_object_t object;
6245	memory_object_offset_t mo_offset;
6246	memory_object_t pager;
6247	struct vnode *vnode;
6248	boolean_t validated;
6249	unsigned tainted;
6250
6251	assert(page->vmp_busy);
6252	object = VM_PAGE_OBJECT(page);
6253	vm_object_lock_assert_exclusive(object);
6254
6255	vm_cs_validates++;
6256
6257	/*
6258	* Since we get here to validate a page that was brought in by
6259	* the pager, we know that this pager is all setup and ready
6260	* by now.
6261	*/
6262	assert(object->code_signed);
6263	assert(!object->internal);
6264	assert(object->pager != NULL);
6265	assert(object->pager_ready);
6266
6267	pager = object->pager;
6268	assert(object->paging_in_progress);
6269	vnode = vnode_pager_lookup_vnode(pager);
6270	mo_offset = page->vmp_offset + object->paging_offset;
6271
6272	/ verify the SHA1 hash for this page /
6273	tainted = `0`;
6274	validated = cs_validate_range(vnode,
6275	pager,
6276	mo_offset,
6277	(const void )((const* char *)kaddr),
6278	PAGE_SIZE_64,
6279	&tainted);
6280
6281	if (tainted & CS_VALIDATE_TAINTED) {
6282	page->vmp_cs_tainted = TRUE;
6283	}
6284	if (tainted & CS_VALIDATE_NX) {
6285	page->vmp_cs_nx = TRUE;
6286	}
6287	if (validated) {
6288	page->vmp_cs_validated = TRUE;
6289	}
6290
6291	#if CHECK_CS_VALIDATION_BITMAP
6292	if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6293	vnode_pager_cs_check_validation_bitmap(object->pager,
6294	mo_offset,
6295	CS_BITMAP_SET);
6296	}
6297	#endif /* CHECK_CS_VALIDATION_BITMAP */
6298	}
6299
6300	void
6301	vm_page_validate_cs_mapped(
6302	vm_page_t page,
6303	const void *kaddr)
6304	{
6305	if (!vm_page_validate_cs_fast(page)) {
6306	vm_page_validate_cs_mapped_slow(page, kaddr);
6307	}
6308	}
6309
6310	void
6311	vm_page_validate_cs(
6312	vm_page_t page)
6313	{
6314	vm_object_t object;
6315	vm_object_offset_t offset;
6316	vm_map_offset_t koffset;
6317	vm_map_size_t ksize;
6318	vm_offset_t kaddr;
6319	kern_return_t kr;
6320	boolean_t busy_page;
6321	boolean_t need_unmap;
6322
6323	object = VM_PAGE_OBJECT(page);
6324	vm_object_lock_assert_held(object);
6325
6326	if (vm_page_validate_cs_fast(page)) {
6327	return;
6328	}
6329	vm_object_lock_assert_exclusive(object);
6330
6331	assert(object->code_signed);
6332	offset = page->vmp_offset;
6333
6334	busy_page = page->vmp_busy;
6335	if (!busy_page) {
6336	/ keep page busy while we map (and unlock) the VM object /
6337	page->vmp_busy = TRUE;
6338	}
6339
6340	/*
6341	* Take a paging reference on the VM object
6342	* to protect it from collapse or bypass,
6343	* and keep it from disappearing too.
6344	*/
6345	vm_object_paging_begin(object);
6346
6347	/ map the page in the kernel address space /
6348	ksize = PAGE_SIZE_64;
6349	koffset = `0`;
6350	need_unmap = FALSE;
6351	kr = vm_paging_map_object(page,
6352	object,
6353	offset,
6354	VM_PROT_READ,
6355	FALSE, / can't unlock object ! /
6356	&ksize,
6357	&koffset,
6358	&need_unmap);
6359	if (kr != KERN_SUCCESS) {
6360	panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6361	}
6362	kaddr = CAST_DOWN(vm_offset_t, koffset);
6363
6364	/ validate the mapped page /
6365	vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6366
6367	assert(page->vmp_busy);
6368	assert(object == VM_PAGE_OBJECT(page));
6369	vm_object_lock_assert_exclusive(object);
6370
6371	if (!busy_page) {
6372	PAGE_WAKEUP_DONE(page);
6373	}
6374	if (need_unmap) {
6375	/ unmap the map from the kernel address space /
6376	vm_paging_unmap_object(object, koffset, koffset + ksize);
6377	koffset = `0`;
6378	ksize = `0`;
6379	kaddr = `0`;
6380	}
6381	vm_object_paging_end(object);
6382	}
6383
6384	void
6385	vm_page_validate_cs_mapped_chunk(
6386	vm_page_t page,
6387	const void *kaddr,
6388	vm_offset_t chunk_offset,
6389	vm_size_t chunk_size,
6390	boolean_t *validated_p,
6391	unsigned *tainted_p)
6392	{
6393	vm_object_t object;
6394	vm_object_offset_t offset, offset_in_page;
6395	memory_object_t pager;
6396	struct vnode *vnode;
6397	boolean_t validated;
6398	unsigned tainted;
6399
6400	*validated_p = FALSE;
6401	*tainted_p = `0`;
6402
6403	assert(page->vmp_busy);
6404	object = VM_PAGE_OBJECT(page);
6405	vm_object_lock_assert_exclusive(object);
6406
6407	assert(object->code_signed);
6408	offset = page->vmp_offset;
6409
6410	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
6411	/*
6412	* The object is terminating and we don't have its pager
6413	* so we can't validate the data...
6414	*/
6415	return;
6416	}
6417	/*
6418	* Since we get here to validate a page that was brought in by
6419	* the pager, we know that this pager is all setup and ready
6420	* by now.
6421	*/
6422	assert(!object->internal);
6423	assert(object->pager != NULL);
6424	assert(object->pager_ready);
6425
6426	pager = object->pager;
6427	assert(object->paging_in_progress);
6428	vnode = vnode_pager_lookup_vnode(pager);
6429
6430	/ verify the signature for this chunk /
6431	offset_in_page = chunk_offset;
6432	assert(offset_in_page < PAGE_SIZE);
6433
6434	tainted = `0`;
6435	validated = cs_validate_range(vnode,
6436	pager,
6437	(object->paging_offset +
6438	offset +
6439	offset_in_page),
6440	(const void )((const* char *)kaddr
6441	+ offset_in_page),
6442	chunk_size,
6443	&tainted);
6444	if (validated) {
6445	*validated_p = TRUE;
6446	}
6447	if (tainted) {
6448	*tainted_p = tainted;
6449	}
6450	}
6451
6452	static void vm_rtfrecord_lock(void) {
6453	lck_spin_lock(&vm_rtfr_slock);
6454	}
6455
6456	static void vm_rtfrecord_unlock(void) {
6457	lck_spin_unlock(&vm_rtfr_slock);
6458	}
6459
6460	unsigned int vmrtfaultinfo_bufsz(void) {
6461	return (vmrtf_num_records * sizeof(vm_rtfault_record_t));
6462	}
6463
6464	#include <kern/backtrace.h>
6465
6466	static void vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault) {
6467	uint64_t fend = mach_continuous_time();
6468
6469	uint64_t cfpc = `0`;
6470	uint64_t ctid = cthread->thread_id;
6471	uint64_t cupid = get_current_unique_pid();
6472
6473	uintptr_t bpc = `0`;
6474	uint32_t bfrs = `0`;
6475	bool u64 = false;
6476
6477	/ Capture a single-frame backtrace; this extracts just the program*
6478	* counter at the point of the fault into "bpc", and should perform no
6479	* further user stack traversals, thus avoiding copyin()s and further
6480	* faults.
6481	*/
6482	int btr = backtrace_thread_user(cthread, &bpc, `1U`, &bfrs, &u64);
6483
6484	if ((btr == `0`) && (bfrs > `0`)) {
6485	cfpc = bpc;
6486	}
6487
6488	assert((fstart != `0`) && fend >= fstart);
6489	vm_rtfrecord_lock();
6490	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6491
6492	vmrtfrs.vmrtf_total++;
6493	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6494
6495	cvmr->rtfabstime = fstart;
6496	cvmr->rtfduration = fend - fstart;
6497	cvmr->rtfaddr = fault_vaddr;
6498	cvmr->rtfpc = cfpc;
6499	cvmr->rtftype = type_of_fault;
6500	cvmr->rtfupid = cupid;
6501	cvmr->rtftid = ctid;
6502
6503	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6504	vmrtfrs.vmrtfr_curi = `0`;
6505	}
6506
6507	vm_rtfrecord_unlock();
6508	}
6509
6510	int vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void vrecords, int* *vmrtfrv) {
6511	vm_rtfault_record_t *cvmrd = vrecords;
6512	size_t residue = vrecordsz;
6513	int numextracted = `0`;
6514	boolean_t early_exit = FALSE;
6515
6516	vm_rtfrecord_lock();
6517
6518	for (int vmfi = `0`; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6519
6520	if (residue < sizeof(vm_rtfault_record_t)) {
6521	early_exit = TRUE;
6522	break;
6523	}
6524
6525	if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6526	#if DEVELOPMENT \|\| DEBUG
6527	if (isroot == FALSE) {
6528	continue;
6529	}
6530	#else
6531	continue;
6532	#endif /* DEVDEBUG */
6533	}
6534
6535	*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6536	cvmrd++;
6537	residue -= sizeof(vm_rtfault_record_t);
6538	numextracted++;
6539	}
6540
6541	vm_rtfrecord_unlock();
6542
6543	*vmrtfrv = numextracted;
6544	return (early_exit);
6545	}
6546

Browse the source code of codebrowser/osfmk/vm/vm_fault.c