1 | /* |
2 | * Copyright (c) 2000-2012 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | |
30 | #ifndef _I386_PMAP_INTERNAL_ |
31 | #define _I386_PMAP_INTERNAL_ |
32 | #ifdef MACH_KERNEL_PRIVATE |
33 | |
34 | #include <vm/pmap.h> |
35 | #include <sys/kdebug.h> |
36 | #include <kern/ledger.h> |
37 | #include <kern/simple_lock.h> |
38 | #include <i386/bit_routines.h> |
39 | |
40 | /* |
41 | * pmap locking |
42 | */ |
43 | |
44 | #define PMAP_LOCK(pmap) { \ |
45 | simple_lock(&(pmap)->lock); \ |
46 | } |
47 | |
48 | #define PMAP_UNLOCK(pmap) { \ |
49 | simple_unlock(&(pmap)->lock); \ |
50 | } |
51 | |
52 | #define PMAP_UPDATE_TLBS(pmap, s, e) \ |
53 | pmap_flush_tlbs(pmap, s, e, 0, NULL) |
54 | |
55 | |
56 | #define PMAP_DELAY_TLB_FLUSH 0x01 |
57 | |
58 | #define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \ |
59 | pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c) |
60 | |
61 | |
62 | #define iswired(pte) ((pte) & INTEL_PTE_WIRED) |
63 | |
64 | #ifdef PMAP_TRACES |
65 | extern boolean_t pmap_trace; |
66 | #define PMAP_TRACE(...) \ |
67 | if (pmap_trace) { \ |
68 | KDBG_RELEASE(__VA_ARGS__); \ |
69 | } |
70 | #else |
71 | #define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__) |
72 | #endif /* PMAP_TRACES */ |
73 | |
74 | #define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__) |
75 | |
76 | kern_return_t pmap_expand_pml4( |
77 | pmap_t map, |
78 | vm_map_offset_t v, |
79 | unsigned int options); |
80 | |
81 | kern_return_t pmap_expand_pdpt( |
82 | pmap_t map, |
83 | vm_map_offset_t v, |
84 | unsigned int options); |
85 | |
86 | void phys_attribute_set( |
87 | ppnum_t phys, |
88 | int bits); |
89 | |
90 | void pmap_set_reference( |
91 | ppnum_t pn); |
92 | |
93 | boolean_t phys_page_exists( |
94 | ppnum_t pn); |
95 | |
96 | void |
97 | pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); |
98 | |
99 | void |
100 | pmap_update_cache_attributes_locked(ppnum_t, unsigned); |
101 | |
102 | extern const boolean_t cpu_64bit; |
103 | |
104 | /* |
105 | * Private data structures. |
106 | */ |
107 | |
108 | /* |
109 | * For each vm_page_t, there is a list of all currently |
110 | * valid virtual mappings of that page. An entry is |
111 | * a pv_rooted_entry_t; the list is the pv_table. |
112 | * |
113 | * N.B. with the new combo rooted/hashed scheme it is |
114 | * only possibly to remove individual non-rooted entries |
115 | * if they are found via the hashed chains as there is no |
116 | * way to unlink the singly linked hashed entries if navigated to |
117 | * via the queue list off the rooted entries. Think of it as |
118 | * hash/walk/pull, keeping track of the prev pointer while walking |
119 | * the singly linked hash list. All of this is to save memory and |
120 | * keep both types of pv_entries as small as possible. |
121 | */ |
122 | |
123 | /* |
124 | |
125 | PV HASHING Changes - JK 1/2007 |
126 | |
127 | Pve's establish physical to virtual mappings. These are used for aliasing of a |
128 | physical page to (potentially many) virtual addresses within pmaps. In the |
129 | previous implementation the structure of the pv_entries (each 16 bytes in size) was |
130 | |
131 | typedef struct pv_entry { |
132 | struct pv_entry_t next; |
133 | pmap_t pmap; |
134 | vm_map_offset_t va; |
135 | } *pv_entry_t; |
136 | |
137 | An initial array of these is created at boot time, one per physical page of |
138 | memory, indexed by the physical page number. Additionally, a pool of entries |
139 | is created from a pv_zone to be used as needed by pmap_enter() when it is |
140 | creating new mappings. Originally, we kept this pool around because the code |
141 | in pmap_enter() was unable to block if it needed an entry and none were |
142 | available - we'd panic. Some time ago I restructured the pmap_enter() code |
143 | so that for user pmaps it can block while zalloc'ing a pv structure and restart, |
144 | removing a panic from the code (in the case of the kernel pmap we cannot block |
145 | and still panic, so, we keep a separate hot pool for use only on kernel pmaps). |
146 | The pool has not been removed since there is a large performance gain keeping |
147 | freed pv's around for reuse and not suffering the overhead of zalloc for every |
148 | new pv we need. |
149 | |
150 | As pmap_enter() created new mappings it linked the new pve's for them off the |
151 | fixed pv array for that ppn (off the next pointer). These pve's are accessed |
152 | for several operations, one of them being address space teardown. In that case, |
153 | we basically do this |
154 | |
155 | for (every page/pte in the space) { |
156 | calc pve_ptr from the ppn in the pte |
157 | for (every pv in the list for the ppn) { |
158 | if (this pv is for this pmap/vaddr) { |
159 | do housekeeping |
160 | unlink/free the pv |
161 | } |
162 | } |
163 | } |
164 | |
165 | The problem arose when we were running, say 8000 (or even 2000) apache or |
166 | other processes and one or all terminate. The list hanging off each pv array |
167 | entry could have thousands of entries. We were continuously linearly searching |
168 | each of these lists as we stepped through the address space we were tearing |
169 | down. Because of the locks we hold, likely taking a cache miss for each node, |
170 | and interrupt disabling for MP issues the system became completely unresponsive |
171 | for many seconds while we did this. |
172 | |
173 | Realizing that pve's are accessed in two distinct ways (linearly running the |
174 | list by ppn for operations like pmap_page_protect and finding and |
175 | modifying/removing a single pve as part of pmap_enter processing) has led to |
176 | modifying the pve structures and databases. |
177 | |
178 | There are now two types of pve structures. A "rooted" structure which is |
179 | basically the original structure accessed in an array by ppn, and a ''hashed'' |
180 | structure accessed on a hash list via a hash of [pmap, vaddr]. These have been |
181 | designed with the two goals of minimizing wired memory and making the lookup of |
182 | a ppn faster. Since a vast majority of pages in the system are not aliased |
183 | and hence represented by a single pv entry I've kept the rooted entry size as |
184 | small as possible because there is one of these dedicated for every physical |
185 | page of memory. The hashed pve's are larger due to the addition of the hash |
186 | link and the ppn entry needed for matching while running the hash list to find |
187 | the entry we are looking for. This way, only systems that have lots of |
188 | aliasing (like 2000+ httpd procs) will pay the extra memory price. Both |
189 | structures have the same first three fields allowing some simplification in |
190 | the code. |
191 | |
192 | They have these shapes |
193 | |
194 | typedef struct pv_rooted_entry { |
195 | queue_head_t qlink; |
196 | vm_map_offset_t va; |
197 | pmap_t pmap; |
198 | } *pv_rooted_entry_t; |
199 | |
200 | |
201 | typedef struct pv_hashed_entry { |
202 | queue_head_t qlink; |
203 | vm_map_offset_t va; |
204 | pmap_t pmap; |
205 | ppnum_t ppn; |
206 | struct pv_hashed_entry *nexth; |
207 | } *pv_hashed_entry_t; |
208 | |
209 | The main flow difference is that the code is now aware of the rooted entry and |
210 | the hashed entries. Code that runs the pv list still starts with the rooted |
211 | entry and then continues down the qlink onto the hashed entries. Code that is |
212 | looking up a specific pv entry first checks the rooted entry and then hashes |
213 | and runs the hash list for the match. The hash list lengths are much smaller |
214 | than the original pv lists that contained all aliases for the specific ppn. |
215 | |
216 | */ |
217 | |
218 | typedef struct pv_rooted_entry { |
219 | /* first three entries must match pv_hashed_entry_t */ |
220 | queue_head_t qlink; |
221 | vm_map_offset_t va_and_flags; /* virtual address for mapping */ |
222 | pmap_t pmap; /* pmap where mapping lies */ |
223 | } *pv_rooted_entry_t; |
224 | |
225 | #define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) |
226 | |
227 | typedef struct pv_hashed_entry { |
228 | /* first three entries must match pv_rooted_entry_t */ |
229 | queue_head_t qlink; |
230 | vm_map_offset_t va_and_flags; |
231 | pmap_t pmap; |
232 | ppnum_t ppn; |
233 | struct pv_hashed_entry *nexth; |
234 | } *pv_hashed_entry_t; |
235 | |
236 | #define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) |
237 | |
238 | #define PVE_VA(pve) ((pve)->va_and_flags & ~PAGE_MASK) |
239 | #define PVE_FLAGS(pve) ((pve)->va_and_flags & PAGE_MASK) |
240 | #define PVE_IS_ALTACCT 0x001 |
241 | #define PVE_IS_ALTACCT_PAGE(pve) \ |
242 | (((pve)->va_and_flags & PVE_IS_ALTACCT) ? TRUE : FALSE) |
243 | |
244 | //#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ |
245 | #ifdef PV_DEBUG |
246 | #define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized"); |
247 | #else |
248 | #define CHK_NPVHASH(x) |
249 | #endif |
250 | |
251 | #define NPVHASHBUCKETS (4096) |
252 | #define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */ |
253 | #define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 |
254 | #define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 |
255 | #define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 |
256 | #define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 |
257 | |
258 | extern volatile uint32_t mappingrecurse; |
259 | extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; |
260 | |
261 | /* |
262 | * PV hash locking |
263 | */ |
264 | |
265 | #define LOCK_PV_HASH(hash) lock_hash_hash(hash) |
266 | #define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) |
267 | extern uint32_t npvhashmask; |
268 | extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ |
269 | extern pv_hashed_entry_t pv_hashed_free_list; |
270 | extern pv_hashed_entry_t pv_hashed_kern_free_list; |
271 | decl_simple_lock_data(extern, pv_hashed_free_list_lock) |
272 | decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) |
273 | decl_simple_lock_data(extern, pv_hash_table_lock) |
274 | decl_simple_lock_data(extern, phys_backup_lock) |
275 | |
276 | extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry |
277 | * structures */ |
278 | |
279 | extern uint32_t pv_hashed_free_count; |
280 | extern uint32_t pv_hashed_kern_free_count; |
281 | /* |
282 | * Each entry in the pv_head_table is locked by a bit in the |
283 | * pv_lock_table. The lock bits are accessed by the address of |
284 | * the frame they lock. |
285 | */ |
286 | #define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) |
287 | #define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) |
288 | extern char *pv_lock_table; /* pointer to array of bits */ |
289 | extern char *pv_hash_lock_table; |
290 | extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ |
291 | |
292 | extern event_t mapping_replenish_event; |
293 | |
294 | static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { |
295 | pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); |
296 | simple_lock(&pv_hashed_free_list_lock); |
297 | /* If the kernel reserved pool is low, let non-kernel mappings allocate |
298 | * synchronously, possibly subject to a throttle. |
299 | */ |
300 | if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { |
301 | pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; |
302 | pv_hashed_free_count--; |
303 | } |
304 | |
305 | simple_unlock(&pv_hashed_free_list_lock); |
306 | |
307 | if (pv_hashed_free_count <= pv_hashed_low_water_mark) { |
308 | if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) |
309 | thread_wakeup(&mapping_replenish_event); |
310 | } |
311 | } |
312 | |
313 | static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { |
314 | simple_lock(&pv_hashed_free_list_lock); |
315 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; |
316 | pv_hashed_free_list = pvh_eh; |
317 | pv_hashed_free_count += pv_cnt; |
318 | simple_unlock(&pv_hashed_free_list_lock); |
319 | } |
320 | |
321 | extern unsigned pmap_kern_reserve_alloc_stat; |
322 | |
323 | static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { |
324 | pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); |
325 | simple_lock(&pv_hashed_kern_free_list_lock); |
326 | |
327 | if ((*pvh_e = pv_hashed_kern_free_list) != 0) { |
328 | pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; |
329 | pv_hashed_kern_free_count--; |
330 | pmap_kern_reserve_alloc_stat++; |
331 | } |
332 | |
333 | simple_unlock(&pv_hashed_kern_free_list_lock); |
334 | |
335 | if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { |
336 | if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) |
337 | thread_wakeup(&mapping_replenish_event); |
338 | } |
339 | } |
340 | |
341 | static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { |
342 | simple_lock(&pv_hashed_kern_free_list_lock); |
343 | pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; |
344 | pv_hashed_kern_free_list = pvh_eh; |
345 | pv_hashed_kern_free_count += pv_cnt; |
346 | simple_unlock(&pv_hashed_kern_free_list_lock); |
347 | } |
348 | |
349 | extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; |
350 | extern event_t pmap_user_pv_throttle_event; |
351 | |
352 | static inline void pmap_pv_throttle(__unused pmap_t p) { |
353 | pmap_assert(p != kernel_pmap); |
354 | /* Apply throttle on non-kernel mappings */ |
355 | if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { |
356 | pmap_pv_throttle_stat++; |
357 | /* This doesn't need to be strictly accurate, merely a hint |
358 | * to eliminate the timeout when the reserve is replenished. |
359 | */ |
360 | pmap_pv_throttled_waiters++; |
361 | assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); |
362 | thread_block(THREAD_CONTINUE_NULL); |
363 | } |
364 | } |
365 | |
366 | /* |
367 | * Index into pv_head table, its lock bits, and the modify/reference and managed bits |
368 | */ |
369 | |
370 | #define pa_index(pa) (i386_btop(pa)) |
371 | #define ppn_to_pai(ppn) ((int)ppn) |
372 | |
373 | #define pai_to_pvh(pai) (&pv_head_table[pai]) |
374 | #define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) |
375 | #define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) |
376 | #define pvhash(idx) (&pv_hash_table[idx]) |
377 | #define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) |
378 | #define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) |
379 | |
380 | #define IS_MANAGED_PAGE(x) \ |
381 | ((unsigned int)(x) <= last_managed_page && \ |
382 | (pmap_phys_attributes[x] & PHYS_MANAGED)) |
383 | #define IS_INTERNAL_PAGE(x) \ |
384 | (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) |
385 | #define IS_REUSABLE_PAGE(x) \ |
386 | (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) |
387 | #define IS_ALTACCT_PAGE(x,pve) \ |
388 | (IS_MANAGED_PAGE((x)) && \ |
389 | (PVE_IS_ALTACCT_PAGE((pve)))) |
390 | |
391 | /* |
392 | * Physical page attributes. Copy bits from PTE definition. |
393 | */ |
394 | #define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ |
395 | #define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ |
396 | #define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ |
397 | #define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ |
398 | #define PHYS_NCACHE INTEL_PTE_NCACHE |
399 | #define PHYS_PTA INTEL_PTE_PTA |
400 | #define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) |
401 | #define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ |
402 | #define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ |
403 | |
404 | extern boolean_t pmap_disable_kheap_nx; |
405 | extern boolean_t pmap_disable_kstack_nx; |
406 | |
407 | #define PMAP_EXPAND_OPTIONS_NONE (0x0) |
408 | #define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) |
409 | #define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) |
410 | #define PMAP_EXPAND_OPTIONS_ALIASMAP (0x40000000U) |
411 | /* |
412 | * Amount of virtual memory mapped by one |
413 | * page-directory entry. |
414 | */ |
415 | #define PDE_MAPPED_SIZE (pdetova(1)) |
416 | |
417 | |
418 | /* |
419 | * Locking and TLB invalidation |
420 | */ |
421 | |
422 | /* |
423 | * Locking Protocols: (changed 2/2007 JK) |
424 | * |
425 | * There are two structures in the pmap module that need locking: |
426 | * the pmaps themselves, and the per-page pv_lists (which are locked |
427 | * by locking the pv_lock_table entry that corresponds to the pv_head |
428 | * for the list in question.) Most routines want to lock a pmap and |
429 | * then do operations in it that require pv_list locking -- however |
430 | * pmap_remove_all and pmap_copy_on_write operate on a physical page |
431 | * basis and want to do the locking in the reverse order, i.e. lock |
432 | * a pv_list and then go through all the pmaps referenced by that list. |
433 | * |
434 | * The system wide pmap lock has been removed. Now, paths take a lock |
435 | * on the pmap before changing its 'shape' and the reverse order lockers |
436 | * (coming in by phys ppn) take a lock on the corresponding pv and then |
437 | * retest to be sure nothing changed during the window before they locked |
438 | * and can then run up/down the pv lists holding the list lock. This also |
439 | * lets the pmap layer run (nearly completely) interrupt enabled, unlike |
440 | * previously. |
441 | */ |
442 | |
443 | /* |
444 | * PV locking |
445 | */ |
446 | |
447 | #define LOCK_PVH(index) { \ |
448 | mp_disable_preemption(); \ |
449 | lock_pvh_pai(index); \ |
450 | } |
451 | |
452 | #define UNLOCK_PVH(index) { \ |
453 | unlock_pvh_pai(index); \ |
454 | mp_enable_preemption(); \ |
455 | } |
456 | |
457 | extern uint64_t pde_mapped_size; |
458 | |
459 | extern char *pmap_phys_attributes; |
460 | extern ppnum_t last_managed_page; |
461 | |
462 | extern ppnum_t lowest_lo; |
463 | extern ppnum_t lowest_hi; |
464 | extern ppnum_t highest_hi; |
465 | |
466 | /* |
467 | * when spinning through pmap_remove |
468 | * ensure that we don't spend too much |
469 | * time with preemption disabled. |
470 | * I'm setting the current threshold |
471 | * to 20us |
472 | */ |
473 | #define MAX_PREEMPTION_LATENCY_NS 20000 |
474 | extern uint64_t max_preemption_latency_tsc; |
475 | |
476 | #if DEBUG |
477 | #define PMAP_INTR_DEBUG (1) |
478 | #endif |
479 | |
480 | #if PMAP_INTR_DEBUG |
481 | #define pmap_intr_assert() { \ |
482 | if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ |
483 | panic("pmap interrupt assert %d %s, %d", processor_avail_count, __FILE__, __LINE__); \ |
484 | } |
485 | #else |
486 | #define pmap_intr_assert() |
487 | #endif |
488 | |
489 | extern int nx_enabled; |
490 | extern unsigned int inuse_ptepages_count; |
491 | |
492 | static inline uint32_t |
493 | pvhashidx(pmap_t pmap, vm_map_offset_t va) |
494 | { |
495 | uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^ |
496 | ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & |
497 | npvhashmask; |
498 | return hashidx; |
499 | } |
500 | |
501 | /* |
502 | * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. |
503 | * properly deals with the anchor. |
504 | * must be called with the hash locked, does not unlock it |
505 | */ |
506 | static inline void |
507 | pmap_pvh_unlink(pv_hashed_entry_t pvh) |
508 | { |
509 | pv_hashed_entry_t curh; |
510 | pv_hashed_entry_t *pprevh; |
511 | int pvhash_idx; |
512 | |
513 | CHK_NPVHASH(); |
514 | pvhash_idx = pvhashidx(pvh->pmap, PVE_VA(pvh)); |
515 | |
516 | pprevh = pvhash(pvhash_idx); |
517 | |
518 | #if PV_DEBUG |
519 | if (NULL == *pprevh) |
520 | panic("pvh_unlink null anchor" ); /* JK DEBUG */ |
521 | #endif |
522 | curh = *pprevh; |
523 | |
524 | while (PV_HASHED_ENTRY_NULL != curh) { |
525 | if (pvh == curh) |
526 | break; |
527 | pprevh = &curh->nexth; |
528 | curh = curh->nexth; |
529 | } |
530 | if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh" ); |
531 | *pprevh = pvh->nexth; |
532 | return; |
533 | } |
534 | |
535 | static inline void |
536 | pv_hash_add(pv_hashed_entry_t pvh_e, |
537 | pv_rooted_entry_t pv_h) |
538 | { |
539 | pv_hashed_entry_t *hashp; |
540 | int pvhash_idx; |
541 | |
542 | CHK_NPVHASH(); |
543 | pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); |
544 | LOCK_PV_HASH(pvhash_idx); |
545 | insque(&pvh_e->qlink, &pv_h->qlink); |
546 | hashp = pvhash(pvhash_idx); |
547 | #if PV_DEBUG |
548 | if (NULL==hashp) |
549 | panic("pv_hash_add(%p) null hash bucket" , pvh_e); |
550 | #endif |
551 | pvh_e->nexth = *hashp; |
552 | *hashp = pvh_e; |
553 | UNLOCK_PV_HASH(pvhash_idx); |
554 | } |
555 | |
556 | static inline void |
557 | pv_hash_remove(pv_hashed_entry_t pvh_e) |
558 | { |
559 | int pvhash_idx; |
560 | |
561 | CHK_NPVHASH(); |
562 | pvhash_idx = pvhashidx(pvh_e->pmap,PVE_VA(pvh_e)); |
563 | LOCK_PV_HASH(pvhash_idx); |
564 | remque(&pvh_e->qlink); |
565 | pmap_pvh_unlink(pvh_e); |
566 | UNLOCK_PV_HASH(pvhash_idx); |
567 | } |
568 | |
569 | static inline boolean_t popcnt1(uint64_t distance) { |
570 | return ((distance & (distance - 1)) == 0); |
571 | } |
572 | |
573 | /* |
574 | * Routines to handle suppression of/recovery from some forms of pagetable corruption |
575 | * incidents observed in the field. These can be either software induced (wild |
576 | * stores to the mapwindows where applicable, use after free errors |
577 | * (typically of pages addressed physically), mis-directed DMAs etc., or due |
578 | * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, |
579 | * the recording mechanism is deliberately not MP-safe. The overarching goal is to |
580 | * still assert on potential software races, but attempt recovery from incidents |
581 | * identifiable as occurring due to issues beyond the control of the pmap module. |
582 | * The latter includes single-bit errors and malformed pagetable entries. |
583 | * We currently limit ourselves to recovery/suppression of one incident per |
584 | * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident |
585 | * are logged. |
586 | * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) |
587 | */ |
588 | |
589 | typedef enum { |
590 | PTE_VALID = 0x0, |
591 | PTE_INVALID = 0x1, |
592 | PTE_RSVD = 0x2, |
593 | PTE_SUPERVISOR = 0x4, |
594 | PTE_BITFLIP = 0x8, |
595 | PV_BITFLIP = 0x10, |
596 | PTE_INVALID_CACHEABILITY = 0x20 |
597 | } pmap_pagetable_corruption_t; |
598 | |
599 | typedef enum { |
600 | ROOT_PRESENT = 0, |
601 | ROOT_ABSENT = 1 |
602 | } pmap_pv_assertion_t; |
603 | |
604 | typedef enum { |
605 | PMAP_ACTION_IGNORE = 0x0, |
606 | PMAP_ACTION_ASSERT = 0x1, |
607 | PMAP_ACTION_RETRY = 0x2, |
608 | PMAP_ACTION_RETRY_RELOCK = 0x4 |
609 | } pmap_pagetable_corruption_action_t; |
610 | |
611 | #define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) |
612 | extern uint64_t pmap_pagetable_corruption_interval_abstime; |
613 | |
614 | extern uint32_t pmap_pagetable_corruption_incidents; |
615 | #define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) |
616 | typedef struct { |
617 | pmap_pv_assertion_t incident; |
618 | pmap_pagetable_corruption_t reason; |
619 | pmap_pagetable_corruption_action_t action; |
620 | pmap_t pmap; |
621 | vm_map_offset_t vaddr; |
622 | pt_entry_t pte; |
623 | ppnum_t ppn; |
624 | pmap_t pvpmap; |
625 | vm_map_offset_t pvva; |
626 | uint64_t abstime; |
627 | } pmap_pagetable_corruption_record_t; |
628 | |
629 | extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; |
630 | extern uint64_t pmap_pagetable_corruption_last_abstime; |
631 | extern thread_call_t pmap_pagetable_corruption_log_call; |
632 | extern boolean_t pmap_pagetable_corruption_timeout; |
633 | |
634 | static inline void |
635 | pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { |
636 | uint32_t pmap_pagetable_corruption_log_index; |
637 | pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; |
638 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; |
639 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; |
640 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; |
641 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; |
642 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; |
643 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; |
644 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; |
645 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; |
646 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; |
647 | pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); |
648 | /* Asynchronously log */ |
649 | thread_call_enter(pmap_pagetable_corruption_log_call); |
650 | } |
651 | |
652 | static inline pmap_pagetable_corruption_action_t |
653 | pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { |
654 | pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; |
655 | pmap_pagetable_corruption_t suppress_reason = PTE_VALID; |
656 | ppnum_t suppress_ppn = 0; |
657 | pt_entry_t cpte = *ptep; |
658 | ppnum_t cpn = pa_index(pte_to_pa(cpte)); |
659 | ppnum_t ppn = *ppnp; |
660 | pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); |
661 | pv_rooted_entry_t pv_e = pv_h; |
662 | uint32_t bitdex; |
663 | pmap_t pvpmap = pv_h->pmap; |
664 | vm_map_offset_t pvva = PVE_VA(pv_h); |
665 | vm_map_offset_t pve_flags; |
666 | boolean_t ppcd = FALSE; |
667 | boolean_t is_ept; |
668 | |
669 | /* Ideally, we'd consult the Mach VM here to definitively determine |
670 | * the nature of the mapping for this address space and address. |
671 | * As that would be a layering violation in this context, we |
672 | * use various heuristics to recover from single bit errors, |
673 | * malformed pagetable entries etc. These are not intended |
674 | * to be comprehensive. |
675 | */ |
676 | |
677 | /* As a precautionary measure, mark A+D */ |
678 | pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); |
679 | is_ept = is_ept_pmap(pmap); |
680 | |
681 | /* |
682 | * Correct potential single bit errors in either (but not both) element |
683 | * of the PV |
684 | */ |
685 | do { |
686 | if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && PVE_VA(pv_e) == vaddr) || |
687 | (pv_e->pmap == pmap && popcnt1(PVE_VA(pv_e) ^ vaddr))) { |
688 | pve_flags = PVE_FLAGS(pv_e); |
689 | pv_e->pmap = pmap; |
690 | pv_h->va_and_flags = vaddr | pve_flags; |
691 | suppress_reason = PV_BITFLIP; |
692 | action = PMAP_ACTION_RETRY; |
693 | goto pmap_cpc_exit; |
694 | } |
695 | } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); |
696 | |
697 | /* Discover root entries with a Hamming |
698 | * distance of 1 from the supplied |
699 | * physical page frame. |
700 | */ |
701 | for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { |
702 | ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); |
703 | if (IS_MANAGED_PAGE(npn)) { |
704 | pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); |
705 | if (PVE_VA(npv_h) == vaddr && npv_h->pmap == pmap) { |
706 | suppress_reason = PTE_BITFLIP; |
707 | suppress_ppn = npn; |
708 | action = PMAP_ACTION_RETRY_RELOCK; |
709 | UNLOCK_PVH(ppn_to_pai(ppn)); |
710 | *ppnp = npn; |
711 | goto pmap_cpc_exit; |
712 | } |
713 | } |
714 | } |
715 | |
716 | if (pmap == kernel_pmap) { |
717 | action = PMAP_ACTION_ASSERT; |
718 | goto pmap_cpc_exit; |
719 | } |
720 | |
721 | /* |
722 | * Check for malformed/inconsistent entries. |
723 | * The first check here isn't useful for EPT PTEs because INTEL_EPT_NCACHE == 0 |
724 | */ |
725 | if (!is_ept && ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU))) { |
726 | action = PMAP_ACTION_IGNORE; |
727 | suppress_reason = PTE_INVALID_CACHEABILITY; |
728 | } |
729 | else if (cpte & INTEL_PTE_RSVD) { |
730 | action = PMAP_ACTION_IGNORE; |
731 | suppress_reason = PTE_RSVD; |
732 | } |
733 | else if ((pmap != kernel_pmap) && (!is_ept) && ((cpte & INTEL_PTE_USER) == 0)) { |
734 | action = PMAP_ACTION_IGNORE; |
735 | suppress_reason = PTE_SUPERVISOR; |
736 | } |
737 | pmap_cpc_exit: |
738 | PE_parse_boot_argn("-pmap_pagetable_corruption_deassert" , &ppcd, sizeof(ppcd)); |
739 | |
740 | if (debug_boot_arg && !ppcd) { |
741 | action = PMAP_ACTION_ASSERT; |
742 | } |
743 | |
744 | if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { |
745 | action = PMAP_ACTION_ASSERT; |
746 | pmap_pagetable_corruption_timeout = TRUE; |
747 | } |
748 | else |
749 | { |
750 | pmap_pagetable_corruption_last_abstime = mach_absolute_time(); |
751 | } |
752 | pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); |
753 | return action; |
754 | } |
755 | |
756 | /* |
757 | * Remove pv list entry. |
758 | * Called with pv_head_table entry locked. |
759 | * Returns pv entry to be freed (or NULL). |
760 | */ |
761 | static inline __attribute__((always_inline)) pv_hashed_entry_t |
762 | pmap_pv_remove(pmap_t pmap, |
763 | vm_map_offset_t vaddr, |
764 | ppnum_t *ppnp, |
765 | pt_entry_t *pte, |
766 | boolean_t *was_altacct) |
767 | { |
768 | pv_hashed_entry_t pvh_e; |
769 | pv_rooted_entry_t pv_h; |
770 | pv_hashed_entry_t *pprevh; |
771 | int pvhash_idx; |
772 | uint32_t pv_cnt; |
773 | ppnum_t ppn; |
774 | |
775 | *was_altacct = FALSE; |
776 | pmap_pv_remove_retry: |
777 | ppn = *ppnp; |
778 | pvh_e = PV_HASHED_ENTRY_NULL; |
779 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); |
780 | |
781 | if (__improbable(pv_h->pmap == PMAP_NULL)) { |
782 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); |
783 | if (pac == PMAP_ACTION_IGNORE) |
784 | goto pmap_pv_remove_exit; |
785 | else if (pac == PMAP_ACTION_ASSERT) |
786 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list, priors: %d" , pmap, vaddr, ppn, *pte, ppnp, pte, pmap_pagetable_corruption_incidents); |
787 | else if (pac == PMAP_ACTION_RETRY_RELOCK) { |
788 | LOCK_PVH(ppn_to_pai(*ppnp)); |
789 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); |
790 | goto pmap_pv_remove_retry; |
791 | } |
792 | else if (pac == PMAP_ACTION_RETRY) |
793 | goto pmap_pv_remove_retry; |
794 | } |
795 | |
796 | if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { |
797 | *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pv_h); |
798 | /* |
799 | * Header is the pv_rooted_entry. |
800 | * We can't free that. If there is a queued |
801 | * entry after this one we remove that |
802 | * from the ppn queue, we remove it from the hash chain |
803 | * and copy it to the rooted entry. Then free it instead. |
804 | */ |
805 | pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); |
806 | if (pv_h != (pv_rooted_entry_t) pvh_e) { |
807 | /* |
808 | * Entry queued to root, remove this from hash |
809 | * and install as new root. |
810 | */ |
811 | CHK_NPVHASH(); |
812 | pvhash_idx = pvhashidx(pvh_e->pmap, PVE_VA(pvh_e)); |
813 | LOCK_PV_HASH(pvhash_idx); |
814 | remque(&pvh_e->qlink); |
815 | pprevh = pvhash(pvhash_idx); |
816 | if (PV_HASHED_ENTRY_NULL == *pprevh) { |
817 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " |
818 | "empty hash, removing rooted, priors: %d" , |
819 | pmap, vaddr, ppn, pmap_pagetable_corruption_incidents); |
820 | } |
821 | pmap_pvh_unlink(pvh_e); |
822 | UNLOCK_PV_HASH(pvhash_idx); |
823 | pv_h->pmap = pvh_e->pmap; |
824 | pv_h->va_and_flags = pvh_e->va_and_flags; |
825 | /* dispose of pvh_e */ |
826 | } else { |
827 | /* none queued after rooted */ |
828 | pv_h->pmap = PMAP_NULL; |
829 | pvh_e = PV_HASHED_ENTRY_NULL; |
830 | } |
831 | } else { |
832 | /* |
833 | * not removing rooted pv. find it on hash chain, remove from |
834 | * ppn queue and hash chain and free it |
835 | */ |
836 | CHK_NPVHASH(); |
837 | pvhash_idx = pvhashidx(pmap, vaddr); |
838 | LOCK_PV_HASH(pvhash_idx); |
839 | pprevh = pvhash(pvhash_idx); |
840 | if (PV_HASHED_ENTRY_NULL == *pprevh) { |
841 | panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash, priors: %d" , |
842 | pmap, vaddr, ppn, *pte, pte, pmap_pagetable_corruption_incidents); |
843 | } |
844 | pvh_e = *pprevh; |
845 | pmap_pv_hashlist_walks++; |
846 | pv_cnt = 0; |
847 | while (PV_HASHED_ENTRY_NULL != pvh_e) { |
848 | pv_cnt++; |
849 | if (pvh_e->pmap == pmap && |
850 | PVE_VA(pvh_e) == vaddr && |
851 | pvh_e->ppn == ppn) |
852 | break; |
853 | pprevh = &pvh_e->nexth; |
854 | pvh_e = pvh_e->nexth; |
855 | } |
856 | |
857 | if (PV_HASHED_ENTRY_NULL == pvh_e) { |
858 | pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); |
859 | |
860 | if (pac == PMAP_ACTION_ASSERT) |
861 | panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx, priors: %d" , pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, PVE_VA(pv_h), pmap_pagetable_corruption_incidents); |
862 | else { |
863 | UNLOCK_PV_HASH(pvhash_idx); |
864 | if (pac == PMAP_ACTION_RETRY_RELOCK) { |
865 | LOCK_PVH(ppn_to_pai(*ppnp)); |
866 | pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); |
867 | goto pmap_pv_remove_retry; |
868 | } |
869 | else if (pac == PMAP_ACTION_RETRY) { |
870 | goto pmap_pv_remove_retry; |
871 | } |
872 | else if (pac == PMAP_ACTION_IGNORE) { |
873 | goto pmap_pv_remove_exit; |
874 | } |
875 | } |
876 | } |
877 | |
878 | *was_altacct = IS_ALTACCT_PAGE(ppn_to_pai(*ppnp), pvh_e); |
879 | |
880 | pmap_pv_hashlist_cnts += pv_cnt; |
881 | if (pmap_pv_hashlist_max < pv_cnt) |
882 | pmap_pv_hashlist_max = pv_cnt; |
883 | *pprevh = pvh_e->nexth; |
884 | remque(&pvh_e->qlink); |
885 | UNLOCK_PV_HASH(pvhash_idx); |
886 | } |
887 | pmap_pv_remove_exit: |
888 | return pvh_e; |
889 | } |
890 | |
891 | static inline __attribute__((always_inline)) boolean_t |
892 | pmap_pv_is_altacct( |
893 | pmap_t pmap, |
894 | vm_map_offset_t vaddr, |
895 | ppnum_t ppn) |
896 | { |
897 | pv_hashed_entry_t pvh_e; |
898 | pv_rooted_entry_t pv_h; |
899 | int pvhash_idx; |
900 | boolean_t is_altacct; |
901 | |
902 | pvh_e = PV_HASHED_ENTRY_NULL; |
903 | pv_h = pai_to_pvh(ppn_to_pai(ppn)); |
904 | |
905 | if (__improbable(pv_h->pmap == PMAP_NULL)) { |
906 | return FALSE; |
907 | } |
908 | |
909 | if (PVE_VA(pv_h) == vaddr && pv_h->pmap == pmap) { |
910 | /* |
911 | * Header is the pv_rooted_entry. |
912 | */ |
913 | return IS_ALTACCT_PAGE(ppn, pv_h); |
914 | } |
915 | |
916 | CHK_NPVHASH(); |
917 | pvhash_idx = pvhashidx(pmap, vaddr); |
918 | LOCK_PV_HASH(pvhash_idx); |
919 | pvh_e = *(pvhash(pvhash_idx)); |
920 | while (PV_HASHED_ENTRY_NULL != pvh_e) { |
921 | if (pvh_e->pmap == pmap && |
922 | PVE_VA(pvh_e) == vaddr && |
923 | pvh_e->ppn == ppn) |
924 | break; |
925 | pvh_e = pvh_e->nexth; |
926 | } |
927 | if (PV_HASHED_ENTRY_NULL == pvh_e) { |
928 | is_altacct = FALSE; |
929 | } else { |
930 | is_altacct = IS_ALTACCT_PAGE(ppn, pvh_e); |
931 | } |
932 | UNLOCK_PV_HASH(pvhash_idx); |
933 | |
934 | return is_altacct; |
935 | } |
936 | |
937 | extern int pt_fake_zone_index; |
938 | static inline void |
939 | PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) |
940 | { |
941 | pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); |
942 | } |
943 | |
944 | static inline void |
945 | PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) |
946 | { |
947 | pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); |
948 | } |
949 | |
950 | static inline void |
951 | PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) |
952 | { |
953 | pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); |
954 | } |
955 | |
956 | static inline void |
957 | PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) |
958 | { |
959 | pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); |
960 | } |
961 | |
962 | extern boolean_t pmap_initialized;/* Has pmap_init completed? */ |
963 | #define valid_page(x) (pmap_initialized && pmap_valid_page(x)) |
964 | |
965 | int phys_attribute_test( |
966 | ppnum_t phys, |
967 | int bits); |
968 | void phys_attribute_clear( |
969 | ppnum_t phys, |
970 | int bits, |
971 | unsigned int options, |
972 | void *arg); |
973 | |
974 | //#define PCID_DEBUG 1 |
975 | #if PCID_DEBUG |
976 | #define pmap_pcid_log(fmt, args...) \ |
977 | do { \ |
978 | kprintf(fmt, ##args); \ |
979 | printf(fmt, ##args); \ |
980 | } while(0) |
981 | #else |
982 | #define pmap_pcid_log(fmt, args...) |
983 | #endif |
984 | void pmap_pcid_configure(void); |
985 | |
986 | |
987 | /* |
988 | * Atomic 64-bit compare and exchange of a page table entry. |
989 | */ |
990 | static inline boolean_t |
991 | pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) |
992 | { |
993 | boolean_t ret; |
994 | |
995 | /* |
996 | * Load the old value into %rax |
997 | * Load the new value into another register |
998 | * Compare-exchange-quad at address entryp |
999 | * If the compare succeeds, the new value is stored, return TRUE. |
1000 | * Otherwise, no swap is made, return FALSE. |
1001 | */ |
1002 | asm volatile( |
1003 | " lock; cmpxchgq %2,(%3) \n\t" |
1004 | " setz %%al \n\t" |
1005 | " movzbl %%al,%0" |
1006 | : "=a" (ret) |
1007 | : "a" (old), |
1008 | "r" (new), |
1009 | "r" (entryp) |
1010 | : "memory" ); |
1011 | return ret; |
1012 | } |
1013 | |
1014 | extern uint32_t pmap_update_clear_pte_count; |
1015 | |
1016 | static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { |
1017 | pt_entry_t npte, opte; |
1018 | do { |
1019 | opte = *mptep; |
1020 | if (__improbable(opte == 0)) { |
1021 | pmap_update_clear_pte_count++; |
1022 | break; |
1023 | } |
1024 | npte = opte & ~(pclear_bits); |
1025 | npte |= pset_bits; |
1026 | } while (!pmap_cmpx_pte(mptep, opte, npte)); |
1027 | } |
1028 | |
1029 | /* |
1030 | * The single pml4 page per pmap is allocated at pmap create time and exists |
1031 | * for the duration of the pmap. we allocate this page in kernel vm. |
1032 | * this returns the address of the requested pml4 entry in the top level page. |
1033 | */ |
1034 | static inline |
1035 | pml4_entry_t * |
1036 | pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) |
1037 | { |
1038 | if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && |
1039 | (vaddr < 0xFFFF800000000000ULL))) { |
1040 | return (NULL); |
1041 | } |
1042 | |
1043 | #if DEBUG |
1044 | return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); |
1045 | #else |
1046 | return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; |
1047 | #endif |
1048 | } |
1049 | |
1050 | static inline pml4_entry_t * |
1051 | pmap64_user_pml4(pmap_t pmap, vm_map_offset_t vaddr) |
1052 | { |
1053 | if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && |
1054 | (vaddr < 0xFFFF800000000000ULL))) { |
1055 | return (NULL); |
1056 | } |
1057 | |
1058 | #if DEBUG |
1059 | return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_ucr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); |
1060 | #else |
1061 | return &pmap->pm_upml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; |
1062 | #endif |
1063 | } |
1064 | |
1065 | /* |
1066 | * Returns address of requested PDPT entry in the physmap. |
1067 | */ |
1068 | static inline pdpt_entry_t * |
1069 | pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) |
1070 | { |
1071 | pml4_entry_t newpf; |
1072 | pml4_entry_t *pml4; |
1073 | boolean_t is_ept; |
1074 | |
1075 | pml4 = pmap64_pml4(pmap, vaddr); |
1076 | is_ept = is_ept_pmap(pmap); |
1077 | |
1078 | if (pml4 && (*pml4 & PTE_VALID_MASK(is_ept))) { |
1079 | newpf = *pml4 & PG_FRAME; |
1080 | return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) |
1081 | [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; |
1082 | } |
1083 | return (NULL); |
1084 | } |
1085 | /* |
1086 | * Returns the address of the requested PDE entry in the physmap. |
1087 | */ |
1088 | static inline pd_entry_t * |
1089 | pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) |
1090 | { |
1091 | pdpt_entry_t newpf; |
1092 | pdpt_entry_t *pdpt; |
1093 | boolean_t is_ept; |
1094 | |
1095 | pdpt = pmap64_pdpt(pmap, vaddr); |
1096 | is_ept = is_ept_pmap(pmap); |
1097 | |
1098 | if (pdpt && (*pdpt & PTE_VALID_MASK(is_ept))) { |
1099 | newpf = *pdpt & PG_FRAME; |
1100 | return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) |
1101 | [(vaddr >> PDSHIFT) & (NPDPG-1)]; |
1102 | } |
1103 | return (NULL); |
1104 | } |
1105 | |
1106 | static inline pd_entry_t * |
1107 | pmap_pde(pmap_t m, vm_map_offset_t v) |
1108 | { |
1109 | pd_entry_t *pde; |
1110 | |
1111 | pde = pmap64_pde(m, v); |
1112 | |
1113 | return pde; |
1114 | } |
1115 | |
1116 | |
1117 | /* |
1118 | * return address of mapped pte for vaddr va in pmap pmap. |
1119 | * |
1120 | * In case the pde maps a superpage, return the pde, which, in this case |
1121 | * is the actual page table entry. |
1122 | */ |
1123 | static inline pt_entry_t * |
1124 | pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) |
1125 | { |
1126 | pd_entry_t *pde; |
1127 | pd_entry_t newpf; |
1128 | boolean_t is_ept; |
1129 | |
1130 | assert(pmap); |
1131 | pde = pmap64_pde(pmap, vaddr); |
1132 | |
1133 | is_ept = is_ept_pmap(pmap); |
1134 | |
1135 | if (pde && (*pde & PTE_VALID_MASK(is_ept))) { |
1136 | if (*pde & PTE_PS) |
1137 | return pde; |
1138 | newpf = *pde & PG_FRAME; |
1139 | return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) |
1140 | [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; |
1141 | } |
1142 | return (NULL); |
1143 | } |
1144 | extern void pmap_alias( |
1145 | vm_offset_t ava, |
1146 | vm_map_offset_t start, |
1147 | vm_map_offset_t end, |
1148 | vm_prot_t prot, |
1149 | unsigned int options); |
1150 | |
1151 | #if DEBUG |
1152 | #define DPRINTF(x...) kprintf(x) |
1153 | #else |
1154 | #define DPRINTF(x...) |
1155 | #endif |
1156 | |
1157 | #endif /* MACH_KERNEL_PRIVATE */ |
1158 | #endif /* _I386_PMAP_INTERNAL_ */ |
1159 | |