| 1 | /* |
| 2 | * Copyright (c) 2000-2010 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include <i386/proc_reg.h> |
| 30 | #include <i386/cpuid.h> |
| 31 | #include <i386/tsc.h> |
| 32 | #include <vm/pmap.h> |
| 33 | #include <vm/vm_map.h> |
| 34 | #include <i386/pmap_internal.h> |
| 35 | #include <i386/pmap_pcid.h> |
| 36 | |
| 37 | /* |
| 38 | * PCID (Process context identifier) aka tagged TLB support. |
| 39 | * On processors with this feature, unless disabled via the -pmap_pcid_disable |
| 40 | * boot-arg, the following algorithm is in effect: |
| 41 | * Each processor maintains an array of tag refcounts indexed by tag. |
| 42 | * Each address space maintains an array of tags indexed by CPU number. |
| 43 | * Each address space maintains a coherency vector, indexed by CPU |
| 44 | * indicating that the TLB state for that address space has a pending |
| 45 | * invalidation. |
| 46 | * On a context switch, a refcounted tag is lazily assigned to the newly |
| 47 | * dispatched (CPU, address space) tuple. |
| 48 | * When an inactive address space is invalidated on a remote CPU, it is marked |
| 49 | * for invalidation upon the next dispatch. Some invalidations are |
| 50 | * also processed at the user/kernel boundary. |
| 51 | * Provisions are made for the case where a CPU is overcommmitted, i.e. |
| 52 | * more active address spaces exist than the number of logical tags |
| 53 | * provided for by the processor architecture (currently 4096). |
| 54 | * The algorithm assumes the processor remaps the logical tags |
| 55 | * to physical TLB context IDs in an LRU fashion for efficiency. (DRK '10) |
| 56 | */ |
| 57 | |
| 58 | uint32_t pmap_pcid_ncpus; |
| 59 | boolean_t pmap_pcid_disabled = FALSE; |
| 60 | pcid_cdata_t pcid_data[MAX_CPUS] __attribute__((aligned(64))); |
| 61 | |
| 62 | void pmap_pcid_configure(void) { |
| 63 | int ccpu = cpu_number(); |
| 64 | uintptr_t cr4 = get_cr4(); |
| 65 | boolean_t pcid_present = FALSE; |
| 66 | |
| 67 | pmap_pcid_log("PCID configure invoked on CPU %d\n" , ccpu); |
| 68 | pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0); |
| 69 | pmap_assert(cpu_mode_is64bit()); |
| 70 | |
| 71 | if (PE_parse_boot_argn("-pmap_pcid_disable" , &pmap_pcid_disabled, sizeof (pmap_pcid_disabled))) { |
| 72 | pmap_pcid_log("PMAP: PCID feature disabled\n" ); |
| 73 | printf("PMAP: PCID feature disabled, %u\n" , pmap_pcid_disabled); |
| 74 | kprintf("PMAP: PCID feature disabled %u\n" , pmap_pcid_disabled); |
| 75 | } |
| 76 | /* no_shared_cr3+PCID is currently unsupported */ |
| 77 | //todo remove nscr3 |
| 78 | #if DEBUG |
| 79 | if (pmap_pcid_disabled == FALSE) |
| 80 | no_shared_cr3 = FALSE; |
| 81 | else |
| 82 | no_shared_cr3 = TRUE; |
| 83 | #else |
| 84 | if (no_shared_cr3) |
| 85 | pmap_pcid_disabled = TRUE; |
| 86 | #endif |
| 87 | if (pmap_pcid_disabled || no_shared_cr3) { |
| 88 | unsigned i; |
| 89 | /* Reset PCID status, as we may have picked up |
| 90 | * strays if discovered prior to platform |
| 91 | * expert initialization. |
| 92 | */ |
| 93 | for (i = 0; i < real_ncpus; i++) { |
| 94 | if (cpu_datap(i)) { |
| 95 | cpu_datap(i)->cpu_pmap_pcid_enabled = FALSE; |
| 96 | } |
| 97 | pmap_pcid_ncpus = 0; |
| 98 | } |
| 99 | cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE; |
| 100 | return; |
| 101 | } |
| 102 | /* DRKTODO: assert if features haven't been discovered yet. Redundant |
| 103 | * invocation of cpu_mode_init and descendants masks this for now. |
| 104 | */ |
| 105 | if ((cpuid_features() & CPUID_FEATURE_PCID)) |
| 106 | pcid_present = TRUE; |
| 107 | else { |
| 108 | cpu_datap(ccpu)->cpu_pmap_pcid_enabled = FALSE; |
| 109 | pmap_pcid_log("PMAP: PCID not detected CPU %d\n" , ccpu); |
| 110 | return; |
| 111 | } |
| 112 | if ((cr4 & (CR4_PCIDE | CR4_PGE)) == (CR4_PCIDE|CR4_PGE)) { |
| 113 | cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE; |
| 114 | pmap_pcid_log("PMAP: PCID already enabled %d\n" , ccpu); |
| 115 | return; |
| 116 | } |
| 117 | if (pcid_present == TRUE) { |
| 118 | pmap_pcid_log("Pre-PCID:CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n" , get_cr0(), get_cr3_raw(), ccpu, cr4); |
| 119 | |
| 120 | if (cpu_number() >= PMAP_PCID_MAX_CPUS) { |
| 121 | panic("PMAP_PCID_MAX_CPUS %d\n" , cpu_number()); |
| 122 | } |
| 123 | if ((get_cr4() & CR4_PGE) == 0) { |
| 124 | set_cr4(get_cr4() | CR4_PGE); |
| 125 | pmap_pcid_log("Toggled PGE ON (CPU: %d\n" , ccpu); |
| 126 | } |
| 127 | set_cr4(get_cr4() | CR4_PCIDE); |
| 128 | pmap_pcid_log("Post PCID: CR0: 0x%lx, CR3: 0x%lx, CR4(CPU %d): 0x%lx\n" , get_cr0(), get_cr3_raw(), ccpu, get_cr4()); |
| 129 | tlb_flush_global(); |
| 130 | cpu_datap(ccpu)->cpu_pmap_pcid_enabled = TRUE; |
| 131 | |
| 132 | if (OSIncrementAtomic(&pmap_pcid_ncpus) == machine_info.max_cpus) { |
| 133 | pmap_pcid_log("All PCIDs enabled: real_ncpus: %d, pmap_pcid_ncpus: %d\n" , real_ncpus, pmap_pcid_ncpus); |
| 134 | } |
| 135 | cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = |
| 136 | cpu_datap(ccpu)->cpu_pmap_pcid_coherentp_kernel = |
| 137 | &(kernel_pmap->pmap_pcid_coherency_vector[ccpu]); |
| 138 | cpu_datap(ccpu)->cpu_pcid_data = &pcid_data[ccpu]; |
| 139 | cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[0] = 1; |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | void pmap_pcid_initialize(pmap_t p) { |
| 144 | unsigned i; |
| 145 | unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t); |
| 146 | |
| 147 | pmap_assert(nc >= real_ncpus); |
| 148 | for (i = 0; i < nc; i++) { |
| 149 | p->pmap_pcid_cpus[i] = PMAP_PCID_INVALID_PCID; |
| 150 | /* We assume here that the coherency vector is zeroed by |
| 151 | * pmap_create |
| 152 | */ |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | void pmap_pcid_initialize_kernel(pmap_t p) { |
| 157 | unsigned i; |
| 158 | unsigned nc = sizeof(p->pmap_pcid_cpus)/sizeof(pcid_t); |
| 159 | |
| 160 | for (i = 0; i < nc; i++) { |
| 161 | p->pmap_pcid_cpus[i] = 0; |
| 162 | /* We assume here that the coherency vector is zeroed by |
| 163 | * pmap_create |
| 164 | */ |
| 165 | } |
| 166 | } |
| 167 | |
| 168 | pcid_t pmap_pcid_allocate_pcid(int ccpu) { |
| 169 | int i; |
| 170 | pcid_ref_t cur_min = 0xFF; |
| 171 | uint32_t cur_min_index = ~1; |
| 172 | pcid_ref_t *cpu_pcid_refcounts = &cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[0]; |
| 173 | pcid_ref_t old_count; |
| 174 | |
| 175 | if ((i = cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint) != 0) { |
| 176 | if (cpu_pcid_refcounts[i] == 0) { |
| 177 | (void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1); |
| 178 | cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint = 0; |
| 179 | return i; |
| 180 | } |
| 181 | } |
| 182 | /* Linear scan to discover free slot, with hint. Room for optimization |
| 183 | * but with intelligent prefetchers this should be |
| 184 | * adequately performant, as it is invoked |
| 185 | * only on first dispatch of a new address space onto |
| 186 | * a given processor. DRKTODO: use larger loads and |
| 187 | * zero byte discovery -- any pattern != ~1 should |
| 188 | * signify a free slot. |
| 189 | */ |
| 190 | for (i = PMAP_PCID_MIN_PCID; i < PMAP_PCID_MAX_PCID; i++) { |
| 191 | pcid_ref_t cur_refcount = cpu_pcid_refcounts[i]; |
| 192 | |
| 193 | pmap_assert(cur_refcount < PMAP_PCID_MAX_REFCOUNT); |
| 194 | |
| 195 | if (cur_refcount == 0) { |
| 196 | (void)__sync_fetch_and_add(&cpu_pcid_refcounts[i], 1); |
| 197 | return i; |
| 198 | } else { |
| 199 | if (cur_refcount < cur_min) { |
| 200 | cur_min_index = i; |
| 201 | cur_min = cur_refcount; |
| 202 | } |
| 203 | } |
| 204 | } |
| 205 | pmap_assert(cur_min_index > 0 && cur_min_index < PMAP_PCID_MAX_PCID); |
| 206 | /* Consider "rebalancing" tags actively in highly oversubscribed cases |
| 207 | * perhaps selecting tags with lower activity. |
| 208 | */ |
| 209 | |
| 210 | old_count = __sync_fetch_and_add(&cpu_pcid_refcounts[cur_min_index], 1); |
| 211 | pmap_assert(old_count < PMAP_PCID_MAX_REFCOUNT); |
| 212 | return (cur_min_index); |
| 213 | } |
| 214 | |
| 215 | void pmap_pcid_deallocate_pcid(int ccpu, pmap_t tpmap) { |
| 216 | pcid_t pcid; |
| 217 | pmap_t lp; |
| 218 | pcid_ref_t prior_count; |
| 219 | |
| 220 | pcid = tpmap->pmap_pcid_cpus[ccpu]; |
| 221 | pmap_assert(pcid != PMAP_PCID_INVALID_PCID); |
| 222 | if (pcid == PMAP_PCID_INVALID_PCID) |
| 223 | return; |
| 224 | |
| 225 | lp = cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_last_pmap_dispatched[pcid]; |
| 226 | pmap_assert(pcid > 0 && pcid < PMAP_PCID_MAX_PCID); |
| 227 | pmap_assert(cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[pcid] >= 1); |
| 228 | |
| 229 | if (lp == tpmap) |
| 230 | (void)__sync_bool_compare_and_swap(&cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_last_pmap_dispatched[pcid], tpmap, PMAP_INVALID); |
| 231 | |
| 232 | if ((prior_count = __sync_fetch_and_sub(&cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_refcounts[pcid], 1)) == 1) { |
| 233 | cpu_datap(ccpu)->cpu_pcid_data->cpu_pcid_free_hint = pcid; |
| 234 | } |
| 235 | pmap_assert(prior_count <= PMAP_PCID_MAX_REFCOUNT); |
| 236 | } |
| 237 | |
| 238 | void pmap_destroy_pcid_sync(pmap_t p) { |
| 239 | int i; |
| 240 | pmap_assert(ml_get_interrupts_enabled() == FALSE || get_preemption_level() !=0); |
| 241 | for (i = 0; i < PMAP_PCID_MAX_CPUS; i++) |
| 242 | if (p->pmap_pcid_cpus[i] != PMAP_PCID_INVALID_PCID) |
| 243 | pmap_pcid_deallocate_pcid(i, p); |
| 244 | } |
| 245 | |
| 246 | pcid_t pcid_for_pmap_cpu_tuple(pmap_t cpmap, thread_t cthread, int ccpu) { |
| 247 | pmap_t active_pmap = cpmap; |
| 248 | |
| 249 | if (__improbable(cpmap->pagezero_accessible)) { |
| 250 | if ((cthread->machine.specFlags & CopyIOActive) == 0) { |
| 251 | active_pmap = kernel_pmap; |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | return active_pmap->pmap_pcid_cpus[ccpu]; |
| 256 | } |
| 257 | int npz = 0; |
| 258 | |
| 259 | #if PMAP_ASSERT |
| 260 | #define PCID_RECORD_SIZE 128 |
| 261 | uint64_t pcid_record_array[PCID_RECORD_SIZE]; |
| 262 | #endif |
| 263 | |
| 264 | void pmap_pcid_activate(pmap_t tpmap, int ccpu, boolean_t nopagezero, boolean_t copyio) { |
| 265 | pcid_t new_pcid = tpmap->pmap_pcid_cpus[ccpu]; |
| 266 | pmap_t last_pmap; |
| 267 | boolean_t pcid_conflict = FALSE, pending_flush = FALSE; |
| 268 | pcid_cdata_t *pcdata = cpu_datap(ccpu)->cpu_pcid_data; |
| 269 | |
| 270 | pmap_assert(cpu_datap(ccpu)->cpu_pmap_pcid_enabled); |
| 271 | if (__improbable(new_pcid == PMAP_PCID_INVALID_PCID)) { |
| 272 | new_pcid = tpmap->pmap_pcid_cpus[ccpu] = pmap_pcid_allocate_pcid(ccpu); |
| 273 | } |
| 274 | |
| 275 | pmap_assert(new_pcid != PMAP_PCID_INVALID_PCID); |
| 276 | #ifdef PCID_ASSERT |
| 277 | cpu_datap(ccpu)->cpu_last_pcid = cpu_datap(ccpu)->cpu_active_pcid; |
| 278 | #endif |
| 279 | cpu_datap(ccpu)->cpu_active_pcid = new_pcid; |
| 280 | |
| 281 | pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); |
| 282 | if (__probable(pending_flush == FALSE)) { |
| 283 | last_pmap = pcdata->cpu_pcid_last_pmap_dispatched[new_pcid]; |
| 284 | pcid_conflict = ((last_pmap != NULL) && (tpmap != last_pmap)); |
| 285 | } |
| 286 | if (__improbable(pending_flush || pcid_conflict)) { |
| 287 | pmap_pcid_validate_cpu(tpmap, ccpu); |
| 288 | } |
| 289 | /* Consider making this a unique id */ |
| 290 | pcdata->cpu_pcid_last_pmap_dispatched[new_pcid] = tpmap; |
| 291 | |
| 292 | pmap_assert(new_pcid < PMAP_PCID_MAX_PCID); |
| 293 | pmap_assert(((tpmap == kernel_pmap) && new_pcid == 0) || |
| 294 | ((new_pcid != PMAP_PCID_INVALID_PCID) && (new_pcid != 0))); |
| 295 | #if PMAP_ASSERT |
| 296 | pcid_record_array[ccpu % PCID_RECORD_SIZE] = tpmap->pm_cr3 | new_pcid | (((uint64_t)(!(pending_flush || pcid_conflict))) <<63); |
| 297 | pml4_entry_t *pml4 = pmap64_pml4(tpmap, 0ULL); |
| 298 | /* Diagnostic to detect pagetable anchor corruption */ |
| 299 | if (pml4[KERNEL_PML4_INDEX] != kernel_pmap->pm_pml4[KERNEL_PML4_INDEX]) |
| 300 | __asm__ volatile("int3" ); |
| 301 | #endif /* PMAP_ASSERT */ |
| 302 | |
| 303 | pmap_paddr_t ncr3 = tpmap->pm_cr3; |
| 304 | |
| 305 | if (__improbable(nopagezero)) { |
| 306 | pending_flush = TRUE; |
| 307 | if (copyio == FALSE) { |
| 308 | new_pcid = kernel_pmap->pmap_pcid_cpus[ccpu]; |
| 309 | ncr3 = kernel_pmap->pm_cr3; |
| 310 | } |
| 311 | cpu_datap(ccpu)->cpu_kernel_pcid = kernel_pmap->pmap_pcid_cpus[ccpu]; |
| 312 | npz++; |
| 313 | } |
| 314 | |
| 315 | uint64_t preserve = !(pending_flush || pcid_conflict); |
| 316 | set_cr3_composed(ncr3, new_pcid, preserve); |
| 317 | #if DEBUG |
| 318 | cpu_datap(ccpu)->cpu_pcid_last_cr3 = ncr3 | new_pcid | preserve << 63; |
| 319 | #endif |
| 320 | uint64_t spcid = (new_pcid + PMAP_PCID_MAX_PCID); |
| 321 | if (new_pcid == 0) { |
| 322 | spcid = 0; |
| 323 | } |
| 324 | uint64_t scr3 = tpmap->pm_ucr3 | spcid; |
| 325 | |
| 326 | cpu_datap(ccpu)->cpu_ucr3 = scr3; |
| 327 | cpu_shadowp(ccpu)->cpu_ucr3 = scr3; |
| 328 | |
| 329 | cpu_shadowp(ccpu)->cpu_task_cr3 = ncr3 | new_pcid; |
| 330 | |
| 331 | if (!pending_flush) { |
| 332 | /* We did not previously observe a pending invalidation for this |
| 333 | * ASID. However, the load from the coherency vector |
| 334 | * could've been reordered ahead of the store to the |
| 335 | * active_cr3 field (in the context switch path, our |
| 336 | * caller). Re-consult the pending invalidation vector |
| 337 | * after the CR3 write. We rely on MOV CR3's documented |
| 338 | * serializing property to avoid insertion of an expensive |
| 339 | * barrier. (DRK) |
| 340 | */ |
| 341 | pending_flush = (tpmap->pmap_pcid_coherency_vector[ccpu] != 0); |
| 342 | if (__improbable(pending_flush != 0)) { |
| 343 | pmap_pcid_validate_cpu(tpmap, ccpu); |
| 344 | set_cr3_composed(ncr3, new_pcid, FALSE); |
| 345 | } |
| 346 | } |
| 347 | cpu_datap(ccpu)->cpu_pmap_pcid_coherentp = &(tpmap->pmap_pcid_coherency_vector[ccpu]); |
| 348 | #if DEBUG |
| 349 | KERNEL_DEBUG_CONSTANT(0x9c1d0000, tpmap, new_pcid, pending_flush, pcid_conflict, 0); |
| 350 | #endif |
| 351 | } |
| 352 | |