| 1 | /* |
| 2 | * Copyright (c) 2000-2012 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | /* |
| 29 | * @OSF_COPYRIGHT@ |
| 30 | */ |
| 31 | /* |
| 32 | * Mach Operating System |
| 33 | * Copyright (c) 1991,1990 Carnegie Mellon University |
| 34 | * All Rights Reserved. |
| 35 | * |
| 36 | * Permission to use, copy, modify and distribute this software and its |
| 37 | * documentation is hereby granted, provided that both the copyright |
| 38 | * notice and this permission notice appear in all copies of the |
| 39 | * software, derivative works or modified versions, and any portions |
| 40 | * thereof, and that both notices appear in supporting documentation. |
| 41 | * |
| 42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR |
| 44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 45 | * |
| 46 | * Carnegie Mellon requests users of this software to return to |
| 47 | * |
| 48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 49 | * School of Computer Science |
| 50 | * Carnegie Mellon University |
| 51 | * Pittsburgh PA 15213-3890 |
| 52 | * |
| 53 | * any improvements or extensions that they make and grant Carnegie Mellon |
| 54 | * the rights to redistribute these changes. |
| 55 | */ |
| 56 | |
| 57 | /* |
| 58 | */ |
| 59 | |
| 60 | #include <kern/cpu_number.h> |
| 61 | #include <kern/kalloc.h> |
| 62 | #include <kern/cpu_data.h> |
| 63 | #include <mach/mach_types.h> |
| 64 | #include <mach/machine.h> |
| 65 | #include <mach/vm_map.h> |
| 66 | #include <mach/machine/vm_param.h> |
| 67 | #include <vm/vm_kern.h> |
| 68 | #include <vm/vm_map.h> |
| 69 | |
| 70 | #include <i386/bit_routines.h> |
| 71 | #include <i386/mp_desc.h> |
| 72 | #include <i386/misc_protos.h> |
| 73 | #include <i386/mp.h> |
| 74 | #include <i386/pmap.h> |
| 75 | #include <i386/postcode.h> |
| 76 | #include <i386/pmap_internal.h> |
| 77 | #if CONFIG_MCA |
| 78 | #include <i386/machine_check.h> |
| 79 | #endif |
| 80 | |
| 81 | #include <kern/misc_protos.h> |
| 82 | |
| 83 | #if MONOTONIC |
| 84 | #include <kern/monotonic.h> |
| 85 | #endif /* MONOTONIC */ |
| 86 | #include <san/kasan.h> |
| 87 | |
| 88 | #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) |
| 89 | #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) |
| 90 | |
| 91 | // Declare macros that will declare the externs |
| 92 | #define TRAP(n, name) extern void *name ; |
| 93 | #define TRAP_ERR(n, name) extern void *name ; |
| 94 | #define TRAP_SPC(n, name) extern void *name ; |
| 95 | #define TRAP_IST1(n, name) extern void *name ; |
| 96 | #define TRAP_IST2(n, name) extern void *name ; |
| 97 | #define INTERRUPT(n) extern void *_intr_ ## n ; |
| 98 | #define USER_TRAP(n, name) extern void *name ; |
| 99 | #define USER_TRAP_SPC(n, name) extern void *name ; |
| 100 | |
| 101 | // Include the table to declare the externs |
| 102 | #include "../x86_64/idt_table.h" |
| 103 | |
| 104 | // Undef the macros, then redefine them so we can declare the table |
| 105 | #undef TRAP |
| 106 | #undef TRAP_ERR |
| 107 | #undef TRAP_SPC |
| 108 | #undef TRAP_IST1 |
| 109 | #undef TRAP_IST2 |
| 110 | #undef INTERRUPT |
| 111 | #undef USER_TRAP |
| 112 | #undef USER_TRAP_SPC |
| 113 | |
| 114 | #define TRAP(n, name) \ |
| 115 | [n] = { \ |
| 116 | (uintptr_t)&name, \ |
| 117 | KERNEL64_CS, \ |
| 118 | 0, \ |
| 119 | K_INTR_GATE, \ |
| 120 | 0 \ |
| 121 | }, |
| 122 | |
| 123 | #define TRAP_ERR TRAP |
| 124 | #define TRAP_SPC TRAP |
| 125 | |
| 126 | #define TRAP_IST1(n, name) \ |
| 127 | [n] = { \ |
| 128 | (uintptr_t)&name, \ |
| 129 | KERNEL64_CS, \ |
| 130 | 1, \ |
| 131 | K_INTR_GATE, \ |
| 132 | 0 \ |
| 133 | }, |
| 134 | |
| 135 | #define TRAP_IST2(n, name) \ |
| 136 | [n] = { \ |
| 137 | (uintptr_t)&name, \ |
| 138 | KERNEL64_CS, \ |
| 139 | 2, \ |
| 140 | K_INTR_GATE, \ |
| 141 | 0 \ |
| 142 | }, |
| 143 | |
| 144 | #define INTERRUPT(n) \ |
| 145 | [n] = { \ |
| 146 | (uintptr_t)&_intr_ ## n,\ |
| 147 | KERNEL64_CS, \ |
| 148 | 0, \ |
| 149 | K_INTR_GATE, \ |
| 150 | 0 \ |
| 151 | }, |
| 152 | |
| 153 | #define USER_TRAP(n, name) \ |
| 154 | [n] = { \ |
| 155 | (uintptr_t)&name, \ |
| 156 | KERNEL64_CS, \ |
| 157 | 0, \ |
| 158 | U_INTR_GATE, \ |
| 159 | 0 \ |
| 160 | }, |
| 161 | |
| 162 | #define USER_TRAP_SPC USER_TRAP |
| 163 | |
| 164 | // Declare the table using the macros we just set up |
| 165 | struct fake_descriptor64 master_idt64[IDTSZ] |
| 166 | __attribute__ ((section("__HIB,__desc" ))) |
| 167 | __attribute__ ((aligned(PAGE_SIZE))) = { |
| 168 | #include "../x86_64/idt_table.h" |
| 169 | }; |
| 170 | |
| 171 | /* |
| 172 | * First cpu`s interrupt stack. |
| 173 | */ |
| 174 | extern uint32_t low_intstack[]; /* bottom */ |
| 175 | extern uint32_t low_eintstack[]; /* top */ |
| 176 | |
| 177 | /* |
| 178 | * Per-cpu data area pointers. |
| 179 | */ |
| 180 | cpu_data_t cpshadows[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
| 181 | cpu_data_t scdatas[MAX_CPUS] __attribute__((aligned(64))) = { |
| 182 | [0].cpu_this = &scdatas[0], |
| 183 | [0].cpu_nanotime = &pal_rtc_nanotime_info, |
| 184 | [0].cpu_int_stack_top = (vm_offset_t) low_eintstack, |
| 185 | [0].cd_shadow = &cpshadows[0] |
| 186 | }; |
| 187 | cpu_data_t *cpu_data_master = &scdatas[0]; |
| 188 | |
| 189 | cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &scdatas[0] }; |
| 190 | |
| 191 | decl_simple_lock_data(,ncpus_lock); /* protects real_ncpus */ |
| 192 | unsigned int real_ncpus = 1; |
| 193 | unsigned int max_ncpus = MAX_CPUS; |
| 194 | |
| 195 | extern void hi64_sysenter(void); |
| 196 | extern void hi64_syscall(void); |
| 197 | |
| 198 | typedef struct { |
| 199 | struct real_descriptor pcldts[LDTSZ]; |
| 200 | } cldt_t; |
| 201 | |
| 202 | cpu_desc_table64_t scdtables[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
| 203 | cpu_fault_stack_t scfstks[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
| 204 | |
| 205 | cldt_t *dyn_ldts; |
| 206 | |
| 207 | /* |
| 208 | * Multiprocessor i386/i486 systems use a separate copy of the |
| 209 | * GDT, IDT, LDT, and kernel TSS per processor. The first three |
| 210 | * are separate to avoid lock contention: the i386 uses locked |
| 211 | * memory cycles to access the descriptor tables. The TSS is |
| 212 | * separate since each processor needs its own kernel stack, |
| 213 | * and since using a TSS marks it busy. |
| 214 | */ |
| 215 | |
| 216 | /* |
| 217 | * Allocate and initialize the per-processor descriptor tables. |
| 218 | */ |
| 219 | |
| 220 | /* |
| 221 | * This is the expanded, 64-bit variant of the kernel LDT descriptor. |
| 222 | * When switching to 64-bit mode this replaces KERNEL_LDT entry |
| 223 | * and the following empty slot. This enables the LDT to be referenced |
| 224 | * in the uber-space remapping window on the kernel. |
| 225 | */ |
| 226 | struct fake_descriptor64 kernel_ldt_desc64 = { |
| 227 | 0, |
| 228 | LDTSZ_MIN*sizeof(struct fake_descriptor)-1, |
| 229 | 0, |
| 230 | ACC_P|ACC_PL_K|ACC_LDT, |
| 231 | 0 |
| 232 | }; |
| 233 | |
| 234 | /* |
| 235 | * This is the expanded, 64-bit variant of the kernel TSS descriptor. |
| 236 | * It is follows pattern of the KERNEL_LDT. |
| 237 | */ |
| 238 | struct fake_descriptor64 kernel_tss_desc64 = { |
| 239 | 0, |
| 240 | sizeof(struct x86_64_tss)-1, |
| 241 | 0, |
| 242 | ACC_P|ACC_PL_K|ACC_TSS, |
| 243 | 0 |
| 244 | }; |
| 245 | |
| 246 | /* |
| 247 | * Convert a descriptor from fake to real format. |
| 248 | * |
| 249 | * Fake descriptor format: |
| 250 | * bytes 0..3 base 31..0 |
| 251 | * bytes 4..5 limit 15..0 |
| 252 | * byte 6 access byte 2 | limit 19..16 |
| 253 | * byte 7 access byte 1 |
| 254 | * |
| 255 | * Real descriptor format: |
| 256 | * bytes 0..1 limit 15..0 |
| 257 | * bytes 2..3 base 15..0 |
| 258 | * byte 4 base 23..16 |
| 259 | * byte 5 access byte 1 |
| 260 | * byte 6 access byte 2 | limit 19..16 |
| 261 | * byte 7 base 31..24 |
| 262 | * |
| 263 | * Fake gate format: |
| 264 | * bytes 0..3 offset |
| 265 | * bytes 4..5 selector |
| 266 | * byte 6 word count << 4 (to match fake descriptor) |
| 267 | * byte 7 access byte 1 |
| 268 | * |
| 269 | * Real gate format: |
| 270 | * bytes 0..1 offset 15..0 |
| 271 | * bytes 2..3 selector |
| 272 | * byte 4 word count |
| 273 | * byte 5 access byte 1 |
| 274 | * bytes 6..7 offset 31..16 |
| 275 | */ |
| 276 | void |
| 277 | fix_desc(void *d, int num_desc) { |
| 278 | //early_kprintf("fix_desc(%x, %x)\n", d, num_desc); |
| 279 | uint8_t *desc = (uint8_t*) d; |
| 280 | |
| 281 | do { |
| 282 | if ((desc[7] & 0x14) == 0x04) { /* gate */ |
| 283 | uint32_t offset; |
| 284 | uint16_t selector; |
| 285 | uint8_t wordcount; |
| 286 | uint8_t acc; |
| 287 | |
| 288 | offset = *((uint32_t*)(desc)); |
| 289 | selector = *((uint32_t*)(desc+4)); |
| 290 | wordcount = desc[6] >> 4; |
| 291 | acc = desc[7]; |
| 292 | |
| 293 | *((uint16_t*)desc) = offset & 0xFFFF; |
| 294 | *((uint16_t*)(desc+2)) = selector; |
| 295 | desc[4] = wordcount; |
| 296 | desc[5] = acc; |
| 297 | *((uint16_t*)(desc+6)) = offset >> 16; |
| 298 | |
| 299 | } else { /* descriptor */ |
| 300 | uint32_t base; |
| 301 | uint16_t limit; |
| 302 | uint8_t acc1, acc2; |
| 303 | |
| 304 | base = *((uint32_t*)(desc)); |
| 305 | limit = *((uint16_t*)(desc+4)); |
| 306 | acc2 = desc[6]; |
| 307 | acc1 = desc[7]; |
| 308 | |
| 309 | *((uint16_t*)(desc)) = limit; |
| 310 | *((uint16_t*)(desc+2)) = base & 0xFFFF; |
| 311 | desc[4] = (base >> 16) & 0xFF; |
| 312 | desc[5] = acc1; |
| 313 | desc[6] = acc2; |
| 314 | desc[7] = base >> 24; |
| 315 | } |
| 316 | desc += 8; |
| 317 | } while (--num_desc); |
| 318 | } |
| 319 | |
| 320 | void |
| 321 | fix_desc64(void *descp, int count) |
| 322 | { |
| 323 | struct fake_descriptor64 *fakep; |
| 324 | union { |
| 325 | struct real_gate64 gate; |
| 326 | struct real_descriptor64 desc; |
| 327 | } real; |
| 328 | int i; |
| 329 | |
| 330 | fakep = (struct fake_descriptor64 *) descp; |
| 331 | |
| 332 | for (i = 0; i < count; i++, fakep++) { |
| 333 | /* |
| 334 | * Construct the real decriptor locally. |
| 335 | */ |
| 336 | |
| 337 | bzero((void *) &real, sizeof(real)); |
| 338 | |
| 339 | switch (fakep->access & ACC_TYPE) { |
| 340 | case 0: |
| 341 | break; |
| 342 | case ACC_CALL_GATE: |
| 343 | case ACC_INTR_GATE: |
| 344 | case ACC_TRAP_GATE: |
| 345 | real.gate.offset_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); |
| 346 | real.gate.selector16 = fakep->lim_or_seg & 0xFFFF; |
| 347 | real.gate.IST = fakep->size_or_IST & 0x7; |
| 348 | real.gate.access8 = fakep->access; |
| 349 | real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF); |
| 350 | real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32); |
| 351 | break; |
| 352 | default: /* Otherwise */ |
| 353 | real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF; |
| 354 | real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); |
| 355 | real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF); |
| 356 | real.desc.access8 = fakep->access; |
| 357 | real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF; |
| 358 | real.desc.granularity4 = fakep->size_or_IST; |
| 359 | real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF); |
| 360 | real.desc.base_top32 = (uint32_t)(fakep->offset64>>32); |
| 361 | } |
| 362 | |
| 363 | /* |
| 364 | * Now copy back over the fake structure. |
| 365 | */ |
| 366 | bcopy((void *) &real, (void *) fakep, sizeof(real)); |
| 367 | } |
| 368 | } |
| 369 | |
| 370 | extern unsigned mldtsz; |
| 371 | void |
| 372 | cpu_desc_init(cpu_data_t *cdp) |
| 373 | { |
| 374 | cpu_desc_index_t *cdi = &cdp->cpu_desc_index; |
| 375 | |
| 376 | if (cdp == cpu_data_master) { |
| 377 | /* |
| 378 | * Populate the double-mapped 'u' and base 'b' fields in the |
| 379 | * KTSS with I/G/LDT and sysenter stack data. |
| 380 | */ |
| 381 | cdi->cdi_ktssu = (void *)DBLMAP(&master_ktss64); |
| 382 | cdi->cdi_ktssb = (void *)&master_ktss64; |
| 383 | cdi->cdi_sstku = (vm_offset_t) DBLMAP(&master_sstk.top); |
| 384 | cdi->cdi_sstkb = (vm_offset_t) &master_sstk.top; |
| 385 | |
| 386 | cdi->cdi_gdtu.ptr = (void *)DBLMAP((uintptr_t) &master_gdt); |
| 387 | cdi->cdi_gdtb.ptr = (void *)&master_gdt; |
| 388 | cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); |
| 389 | cdi->cdi_idtb.ptr = (void *)((uintptr_t) &master_idt64); |
| 390 | cdi->cdi_ldtu = (struct fake_descriptor *) (void *) DBLMAP((uintptr_t)&master_ldt[0]); |
| 391 | cdi->cdi_ldtb = (struct fake_descriptor *) (void *) &master_ldt[0]; |
| 392 | |
| 393 | /* Replace the expanded LDTs and TSS slots in the GDT */ |
| 394 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
| 395 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] = |
| 396 | kernel_ldt_desc64; |
| 397 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] = |
| 398 | kernel_ldt_desc64; |
| 399 | kernel_tss_desc64.offset64 = (uintptr_t) DBLMAP(&master_ktss64); |
| 400 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] = |
| 401 | kernel_tss_desc64; |
| 402 | |
| 403 | /* Fix up the expanded descriptors for 64-bit. */ |
| 404 | fix_desc64((void *) &master_idt64, IDTSZ); |
| 405 | fix_desc64((void *) &master_gdt[sel_idx(KERNEL_LDT)], 1); |
| 406 | fix_desc64((void *) &master_gdt[sel_idx(USER_LDT)], 1); |
| 407 | fix_desc64((void *) &master_gdt[sel_idx(KERNEL_TSS)], 1); |
| 408 | |
| 409 | /* |
| 410 | * Set the NMI/fault stacks as IST2/IST1 in the 64-bit TSS |
| 411 | */ |
| 412 | master_ktss64.ist2 = (uintptr_t) low_eintstack; |
| 413 | master_ktss64.ist1 = (uintptr_t) low_eintstack - sizeof(x86_64_intr_stack_frame_t); |
| 414 | } else if (cdi->cdi_ktssu == NULL) { /* Skipping re-init on wake */ |
| 415 | cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; |
| 416 | |
| 417 | cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); |
| 418 | |
| 419 | cdi->cdi_ktssu = (void *)DBLMAP(&cdt->ktss); |
| 420 | cdi->cdi_ktssb = (void *)(&cdt->ktss); |
| 421 | cdi->cdi_sstku = (vm_offset_t)DBLMAP(&cdt->sstk.top); |
| 422 | cdi->cdi_sstkb = (vm_offset_t)(&cdt->sstk.top); |
| 423 | cdi->cdi_ldtu = (void *)LDTALIAS(cdp->cpu_ldtp); |
| 424 | cdi->cdi_ldtb = (void *)(cdp->cpu_ldtp); |
| 425 | |
| 426 | /* |
| 427 | * Copy the tables |
| 428 | */ |
| 429 | bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); |
| 430 | bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, mldtsz); |
| 431 | bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss)); |
| 432 | cdi->cdi_gdtu.ptr = (void *)DBLMAP(cdt->gdt); |
| 433 | cdi->cdi_gdtb.ptr = (void *)(cdt->gdt); |
| 434 | /* |
| 435 | * Fix up the entries in the GDT to point to |
| 436 | * this LDT and this TSS. |
| 437 | * Note reuse of global 'kernel_ldt_desc64, which is not |
| 438 | * concurrency-safe. Higher level synchronization is expected |
| 439 | */ |
| 440 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
| 441 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] = |
| 442 | kernel_ldt_desc64; |
| 443 | fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1); |
| 444 | |
| 445 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
| 446 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] = |
| 447 | kernel_ldt_desc64; |
| 448 | fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1); |
| 449 | |
| 450 | kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktssu; |
| 451 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] = |
| 452 | kernel_tss_desc64; |
| 453 | fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); |
| 454 | |
| 455 | /* Set (zeroed) fault stack as IST1, NMI intr stack IST2 */ |
| 456 | uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; |
| 457 | cdt->fstkp = cfstk; |
| 458 | bzero((void *) cfstk, FSTK_SZ); |
| 459 | cdt->ktss.ist2 = DBLMAP((uint64_t)cdt->fstkp + FSTK_SZ); |
| 460 | cdt->ktss.ist1 = cdt->ktss.ist2 - sizeof(x86_64_intr_stack_frame_t); |
| 461 | } |
| 462 | |
| 463 | /* Require that the top of the sysenter stack is 16-byte aligned */ |
| 464 | if ((cdi->cdi_sstku % 16) != 0) |
| 465 | panic("cpu_desc_init() sysenter stack not 16-byte aligned" ); |
| 466 | } |
| 467 | void |
| 468 | cpu_desc_load(cpu_data_t *cdp) |
| 469 | { |
| 470 | cpu_desc_index_t *cdi = &cdp->cpu_desc_index; |
| 471 | |
| 472 | postcode(CPU_DESC_LOAD_ENTRY); |
| 473 | |
| 474 | /* Stuff the kernel per-cpu data area address into the MSRs */ |
| 475 | postcode(CPU_DESC_LOAD_GS_BASE); |
| 476 | wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); |
| 477 | postcode(CPU_DESC_LOAD_KERNEL_GS_BASE); |
| 478 | wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); |
| 479 | |
| 480 | /* |
| 481 | * Ensure the TSS segment's busy bit is clear. This is required |
| 482 | * for the case of reloading descriptors at wake to avoid |
| 483 | * their complete re-initialization. |
| 484 | */ |
| 485 | gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY; |
| 486 | |
| 487 | /* Load the GDT, LDT, IDT and TSS */ |
| 488 | cdi->cdi_gdtb.size = sizeof(struct real_descriptor)*GDTSZ - 1; |
| 489 | cdi->cdi_gdtu.size = cdi->cdi_gdtb.size; |
| 490 | cdi->cdi_idtb.size = 0x1000 + cdp->cpu_number; |
| 491 | cdi->cdi_idtu.size = cdi->cdi_idtb.size; |
| 492 | |
| 493 | postcode(CPU_DESC_LOAD_GDT); |
| 494 | lgdt((uintptr_t *) &cdi->cdi_gdtu); |
| 495 | postcode(CPU_DESC_LOAD_IDT); |
| 496 | lidt((uintptr_t *) &cdi->cdi_idtu); |
| 497 | postcode(CPU_DESC_LOAD_LDT); |
| 498 | lldt(KERNEL_LDT); |
| 499 | postcode(CPU_DESC_LOAD_TSS); |
| 500 | set_tr(KERNEL_TSS); |
| 501 | |
| 502 | #if GPROF // Hack to enable mcount to work on K64 |
| 503 | __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); |
| 504 | #endif |
| 505 | postcode(CPU_DESC_LOAD_EXIT); |
| 506 | } |
| 507 | |
| 508 | /* |
| 509 | * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit. |
| 510 | */ |
| 511 | void |
| 512 | cpu_syscall_init(cpu_data_t *cdp) |
| 513 | { |
| 514 | #if MONOTONIC |
| 515 | mt_cpu_up(cdp); |
| 516 | #else /* MONOTONIC */ |
| 517 | #pragma unused(cdp) |
| 518 | #endif /* !MONOTONIC */ |
| 519 | wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); |
| 520 | wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter)); |
| 521 | wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku); |
| 522 | /* Enable syscall/sysret */ |
| 523 | wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE); |
| 524 | |
| 525 | /* |
| 526 | * MSRs for 64-bit syscall/sysret |
| 527 | * Note USER_CS because sysret uses this + 16 when returning to |
| 528 | * 64-bit code. |
| 529 | */ |
| 530 | wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall)); |
| 531 | wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32)); |
| 532 | /* |
| 533 | * Emulate eflags cleared by sysenter but note that |
| 534 | * we also clear the trace trap to avoid the complications |
| 535 | * of single-stepping into a syscall. The nested task bit |
| 536 | * is also cleared to avoid a spurious "task switch" |
| 537 | * should we choose to return via an IRET. |
| 538 | */ |
| 539 | wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT); |
| 540 | |
| 541 | } |
| 542 | extern vm_offset_t dyn_dblmap(vm_offset_t, vm_offset_t); |
| 543 | uint64_t ldt_alias_offset; |
| 544 | |
| 545 | cpu_data_t * |
| 546 | cpu_data_alloc(boolean_t is_boot_cpu) |
| 547 | { |
| 548 | int ret; |
| 549 | cpu_data_t *cdp; |
| 550 | |
| 551 | if (is_boot_cpu) { |
| 552 | assert(real_ncpus == 1); |
| 553 | cdp = cpu_datap(0); |
| 554 | if (cdp->cpu_processor == NULL) { |
| 555 | simple_lock_init(&ncpus_lock, 0); |
| 556 | cdp->cpu_processor = cpu_processor_alloc(TRUE); |
| 557 | #if NCOPY_WINDOWS > 0 |
| 558 | cdp->cpu_pmap = pmap_cpu_alloc(TRUE); |
| 559 | #endif |
| 560 | } |
| 561 | return cdp; |
| 562 | } |
| 563 | |
| 564 | boolean_t do_ldt_alloc = FALSE; |
| 565 | simple_lock(&ncpus_lock); |
| 566 | int cnum = real_ncpus; |
| 567 | real_ncpus++; |
| 568 | if (dyn_ldts == NULL) { |
| 569 | do_ldt_alloc = TRUE; |
| 570 | } |
| 571 | simple_unlock(&ncpus_lock); |
| 572 | |
| 573 | /* |
| 574 | * Allocate per-cpu data: |
| 575 | */ |
| 576 | |
| 577 | cdp = &scdatas[cnum]; |
| 578 | bzero((void*) cdp, sizeof(cpu_data_t)); |
| 579 | cdp->cpu_this = cdp; |
| 580 | cdp->cpu_number = cnum; |
| 581 | cdp->cd_shadow = &cpshadows[cnum]; |
| 582 | /* |
| 583 | * Allocate interrupt stack: |
| 584 | */ |
| 585 | ret = kmem_alloc(kernel_map, |
| 586 | (vm_offset_t *) &cdp->cpu_int_stack_top, |
| 587 | INTSTACK_SIZE, VM_KERN_MEMORY_CPU); |
| 588 | if (ret != KERN_SUCCESS) { |
| 589 | panic("cpu_data_alloc() int stack failed, ret=%d\n" , ret); |
| 590 | } |
| 591 | bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE); |
| 592 | cdp->cpu_int_stack_top += INTSTACK_SIZE; |
| 593 | |
| 594 | /* |
| 595 | * Allocate descriptor table: |
| 596 | */ |
| 597 | |
| 598 | cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[cnum]; |
| 599 | /* |
| 600 | * Allocate LDT |
| 601 | */ |
| 602 | if (do_ldt_alloc) { |
| 603 | boolean_t do_ldt_free = FALSE; |
| 604 | vm_offset_t sldtoffset = 0; |
| 605 | /* |
| 606 | * Allocate LDT |
| 607 | */ |
| 608 | vm_offset_t ldtalloc = 0, ldtallocsz = round_page_64(MAX_CPUS * sizeof(struct real_descriptor) * LDTSZ); |
| 609 | ret = kmem_alloc(kernel_map, (vm_offset_t *) &ldtalloc, ldtallocsz, VM_KERN_MEMORY_CPU); |
| 610 | if (ret != KERN_SUCCESS) { |
| 611 | panic("cpu_data_alloc() ldt failed, kmem_alloc=%d\n" , ret); |
| 612 | } |
| 613 | |
| 614 | simple_lock(&ncpus_lock); |
| 615 | if (dyn_ldts == NULL) { |
| 616 | dyn_ldts = (cldt_t *)ldtalloc; |
| 617 | } else { |
| 618 | do_ldt_free = TRUE; |
| 619 | } |
| 620 | simple_unlock(&ncpus_lock); |
| 621 | |
| 622 | if (do_ldt_free) { |
| 623 | kmem_free(kernel_map, ldtalloc, ldtallocsz); |
| 624 | } else { |
| 625 | /* CPU registration and startup are expected to execute |
| 626 | * serially, as invoked by the platform driver. |
| 627 | * Create trampoline alias of LDT region. |
| 628 | */ |
| 629 | sldtoffset = dyn_dblmap(ldtalloc, ldtallocsz); |
| 630 | ldt_alias_offset = sldtoffset; |
| 631 | } |
| 632 | } |
| 633 | cdp->cpu_ldtp = &dyn_ldts[cnum].pcldts[0]; |
| 634 | |
| 635 | #if CONFIG_MCA |
| 636 | /* Machine-check shadow register allocation. */ |
| 637 | mca_cpu_alloc(cdp); |
| 638 | #endif |
| 639 | |
| 640 | /* |
| 641 | * Before this cpu has been assigned a real thread context, |
| 642 | * we give it a fake, unique, non-zero thread id which the locking |
| 643 | * primitives use as their lock value. |
| 644 | * Note that this does not apply to the boot processor, cpu 0, which |
| 645 | * transitions to a thread context well before other processors are |
| 646 | * started. |
| 647 | */ |
| 648 | cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number; |
| 649 | cdp->cpu_NMI_acknowledged = TRUE; |
| 650 | cdp->cpu_nanotime = &pal_rtc_nanotime_info; |
| 651 | |
| 652 | kprintf("cpu_data_alloc(%d) %p desc_table: %p " |
| 653 | "ldt: %p " |
| 654 | "int_stack: 0x%lx-0x%lx\n" , |
| 655 | cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp, |
| 656 | (long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top)); |
| 657 | cpu_data_ptr[cnum] = cdp; |
| 658 | |
| 659 | return cdp; |
| 660 | |
| 661 | } |
| 662 | |
| 663 | boolean_t |
| 664 | valid_user_data_selector(uint16_t selector) |
| 665 | { |
| 666 | sel_t sel = selector_to_sel(selector); |
| 667 | |
| 668 | if (selector == 0) |
| 669 | return (TRUE); |
| 670 | |
| 671 | if (sel.ti == SEL_LDT) |
| 672 | return (TRUE); |
| 673 | else if (sel.index < GDTSZ) { |
| 674 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
| 675 | return (TRUE); |
| 676 | } |
| 677 | return (FALSE); |
| 678 | } |
| 679 | |
| 680 | boolean_t |
| 681 | valid_user_code_selector(uint16_t selector) |
| 682 | { |
| 683 | sel_t sel = selector_to_sel(selector); |
| 684 | |
| 685 | if (selector == 0) |
| 686 | return (FALSE); |
| 687 | |
| 688 | if (sel.ti == SEL_LDT) { |
| 689 | if (sel.rpl == USER_PRIV) |
| 690 | return (TRUE); |
| 691 | } |
| 692 | else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { |
| 693 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
| 694 | return (TRUE); |
| 695 | /* Explicitly validate the system code selectors |
| 696 | * even if not instantaneously privileged, |
| 697 | * since they are dynamically re-privileged |
| 698 | * at context switch |
| 699 | */ |
| 700 | if ((selector == USER_CS) || (selector == USER64_CS)) |
| 701 | return (TRUE); |
| 702 | } |
| 703 | |
| 704 | return (FALSE); |
| 705 | } |
| 706 | |
| 707 | boolean_t |
| 708 | valid_user_stack_selector(uint16_t selector) |
| 709 | { |
| 710 | sel_t sel = selector_to_sel(selector); |
| 711 | |
| 712 | if (selector == 0) |
| 713 | return (FALSE); |
| 714 | |
| 715 | if (sel.ti == SEL_LDT) { |
| 716 | if (sel.rpl == USER_PRIV) |
| 717 | return (TRUE); |
| 718 | } |
| 719 | else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { |
| 720 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
| 721 | return (TRUE); |
| 722 | } |
| 723 | |
| 724 | return (FALSE); |
| 725 | } |
| 726 | |
| 727 | boolean_t |
| 728 | valid_user_segment_selectors(uint16_t cs, |
| 729 | uint16_t ss, |
| 730 | uint16_t ds, |
| 731 | uint16_t es, |
| 732 | uint16_t fs, |
| 733 | uint16_t gs) |
| 734 | { |
| 735 | return valid_user_code_selector(cs) && |
| 736 | valid_user_stack_selector(ss) && |
| 737 | valid_user_data_selector(ds) && |
| 738 | valid_user_data_selector(es) && |
| 739 | valid_user_data_selector(fs) && |
| 740 | valid_user_data_selector(gs); |
| 741 | } |
| 742 | |
| 743 | #if NCOPY_WINDOWS > 0 |
| 744 | |
| 745 | static vm_offset_t user_window_base = 0; |
| 746 | |
| 747 | void |
| 748 | cpu_userwindow_init(int cpu) |
| 749 | { |
| 750 | cpu_data_t *cdp = cpu_data_ptr[cpu]; |
| 751 | vm_offset_t user_window; |
| 752 | vm_offset_t vaddr; |
| 753 | int num_cpus; |
| 754 | |
| 755 | num_cpus = ml_get_max_cpus(); |
| 756 | |
| 757 | if (cpu >= num_cpus) |
| 758 | panic("cpu_userwindow_init: cpu > num_cpus" ); |
| 759 | |
| 760 | if (user_window_base == 0) { |
| 761 | |
| 762 | if (vm_allocate(kernel_map, &vaddr, |
| 763 | (NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE, |
| 764 | VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) |
| 765 | panic("cpu_userwindow_init: " |
| 766 | "couldn't allocate user map window" ); |
| 767 | |
| 768 | /* |
| 769 | * window must start on a page table boundary |
| 770 | * in the virtual address space |
| 771 | */ |
| 772 | user_window_base = (vaddr + (NBPDE - 1)) & ~(NBPDE - 1); |
| 773 | |
| 774 | /* |
| 775 | * get rid of any allocation leading up to our |
| 776 | * starting boundary |
| 777 | */ |
| 778 | vm_deallocate(kernel_map, vaddr, user_window_base - vaddr); |
| 779 | |
| 780 | /* |
| 781 | * get rid of tail that we don't need |
| 782 | */ |
| 783 | user_window = user_window_base + |
| 784 | (NBPDE * NCOPY_WINDOWS * num_cpus); |
| 785 | |
| 786 | vm_deallocate(kernel_map, user_window, |
| 787 | (vaddr + |
| 788 | ((NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE)) - |
| 789 | user_window); |
| 790 | } |
| 791 | |
| 792 | user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE); |
| 793 | |
| 794 | cdp->cpu_copywindow_base = user_window; |
| 795 | /* |
| 796 | * Abuse this pdp entry, the pdp now actually points to |
| 797 | * an array of copy windows addresses. |
| 798 | */ |
| 799 | cdp->cpu_copywindow_pdp = pmap_pde(kernel_pmap, user_window); |
| 800 | |
| 801 | } |
| 802 | |
| 803 | void |
| 804 | cpu_physwindow_init(int cpu) |
| 805 | { |
| 806 | cpu_data_t *cdp = cpu_data_ptr[cpu]; |
| 807 | vm_offset_t phys_window = cdp->cpu_physwindow_base; |
| 808 | |
| 809 | if (phys_window == 0) { |
| 810 | if (vm_allocate(kernel_map, &phys_window, |
| 811 | PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) |
| 812 | != KERN_SUCCESS) |
| 813 | panic("cpu_physwindow_init: " |
| 814 | "couldn't allocate phys map window" ); |
| 815 | |
| 816 | /* |
| 817 | * make sure the page that encompasses the |
| 818 | * pte pointer we're interested in actually |
| 819 | * exists in the page table |
| 820 | */ |
| 821 | pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE); |
| 822 | |
| 823 | cdp->cpu_physwindow_base = phys_window; |
| 824 | cdp->cpu_physwindow_ptep = vtopte(phys_window); |
| 825 | } |
| 826 | } |
| 827 | #endif /* NCOPY_WINDOWS > 0 */ |
| 828 | |
| 829 | /* |
| 830 | * Allocate a new interrupt stack for the boot processor from the |
| 831 | * heap rather than continue to use the statically allocated space. |
| 832 | * Also switch to a dynamically allocated cpu data area. |
| 833 | */ |
| 834 | void |
| 835 | cpu_data_realloc(void) |
| 836 | { |
| 837 | int ret; |
| 838 | vm_offset_t istk; |
| 839 | cpu_data_t *cdp; |
| 840 | boolean_t istate; |
| 841 | |
| 842 | ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE, VM_KERN_MEMORY_CPU); |
| 843 | if (ret != KERN_SUCCESS) { |
| 844 | panic("cpu_data_realloc() stack alloc, ret=%d\n" , ret); |
| 845 | } |
| 846 | bzero((void*) istk, INTSTACK_SIZE); |
| 847 | istk += INTSTACK_SIZE; |
| 848 | |
| 849 | cdp = &scdatas[0]; |
| 850 | |
| 851 | /* Copy old contents into new area and make fix-ups */ |
| 852 | assert(cpu_number() == 0); |
| 853 | bcopy((void *) cpu_data_ptr[0], (void*) cdp, sizeof(cpu_data_t)); |
| 854 | cdp->cpu_this = cdp; |
| 855 | cdp->cpu_int_stack_top = istk; |
| 856 | timer_call_queue_init(&cdp->rtclock_timer.queue); |
| 857 | cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[0]; |
| 858 | cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; |
| 859 | |
| 860 | uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; |
| 861 | cdt->fstkp = cfstk; |
| 862 | cfstk += FSTK_SZ; |
| 863 | |
| 864 | /* |
| 865 | * With interrupts disabled commmit the new areas. |
| 866 | */ |
| 867 | istate = ml_set_interrupts_enabled(FALSE); |
| 868 | cpu_data_ptr[0] = cdp; |
| 869 | master_ktss64.ist2 = DBLMAP((uintptr_t) cfstk); |
| 870 | master_ktss64.ist1 = DBLMAP((uintptr_t) cfstk - sizeof(x86_64_intr_stack_frame_t)); |
| 871 | wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); |
| 872 | wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); |
| 873 | (void) ml_set_interrupts_enabled(istate); |
| 874 | |
| 875 | kprintf("Reallocated master cpu data: %p," |
| 876 | " interrupt stack: %p, fault stack: %p\n" , |
| 877 | (void *) cdp, (void *) istk, (void *) cfstk); |
| 878 | } |
| 879 | |