1 | /* |
2 | * Copyright (c) 2000-2012 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | /* |
29 | * @OSF_COPYRIGHT@ |
30 | */ |
31 | /* |
32 | * Mach Operating System |
33 | * Copyright (c) 1991,1990 Carnegie Mellon University |
34 | * All Rights Reserved. |
35 | * |
36 | * Permission to use, copy, modify and distribute this software and its |
37 | * documentation is hereby granted, provided that both the copyright |
38 | * notice and this permission notice appear in all copies of the |
39 | * software, derivative works or modified versions, and any portions |
40 | * thereof, and that both notices appear in supporting documentation. |
41 | * |
42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR |
44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
45 | * |
46 | * Carnegie Mellon requests users of this software to return to |
47 | * |
48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
49 | * School of Computer Science |
50 | * Carnegie Mellon University |
51 | * Pittsburgh PA 15213-3890 |
52 | * |
53 | * any improvements or extensions that they make and grant Carnegie Mellon |
54 | * the rights to redistribute these changes. |
55 | */ |
56 | |
57 | /* |
58 | */ |
59 | |
60 | #include <kern/cpu_number.h> |
61 | #include <kern/kalloc.h> |
62 | #include <kern/cpu_data.h> |
63 | #include <mach/mach_types.h> |
64 | #include <mach/machine.h> |
65 | #include <mach/vm_map.h> |
66 | #include <mach/machine/vm_param.h> |
67 | #include <vm/vm_kern.h> |
68 | #include <vm/vm_map.h> |
69 | |
70 | #include <i386/bit_routines.h> |
71 | #include <i386/mp_desc.h> |
72 | #include <i386/misc_protos.h> |
73 | #include <i386/mp.h> |
74 | #include <i386/pmap.h> |
75 | #include <i386/postcode.h> |
76 | #include <i386/pmap_internal.h> |
77 | #if CONFIG_MCA |
78 | #include <i386/machine_check.h> |
79 | #endif |
80 | |
81 | #include <kern/misc_protos.h> |
82 | |
83 | #if MONOTONIC |
84 | #include <kern/monotonic.h> |
85 | #endif /* MONOTONIC */ |
86 | #include <san/kasan.h> |
87 | |
88 | #define K_INTR_GATE (ACC_P|ACC_PL_K|ACC_INTR_GATE) |
89 | #define U_INTR_GATE (ACC_P|ACC_PL_U|ACC_INTR_GATE) |
90 | |
91 | // Declare macros that will declare the externs |
92 | #define TRAP(n, name) extern void *name ; |
93 | #define TRAP_ERR(n, name) extern void *name ; |
94 | #define TRAP_SPC(n, name) extern void *name ; |
95 | #define TRAP_IST1(n, name) extern void *name ; |
96 | #define TRAP_IST2(n, name) extern void *name ; |
97 | #define INTERRUPT(n) extern void *_intr_ ## n ; |
98 | #define USER_TRAP(n, name) extern void *name ; |
99 | #define USER_TRAP_SPC(n, name) extern void *name ; |
100 | |
101 | // Include the table to declare the externs |
102 | #include "../x86_64/idt_table.h" |
103 | |
104 | // Undef the macros, then redefine them so we can declare the table |
105 | #undef TRAP |
106 | #undef TRAP_ERR |
107 | #undef TRAP_SPC |
108 | #undef TRAP_IST1 |
109 | #undef TRAP_IST2 |
110 | #undef INTERRUPT |
111 | #undef USER_TRAP |
112 | #undef USER_TRAP_SPC |
113 | |
114 | #define TRAP(n, name) \ |
115 | [n] = { \ |
116 | (uintptr_t)&name, \ |
117 | KERNEL64_CS, \ |
118 | 0, \ |
119 | K_INTR_GATE, \ |
120 | 0 \ |
121 | }, |
122 | |
123 | #define TRAP_ERR TRAP |
124 | #define TRAP_SPC TRAP |
125 | |
126 | #define TRAP_IST1(n, name) \ |
127 | [n] = { \ |
128 | (uintptr_t)&name, \ |
129 | KERNEL64_CS, \ |
130 | 1, \ |
131 | K_INTR_GATE, \ |
132 | 0 \ |
133 | }, |
134 | |
135 | #define TRAP_IST2(n, name) \ |
136 | [n] = { \ |
137 | (uintptr_t)&name, \ |
138 | KERNEL64_CS, \ |
139 | 2, \ |
140 | K_INTR_GATE, \ |
141 | 0 \ |
142 | }, |
143 | |
144 | #define INTERRUPT(n) \ |
145 | [n] = { \ |
146 | (uintptr_t)&_intr_ ## n,\ |
147 | KERNEL64_CS, \ |
148 | 0, \ |
149 | K_INTR_GATE, \ |
150 | 0 \ |
151 | }, |
152 | |
153 | #define USER_TRAP(n, name) \ |
154 | [n] = { \ |
155 | (uintptr_t)&name, \ |
156 | KERNEL64_CS, \ |
157 | 0, \ |
158 | U_INTR_GATE, \ |
159 | 0 \ |
160 | }, |
161 | |
162 | #define USER_TRAP_SPC USER_TRAP |
163 | |
164 | // Declare the table using the macros we just set up |
165 | struct fake_descriptor64 master_idt64[IDTSZ] |
166 | __attribute__ ((section("__HIB,__desc" ))) |
167 | __attribute__ ((aligned(PAGE_SIZE))) = { |
168 | #include "../x86_64/idt_table.h" |
169 | }; |
170 | |
171 | /* |
172 | * First cpu`s interrupt stack. |
173 | */ |
174 | extern uint32_t low_intstack[]; /* bottom */ |
175 | extern uint32_t low_eintstack[]; /* top */ |
176 | |
177 | /* |
178 | * Per-cpu data area pointers. |
179 | */ |
180 | cpu_data_t cpshadows[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
181 | cpu_data_t scdatas[MAX_CPUS] __attribute__((aligned(64))) = { |
182 | [0].cpu_this = &scdatas[0], |
183 | [0].cpu_nanotime = &pal_rtc_nanotime_info, |
184 | [0].cpu_int_stack_top = (vm_offset_t) low_eintstack, |
185 | [0].cd_shadow = &cpshadows[0] |
186 | }; |
187 | cpu_data_t *cpu_data_master = &scdatas[0]; |
188 | |
189 | cpu_data_t *cpu_data_ptr[MAX_CPUS] = { [0] = &scdatas[0] }; |
190 | |
191 | decl_simple_lock_data(,ncpus_lock); /* protects real_ncpus */ |
192 | unsigned int real_ncpus = 1; |
193 | unsigned int max_ncpus = MAX_CPUS; |
194 | |
195 | extern void hi64_sysenter(void); |
196 | extern void hi64_syscall(void); |
197 | |
198 | typedef struct { |
199 | struct real_descriptor pcldts[LDTSZ]; |
200 | } cldt_t; |
201 | |
202 | cpu_desc_table64_t scdtables[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
203 | cpu_fault_stack_t scfstks[MAX_CPUS] __attribute__((aligned(64))) __attribute__((section("__HIB, __desc" ))); |
204 | |
205 | cldt_t *dyn_ldts; |
206 | |
207 | /* |
208 | * Multiprocessor i386/i486 systems use a separate copy of the |
209 | * GDT, IDT, LDT, and kernel TSS per processor. The first three |
210 | * are separate to avoid lock contention: the i386 uses locked |
211 | * memory cycles to access the descriptor tables. The TSS is |
212 | * separate since each processor needs its own kernel stack, |
213 | * and since using a TSS marks it busy. |
214 | */ |
215 | |
216 | /* |
217 | * Allocate and initialize the per-processor descriptor tables. |
218 | */ |
219 | |
220 | /* |
221 | * This is the expanded, 64-bit variant of the kernel LDT descriptor. |
222 | * When switching to 64-bit mode this replaces KERNEL_LDT entry |
223 | * and the following empty slot. This enables the LDT to be referenced |
224 | * in the uber-space remapping window on the kernel. |
225 | */ |
226 | struct fake_descriptor64 kernel_ldt_desc64 = { |
227 | 0, |
228 | LDTSZ_MIN*sizeof(struct fake_descriptor)-1, |
229 | 0, |
230 | ACC_P|ACC_PL_K|ACC_LDT, |
231 | 0 |
232 | }; |
233 | |
234 | /* |
235 | * This is the expanded, 64-bit variant of the kernel TSS descriptor. |
236 | * It is follows pattern of the KERNEL_LDT. |
237 | */ |
238 | struct fake_descriptor64 kernel_tss_desc64 = { |
239 | 0, |
240 | sizeof(struct x86_64_tss)-1, |
241 | 0, |
242 | ACC_P|ACC_PL_K|ACC_TSS, |
243 | 0 |
244 | }; |
245 | |
246 | /* |
247 | * Convert a descriptor from fake to real format. |
248 | * |
249 | * Fake descriptor format: |
250 | * bytes 0..3 base 31..0 |
251 | * bytes 4..5 limit 15..0 |
252 | * byte 6 access byte 2 | limit 19..16 |
253 | * byte 7 access byte 1 |
254 | * |
255 | * Real descriptor format: |
256 | * bytes 0..1 limit 15..0 |
257 | * bytes 2..3 base 15..0 |
258 | * byte 4 base 23..16 |
259 | * byte 5 access byte 1 |
260 | * byte 6 access byte 2 | limit 19..16 |
261 | * byte 7 base 31..24 |
262 | * |
263 | * Fake gate format: |
264 | * bytes 0..3 offset |
265 | * bytes 4..5 selector |
266 | * byte 6 word count << 4 (to match fake descriptor) |
267 | * byte 7 access byte 1 |
268 | * |
269 | * Real gate format: |
270 | * bytes 0..1 offset 15..0 |
271 | * bytes 2..3 selector |
272 | * byte 4 word count |
273 | * byte 5 access byte 1 |
274 | * bytes 6..7 offset 31..16 |
275 | */ |
276 | void |
277 | fix_desc(void *d, int num_desc) { |
278 | //early_kprintf("fix_desc(%x, %x)\n", d, num_desc); |
279 | uint8_t *desc = (uint8_t*) d; |
280 | |
281 | do { |
282 | if ((desc[7] & 0x14) == 0x04) { /* gate */ |
283 | uint32_t offset; |
284 | uint16_t selector; |
285 | uint8_t wordcount; |
286 | uint8_t acc; |
287 | |
288 | offset = *((uint32_t*)(desc)); |
289 | selector = *((uint32_t*)(desc+4)); |
290 | wordcount = desc[6] >> 4; |
291 | acc = desc[7]; |
292 | |
293 | *((uint16_t*)desc) = offset & 0xFFFF; |
294 | *((uint16_t*)(desc+2)) = selector; |
295 | desc[4] = wordcount; |
296 | desc[5] = acc; |
297 | *((uint16_t*)(desc+6)) = offset >> 16; |
298 | |
299 | } else { /* descriptor */ |
300 | uint32_t base; |
301 | uint16_t limit; |
302 | uint8_t acc1, acc2; |
303 | |
304 | base = *((uint32_t*)(desc)); |
305 | limit = *((uint16_t*)(desc+4)); |
306 | acc2 = desc[6]; |
307 | acc1 = desc[7]; |
308 | |
309 | *((uint16_t*)(desc)) = limit; |
310 | *((uint16_t*)(desc+2)) = base & 0xFFFF; |
311 | desc[4] = (base >> 16) & 0xFF; |
312 | desc[5] = acc1; |
313 | desc[6] = acc2; |
314 | desc[7] = base >> 24; |
315 | } |
316 | desc += 8; |
317 | } while (--num_desc); |
318 | } |
319 | |
320 | void |
321 | fix_desc64(void *descp, int count) |
322 | { |
323 | struct fake_descriptor64 *fakep; |
324 | union { |
325 | struct real_gate64 gate; |
326 | struct real_descriptor64 desc; |
327 | } real; |
328 | int i; |
329 | |
330 | fakep = (struct fake_descriptor64 *) descp; |
331 | |
332 | for (i = 0; i < count; i++, fakep++) { |
333 | /* |
334 | * Construct the real decriptor locally. |
335 | */ |
336 | |
337 | bzero((void *) &real, sizeof(real)); |
338 | |
339 | switch (fakep->access & ACC_TYPE) { |
340 | case 0: |
341 | break; |
342 | case ACC_CALL_GATE: |
343 | case ACC_INTR_GATE: |
344 | case ACC_TRAP_GATE: |
345 | real.gate.offset_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); |
346 | real.gate.selector16 = fakep->lim_or_seg & 0xFFFF; |
347 | real.gate.IST = fakep->size_or_IST & 0x7; |
348 | real.gate.access8 = fakep->access; |
349 | real.gate.offset_high16 = (uint16_t)((fakep->offset64>>16) & 0xFFFF); |
350 | real.gate.offset_top32 = (uint32_t)(fakep->offset64>>32); |
351 | break; |
352 | default: /* Otherwise */ |
353 | real.desc.limit_low16 = fakep->lim_or_seg & 0xFFFF; |
354 | real.desc.base_low16 = (uint16_t)(fakep->offset64 & 0xFFFF); |
355 | real.desc.base_med8 = (uint8_t)((fakep->offset64 >> 16) & 0xFF); |
356 | real.desc.access8 = fakep->access; |
357 | real.desc.limit_high4 = (fakep->lim_or_seg >> 16) & 0xFF; |
358 | real.desc.granularity4 = fakep->size_or_IST; |
359 | real.desc.base_high8 = (uint8_t)((fakep->offset64 >> 24) & 0xFF); |
360 | real.desc.base_top32 = (uint32_t)(fakep->offset64>>32); |
361 | } |
362 | |
363 | /* |
364 | * Now copy back over the fake structure. |
365 | */ |
366 | bcopy((void *) &real, (void *) fakep, sizeof(real)); |
367 | } |
368 | } |
369 | |
370 | extern unsigned mldtsz; |
371 | void |
372 | cpu_desc_init(cpu_data_t *cdp) |
373 | { |
374 | cpu_desc_index_t *cdi = &cdp->cpu_desc_index; |
375 | |
376 | if (cdp == cpu_data_master) { |
377 | /* |
378 | * Populate the double-mapped 'u' and base 'b' fields in the |
379 | * KTSS with I/G/LDT and sysenter stack data. |
380 | */ |
381 | cdi->cdi_ktssu = (void *)DBLMAP(&master_ktss64); |
382 | cdi->cdi_ktssb = (void *)&master_ktss64; |
383 | cdi->cdi_sstku = (vm_offset_t) DBLMAP(&master_sstk.top); |
384 | cdi->cdi_sstkb = (vm_offset_t) &master_sstk.top; |
385 | |
386 | cdi->cdi_gdtu.ptr = (void *)DBLMAP((uintptr_t) &master_gdt); |
387 | cdi->cdi_gdtb.ptr = (void *)&master_gdt; |
388 | cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); |
389 | cdi->cdi_idtb.ptr = (void *)((uintptr_t) &master_idt64); |
390 | cdi->cdi_ldtu = (struct fake_descriptor *) (void *) DBLMAP((uintptr_t)&master_ldt[0]); |
391 | cdi->cdi_ldtb = (struct fake_descriptor *) (void *) &master_ldt[0]; |
392 | |
393 | /* Replace the expanded LDTs and TSS slots in the GDT */ |
394 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
395 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_LDT)] = |
396 | kernel_ldt_desc64; |
397 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(USER_LDT)] = |
398 | kernel_ldt_desc64; |
399 | kernel_tss_desc64.offset64 = (uintptr_t) DBLMAP(&master_ktss64); |
400 | *(struct fake_descriptor64 *) &master_gdt[sel_idx(KERNEL_TSS)] = |
401 | kernel_tss_desc64; |
402 | |
403 | /* Fix up the expanded descriptors for 64-bit. */ |
404 | fix_desc64((void *) &master_idt64, IDTSZ); |
405 | fix_desc64((void *) &master_gdt[sel_idx(KERNEL_LDT)], 1); |
406 | fix_desc64((void *) &master_gdt[sel_idx(USER_LDT)], 1); |
407 | fix_desc64((void *) &master_gdt[sel_idx(KERNEL_TSS)], 1); |
408 | |
409 | /* |
410 | * Set the NMI/fault stacks as IST2/IST1 in the 64-bit TSS |
411 | */ |
412 | master_ktss64.ist2 = (uintptr_t) low_eintstack; |
413 | master_ktss64.ist1 = (uintptr_t) low_eintstack - sizeof(x86_64_intr_stack_frame_t); |
414 | } else if (cdi->cdi_ktssu == NULL) { /* Skipping re-init on wake */ |
415 | cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; |
416 | |
417 | cdi->cdi_idtu.ptr = (void *)DBLMAP((uintptr_t) &master_idt64); |
418 | |
419 | cdi->cdi_ktssu = (void *)DBLMAP(&cdt->ktss); |
420 | cdi->cdi_ktssb = (void *)(&cdt->ktss); |
421 | cdi->cdi_sstku = (vm_offset_t)DBLMAP(&cdt->sstk.top); |
422 | cdi->cdi_sstkb = (vm_offset_t)(&cdt->sstk.top); |
423 | cdi->cdi_ldtu = (void *)LDTALIAS(cdp->cpu_ldtp); |
424 | cdi->cdi_ldtb = (void *)(cdp->cpu_ldtp); |
425 | |
426 | /* |
427 | * Copy the tables |
428 | */ |
429 | bcopy((char *)master_gdt, (char *)cdt->gdt, sizeof(master_gdt)); |
430 | bcopy((char *)master_ldt, (char *)cdp->cpu_ldtp, mldtsz); |
431 | bcopy((char *)&master_ktss64, (char *)&cdt->ktss, sizeof(struct x86_64_tss)); |
432 | cdi->cdi_gdtu.ptr = (void *)DBLMAP(cdt->gdt); |
433 | cdi->cdi_gdtb.ptr = (void *)(cdt->gdt); |
434 | /* |
435 | * Fix up the entries in the GDT to point to |
436 | * this LDT and this TSS. |
437 | * Note reuse of global 'kernel_ldt_desc64, which is not |
438 | * concurrency-safe. Higher level synchronization is expected |
439 | */ |
440 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
441 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_LDT)] = |
442 | kernel_ldt_desc64; |
443 | fix_desc64(&cdt->gdt[sel_idx(KERNEL_LDT)], 1); |
444 | |
445 | kernel_ldt_desc64.offset64 = (uintptr_t) cdi->cdi_ldtu; |
446 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(USER_LDT)] = |
447 | kernel_ldt_desc64; |
448 | fix_desc64(&cdt->gdt[sel_idx(USER_LDT)], 1); |
449 | |
450 | kernel_tss_desc64.offset64 = (uintptr_t) cdi->cdi_ktssu; |
451 | *(struct fake_descriptor64 *) &cdt->gdt[sel_idx(KERNEL_TSS)] = |
452 | kernel_tss_desc64; |
453 | fix_desc64(&cdt->gdt[sel_idx(KERNEL_TSS)], 1); |
454 | |
455 | /* Set (zeroed) fault stack as IST1, NMI intr stack IST2 */ |
456 | uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; |
457 | cdt->fstkp = cfstk; |
458 | bzero((void *) cfstk, FSTK_SZ); |
459 | cdt->ktss.ist2 = DBLMAP((uint64_t)cdt->fstkp + FSTK_SZ); |
460 | cdt->ktss.ist1 = cdt->ktss.ist2 - sizeof(x86_64_intr_stack_frame_t); |
461 | } |
462 | |
463 | /* Require that the top of the sysenter stack is 16-byte aligned */ |
464 | if ((cdi->cdi_sstku % 16) != 0) |
465 | panic("cpu_desc_init() sysenter stack not 16-byte aligned" ); |
466 | } |
467 | void |
468 | cpu_desc_load(cpu_data_t *cdp) |
469 | { |
470 | cpu_desc_index_t *cdi = &cdp->cpu_desc_index; |
471 | |
472 | postcode(CPU_DESC_LOAD_ENTRY); |
473 | |
474 | /* Stuff the kernel per-cpu data area address into the MSRs */ |
475 | postcode(CPU_DESC_LOAD_GS_BASE); |
476 | wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); |
477 | postcode(CPU_DESC_LOAD_KERNEL_GS_BASE); |
478 | wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); |
479 | |
480 | /* |
481 | * Ensure the TSS segment's busy bit is clear. This is required |
482 | * for the case of reloading descriptors at wake to avoid |
483 | * their complete re-initialization. |
484 | */ |
485 | gdt_desc_p(KERNEL_TSS)->access &= ~ACC_TSS_BUSY; |
486 | |
487 | /* Load the GDT, LDT, IDT and TSS */ |
488 | cdi->cdi_gdtb.size = sizeof(struct real_descriptor)*GDTSZ - 1; |
489 | cdi->cdi_gdtu.size = cdi->cdi_gdtb.size; |
490 | cdi->cdi_idtb.size = 0x1000 + cdp->cpu_number; |
491 | cdi->cdi_idtu.size = cdi->cdi_idtb.size; |
492 | |
493 | postcode(CPU_DESC_LOAD_GDT); |
494 | lgdt((uintptr_t *) &cdi->cdi_gdtu); |
495 | postcode(CPU_DESC_LOAD_IDT); |
496 | lidt((uintptr_t *) &cdi->cdi_idtu); |
497 | postcode(CPU_DESC_LOAD_LDT); |
498 | lldt(KERNEL_LDT); |
499 | postcode(CPU_DESC_LOAD_TSS); |
500 | set_tr(KERNEL_TSS); |
501 | |
502 | #if GPROF // Hack to enable mcount to work on K64 |
503 | __asm__ volatile("mov %0, %%gs" : : "rm" ((unsigned short)(KERNEL_DS))); |
504 | #endif |
505 | postcode(CPU_DESC_LOAD_EXIT); |
506 | } |
507 | |
508 | /* |
509 | * Set MSRs for sysenter/sysexit and syscall/sysret for 64-bit. |
510 | */ |
511 | void |
512 | cpu_syscall_init(cpu_data_t *cdp) |
513 | { |
514 | #if MONOTONIC |
515 | mt_cpu_up(cdp); |
516 | #else /* MONOTONIC */ |
517 | #pragma unused(cdp) |
518 | #endif /* !MONOTONIC */ |
519 | wrmsr64(MSR_IA32_SYSENTER_CS, SYSENTER_CS); |
520 | wrmsr64(MSR_IA32_SYSENTER_EIP, DBLMAP((uintptr_t) hi64_sysenter)); |
521 | wrmsr64(MSR_IA32_SYSENTER_ESP, current_cpu_datap()->cpu_desc_index.cdi_sstku); |
522 | /* Enable syscall/sysret */ |
523 | wrmsr64(MSR_IA32_EFER, rdmsr64(MSR_IA32_EFER) | MSR_IA32_EFER_SCE); |
524 | |
525 | /* |
526 | * MSRs for 64-bit syscall/sysret |
527 | * Note USER_CS because sysret uses this + 16 when returning to |
528 | * 64-bit code. |
529 | */ |
530 | wrmsr64(MSR_IA32_LSTAR, DBLMAP((uintptr_t) hi64_syscall)); |
531 | wrmsr64(MSR_IA32_STAR, (((uint64_t)USER_CS) << 48) | (((uint64_t)KERNEL64_CS) << 32)); |
532 | /* |
533 | * Emulate eflags cleared by sysenter but note that |
534 | * we also clear the trace trap to avoid the complications |
535 | * of single-stepping into a syscall. The nested task bit |
536 | * is also cleared to avoid a spurious "task switch" |
537 | * should we choose to return via an IRET. |
538 | */ |
539 | wrmsr64(MSR_IA32_FMASK, EFL_DF|EFL_IF|EFL_TF|EFL_NT); |
540 | |
541 | } |
542 | extern vm_offset_t dyn_dblmap(vm_offset_t, vm_offset_t); |
543 | uint64_t ldt_alias_offset; |
544 | |
545 | cpu_data_t * |
546 | cpu_data_alloc(boolean_t is_boot_cpu) |
547 | { |
548 | int ret; |
549 | cpu_data_t *cdp; |
550 | |
551 | if (is_boot_cpu) { |
552 | assert(real_ncpus == 1); |
553 | cdp = cpu_datap(0); |
554 | if (cdp->cpu_processor == NULL) { |
555 | simple_lock_init(&ncpus_lock, 0); |
556 | cdp->cpu_processor = cpu_processor_alloc(TRUE); |
557 | #if NCOPY_WINDOWS > 0 |
558 | cdp->cpu_pmap = pmap_cpu_alloc(TRUE); |
559 | #endif |
560 | } |
561 | return cdp; |
562 | } |
563 | |
564 | boolean_t do_ldt_alloc = FALSE; |
565 | simple_lock(&ncpus_lock); |
566 | int cnum = real_ncpus; |
567 | real_ncpus++; |
568 | if (dyn_ldts == NULL) { |
569 | do_ldt_alloc = TRUE; |
570 | } |
571 | simple_unlock(&ncpus_lock); |
572 | |
573 | /* |
574 | * Allocate per-cpu data: |
575 | */ |
576 | |
577 | cdp = &scdatas[cnum]; |
578 | bzero((void*) cdp, sizeof(cpu_data_t)); |
579 | cdp->cpu_this = cdp; |
580 | cdp->cpu_number = cnum; |
581 | cdp->cd_shadow = &cpshadows[cnum]; |
582 | /* |
583 | * Allocate interrupt stack: |
584 | */ |
585 | ret = kmem_alloc(kernel_map, |
586 | (vm_offset_t *) &cdp->cpu_int_stack_top, |
587 | INTSTACK_SIZE, VM_KERN_MEMORY_CPU); |
588 | if (ret != KERN_SUCCESS) { |
589 | panic("cpu_data_alloc() int stack failed, ret=%d\n" , ret); |
590 | } |
591 | bzero((void*) cdp->cpu_int_stack_top, INTSTACK_SIZE); |
592 | cdp->cpu_int_stack_top += INTSTACK_SIZE; |
593 | |
594 | /* |
595 | * Allocate descriptor table: |
596 | */ |
597 | |
598 | cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[cnum]; |
599 | /* |
600 | * Allocate LDT |
601 | */ |
602 | if (do_ldt_alloc) { |
603 | boolean_t do_ldt_free = FALSE; |
604 | vm_offset_t sldtoffset = 0; |
605 | /* |
606 | * Allocate LDT |
607 | */ |
608 | vm_offset_t ldtalloc = 0, ldtallocsz = round_page_64(MAX_CPUS * sizeof(struct real_descriptor) * LDTSZ); |
609 | ret = kmem_alloc(kernel_map, (vm_offset_t *) &ldtalloc, ldtallocsz, VM_KERN_MEMORY_CPU); |
610 | if (ret != KERN_SUCCESS) { |
611 | panic("cpu_data_alloc() ldt failed, kmem_alloc=%d\n" , ret); |
612 | } |
613 | |
614 | simple_lock(&ncpus_lock); |
615 | if (dyn_ldts == NULL) { |
616 | dyn_ldts = (cldt_t *)ldtalloc; |
617 | } else { |
618 | do_ldt_free = TRUE; |
619 | } |
620 | simple_unlock(&ncpus_lock); |
621 | |
622 | if (do_ldt_free) { |
623 | kmem_free(kernel_map, ldtalloc, ldtallocsz); |
624 | } else { |
625 | /* CPU registration and startup are expected to execute |
626 | * serially, as invoked by the platform driver. |
627 | * Create trampoline alias of LDT region. |
628 | */ |
629 | sldtoffset = dyn_dblmap(ldtalloc, ldtallocsz); |
630 | ldt_alias_offset = sldtoffset; |
631 | } |
632 | } |
633 | cdp->cpu_ldtp = &dyn_ldts[cnum].pcldts[0]; |
634 | |
635 | #if CONFIG_MCA |
636 | /* Machine-check shadow register allocation. */ |
637 | mca_cpu_alloc(cdp); |
638 | #endif |
639 | |
640 | /* |
641 | * Before this cpu has been assigned a real thread context, |
642 | * we give it a fake, unique, non-zero thread id which the locking |
643 | * primitives use as their lock value. |
644 | * Note that this does not apply to the boot processor, cpu 0, which |
645 | * transitions to a thread context well before other processors are |
646 | * started. |
647 | */ |
648 | cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number; |
649 | cdp->cpu_NMI_acknowledged = TRUE; |
650 | cdp->cpu_nanotime = &pal_rtc_nanotime_info; |
651 | |
652 | kprintf("cpu_data_alloc(%d) %p desc_table: %p " |
653 | "ldt: %p " |
654 | "int_stack: 0x%lx-0x%lx\n" , |
655 | cdp->cpu_number, cdp, cdp->cpu_desc_tablep, cdp->cpu_ldtp, |
656 | (long)(cdp->cpu_int_stack_top - INTSTACK_SIZE), (long)(cdp->cpu_int_stack_top)); |
657 | cpu_data_ptr[cnum] = cdp; |
658 | |
659 | return cdp; |
660 | |
661 | } |
662 | |
663 | boolean_t |
664 | valid_user_data_selector(uint16_t selector) |
665 | { |
666 | sel_t sel = selector_to_sel(selector); |
667 | |
668 | if (selector == 0) |
669 | return (TRUE); |
670 | |
671 | if (sel.ti == SEL_LDT) |
672 | return (TRUE); |
673 | else if (sel.index < GDTSZ) { |
674 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
675 | return (TRUE); |
676 | } |
677 | return (FALSE); |
678 | } |
679 | |
680 | boolean_t |
681 | valid_user_code_selector(uint16_t selector) |
682 | { |
683 | sel_t sel = selector_to_sel(selector); |
684 | |
685 | if (selector == 0) |
686 | return (FALSE); |
687 | |
688 | if (sel.ti == SEL_LDT) { |
689 | if (sel.rpl == USER_PRIV) |
690 | return (TRUE); |
691 | } |
692 | else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { |
693 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
694 | return (TRUE); |
695 | /* Explicitly validate the system code selectors |
696 | * even if not instantaneously privileged, |
697 | * since they are dynamically re-privileged |
698 | * at context switch |
699 | */ |
700 | if ((selector == USER_CS) || (selector == USER64_CS)) |
701 | return (TRUE); |
702 | } |
703 | |
704 | return (FALSE); |
705 | } |
706 | |
707 | boolean_t |
708 | valid_user_stack_selector(uint16_t selector) |
709 | { |
710 | sel_t sel = selector_to_sel(selector); |
711 | |
712 | if (selector == 0) |
713 | return (FALSE); |
714 | |
715 | if (sel.ti == SEL_LDT) { |
716 | if (sel.rpl == USER_PRIV) |
717 | return (TRUE); |
718 | } |
719 | else if (sel.index < GDTSZ && sel.rpl == USER_PRIV) { |
720 | if ((gdt_desc_p(selector)->access & ACC_PL_U) == ACC_PL_U) |
721 | return (TRUE); |
722 | } |
723 | |
724 | return (FALSE); |
725 | } |
726 | |
727 | boolean_t |
728 | valid_user_segment_selectors(uint16_t cs, |
729 | uint16_t ss, |
730 | uint16_t ds, |
731 | uint16_t es, |
732 | uint16_t fs, |
733 | uint16_t gs) |
734 | { |
735 | return valid_user_code_selector(cs) && |
736 | valid_user_stack_selector(ss) && |
737 | valid_user_data_selector(ds) && |
738 | valid_user_data_selector(es) && |
739 | valid_user_data_selector(fs) && |
740 | valid_user_data_selector(gs); |
741 | } |
742 | |
743 | #if NCOPY_WINDOWS > 0 |
744 | |
745 | static vm_offset_t user_window_base = 0; |
746 | |
747 | void |
748 | cpu_userwindow_init(int cpu) |
749 | { |
750 | cpu_data_t *cdp = cpu_data_ptr[cpu]; |
751 | vm_offset_t user_window; |
752 | vm_offset_t vaddr; |
753 | int num_cpus; |
754 | |
755 | num_cpus = ml_get_max_cpus(); |
756 | |
757 | if (cpu >= num_cpus) |
758 | panic("cpu_userwindow_init: cpu > num_cpus" ); |
759 | |
760 | if (user_window_base == 0) { |
761 | |
762 | if (vm_allocate(kernel_map, &vaddr, |
763 | (NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE, |
764 | VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) != KERN_SUCCESS) |
765 | panic("cpu_userwindow_init: " |
766 | "couldn't allocate user map window" ); |
767 | |
768 | /* |
769 | * window must start on a page table boundary |
770 | * in the virtual address space |
771 | */ |
772 | user_window_base = (vaddr + (NBPDE - 1)) & ~(NBPDE - 1); |
773 | |
774 | /* |
775 | * get rid of any allocation leading up to our |
776 | * starting boundary |
777 | */ |
778 | vm_deallocate(kernel_map, vaddr, user_window_base - vaddr); |
779 | |
780 | /* |
781 | * get rid of tail that we don't need |
782 | */ |
783 | user_window = user_window_base + |
784 | (NBPDE * NCOPY_WINDOWS * num_cpus); |
785 | |
786 | vm_deallocate(kernel_map, user_window, |
787 | (vaddr + |
788 | ((NBPDE * NCOPY_WINDOWS * num_cpus) + NBPDE)) - |
789 | user_window); |
790 | } |
791 | |
792 | user_window = user_window_base + (cpu * NCOPY_WINDOWS * NBPDE); |
793 | |
794 | cdp->cpu_copywindow_base = user_window; |
795 | /* |
796 | * Abuse this pdp entry, the pdp now actually points to |
797 | * an array of copy windows addresses. |
798 | */ |
799 | cdp->cpu_copywindow_pdp = pmap_pde(kernel_pmap, user_window); |
800 | |
801 | } |
802 | |
803 | void |
804 | cpu_physwindow_init(int cpu) |
805 | { |
806 | cpu_data_t *cdp = cpu_data_ptr[cpu]; |
807 | vm_offset_t phys_window = cdp->cpu_physwindow_base; |
808 | |
809 | if (phys_window == 0) { |
810 | if (vm_allocate(kernel_map, &phys_window, |
811 | PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_CPU)) |
812 | != KERN_SUCCESS) |
813 | panic("cpu_physwindow_init: " |
814 | "couldn't allocate phys map window" ); |
815 | |
816 | /* |
817 | * make sure the page that encompasses the |
818 | * pte pointer we're interested in actually |
819 | * exists in the page table |
820 | */ |
821 | pmap_expand(kernel_pmap, phys_window, PMAP_EXPAND_OPTIONS_NONE); |
822 | |
823 | cdp->cpu_physwindow_base = phys_window; |
824 | cdp->cpu_physwindow_ptep = vtopte(phys_window); |
825 | } |
826 | } |
827 | #endif /* NCOPY_WINDOWS > 0 */ |
828 | |
829 | /* |
830 | * Allocate a new interrupt stack for the boot processor from the |
831 | * heap rather than continue to use the statically allocated space. |
832 | * Also switch to a dynamically allocated cpu data area. |
833 | */ |
834 | void |
835 | cpu_data_realloc(void) |
836 | { |
837 | int ret; |
838 | vm_offset_t istk; |
839 | cpu_data_t *cdp; |
840 | boolean_t istate; |
841 | |
842 | ret = kmem_alloc(kernel_map, &istk, INTSTACK_SIZE, VM_KERN_MEMORY_CPU); |
843 | if (ret != KERN_SUCCESS) { |
844 | panic("cpu_data_realloc() stack alloc, ret=%d\n" , ret); |
845 | } |
846 | bzero((void*) istk, INTSTACK_SIZE); |
847 | istk += INTSTACK_SIZE; |
848 | |
849 | cdp = &scdatas[0]; |
850 | |
851 | /* Copy old contents into new area and make fix-ups */ |
852 | assert(cpu_number() == 0); |
853 | bcopy((void *) cpu_data_ptr[0], (void*) cdp, sizeof(cpu_data_t)); |
854 | cdp->cpu_this = cdp; |
855 | cdp->cpu_int_stack_top = istk; |
856 | timer_call_queue_init(&cdp->rtclock_timer.queue); |
857 | cdp->cpu_desc_tablep = (struct cpu_desc_table *) &scdtables[0]; |
858 | cpu_desc_table64_t *cdt = (cpu_desc_table64_t *) cdp->cpu_desc_tablep; |
859 | |
860 | uint8_t *cfstk = &scfstks[cdp->cpu_number].fstk[0]; |
861 | cdt->fstkp = cfstk; |
862 | cfstk += FSTK_SZ; |
863 | |
864 | /* |
865 | * With interrupts disabled commmit the new areas. |
866 | */ |
867 | istate = ml_set_interrupts_enabled(FALSE); |
868 | cpu_data_ptr[0] = cdp; |
869 | master_ktss64.ist2 = DBLMAP((uintptr_t) cfstk); |
870 | master_ktss64.ist1 = DBLMAP((uintptr_t) cfstk - sizeof(x86_64_intr_stack_frame_t)); |
871 | wrmsr64(MSR_IA32_GS_BASE, (uintptr_t) cdp); |
872 | wrmsr64(MSR_IA32_KERNEL_GS_BASE, (uintptr_t) cdp); |
873 | (void) ml_set_interrupts_enabled(istate); |
874 | |
875 | kprintf("Reallocated master cpu data: %p," |
876 | " interrupt stack: %p, fault stack: %p\n" , |
877 | (void *) cdp, (void *) istk, (void *) cfstk); |
878 | } |
879 | |