1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 *
31 */
32
33#ifndef I386_CPU_DATA
34#define I386_CPU_DATA
35
36#include <mach_assert.h>
37
38#include <kern/assert.h>
39#include <kern/kern_types.h>
40#include <kern/queue.h>
41#include <kern/processor.h>
42#include <kern/pms.h>
43#include <pexpert/pexpert.h>
44#include <mach/i386/thread_status.h>
45#include <mach/i386/vm_param.h>
46#include <i386/locks.h>
47#include <i386/rtclock_protos.h>
48#include <i386/pmCPU.h>
49#include <i386/cpu_topology.h>
50#include <i386/seg.h>
51
52#if CONFIG_VMX
53#include <i386/vmx/vmx_cpu.h>
54#endif
55
56#if MONOTONIC
57#include <machine/monotonic.h>
58#endif /* MONOTONIC */
59
60#include <machine/pal_routines.h>
61
62/*
63 * Data structures referenced (anonymously) from per-cpu data:
64 */
65struct cpu_cons_buffer;
66struct cpu_desc_table;
67struct mca_state;
68struct prngContext;
69
70/*
71 * Data structures embedded in per-cpu data:
72 */
73typedef struct rtclock_timer {
74 mpqueue_head_t queue;
75 uint64_t deadline;
76 uint64_t when_set;
77 boolean_t has_expired;
78} rtclock_timer_t;
79
80typedef struct {
81 /* The 'u' suffixed fields store the double-mapped descriptor addresses */
82 struct x86_64_tss *cdi_ktssu;
83 struct x86_64_tss *cdi_ktssb;
84 x86_64_desc_register_t cdi_gdtu;
85 x86_64_desc_register_t cdi_gdtb;
86 x86_64_desc_register_t cdi_idtu;
87 x86_64_desc_register_t cdi_idtb;
88 struct fake_descriptor *cdi_ldtu;
89 struct fake_descriptor *cdi_ldtb;
90 vm_offset_t cdi_sstku;
91 vm_offset_t cdi_sstkb;
92} cpu_desc_index_t;
93
94typedef enum {
95 TASK_MAP_32BIT, /* 32-bit user, compatibility mode */
96 TASK_MAP_64BIT, /* 64-bit user thread, shared space */
97} task_map_t;
98
99
100/*
101 * This structure is used on entry into the (uber-)kernel on syscall from
102 * a 64-bit user. It contains the address of the machine state save area
103 * for the current thread and a temporary place to save the user's rsp
104 * before loading this address into rsp.
105 */
106typedef struct {
107 addr64_t cu_isf; /* thread->pcb->iss.isf */
108 uint64_t cu_tmp; /* temporary scratch */
109 addr64_t cu_user_gs_base;
110} cpu_uber_t;
111
112typedef uint16_t pcid_t;
113typedef uint8_t pcid_ref_t;
114
115#define CPU_RTIME_BINS (12)
116#define CPU_ITIME_BINS (CPU_RTIME_BINS)
117
118#define MAXPLFRAMES (16)
119typedef struct {
120 boolean_t pltype;
121 int plevel;
122 uint64_t plbt[MAXPLFRAMES];
123} plrecord_t;
124
125/*
126 * Per-cpu data.
127 *
128 * Each processor has a per-cpu data area which is dereferenced through the
129 * current_cpu_datap() macro. For speed, the %gs segment is based here, and
130 * using this, inlines provides single-instruction access to frequently used
131 * members - such as get_cpu_number()/cpu_number(), and get_active_thread()/
132 * current_thread().
133 *
134 * Cpu data owned by another processor can be accessed using the
135 * cpu_datap(cpu_number) macro which uses the cpu_data_ptr[] array of per-cpu
136 * pointers.
137 */
138typedef struct {
139 pcid_t cpu_pcid_free_hint;
140#define PMAP_PCID_MAX_PCID (0x800)
141 pcid_ref_t cpu_pcid_refcounts[PMAP_PCID_MAX_PCID];
142 pmap_t cpu_pcid_last_pmap_dispatched[PMAP_PCID_MAX_PCID];
143} pcid_cdata_t;
144
145typedef struct cpu_data
146{
147 struct pal_cpu_data cpu_pal_data; /* PAL-specific data */
148#define cpu_pd cpu_pal_data /* convenience alias */
149 struct cpu_data *cpu_this; /* pointer to myself */
150 thread_t cpu_active_thread;
151 thread_t cpu_nthread;
152 volatile int cpu_preemption_level;
153 int cpu_number; /* Logical CPU */
154 void *cpu_int_state; /* interrupt state */
155 vm_offset_t cpu_active_stack; /* kernel stack base */
156 vm_offset_t cpu_kernel_stack; /* kernel stack top */
157 vm_offset_t cpu_int_stack_top;
158 int cpu_interrupt_level;
159 volatile int cpu_signals; /* IPI events */
160 volatile int cpu_prior_signals; /* Last set of events,
161 * debugging
162 */
163 ast_t cpu_pending_ast;
164 volatile int cpu_running;
165#if !MONOTONIC
166 boolean_t cpu_fixed_pmcs_enabled;
167#endif /* !MONOTONIC */
168 rtclock_timer_t rtclock_timer;
169 uint64_t quantum_timer_deadline;
170 volatile addr64_t cpu_active_cr3 __attribute((aligned(64)));
171 union {
172 volatile uint32_t cpu_tlb_invalid;
173 struct {
174 volatile uint16_t cpu_tlb_invalid_local;
175 volatile uint16_t cpu_tlb_invalid_global;
176 };
177 };
178 volatile task_map_t cpu_task_map;
179 volatile addr64_t cpu_task_cr3;
180 addr64_t cpu_kernel_cr3;
181 volatile addr64_t cpu_ucr3;
182 boolean_t cpu_pagezero_mapped;
183 cpu_uber_t cpu_uber;
184/* Double-mapped per-CPU exception stack address */
185 uintptr_t cd_estack;
186 int cpu_xstate;
187/* Address of shadowed, partially mirrored CPU data structures located
188 * in the double mapped PML4
189 */
190 void *cd_shadow;
191 struct processor *cpu_processor;
192#if NCOPY_WINDOWS > 0
193 struct cpu_pmap *cpu_pmap;
194#endif
195 struct real_descriptor *cpu_ldtp;
196 struct cpu_desc_table *cpu_desc_tablep;
197 cpu_desc_index_t cpu_desc_index;
198 int cpu_ldt;
199#if NCOPY_WINDOWS > 0
200 vm_offset_t cpu_copywindow_base;
201 uint64_t *cpu_copywindow_pdp;
202
203 vm_offset_t cpu_physwindow_base;
204 uint64_t *cpu_physwindow_ptep;
205#endif
206
207#define HWINTCNT_SIZE 256
208 uint32_t cpu_hwIntCnt[HWINTCNT_SIZE]; /* Interrupt counts */
209 uint64_t cpu_hwIntpexits[HWINTCNT_SIZE];
210 uint64_t cpu_dr7; /* debug control register */
211 uint64_t cpu_int_event_time; /* intr entry/exit time */
212 pal_rtc_nanotime_t *cpu_nanotime; /* Nanotime info */
213#if KPC
214 /* double-buffered performance counter data */
215 uint64_t *cpu_kpc_buf[2];
216 /* PMC shadow and reload value buffers */
217 uint64_t *cpu_kpc_shadow;
218 uint64_t *cpu_kpc_reload;
219#endif
220#if MONOTONIC
221 struct mt_cpu cpu_monotonic;
222#endif /* MONOTONIC */
223 uint32_t cpu_pmap_pcid_enabled;
224 pcid_t cpu_active_pcid;
225 pcid_t cpu_last_pcid;
226 pcid_t cpu_kernel_pcid;
227 volatile pcid_ref_t *cpu_pmap_pcid_coherentp;
228 volatile pcid_ref_t *cpu_pmap_pcid_coherentp_kernel;
229 pcid_cdata_t *cpu_pcid_data;
230#ifdef PCID_STATS
231 uint64_t cpu_pmap_pcid_flushes;
232 uint64_t cpu_pmap_pcid_preserves;
233#endif
234 uint64_t cpu_aperf;
235 uint64_t cpu_mperf;
236 uint64_t cpu_c3res;
237 uint64_t cpu_c6res;
238 uint64_t cpu_c7res;
239 uint64_t cpu_itime_total;
240 uint64_t cpu_rtime_total;
241 uint64_t cpu_ixtime;
242 uint64_t cpu_idle_exits;
243 uint64_t cpu_rtimes[CPU_RTIME_BINS];
244 uint64_t cpu_itimes[CPU_ITIME_BINS];
245#if !MONOTONIC
246 uint64_t cpu_cur_insns;
247 uint64_t cpu_cur_ucc;
248 uint64_t cpu_cur_urc;
249#endif /* !MONOTONIC */
250 uint64_t cpu_gpmcs[4];
251 uint64_t cpu_max_observed_int_latency;
252 int cpu_max_observed_int_latency_vector;
253 volatile boolean_t cpu_NMI_acknowledged;
254 uint64_t debugger_entry_time;
255 uint64_t debugger_ipi_time;
256 /* A separate nested interrupt stack flag, to account
257 * for non-nested interrupts arriving while on the interrupt stack
258 * Currently only occurs when AICPM enables interrupts on the
259 * interrupt stack during processor offlining.
260 */
261 uint32_t cpu_nested_istack;
262 uint32_t cpu_nested_istack_events;
263 x86_saved_state64_t *cpu_fatal_trap_state;
264 x86_saved_state64_t *cpu_post_fatal_trap_state;
265#if CONFIG_VMX
266 vmx_cpu_t cpu_vmx; /* wonderful world of virtualization */
267#endif
268#if CONFIG_MCA
269 struct mca_state *cpu_mca_state; /* State at MC fault */
270#endif
271 int cpu_type;
272 int cpu_subtype;
273 int cpu_threadtype;
274 boolean_t cpu_iflag;
275 boolean_t cpu_boot_complete;
276 int cpu_hibernate;
277#define MAX_PREEMPTION_RECORDS (8)
278#if DEVELOPMENT || DEBUG
279 int cpu_plri;
280 plrecord_t plrecords[MAX_PREEMPTION_RECORDS];
281#endif
282 void *cpu_console_buf;
283 struct x86_lcpu lcpu;
284 int cpu_phys_number; /* Physical CPU */
285 cpu_id_t cpu_id; /* Platform Expert */
286#if DEBUG
287 uint64_t cpu_entry_cr3;
288 uint64_t cpu_exit_cr3;
289 uint64_t cpu_pcid_last_cr3;
290#endif
291 boolean_t cpu_rendezvous_in_progress;
292} cpu_data_t;
293
294extern cpu_data_t *cpu_data_ptr[];
295
296/* Macro to generate inline bodies to retrieve per-cpu data fields. */
297#if defined(__clang__)
298#define GS_RELATIVE volatile __attribute__((address_space(256)))
299#ifndef offsetof
300#define offsetof(TYPE,MEMBER) __builtin_offsetof(TYPE,MEMBER)
301#endif
302
303#define CPU_DATA_GET(member,type) \
304 cpu_data_t GS_RELATIVE *cpu_data = \
305 (cpu_data_t GS_RELATIVE *)0UL; \
306 type ret; \
307 ret = cpu_data->member; \
308 return ret;
309
310#define CPU_DATA_GET_INDEX(member,index,type) \
311 cpu_data_t GS_RELATIVE *cpu_data = \
312 (cpu_data_t GS_RELATIVE *)0UL; \
313 type ret; \
314 ret = cpu_data->member[index]; \
315 return ret;
316
317#define CPU_DATA_SET(member,value) \
318 cpu_data_t GS_RELATIVE *cpu_data = \
319 (cpu_data_t GS_RELATIVE *)0UL; \
320 cpu_data->member = value;
321
322#define CPU_DATA_XCHG(member,value,type) \
323 cpu_data_t GS_RELATIVE *cpu_data = \
324 (cpu_data_t GS_RELATIVE *)0UL; \
325 type ret; \
326 ret = cpu_data->member; \
327 cpu_data->member = value; \
328 return ret;
329
330#else /* !defined(__clang__) */
331
332#ifndef offsetof
333#define offsetof(TYPE,MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
334#endif /* offsetof */
335#define CPU_DATA_GET(member,type) \
336 type ret; \
337 __asm__ volatile ("mov %%gs:%P1,%0" \
338 : "=r" (ret) \
339 : "i" (offsetof(cpu_data_t,member))); \
340 return ret;
341
342#define CPU_DATA_GET_INDEX(member,index,type) \
343 type ret; \
344 __asm__ volatile ("mov %%gs:(%1),%0" \
345 : "=r" (ret) \
346 : "r" (offsetof(cpu_data_t,member[index]))); \
347 return ret;
348
349#define CPU_DATA_SET(member,value) \
350 __asm__ volatile ("mov %0,%%gs:%P1" \
351 : \
352 : "r" (value), "i" (offsetof(cpu_data_t,member)));
353
354#define CPU_DATA_XCHG(member,value,type) \
355 type ret; \
356 __asm__ volatile ("xchg %0,%%gs:%P1" \
357 : "=r" (ret) \
358 : "i" (offsetof(cpu_data_t,member)), "0" (value)); \
359 return ret;
360
361#endif /* !defined(__clang__) */
362
363/*
364 * Everyone within the osfmk part of the kernel can use the fast
365 * inline versions of these routines. Everyone outside, must call
366 * the real thing,
367 */
368
369
370/*
371 * The "volatile" flavor of current_thread() is intended for use by
372 * scheduler code which may need to update the thread pointer in the
373 * course of a context switch. Any call to current_thread() made
374 * prior to the thread pointer update should be safe to optimize away
375 * as it should be consistent with that thread's state to the extent
376 * the compiler can reason about it. Likewise, the context switch
377 * path will eventually result in an arbitrary branch to the new
378 * thread's pc, about which the compiler won't be able to reason.
379 * Thus any compile-time optimization of current_thread() calls made
380 * within the new thread should be safely encapsulated in its
381 * register/stack state. The volatile form therefore exists to cover
382 * the window between the thread pointer update and the branch to
383 * the new pc.
384 */
385static inline thread_t
386get_active_thread_volatile(void)
387{
388 CPU_DATA_GET(cpu_active_thread,thread_t)
389}
390
391static inline __pure2 thread_t
392get_active_thread(void)
393{
394 CPU_DATA_GET(cpu_active_thread,thread_t)
395}
396
397#define current_thread_fast() get_active_thread()
398#define current_thread_volatile() get_active_thread_volatile()
399#define current_thread() current_thread_fast()
400
401#define cpu_mode_is64bit() TRUE
402
403static inline int
404get_preemption_level(void)
405{
406 CPU_DATA_GET(cpu_preemption_level,int)
407}
408static inline int
409get_interrupt_level(void)
410{
411 CPU_DATA_GET(cpu_interrupt_level,int)
412}
413static inline int
414get_cpu_number(void)
415{
416 CPU_DATA_GET(cpu_number,int)
417}
418static inline int
419get_cpu_phys_number(void)
420{
421 CPU_DATA_GET(cpu_phys_number,int)
422}
423
424static inline cpu_data_t *
425current_cpu_datap(void) {
426 CPU_DATA_GET(cpu_this, cpu_data_t *);
427}
428
429/*
430 * Facility to diagnose preemption-level imbalances, which are otherwise
431 * challenging to debug. On each operation that enables or disables preemption,
432 * we record a backtrace into a per-CPU ring buffer, along with the current
433 * preemption level and operation type. Thus, if an imbalance is observed,
434 * one can examine these per-CPU records to determine which codepath failed
435 * to re-enable preemption, enabled premption without a corresponding
436 * disablement etc. The backtracer determines which stack is currently active,
437 * and uses that to perform bounds checks on unterminated stacks.
438 * To enable, sysctl -w machdep.pltrace=1 on DEVELOPMENT or DEBUG kernels (DRK '15)
439 * The bounds check currently doesn't account for non-default thread stack sizes.
440 */
441#if DEVELOPMENT || DEBUG
442static inline void pltrace_bt(uint64_t *rets, int maxframes, uint64_t stacklo, uint64_t stackhi) {
443 uint64_t *cfp = (uint64_t *) __builtin_frame_address(0);
444 int plbtf;
445
446 assert(stacklo !=0 && stackhi !=0);
447
448 for (plbtf = 0; plbtf < maxframes; plbtf++) {
449 if (((uint64_t)cfp == 0) || (((uint64_t)cfp < stacklo) || ((uint64_t)cfp > stackhi))) {
450 rets[plbtf] = 0;
451 continue;
452 }
453 rets[plbtf] = *(cfp + 1);
454 cfp = (uint64_t *) (*cfp);
455 }
456}
457
458
459extern uint32_t low_intstack[]; /* bottom */
460extern uint32_t low_eintstack[]; /* top */
461extern char mp_slave_stack[PAGE_SIZE];
462
463static inline void pltrace_internal(boolean_t enable) {
464 cpu_data_t *cdata = current_cpu_datap();
465 int cpli = cdata->cpu_preemption_level;
466 int cplrecord = cdata->cpu_plri;
467 uint64_t kstackb, kstackt, *plbts;
468
469 assert(cpli >= 0);
470
471 cdata->plrecords[cplrecord].pltype = enable;
472 cdata->plrecords[cplrecord].plevel = cpli;
473
474 plbts = &cdata->plrecords[cplrecord].plbt[0];
475
476 cplrecord++;
477
478 if (cplrecord >= MAX_PREEMPTION_RECORDS) {
479 cplrecord = 0;
480 }
481
482 cdata->cpu_plri = cplrecord;
483 /* Obtain the 'current' program counter, initial backtrace
484 * element. This will also indicate if we were unable to
485 * trace further up the stack for some reason
486 */
487 __asm__ volatile("leaq 1f(%%rip), %%rax; mov %%rax, %0\n1:"
488 : "=m" (plbts[0])
489 :
490 : "rax");
491
492
493 thread_t cplthread = cdata->cpu_active_thread;
494 if (cplthread) {
495 uintptr_t csp;
496 __asm__ __volatile__ ("movq %%rsp, %0": "=r" (csp):);
497 /* Determine which stack we're on to populate stack bounds.
498 * We don't need to trace across stack boundaries for this
499 * routine.
500 */
501 kstackb = cdata->cpu_active_stack;
502 kstackt = kstackb + KERNEL_STACK_SIZE;
503 if (csp < kstackb || csp > kstackt) {
504 kstackt = cdata->cpu_kernel_stack;
505 kstackb = kstackb - KERNEL_STACK_SIZE;
506 if (csp < kstackb || csp > kstackt) {
507 kstackt = cdata->cpu_int_stack_top;
508 kstackb = kstackt - INTSTACK_SIZE;
509 if (csp < kstackb || csp > kstackt) {
510 kstackt = (uintptr_t)low_eintstack;
511 kstackb = (uintptr_t)low_eintstack - INTSTACK_SIZE;
512 if (csp < kstackb || csp > kstackt) {
513 kstackb = (uintptr_t) mp_slave_stack;
514 kstackt = (uintptr_t) mp_slave_stack + PAGE_SIZE;
515 }
516 }
517 }
518 }
519
520 if (kstackb) {
521 pltrace_bt(&plbts[1], MAXPLFRAMES - 1, kstackb, kstackt);
522 }
523 }
524}
525
526extern int plctrace_enabled;
527#endif /* DEVELOPMENT || DEBUG */
528
529static inline void pltrace(boolean_t plenable) {
530#if DEVELOPMENT || DEBUG
531 if (__improbable(plctrace_enabled != 0)) {
532 pltrace_internal(plenable);
533 }
534#else
535 (void)plenable;
536#endif
537}
538
539static inline void
540disable_preemption_internal(void) {
541 assert(get_preemption_level() >= 0);
542
543#if defined(__clang__)
544 cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
545 cpu_data->cpu_preemption_level++;
546#else
547 __asm__ volatile ("incl %%gs:%P0"
548 :
549 : "i" (offsetof(cpu_data_t, cpu_preemption_level)));
550#endif
551 pltrace(FALSE);
552}
553
554static inline void
555enable_preemption_internal(void) {
556 assert(get_preemption_level() > 0);
557 pltrace(TRUE);
558#if defined(__clang__)
559 cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
560 if (0 == --cpu_data->cpu_preemption_level)
561 kernel_preempt_check();
562#else
563 __asm__ volatile ("decl %%gs:%P0 \n\t"
564 "jne 1f \n\t"
565 "call _kernel_preempt_check \n\t"
566 "1:"
567 : /* no outputs */
568 : "i" (offsetof(cpu_data_t, cpu_preemption_level))
569 : "eax", "ecx", "edx", "cc", "memory");
570#endif
571}
572
573static inline void
574enable_preemption_no_check(void)
575{
576 assert(get_preemption_level() > 0);
577
578 pltrace(TRUE);
579#if defined(__clang__)
580 cpu_data_t GS_RELATIVE *cpu_data = (cpu_data_t GS_RELATIVE *)0UL;
581 cpu_data->cpu_preemption_level--;
582#else
583 __asm__ volatile ("decl %%gs:%P0"
584 : /* no outputs */
585 : "i" (offsetof(cpu_data_t, cpu_preemption_level))
586 : "cc", "memory");
587#endif
588}
589
590static inline void
591_enable_preemption_no_check(void) {
592 enable_preemption_no_check();
593}
594
595static inline void
596mp_disable_preemption(void)
597{
598 disable_preemption_internal();
599}
600
601static inline void
602_mp_disable_preemption(void)
603{
604 disable_preemption_internal();
605}
606
607static inline void
608mp_enable_preemption(void)
609{
610 enable_preemption_internal();
611}
612
613static inline void
614_mp_enable_preemption(void) {
615 enable_preemption_internal();
616}
617
618static inline void
619mp_enable_preemption_no_check(void) {
620 enable_preemption_no_check();
621}
622
623static inline void
624_mp_enable_preemption_no_check(void) {
625 enable_preemption_no_check();
626}
627
628#ifdef XNU_KERNEL_PRIVATE
629#define disable_preemption() disable_preemption_internal()
630#define enable_preemption() enable_preemption_internal()
631#define MACHINE_PREEMPTION_MACROS (1)
632#endif
633
634static inline cpu_data_t *
635cpu_datap(int cpu) {
636 return cpu_data_ptr[cpu];
637}
638
639static inline int
640cpu_is_running(int cpu) {
641 return ((cpu_datap(cpu) != NULL) && (cpu_datap(cpu)->cpu_running));
642}
643
644#ifdef MACH_KERNEL_PRIVATE
645static inline cpu_data_t *
646cpu_shadowp(int cpu) {
647 return cpu_data_ptr[cpu]->cd_shadow;
648}
649
650#endif
651extern cpu_data_t *cpu_data_alloc(boolean_t is_boot_cpu);
652extern void cpu_data_realloc(void);
653
654#endif /* I386_CPU_DATA */
655