| 1 | /* |
| 2 | * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | /* |
| 29 | * @OSF_COPYRIGHT@ |
| 30 | */ |
| 31 | /* |
| 32 | * Mach Operating System |
| 33 | * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University |
| 34 | * All Rights Reserved. |
| 35 | * |
| 36 | * Permission to use, copy, modify and distribute this software and its |
| 37 | * documentation is hereby granted, provided that both the copyright |
| 38 | * notice and this permission notice appear in all copies of the |
| 39 | * software, derivative works or modified versions, and any portions |
| 40 | * thereof, and that both notices appear in supporting documentation. |
| 41 | * |
| 42 | * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" |
| 43 | * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR |
| 44 | * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. |
| 45 | * |
| 46 | * Carnegie Mellon requests users of this software to return to |
| 47 | * |
| 48 | * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU |
| 49 | * School of Computer Science |
| 50 | * Carnegie Mellon University |
| 51 | * Pittsburgh PA 15213-3890 |
| 52 | * |
| 53 | * any improvements or extensions that they make and grant Carnegie Mellon |
| 54 | * the rights to redistribute these changes. |
| 55 | */ |
| 56 | /* |
| 57 | */ |
| 58 | /* |
| 59 | * Author: Avadis Tevanian, Jr. |
| 60 | * Date: 1986 |
| 61 | * |
| 62 | * Compute various averages. |
| 63 | */ |
| 64 | |
| 65 | #include <mach/mach_types.h> |
| 66 | |
| 67 | #include <kern/sched.h> |
| 68 | #include <kern/assert.h> |
| 69 | #include <kern/processor.h> |
| 70 | #include <kern/thread.h> |
| 71 | #if CONFIG_TELEMETRY |
| 72 | #include <kern/telemetry.h> |
| 73 | #endif |
| 74 | |
| 75 | #include <sys/kdebug.h> |
| 76 | |
| 77 | uint32_t avenrun[3] = {0, 0, 0}; |
| 78 | uint32_t mach_factor[3] = {0, 0, 0}; |
| 79 | |
| 80 | uint32_t sched_load_average, sched_mach_factor; |
| 81 | |
| 82 | #if defined(CONFIG_SCHED_TIMESHARE_CORE) |
| 83 | /* |
| 84 | * Values are scaled by LOAD_SCALE, defined in processor_info.h |
| 85 | */ |
| 86 | #define base(n) ((n) << SCHED_TICK_SHIFT) |
| 87 | #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n)) |
| 88 | |
| 89 | static uint32_t fract[3] = { |
| 90 | frac(5), /* 5 second average */ |
| 91 | frac(30), /* 30 second average */ |
| 92 | frac(60), /* 1 minute average */ |
| 93 | }; |
| 94 | |
| 95 | #undef base |
| 96 | #undef frac |
| 97 | |
| 98 | #endif /* CONFIG_SCHED_TIMESHARE_CORE */ |
| 99 | |
| 100 | static unsigned int sched_nrun; |
| 101 | |
| 102 | typedef void (*sched_avg_comp_t)( |
| 103 | void *param); |
| 104 | |
| 105 | static struct sched_average { |
| 106 | sched_avg_comp_t comp; |
| 107 | void *param; |
| 108 | int period; /* in seconds */ |
| 109 | uint64_t deadline; |
| 110 | } sched_average[] = { |
| 111 | { compute_averunnable, &sched_nrun, 5, 0 }, |
| 112 | { compute_stack_target, NULL, 5, 1 }, |
| 113 | { compute_pageout_gc_throttle, NULL, 1, 0 }, |
| 114 | { compute_pmap_gc_throttle, NULL, 60, 0 }, |
| 115 | #if CONFIG_TELEMETRY |
| 116 | { compute_telemetry, NULL, 1, 0 }, |
| 117 | #endif |
| 118 | { NULL, NULL, 0, 0 } |
| 119 | }; |
| 120 | |
| 121 | typedef struct sched_average *sched_average_t; |
| 122 | |
| 123 | /* |
| 124 | * Scheduler load calculation algorithm |
| 125 | * |
| 126 | * The scheduler load values provide an estimate of the number of runnable |
| 127 | * timeshare threads in the system at various priority bands. The load |
| 128 | * ultimately affects the priority shifts applied to all threads in a band |
| 129 | * causing them to timeshare with other threads in the system. The load is |
| 130 | * maintained in buckets, with each bucket corresponding to a priority band. |
| 131 | * |
| 132 | * Each runnable thread on the system contributes its load to its priority |
| 133 | * band and to the bands above it. The contribution of a thread to the bands |
| 134 | * above it is not strictly 1:1 and is weighted based on the priority band |
| 135 | * of the thread. The rules of thread load contribution to each of its higher |
| 136 | * bands are as follows: |
| 137 | * |
| 138 | * - DF threads: Upto (2 * NCPUs) threads |
| 139 | * - UT threads: Upto NCPUs threads |
| 140 | * - BG threads: Upto 1 thread |
| 141 | * |
| 142 | * To calculate the load values, the various run buckets are sampled (every |
| 143 | * sched_load_compute_interval_abs) and the weighted contributions of the the |
| 144 | * lower bucket threads are added. The resultant value is plugged into an |
| 145 | * exponentially weighted moving average formula: |
| 146 | * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count |
| 147 | * (where, alpha < 1) |
| 148 | * The calculations for the scheduler load are done using fixpoint math with |
| 149 | * a scale factor of 16 to avoid expensive divides and floating point |
| 150 | * operations. The final load values are a smooth curve representative of |
| 151 | * the actual number of runnable threads in a priority band. |
| 152 | */ |
| 153 | |
| 154 | /* Maintains the current (scaled for fixpoint) load in various buckets */ |
| 155 | uint32_t sched_load[TH_BUCKET_MAX]; |
| 156 | |
| 157 | /* |
| 158 | * Alpha factor for the EWMA alogrithm. The current values are chosen as |
| 159 | * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast |
| 160 | * enough to changing system load but does not see too many spikes from bursty |
| 161 | * activity. The current values ensure that the scheduler would converge |
| 162 | * to the latest load in 2-3 sched_load_compute_interval_abs intervals |
| 163 | * (which amounts to ~30-45ms with current values). |
| 164 | */ |
| 165 | #define SCHED_LOAD_EWMA_ALPHA_OLD 6 |
| 166 | #define SCHED_LOAD_EWMA_ALPHA_NEW 10 |
| 167 | #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4 |
| 168 | static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT)); |
| 169 | |
| 170 | /* For fixpoint EWMA, roundup the load to make it converge */ |
| 171 | #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0) |
| 172 | |
| 173 | /* Macro to convert scaled sched load to a real load value */ |
| 174 | #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) |
| 175 | |
| 176 | /* |
| 177 | * Routine to capture the latest runnable counts and update sched_load */ |
| 178 | void |
| 179 | compute_sched_load(void) |
| 180 | { |
| 181 | /* |
| 182 | * Retrieve a snapshot of the current run counts. |
| 183 | * |
| 184 | * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets, |
| 185 | * not byte-by-byte copy. |
| 186 | */ |
| 187 | uint32_t ncpus = processor_avail_count; |
| 188 | uint32_t load_now[TH_BUCKET_MAX]; |
| 189 | |
| 190 | load_now[TH_BUCKET_RUN] = sched_run_buckets[TH_BUCKET_RUN]; |
| 191 | load_now[TH_BUCKET_FIXPRI] = sched_run_buckets[TH_BUCKET_FIXPRI]; |
| 192 | load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG]; |
| 193 | load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF]; |
| 194 | load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT]; |
| 195 | load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG]; |
| 196 | |
| 197 | assert(load_now[TH_BUCKET_RUN] >= 0); |
| 198 | assert(load_now[TH_BUCKET_FIXPRI] >= 0); |
| 199 | |
| 200 | uint32_t nthreads = load_now[TH_BUCKET_RUN]; |
| 201 | uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI]; |
| 202 | |
| 203 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
| 204 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, |
| 205 | load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]), |
| 206 | load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); |
| 207 | |
| 208 | /* |
| 209 | * Compute the timeshare priority conversion factor based on loading. |
| 210 | * Because our counters may be incremented and accessed |
| 211 | * concurrently with respect to each other, we may have |
| 212 | * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut) |
| 213 | * is broken, so truncate values in these cases. |
| 214 | */ |
| 215 | uint32_t timeshare_threads = (nthreads - nfixpri); |
| 216 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { |
| 217 | if (load_now[i] > timeshare_threads) |
| 218 | load_now[i] = timeshare_threads; |
| 219 | } |
| 220 | |
| 221 | /* |
| 222 | * Default threads contribute up to (NCPUS * 2) of load to FG threads |
| 223 | */ |
| 224 | if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) { |
| 225 | load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF]; |
| 226 | } else { |
| 227 | load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2); |
| 228 | } |
| 229 | |
| 230 | /* |
| 231 | * Utility threads contribute up to NCPUS of load to FG & DF threads |
| 232 | */ |
| 233 | if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { |
| 234 | load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; |
| 235 | load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT]; |
| 236 | } else { |
| 237 | load_now[TH_BUCKET_SHARE_FG] += ncpus; |
| 238 | load_now[TH_BUCKET_SHARE_DF] += ncpus; |
| 239 | } |
| 240 | |
| 241 | /* |
| 242 | * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads |
| 243 | */ |
| 244 | if (load_now[TH_BUCKET_SHARE_BG] > 0) { |
| 245 | load_now[TH_BUCKET_SHARE_FG] += 1; |
| 246 | load_now[TH_BUCKET_SHARE_DF] += 1; |
| 247 | load_now[TH_BUCKET_SHARE_UT] += 1; |
| 248 | } |
| 249 | |
| 250 | /* |
| 251 | * The conversion factor consists of two components: |
| 252 | * a fixed value based on the absolute time unit (sched_fixed_shift), |
| 253 | * and a dynamic portion based on load (sched_load_shifts). |
| 254 | * |
| 255 | * Zero load results in a out of range shift count. |
| 256 | */ |
| 257 | |
| 258 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { |
| 259 | uint32_t bucket_load = 0; |
| 260 | |
| 261 | if (load_now[i] > ncpus) { |
| 262 | /* Normalize the load to number of CPUs */ |
| 263 | if (ncpus > 1) |
| 264 | bucket_load = load_now[i] / ncpus; |
| 265 | else |
| 266 | bucket_load = load_now[i]; |
| 267 | |
| 268 | if (bucket_load > MAX_LOAD) |
| 269 | bucket_load = MAX_LOAD; |
| 270 | } |
| 271 | /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */ |
| 272 | sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW); |
| 273 | sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT; |
| 274 | } |
| 275 | |
| 276 | KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, |
| 277 | MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, |
| 278 | SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]), |
| 279 | SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0); |
| 280 | } |
| 281 | |
| 282 | void |
| 283 | compute_averages(uint64_t stdelta) |
| 284 | { |
| 285 | |
| 286 | uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1; |
| 287 | uint32_t ncpus = processor_avail_count; |
| 288 | |
| 289 | /* Update the global pri_shifts based on the latest values */ |
| 290 | for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) { |
| 291 | uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]); |
| 292 | sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load]; |
| 293 | } |
| 294 | |
| 295 | /* |
| 296 | * Sample total running threads for the load average calculation. |
| 297 | */ |
| 298 | sched_nrun = nthreads; |
| 299 | |
| 300 | /* |
| 301 | * Load average and mach factor calculations for |
| 302 | * those which ask about these things. |
| 303 | */ |
| 304 | uint32_t average_now = nthreads * LOAD_SCALE; |
| 305 | uint32_t factor_now; |
| 306 | |
| 307 | if (nthreads > ncpus) |
| 308 | factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); |
| 309 | else |
| 310 | factor_now = (ncpus - nthreads) * LOAD_SCALE; |
| 311 | |
| 312 | /* |
| 313 | * For those statistics that formerly relied on being recomputed |
| 314 | * on timer ticks, advance by the approximate number of corresponding |
| 315 | * elapsed intervals, thus compensating for potential idle intervals. |
| 316 | */ |
| 317 | for (uint32_t index = 0; index < stdelta; index++) { |
| 318 | sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; |
| 319 | sched_load_average = ((sched_load_average << 2) + average_now) / 5; |
| 320 | } |
| 321 | |
| 322 | /* |
| 323 | * Compute old-style Mach load averages. |
| 324 | */ |
| 325 | for (uint32_t index = 0; index < stdelta; index++) { |
| 326 | for (uint32_t i = 0; i < 3; i++) { |
| 327 | mach_factor[i] = ((mach_factor[i] * fract[i]) + |
| 328 | (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; |
| 329 | |
| 330 | avenrun[i] = ((avenrun[i] * fract[i]) + |
| 331 | (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | /* |
| 336 | * Compute averages in other components. |
| 337 | */ |
| 338 | uint64_t abstime = mach_absolute_time(); |
| 339 | |
| 340 | for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) { |
| 341 | if (abstime >= avg->deadline) { |
| 342 | uint64_t period_abs = (avg->period * sched_one_second_interval); |
| 343 | uint64_t ninvokes = 1; |
| 344 | |
| 345 | ninvokes += (abstime - avg->deadline) / period_abs; |
| 346 | ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA); |
| 347 | |
| 348 | for (uint32_t index = 0; index < ninvokes; index++) { |
| 349 | (*avg->comp)(avg->param); |
| 350 | } |
| 351 | avg->deadline = abstime + period_abs; |
| 352 | } |
| 353 | } |
| 354 | } |
| 355 | |