sched_average.c
1 /* 2 * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 /* 29 * @OSF_COPYRIGHT@ 30 */ 31 /* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56 /* 57 */ 58 /* 59 * Author: Avadis Tevanian, Jr. 60 * Date: 1986 61 * 62 * Compute various averages. 63 */ 64 65 #include <mach/mach_types.h> 66 67 #include <kern/sched.h> 68 #include <kern/assert.h> 69 #include <kern/processor.h> 70 #include <kern/thread.h> 71 #if CONFIG_TELEMETRY 72 #include <kern/telemetry.h> 73 #endif 74 #include <kern/zalloc_internal.h> 75 76 #include <sys/kdebug.h> 77 78 uint32_t avenrun[3] = {0, 0, 0}; 79 uint32_t mach_factor[3] = {0, 0, 0}; 80 81 uint32_t sched_load_average, sched_mach_factor; 82 83 #if defined(CONFIG_SCHED_TIMESHARE_CORE) 84 /* 85 * Values are scaled by LOAD_SCALE, defined in processor_info.h 86 */ 87 #define base(n) ((n) << SCHED_TICK_SHIFT) 88 #define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n)) 89 90 static uint32_t fract[3] = { 91 frac(5), /* 5 second average */ 92 frac(30), /* 30 second average */ 93 frac(60), /* 1 minute average */ 94 }; 95 96 #undef base 97 #undef frac 98 99 #endif /* CONFIG_SCHED_TIMESHARE_CORE */ 100 101 static unsigned int sched_nrun; 102 103 typedef void (*sched_avg_comp_t)( 104 void *param); 105 106 static struct sched_average { 107 sched_avg_comp_t comp; 108 void *param; 109 int period; /* in seconds */ 110 uint64_t deadline; 111 } sched_average[] = { 112 { compute_averunnable, &sched_nrun, 5, 0 }, 113 { compute_stack_target, NULL, 5, 1 }, 114 { compute_pageout_gc_throttle, NULL, 1, 0 }, 115 { compute_pmap_gc_throttle, NULL, 60, 0 }, 116 { compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 }, 117 #if CONFIG_TELEMETRY 118 { compute_telemetry, NULL, 1, 0 }, 119 #endif 120 { NULL, NULL, 0, 0 } 121 }; 122 123 typedef struct sched_average *sched_average_t; 124 125 /* 126 * Scheduler load calculation algorithm 127 * 128 * The scheduler load values provide an estimate of the number of runnable 129 * timeshare threads in the system at various priority bands. The load 130 * ultimately affects the priority shifts applied to all threads in a band 131 * causing them to timeshare with other threads in the system. The load is 132 * maintained in buckets, with each bucket corresponding to a priority band. 133 * 134 * Each runnable thread on the system contributes its load to its priority 135 * band and to the bands above it. The contribution of a thread to the bands 136 * above it is not strictly 1:1 and is weighted based on the priority band 137 * of the thread. The rules of thread load contribution to each of its higher 138 * bands are as follows: 139 * 140 * - DF threads: Upto (2 * NCPUs) threads 141 * - UT threads: Upto NCPUs threads 142 * - BG threads: Upto 1 thread 143 * 144 * To calculate the load values, the various run buckets are sampled (every 145 * sched_load_compute_interval_abs) and the weighted contributions of the the 146 * lower bucket threads are added. The resultant value is plugged into an 147 * exponentially weighted moving average formula: 148 * new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count 149 * (where, alpha < 1) 150 * The calculations for the scheduler load are done using fixpoint math with 151 * a scale factor of 16 to avoid expensive divides and floating point 152 * operations. The final load values are a smooth curve representative of 153 * the actual number of runnable threads in a priority band. 154 */ 155 156 /* Maintains the current (scaled for fixpoint) load in various buckets */ 157 uint32_t sched_load[TH_BUCKET_MAX]; 158 159 /* 160 * Alpha factor for the EWMA alogrithm. The current values are chosen as 161 * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast 162 * enough to changing system load but does not see too many spikes from bursty 163 * activity. The current values ensure that the scheduler would converge 164 * to the latest load in 2-3 sched_load_compute_interval_abs intervals 165 * (which amounts to ~30-45ms with current values). 166 */ 167 #define SCHED_LOAD_EWMA_ALPHA_OLD 6 168 #define SCHED_LOAD_EWMA_ALPHA_NEW 10 169 #define SCHED_LOAD_EWMA_ALPHA_SHIFT 4 170 static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT)); 171 172 /* For fixpoint EWMA, roundup the load to make it converge */ 173 #define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0) 174 175 /* Macro to convert scaled sched load to a real load value */ 176 #define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load)) 177 178 /* 179 * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers) 180 */ 181 void 182 compute_sched_load(void) 183 { 184 /* 185 * Retrieve a snapshot of the current run counts. 186 * 187 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets, 188 * not byte-by-byte copy. 189 */ 190 uint32_t ncpus = processor_avail_count; 191 uint32_t load_now[TH_BUCKET_MAX]; 192 193 load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed); 194 load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed); 195 load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed); 196 load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed); 197 load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed); 198 load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed); 199 200 assert(load_now[TH_BUCKET_RUN] >= 0); 201 assert(load_now[TH_BUCKET_FIXPRI] >= 0); 202 203 uint32_t nthreads = load_now[TH_BUCKET_RUN]; 204 uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI]; 205 206 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 207 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE, 208 load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]), 209 load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0); 210 211 /* 212 * Compute the timeshare priority conversion factor based on loading. 213 * Because our counters may be incremented and accessed 214 * concurrently with respect to each other, we may have 215 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut) 216 * is broken, so truncate values in these cases. 217 */ 218 uint32_t timeshare_threads = (nthreads - nfixpri); 219 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { 220 if (load_now[i] > timeshare_threads) { 221 load_now[i] = timeshare_threads; 222 } 223 } 224 225 /* 226 * Default threads contribute up to (NCPUS * 2) of load to FG threads 227 */ 228 if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) { 229 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF]; 230 } else { 231 load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2); 232 } 233 234 /* 235 * Utility threads contribute up to NCPUS of load to FG & DF threads 236 */ 237 if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) { 238 load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT]; 239 load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT]; 240 } else { 241 load_now[TH_BUCKET_SHARE_FG] += ncpus; 242 load_now[TH_BUCKET_SHARE_DF] += ncpus; 243 } 244 245 /* 246 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads 247 */ 248 if (load_now[TH_BUCKET_SHARE_BG] > 0) { 249 load_now[TH_BUCKET_SHARE_FG] += 1; 250 load_now[TH_BUCKET_SHARE_DF] += 1; 251 load_now[TH_BUCKET_SHARE_UT] += 1; 252 } 253 254 /* 255 * The conversion factor consists of two components: 256 * a fixed value based on the absolute time unit (sched_fixed_shift), 257 * and a dynamic portion based on load (sched_load_shifts). 258 * 259 * Zero load results in a out of range shift count. 260 */ 261 262 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { 263 uint32_t bucket_load = 0; 264 265 if (load_now[i] > ncpus) { 266 /* Normalize the load to number of CPUs */ 267 if (ncpus > 1) { 268 bucket_load = load_now[i] / ncpus; 269 } else { 270 bucket_load = load_now[i]; 271 } 272 273 if (bucket_load > MAX_LOAD) { 274 bucket_load = MAX_LOAD; 275 } 276 } 277 /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */ 278 sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW); 279 sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT; 280 } 281 282 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 283 MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, 284 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]), 285 SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0); 286 } 287 288 void 289 compute_averages(uint64_t stdelta) 290 { 291 uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1; 292 uint32_t ncpus = processor_avail_count; 293 294 /* Update the global pri_shifts based on the latest values */ 295 for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) { 296 uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]); 297 uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load]; 298 299 if (shift > SCHED_PRI_SHIFT_MAX) { 300 sched_pri_shifts[i] = INT8_MAX; 301 } else { 302 sched_pri_shifts[i] = shift; 303 } 304 } 305 306 /* 307 * Sample total running threads for the load average calculation. 308 */ 309 sched_nrun = nthreads; 310 311 /* 312 * Load average and mach factor calculations for 313 * those which ask about these things. 314 */ 315 uint32_t average_now = nthreads * LOAD_SCALE; 316 uint32_t factor_now; 317 318 if (nthreads > ncpus) { 319 factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1); 320 } else { 321 factor_now = (ncpus - nthreads) * LOAD_SCALE; 322 } 323 324 /* 325 * For those statistics that formerly relied on being recomputed 326 * on timer ticks, advance by the approximate number of corresponding 327 * elapsed intervals, thus compensating for potential idle intervals. 328 */ 329 for (uint32_t index = 0; index < stdelta; index++) { 330 sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5; 331 sched_load_average = ((sched_load_average << 2) + average_now) / 5; 332 } 333 334 /* 335 * Compute old-style Mach load averages. 336 */ 337 for (uint32_t index = 0; index < stdelta; index++) { 338 for (uint32_t i = 0; i < 3; i++) { 339 mach_factor[i] = ((mach_factor[i] * fract[i]) + 340 (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; 341 342 avenrun[i] = ((avenrun[i] * fract[i]) + 343 (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE; 344 } 345 } 346 347 /* 348 * Compute averages in other components. 349 */ 350 uint64_t abstime = mach_absolute_time(); 351 352 for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) { 353 if (abstime >= avg->deadline) { 354 uint64_t period_abs = (avg->period * sched_one_second_interval); 355 uint64_t ninvokes = 1; 356 357 ninvokes += (abstime - avg->deadline) / period_abs; 358 ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA); 359 360 for (uint32_t index = 0; index < ninvokes; index++) { 361 (*avg->comp)(avg->param); 362 } 363 avg->deadline = abstime + period_abs; 364 } 365 } 366 }