/ duct-tape / xnu / osfmk / kern / sched_average.c
sched_average.c
  1  /*
  2   * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   *
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   *
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   *
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   */
 28  /*
 29   * @OSF_COPYRIGHT@
 30   */
 31  /*
 32   * Mach Operating System
 33   * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
 34   * All Rights Reserved.
 35   *
 36   * Permission to use, copy, modify and distribute this software and its
 37   * documentation is hereby granted, provided that both the copyright
 38   * notice and this permission notice appear in all copies of the
 39   * software, derivative works or modified versions, and any portions
 40   * thereof, and that both notices appear in supporting documentation.
 41   *
 42   * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 43   * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 44   * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 45   *
 46   * Carnegie Mellon requests users of this software to return to
 47   *
 48   *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 49   *  School of Computer Science
 50   *  Carnegie Mellon University
 51   *  Pittsburgh PA 15213-3890
 52   *
 53   * any improvements or extensions that they make and grant Carnegie Mellon
 54   * the rights to redistribute these changes.
 55   */
 56  /*
 57   */
 58  /*
 59   *	Author:	Avadis Tevanian, Jr.
 60   *	Date:	1986
 61   *
 62   *	Compute various averages.
 63   */
 64  
 65  #include <mach/mach_types.h>
 66  
 67  #include <kern/sched.h>
 68  #include <kern/assert.h>
 69  #include <kern/processor.h>
 70  #include <kern/thread.h>
 71  #if CONFIG_TELEMETRY
 72  #include <kern/telemetry.h>
 73  #endif
 74  #include <kern/zalloc_internal.h>
 75  
 76  #include <sys/kdebug.h>
 77  
 78  uint32_t        avenrun[3] = {0, 0, 0};
 79  uint32_t        mach_factor[3] = {0, 0, 0};
 80  
 81  uint32_t        sched_load_average, sched_mach_factor;
 82  
 83  #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 84  /*
 85   * Values are scaled by LOAD_SCALE, defined in processor_info.h
 86   */
 87  #define base(n)         ((n) << SCHED_TICK_SHIFT)
 88  #define frac(n)         (((base(n) - 1) * LOAD_SCALE) /	base(n))
 89  
 90  static uint32_t         fract[3] = {
 91  	frac(5),                /* 5 second average */
 92  	frac(30),               /* 30 second average */
 93  	frac(60),               /* 1 minute average */
 94  };
 95  
 96  #undef base
 97  #undef frac
 98  
 99  #endif /* CONFIG_SCHED_TIMESHARE_CORE */
100  
101  static unsigned int             sched_nrun;
102  
103  typedef void    (*sched_avg_comp_t)(
104  	void                    *param);
105  
106  static struct sched_average {
107  	sched_avg_comp_t        comp;
108  	void                    *param;
109  	int                     period; /* in seconds */
110  	uint64_t                deadline;
111  } sched_average[] = {
112  	{ compute_averunnable, &sched_nrun, 5, 0 },
113  	{ compute_stack_target, NULL, 5, 1 },
114  	{ compute_pageout_gc_throttle, NULL, 1, 0 },
115  	{ compute_pmap_gc_throttle, NULL, 60, 0 },
116  	{ compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, 0 },
117  #if CONFIG_TELEMETRY
118  	{ compute_telemetry, NULL, 1, 0 },
119  #endif
120  	{ NULL, NULL, 0, 0 }
121  };
122  
123  typedef struct sched_average    *sched_average_t;
124  
125  /*
126   * Scheduler load calculation algorithm
127   *
128   * The scheduler load values provide an estimate of the number of runnable
129   * timeshare threads in the system at various priority bands. The load
130   * ultimately affects the priority shifts applied to all threads in a band
131   * causing them to timeshare with other threads in the system. The load is
132   * maintained in buckets, with each bucket corresponding to a priority band.
133   *
134   * Each runnable thread on the system contributes its load to its priority
135   * band and to the bands above it. The contribution of a thread to the bands
136   * above it is not strictly 1:1 and is weighted based on the priority band
137   * of the thread. The rules of thread load contribution to each of its higher
138   * bands are as follows:
139   *
140   * - DF threads: Upto (2 * NCPUs) threads
141   * - UT threads: Upto NCPUs threads
142   * - BG threads: Upto 1 thread
143   *
144   * To calculate the load values, the various run buckets are sampled (every
145   * sched_load_compute_interval_abs) and the weighted contributions of the the
146   * lower bucket threads are added. The resultant value is plugged into an
147   * exponentially weighted moving average formula:
148   *      new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
149   *      (where, alpha < 1)
150   * The calculations for the scheduler load are done using fixpoint math with
151   * a scale factor of 16 to avoid expensive divides and floating point
152   * operations. The final load values are a smooth curve representative of
153   * the actual number of runnable threads in a priority band.
154   */
155  
156  /* Maintains the current (scaled for fixpoint) load in various buckets */
157  uint32_t sched_load[TH_BUCKET_MAX];
158  
159  /*
160   * Alpha factor for the EWMA alogrithm. The current values are chosen as
161   * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
162   * enough to changing system load but does not see too many spikes from bursty
163   * activity. The current values ensure that the scheduler would converge
164   * to the latest load in 2-3 sched_load_compute_interval_abs intervals
165   * (which amounts to ~30-45ms with current values).
166   */
167  #define SCHED_LOAD_EWMA_ALPHA_OLD      6
168  #define SCHED_LOAD_EWMA_ALPHA_NEW      10
169  #define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
170  static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
171  
172  /* For fixpoint EWMA, roundup the load to make it converge */
173  #define SCHED_LOAD_EWMA_ROUNDUP(load)   (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
174  
175  /* Macro to convert scaled sched load to a real load value */
176  #define SCHED_LOAD_EWMA_UNSCALE(load)   (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
177  
178  /*
179   * Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
180   */
181  void
182  compute_sched_load(void)
183  {
184  	/*
185  	 * Retrieve a snapshot of the current run counts.
186  	 *
187  	 * Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
188  	 * not byte-by-byte copy.
189  	 */
190  	uint32_t ncpus = processor_avail_count;
191  	uint32_t load_now[TH_BUCKET_MAX];
192  
193  	load_now[TH_BUCKET_RUN]      = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
194  	load_now[TH_BUCKET_FIXPRI]   = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
195  	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
196  	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
197  	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
198  	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
199  
200  	assert(load_now[TH_BUCKET_RUN] >= 0);
201  	assert(load_now[TH_BUCKET_FIXPRI] >= 0);
202  
203  	uint32_t nthreads = load_now[TH_BUCKET_RUN];
204  	uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];
205  
206  	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
207  	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
208  	    load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
209  	    load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], 0);
210  
211  	/*
212  	 * Compute the timeshare priority conversion factor based on loading.
213  	 * Because our counters may be incremented and accessed
214  	 * concurrently with respect to each other, we may have
215  	 * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
216  	 * is broken, so truncate values in these cases.
217  	 */
218  	uint32_t timeshare_threads = (nthreads - nfixpri);
219  	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
220  		if (load_now[i] > timeshare_threads) {
221  			load_now[i] = timeshare_threads;
222  		}
223  	}
224  
225  	/*
226  	 * Default threads contribute up to (NCPUS * 2) of load to FG threads
227  	 */
228  	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
229  		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
230  	} else {
231  		load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
232  	}
233  
234  	/*
235  	 * Utility threads contribute up to NCPUS of load to FG & DF threads
236  	 */
237  	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
238  		load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
239  		load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
240  	} else {
241  		load_now[TH_BUCKET_SHARE_FG] += ncpus;
242  		load_now[TH_BUCKET_SHARE_DF] += ncpus;
243  	}
244  
245  	/*
246  	 * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
247  	 */
248  	if (load_now[TH_BUCKET_SHARE_BG] > 0) {
249  		load_now[TH_BUCKET_SHARE_FG] += 1;
250  		load_now[TH_BUCKET_SHARE_DF] += 1;
251  		load_now[TH_BUCKET_SHARE_UT] += 1;
252  	}
253  
254  	/*
255  	 * The conversion factor consists of two components:
256  	 * a fixed value based on the absolute time unit (sched_fixed_shift),
257  	 * and a dynamic portion based on load (sched_load_shifts).
258  	 *
259  	 * Zero load results in a out of range shift count.
260  	 */
261  
262  	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
263  		uint32_t bucket_load = 0;
264  
265  		if (load_now[i] > ncpus) {
266  			/* Normalize the load to number of CPUs */
267  			if (ncpus > 1) {
268  				bucket_load = load_now[i] / ncpus;
269  			} else {
270  				bucket_load = load_now[i];
271  			}
272  
273  			if (bucket_load > MAX_LOAD) {
274  				bucket_load = MAX_LOAD;
275  			}
276  		}
277  		/* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
278  		sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
279  		sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
280  	}
281  
282  	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
283  	    MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE,
284  	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
285  	    SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
286  }
287  
288  void
289  compute_averages(uint64_t stdelta)
290  {
291  	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - 1;
292  	uint32_t ncpus = processor_avail_count;
293  
294  	/* Update the global pri_shifts based on the latest values */
295  	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
296  		uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
297  		uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
298  
299  		if (shift > SCHED_PRI_SHIFT_MAX) {
300  			sched_pri_shifts[i] = INT8_MAX;
301  		} else {
302  			sched_pri_shifts[i] = shift;
303  		}
304  	}
305  
306  	/*
307  	 * Sample total running threads for the load average calculation.
308  	 */
309  	sched_nrun = nthreads;
310  
311  	/*
312  	 * Load average and mach factor calculations for
313  	 * those which ask about these things.
314  	 */
315  	uint32_t average_now = nthreads * LOAD_SCALE;
316  	uint32_t factor_now;
317  
318  	if (nthreads > ncpus) {
319  		factor_now = (ncpus * LOAD_SCALE) / (nthreads + 1);
320  	} else {
321  		factor_now = (ncpus - nthreads) * LOAD_SCALE;
322  	}
323  
324  	/*
325  	 * For those statistics that formerly relied on being recomputed
326  	 * on timer ticks, advance by the approximate number of corresponding
327  	 * elapsed intervals, thus compensating for potential idle intervals.
328  	 */
329  	for (uint32_t index = 0; index < stdelta; index++) {
330  		sched_mach_factor = ((sched_mach_factor << 2) + factor_now) / 5;
331  		sched_load_average = ((sched_load_average << 2) + average_now) / 5;
332  	}
333  
334  	/*
335  	 * Compute old-style Mach load averages.
336  	 */
337  	for (uint32_t index = 0; index < stdelta; index++) {
338  		for (uint32_t i = 0; i < 3; i++) {
339  			mach_factor[i] = ((mach_factor[i] * fract[i]) +
340  			    (factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
341  
342  			avenrun[i] = ((avenrun[i] * fract[i]) +
343  			    (average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
344  		}
345  	}
346  
347  	/*
348  	 * Compute averages in other components.
349  	 */
350  	uint64_t abstime = mach_absolute_time();
351  
352  	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
353  		if (abstime >= avg->deadline) {
354  			uint64_t period_abs = (avg->period * sched_one_second_interval);
355  			uint64_t ninvokes = 1;
356  
357  			ninvokes += (abstime - avg->deadline) / period_abs;
358  			ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
359  
360  			for (uint32_t index = 0; index < ninvokes; index++) {
361  				(*avg->comp)(avg->param);
362  			}
363  			avg->deadline = abstime + period_abs;
364  		}
365  	}
366  }