/ duct-tape / xnu / osfmk / kern / priority.c
priority.c
   1  /*
   2   * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. The rights granted to you under the License
  10   * may not be used to create, or enable the creation or redistribution of,
  11   * unlawful or unlicensed copies of an Apple operating system, or to
  12   * circumvent, violate, or enable the circumvention or violation of, any
  13   * terms of an Apple operating system software license agreement.
  14   *
  15   * Please obtain a copy of the License at
  16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17   *
  18   * The Original Code and all software distributed under the License are
  19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23   * Please see the License for the specific language governing rights and
  24   * limitations under the License.
  25   *
  26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27   */
  28  /*
  29   * @OSF_COPYRIGHT@
  30   */
  31  /*
  32   * Mach Operating System
  33   * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34   * All Rights Reserved.
  35   *
  36   * Permission to use, copy, modify and distribute this software and its
  37   * documentation is hereby granted, provided that both the copyright
  38   * notice and this permission notice appear in all copies of the
  39   * software, derivative works or modified versions, and any portions
  40   * thereof, and that both notices appear in supporting documentation.
  41   *
  42   * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43   * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44   * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45   *
  46   * Carnegie Mellon requests users of this software to return to
  47   *
  48   *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49   *  School of Computer Science
  50   *  Carnegie Mellon University
  51   *  Pittsburgh PA 15213-3890
  52   *
  53   * any improvements or extensions that they make and grant Carnegie Mellon
  54   * the rights to redistribute these changes.
  55   */
  56  /*
  57   */
  58  /*
  59   *	File:	priority.c
  60   *	Author:	Avadis Tevanian, Jr.
  61   *	Date:	1986
  62   *
  63   *	Priority related scheduler bits.
  64   */
  65  
  66  #include <mach/boolean.h>
  67  #include <mach/kern_return.h>
  68  #include <mach/machine.h>
  69  #include <kern/host.h>
  70  #include <kern/mach_param.h>
  71  #include <kern/sched.h>
  72  #include <sys/kdebug.h>
  73  #include <kern/spl.h>
  74  #include <kern/thread.h>
  75  #include <kern/processor.h>
  76  #include <kern/ledger.h>
  77  #include <machine/machparam.h>
  78  #include <kern/machine.h>
  79  #include <kern/policy_internal.h>
  80  #include <kern/sched_clutch.h>
  81  
  82  #ifdef CONFIG_MACH_APPROXIMATE_TIME
  83  #include <machine/commpage.h>  /* for commpage_update_mach_approximate_time */
  84  #endif
  85  
  86  #if MONOTONIC
  87  #include <kern/monotonic.h>
  88  #endif /* MONOTONIC */
  89  
  90  /*
  91   *	thread_quantum_expire:
  92   *
  93   *	Recalculate the quantum and priority for a thread.
  94   *
  95   *	Called at splsched.
  96   */
  97  
  98  void
  99  thread_quantum_expire(
 100  	timer_call_param_t      p0,
 101  	timer_call_param_t      p1)
 102  {
 103  	processor_t                     processor = p0;
 104  	thread_t                        thread = p1;
 105  	ast_t                           preempt;
 106  	uint64_t                        ctime;
 107  
 108  	assert(processor == current_processor());
 109  	assert(thread == current_thread());
 110  
 111  	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_START, 0, 0, 0, 0, 0);
 112  
 113  	SCHED_STATS_INC(quantum_timer_expirations);
 114  
 115  	/*
 116  	 * We bill CPU time to both the individual thread and its task.
 117  	 *
 118  	 * Because this balance adjustment could potentially attempt to wake this
 119  	 * very thread, we must credit the ledger before taking the thread lock.
 120  	 * The ledger pointers are only manipulated by the thread itself at the ast
 121  	 * boundary.
 122  	 *
 123  	 * TODO: This fails to account for the time between when the timer was
 124  	 * armed and when it fired.  It should be based on the system_timer and
 125  	 * running a timer_update operation here.
 126  	 */
 127  	ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining);
 128  	ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining);
 129  	if (thread->t_bankledger) {
 130  		ledger_credit(thread->t_bankledger, bank_ledgers.cpu_time,
 131  		    (thread->quantum_remaining - thread->t_deduct_bank_ledger_time));
 132  	}
 133  	thread->t_deduct_bank_ledger_time = 0;
 134  	ctime = mach_absolute_time();
 135  
 136  #ifdef CONFIG_MACH_APPROXIMATE_TIME
 137  	commpage_update_mach_approximate_time(ctime);
 138  #endif
 139  	sched_update_pset_avg_execution_time(processor->processor_set, thread->quantum_remaining, ctime, thread->th_sched_bucket);
 140  
 141  #if MONOTONIC
 142  	mt_sched_update(thread);
 143  #endif /* MONOTONIC */
 144  
 145  	thread_lock(thread);
 146  
 147  	/*
 148  	 * We've run up until our quantum expiration, and will (potentially)
 149  	 * continue without re-entering the scheduler, so update this now.
 150  	 */
 151  	processor->last_dispatch = ctime;
 152  	thread->last_run_time = ctime;
 153  
 154  	/*
 155  	 *	Check for fail-safe trip.
 156  	 */
 157  	if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
 158  	    !(thread->sched_flags & TH_SFLAG_PROMOTED) &&
 159  	    !(thread->kern_promotion_schedpri != 0) &&
 160  	    !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) &&
 161  	    !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
 162  		uint64_t new_computation;
 163  
 164  		new_computation = ctime - thread->computation_epoch;
 165  		new_computation += thread->computation_metered;
 166  		if (new_computation > max_unsafe_computation) {
 167  			KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE) | DBG_FUNC_NONE,
 168  			    (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0);
 169  
 170  			thread->safe_release = ctime + sched_safe_duration;
 171  
 172  			sched_thread_mode_demote(thread, TH_SFLAG_FAILSAFE);
 173  		}
 174  	}
 175  
 176  	/*
 177  	 *	Recompute scheduled priority if appropriate.
 178  	 */
 179  	if (SCHED(can_update_priority)(thread)) {
 180  		SCHED(update_priority)(thread);
 181  	} else {
 182  		SCHED(lightweight_update_priority)(thread);
 183  	}
 184  
 185  	if (thread->sched_mode != TH_MODE_REALTIME) {
 186  		SCHED(quantum_expire)(thread);
 187  	}
 188  
 189  	/*
 190  	 *	This quantum is up, give this thread another.
 191  	 */
 192  	processor->first_timeslice = FALSE;
 193  
 194  	thread_quantum_init(thread);
 195  
 196  	/* Reload precise timing global policy to thread-local policy */
 197  	thread->precise_user_kernel_time = use_precise_user_kernel_time(thread);
 198  
 199  	/*
 200  	 * Since non-precise user/kernel time doesn't update the state/thread timer
 201  	 * during privilege transitions, synthesize an event now.
 202  	 */
 203  	if (!thread->precise_user_kernel_time) {
 204  		timer_update(processor->current_state, ctime);
 205  		timer_update(processor->thread_timer, ctime);
 206  		timer_update(&thread->runnable_timer, ctime);
 207  	}
 208  
 209  
 210  	processor->quantum_end = ctime + thread->quantum_remaining;
 211  
 212  	/*
 213  	 * Context switch check
 214  	 *
 215  	 * non-urgent flags don't affect kernel threads, so upgrade to urgent
 216  	 * to ensure that rebalancing and non-recommendation kick in quickly.
 217  	 */
 218  
 219  	ast_t check_reason = AST_QUANTUM;
 220  	if (thread->task == kernel_task) {
 221  		check_reason |= AST_URGENT;
 222  	}
 223  
 224  	if ((preempt = csw_check(thread, processor, check_reason)) != AST_NONE) {
 225  		ast_on(preempt);
 226  	}
 227  
 228  	/*
 229  	 * AST_KEVENT does not send an IPI when setting the AST,
 230  	 * to avoid waiting for the next context switch to propagate the AST,
 231  	 * the AST is propagated here at quantum expiration.
 232  	 */
 233  	ast_propagate(thread);
 234  
 235  	thread_unlock(thread);
 236  	running_timer_enter(processor, RUNNING_TIMER_QUANTUM, thread,
 237  	    processor->quantum_end, ctime);
 238  
 239  	/* Tell platform layer that we are still running this thread */
 240  	thread_urgency_t urgency = thread_get_urgency(thread, NULL, NULL);
 241  	machine_thread_going_on_core(thread, urgency, 0, 0, ctime);
 242  	machine_switch_perfcontrol_state_update(QUANTUM_EXPIRY, ctime,
 243  	    0, thread);
 244  
 245  #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 246  	sched_timeshare_consider_maintenance(ctime);
 247  #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 248  
 249  #if __arm__ || __arm64__
 250  	if (thread->sched_mode == TH_MODE_REALTIME) {
 251  		sched_consider_recommended_cores(ctime, thread);
 252  	}
 253  #endif /* __arm__ || __arm64__ */
 254  
 255  	KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_QUANTUM_EXPIRED) | DBG_FUNC_END, preempt, 0, 0, 0, 0);
 256  }
 257  
 258  /*
 259   *	sched_set_thread_base_priority:
 260   *
 261   *	Set the base priority of the thread
 262   *	and reset its scheduled priority.
 263   *
 264   *	This is the only path to change base_pri.
 265   *
 266   *	Called with the thread locked.
 267   */
 268  void
 269  sched_set_thread_base_priority(thread_t thread, int priority)
 270  {
 271  	assert(priority >= MINPRI);
 272  	uint64_t ctime = 0;
 273  
 274  	if (thread->sched_mode == TH_MODE_REALTIME) {
 275  		assert(priority <= BASEPRI_RTQUEUES);
 276  	} else {
 277  		assert(priority < BASEPRI_RTQUEUES);
 278  	}
 279  
 280  	int old_base_pri = thread->base_pri;
 281  	thread->req_base_pri = (int16_t)priority;
 282  	if (thread->sched_flags & TH_SFLAG_BASE_PRI_FROZEN) {
 283  		priority = MAX(priority, old_base_pri);
 284  	}
 285  	thread->base_pri = (int16_t)priority;
 286  
 287  	if ((thread->state & TH_RUN) == TH_RUN) {
 288  		assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE);
 289  		ctime = mach_approximate_time();
 290  		thread->last_basepri_change_time = ctime;
 291  	} else {
 292  		assert(thread->last_basepri_change_time == THREAD_NOT_RUNNABLE);
 293  		assert(thread->last_made_runnable_time == THREAD_NOT_RUNNABLE);
 294  	}
 295  
 296  	/*
 297  	 * Currently the perfcontrol_attr depends on the base pri of the
 298  	 * thread. Therefore, we use this function as the hook for the
 299  	 * perfcontrol callout.
 300  	 */
 301  	if (thread == current_thread() && old_base_pri != priority) {
 302  		if (!ctime) {
 303  			ctime = mach_approximate_time();
 304  		}
 305  		machine_switch_perfcontrol_state_update(PERFCONTROL_ATTR_UPDATE,
 306  		    ctime, PERFCONTROL_CALLOUT_WAKE_UNSAFE, thread);
 307  	}
 308  #if !CONFIG_SCHED_CLUTCH
 309  	/* For the clutch scheduler, this operation is done in set_sched_pri() */
 310  	SCHED(update_thread_bucket)(thread);
 311  #endif /* !CONFIG_SCHED_CLUTCH */
 312  
 313  	thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 314  }
 315  
 316  /*
 317   *	sched_set_kernel_thread_priority:
 318   *
 319   *	Set the absolute base priority of the thread
 320   *	and reset its scheduled priority.
 321   *
 322   *	Called with the thread unlocked.
 323   */
 324  void
 325  sched_set_kernel_thread_priority(thread_t thread, int new_priority)
 326  {
 327  	spl_t s = splsched();
 328  
 329  	thread_lock(thread);
 330  
 331  	assert(thread->sched_mode != TH_MODE_REALTIME);
 332  	assert(thread->effective_policy.thep_qos == THREAD_QOS_UNSPECIFIED);
 333  
 334  	if (new_priority > thread->max_priority) {
 335  		new_priority = thread->max_priority;
 336  	}
 337  #if !defined(XNU_TARGET_OS_OSX)
 338  	if (new_priority < MAXPRI_THROTTLE) {
 339  		new_priority = MAXPRI_THROTTLE;
 340  	}
 341  #endif /* !defined(XNU_TARGET_OS_OSX) */
 342  
 343  	thread->importance = new_priority - thread->task_priority;
 344  
 345  	sched_set_thread_base_priority(thread, new_priority);
 346  
 347  	thread_unlock(thread);
 348  	splx(s);
 349  }
 350  
 351  /*
 352   *	thread_recompute_sched_pri:
 353   *
 354   *	Reset the scheduled priority of the thread
 355   *	according to its base priority if the
 356   *	thread has not been promoted or depressed.
 357   *
 358   *	This is the only way to push base_pri changes into sched_pri,
 359   *	or to recalculate the appropriate sched_pri after changing
 360   *	a promotion or depression.
 361   *
 362   *	Called at splsched with the thread locked.
 363   *
 364   *	TODO: Add an 'update urgency' flag to avoid urgency callouts on every rwlock operation
 365   */
 366  void
 367  thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options)
 368  {
 369  	uint32_t     sched_flags = thread->sched_flags;
 370  	sched_mode_t sched_mode  = thread->sched_mode;
 371  
 372  	int16_t priority = thread->base_pri;
 373  
 374  	if (sched_mode == TH_MODE_TIMESHARE) {
 375  		priority = (int16_t)SCHED(compute_timeshare_priority)(thread);
 376  	}
 377  
 378  	if (sched_flags & TH_SFLAG_DEPRESS) {
 379  		/* thread_yield_internal overrides kernel mutex promotion */
 380  		priority = DEPRESSPRI;
 381  	} else {
 382  		/* poll-depress is overridden by mutex promotion and promote-reasons */
 383  		if ((sched_flags & TH_SFLAG_POLLDEPRESS)) {
 384  			priority = DEPRESSPRI;
 385  		}
 386  
 387  		if (thread->kern_promotion_schedpri > 0) {
 388  			priority = MAX(priority, thread->kern_promotion_schedpri);
 389  
 390  			if (sched_mode != TH_MODE_REALTIME) {
 391  				priority = MIN(priority, MAXPRI_PROMOTE);
 392  			}
 393  		}
 394  
 395  		if (sched_flags & TH_SFLAG_PROMOTED) {
 396  			priority = MAX(priority, thread->promotion_priority);
 397  
 398  			if (sched_mode != TH_MODE_REALTIME) {
 399  				priority = MIN(priority, MAXPRI_PROMOTE);
 400  			}
 401  		}
 402  
 403  		if (sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) {
 404  			if (sched_flags & TH_SFLAG_RW_PROMOTED) {
 405  				priority = MAX(priority, MINPRI_RWLOCK);
 406  			}
 407  
 408  			if (sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
 409  				priority = MAX(priority, MINPRI_WAITQ);
 410  			}
 411  
 412  			if (sched_flags & TH_SFLAG_EXEC_PROMOTED) {
 413  				priority = MAX(priority, MINPRI_EXEC);
 414  			}
 415  		}
 416  	}
 417  
 418  	set_sched_pri(thread, priority, options);
 419  }
 420  
 421  void
 422  sched_default_quantum_expire(thread_t thread __unused)
 423  {
 424  	/*
 425  	 * No special behavior when a timeshare, fixed, or realtime thread
 426  	 * uses up its entire quantum
 427  	 */
 428  }
 429  
 430  int smt_timeshare_enabled = 1;
 431  int smt_sched_bonus_16ths = 8;
 432  
 433  #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 434  
 435  /*
 436   *	lightweight_update_priority:
 437   *
 438   *	Update the scheduled priority for
 439   *	a timesharing thread.
 440   *
 441   *	Only for use on the current thread.
 442   *
 443   *	Called with the thread locked.
 444   */
 445  void
 446  lightweight_update_priority(thread_t thread)
 447  {
 448  	assert(thread->runq == PROCESSOR_NULL);
 449  	assert(thread == current_thread());
 450  
 451  	if (thread->sched_mode == TH_MODE_TIMESHARE) {
 452  		int priority;
 453  		uint32_t delta;
 454  
 455  		thread_timer_delta(thread, delta);
 456  
 457  		/*
 458  		 *	Accumulate timesharing usage only
 459  		 *	during contention for processor
 460  		 *	resources.
 461  		 */
 462  		if (thread->pri_shift < INT8_MAX) {
 463  			if (thread_no_smt(thread) && smt_timeshare_enabled) {
 464  				thread->sched_usage += (delta + ((delta * smt_sched_bonus_16ths) >> 4));
 465  			} else {
 466  				thread->sched_usage += delta;
 467  			}
 468  		}
 469  
 470  		thread->cpu_delta += delta;
 471  
 472  #if CONFIG_SCHED_CLUTCH
 473  		/*
 474  		 * Update the CPU usage for the thread group to which the thread belongs.
 475  		 * The implementation assumes that the thread ran for the entire delta
 476  		 * as part of the same thread group.
 477  		 */
 478  		sched_clutch_cpu_usage_update(thread, delta);
 479  #endif /* CONFIG_SCHED_CLUTCH */
 480  
 481  		priority = sched_compute_timeshare_priority(thread);
 482  
 483  		if (priority != thread->sched_pri) {
 484  			thread_recompute_sched_pri(thread, SETPRI_LAZY);
 485  		}
 486  	}
 487  }
 488  
 489  /*
 490   *	Define shifts for simulating (5/8) ** n
 491   *
 492   *	Shift structures for holding update shifts.  Actual computation
 493   *	is  usage = (usage >> shift1) +/- (usage >> abs(shift2))  where the
 494   *	+/- is determined by the sign of shift 2.
 495   */
 496  
 497  const struct shift_data        sched_decay_shifts[SCHED_DECAY_TICKS] = {
 498  	{ .shift1 = 1, .shift2 = 1 },
 499  	{ .shift1 = 1, .shift2 = 3 },
 500  	{ .shift1 = 1, .shift2 = -3 },
 501  	{ .shift1 = 2, .shift2 = -7 },
 502  	{ .shift1 = 3, .shift2 = 5 },
 503  	{ .shift1 = 3, .shift2 = -5 },
 504  	{ .shift1 = 4, .shift2 = -8 },
 505  	{ .shift1 = 5, .shift2 = 7 },
 506  	{ .shift1 = 5, .shift2 = -7 },
 507  	{ .shift1 = 6, .shift2 = -10 },
 508  	{ .shift1 = 7, .shift2 = 10 },
 509  	{ .shift1 = 7, .shift2 = -9 },
 510  	{ .shift1 = 8, .shift2 = -11 },
 511  	{ .shift1 = 9, .shift2 = 12 },
 512  	{ .shift1 = 9, .shift2 = -11 },
 513  	{ .shift1 = 10, .shift2 = -13 },
 514  	{ .shift1 = 11, .shift2 = 14 },
 515  	{ .shift1 = 11, .shift2 = -13 },
 516  	{ .shift1 = 12, .shift2 = -15 },
 517  	{ .shift1 = 13, .shift2 = 17 },
 518  	{ .shift1 = 13, .shift2 = -15 },
 519  	{ .shift1 = 14, .shift2 = -17 },
 520  	{ .shift1 = 15, .shift2 = 19 },
 521  	{ .shift1 = 16, .shift2 = 18 },
 522  	{ .shift1 = 16, .shift2 = -19 },
 523  	{ .shift1 = 17, .shift2 = 22 },
 524  	{ .shift1 = 18, .shift2 = 20 },
 525  	{ .shift1 = 18, .shift2 = -20 },
 526  	{ .shift1 = 19, .shift2 = 26 },
 527  	{ .shift1 = 20, .shift2 = 22 },
 528  	{ .shift1 = 20, .shift2 = -22 },
 529  	{ .shift1 = 21, .shift2 = -27 }
 530  };
 531  
 532  /*
 533   *	sched_compute_timeshare_priority:
 534   *
 535   *	Calculate the timesharing priority based upon usage and load.
 536   */
 537  extern int sched_pri_decay_band_limit;
 538  
 539  
 540  /* Only use the decay floor logic on non-macOS and non-clutch schedulers */
 541  #if !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH
 542  
 543  int
 544  sched_compute_timeshare_priority(thread_t thread)
 545  {
 546  	int decay_amount;
 547  	int decay_limit = sched_pri_decay_band_limit;
 548  
 549  	if (thread->base_pri > BASEPRI_FOREGROUND) {
 550  		decay_limit += (thread->base_pri - BASEPRI_FOREGROUND);
 551  	}
 552  
 553  	if (thread->pri_shift == INT8_MAX) {
 554  		decay_amount = 0;
 555  	} else {
 556  		decay_amount = (thread->sched_usage >> thread->pri_shift);
 557  	}
 558  
 559  	if (decay_amount > decay_limit) {
 560  		decay_amount = decay_limit;
 561  	}
 562  
 563  	/* start with base priority */
 564  	int priority = thread->base_pri - decay_amount;
 565  
 566  	if (priority < MAXPRI_THROTTLE) {
 567  		if (thread->task->max_priority > MAXPRI_THROTTLE) {
 568  			priority = MAXPRI_THROTTLE;
 569  		} else if (priority < MINPRI_USER) {
 570  			priority = MINPRI_USER;
 571  		}
 572  	} else if (priority > MAXPRI_KERNEL) {
 573  		priority = MAXPRI_KERNEL;
 574  	}
 575  
 576  	return priority;
 577  }
 578  
 579  #else /* !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH */
 580  
 581  int
 582  sched_compute_timeshare_priority(thread_t thread)
 583  {
 584  	/* start with base priority */
 585  	int priority = thread->base_pri;
 586  
 587  	if (thread->pri_shift != INT8_MAX) {
 588  		priority -= (thread->sched_usage >> thread->pri_shift);
 589  	}
 590  
 591  	if (priority < MINPRI_USER) {
 592  		priority = MINPRI_USER;
 593  	} else if (priority > MAXPRI_KERNEL) {
 594  		priority = MAXPRI_KERNEL;
 595  	}
 596  
 597  	return priority;
 598  }
 599  
 600  #endif /* !defined(XNU_TARGET_OS_OSX) && !CONFIG_SCHED_CLUTCH */
 601  
 602  /*
 603   *	can_update_priority
 604   *
 605   *	Make sure we don't do re-dispatches more frequently than a scheduler tick.
 606   *
 607   *	Called with the thread locked.
 608   */
 609  boolean_t
 610  can_update_priority(
 611  	thread_t        thread)
 612  {
 613  	if (sched_tick == thread->sched_stamp) {
 614  		return FALSE;
 615  	} else {
 616  		return TRUE;
 617  	}
 618  }
 619  
 620  /*
 621   *	update_priority
 622   *
 623   *	Perform housekeeping operations driven by scheduler tick.
 624   *
 625   *	Called with the thread locked.
 626   */
 627  void
 628  update_priority(
 629  	thread_t        thread)
 630  {
 631  	uint32_t ticks, delta;
 632  
 633  	ticks = sched_tick - thread->sched_stamp;
 634  	assert(ticks != 0);
 635  
 636  	thread->sched_stamp += ticks;
 637  
 638  	/* If requested, accelerate aging of sched_usage */
 639  	if (sched_decay_usage_age_factor > 1) {
 640  		ticks *= sched_decay_usage_age_factor;
 641  	}
 642  
 643  	/*
 644  	 *	Gather cpu usage data.
 645  	 */
 646  	thread_timer_delta(thread, delta);
 647  	if (ticks < SCHED_DECAY_TICKS) {
 648  		/*
 649  		 *	Accumulate timesharing usage only during contention for processor
 650  		 *	resources. Use the pri_shift from the previous tick window to
 651  		 *	determine if the system was in a contended state.
 652  		 */
 653  		if (thread->pri_shift < INT8_MAX) {
 654  			if (thread_no_smt(thread) && smt_timeshare_enabled) {
 655  				thread->sched_usage += (delta + ((delta * smt_sched_bonus_16ths) >> 4));
 656  			} else {
 657  				thread->sched_usage += delta;
 658  			}
 659  		}
 660  
 661  		thread->cpu_usage += delta + thread->cpu_delta;
 662  		thread->cpu_delta = 0;
 663  
 664  #if CONFIG_SCHED_CLUTCH
 665  		/*
 666  		 * Update the CPU usage for the thread group to which the thread belongs.
 667  		 * The implementation assumes that the thread ran for the entire delta
 668  		 * as part of the same thread group.
 669  		 */
 670  		sched_clutch_cpu_usage_update(thread, delta);
 671  #endif /* CONFIG_SCHED_CLUTCH */
 672  
 673  		const struct shift_data *shiftp = &sched_decay_shifts[ticks];
 674  
 675  		if (shiftp->shift2 > 0) {
 676  			thread->cpu_usage =   (thread->cpu_usage >> shiftp->shift1) +
 677  			    (thread->cpu_usage >> shiftp->shift2);
 678  			thread->sched_usage = (thread->sched_usage >> shiftp->shift1) +
 679  			    (thread->sched_usage >> shiftp->shift2);
 680  		} else {
 681  			thread->cpu_usage =   (thread->cpu_usage >>   shiftp->shift1) -
 682  			    (thread->cpu_usage >> -(shiftp->shift2));
 683  			thread->sched_usage = (thread->sched_usage >>   shiftp->shift1) -
 684  			    (thread->sched_usage >> -(shiftp->shift2));
 685  		}
 686  	} else {
 687  		thread->cpu_usage = thread->cpu_delta = 0;
 688  		thread->sched_usage = 0;
 689  	}
 690  
 691  	/*
 692  	 *	Check for fail-safe release.
 693  	 */
 694  	if ((thread->sched_flags & TH_SFLAG_FAILSAFE) &&
 695  	    mach_absolute_time() >= thread->safe_release) {
 696  		sched_thread_mode_undemote(thread, TH_SFLAG_FAILSAFE);
 697  	}
 698  
 699  	/*
 700  	 * Now that the thread's CPU usage has been accumulated and aged
 701  	 * based on contention of the previous tick window, update the
 702  	 * pri_shift of the thread to match the current global load/shift
 703  	 * values. The updated pri_shift would be used to calculate the
 704  	 * new priority of the thread.
 705  	 */
 706  #if CONFIG_SCHED_CLUTCH
 707  	thread->pri_shift = sched_clutch_thread_pri_shift(thread, thread->th_sched_bucket);
 708  #else /* CONFIG_SCHED_CLUTCH */
 709  	thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
 710  #endif /* CONFIG_SCHED_CLUTCH */
 711  
 712  	/* Recompute scheduled priority if appropriate. */
 713  	if (thread->sched_mode == TH_MODE_TIMESHARE) {
 714  		thread_recompute_sched_pri(thread, SETPRI_LAZY);
 715  	}
 716  }
 717  
 718  #endif /* CONFIG_SCHED_TIMESHARE_CORE */
 719  
 720  
 721  /*
 722   * TH_BUCKET_RUN is a count of *all* runnable non-idle threads.
 723   * Each other bucket is a count of the runnable non-idle threads
 724   * with that property. All updates to these counts should be
 725   * performed with os_atomic_* operations.
 726   *
 727   * For the clutch scheduler, this global bucket is used only for
 728   * keeping the total global run count.
 729   */
 730  uint32_t       sched_run_buckets[TH_BUCKET_MAX];
 731  
 732  static void
 733  sched_incr_bucket(sched_bucket_t bucket)
 734  {
 735  	assert(bucket >= TH_BUCKET_FIXPRI &&
 736  	    bucket <= TH_BUCKET_SHARE_BG);
 737  
 738  	os_atomic_inc(&sched_run_buckets[bucket], relaxed);
 739  }
 740  
 741  static void
 742  sched_decr_bucket(sched_bucket_t bucket)
 743  {
 744  	assert(bucket >= TH_BUCKET_FIXPRI &&
 745  	    bucket <= TH_BUCKET_SHARE_BG);
 746  
 747  	assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 748  
 749  	os_atomic_dec(&sched_run_buckets[bucket], relaxed);
 750  }
 751  
 752  static void
 753  sched_add_bucket(sched_bucket_t bucket, uint8_t run_weight)
 754  {
 755  	assert(bucket >= TH_BUCKET_FIXPRI &&
 756  	    bucket <= TH_BUCKET_SHARE_BG);
 757  
 758  	os_atomic_add(&sched_run_buckets[bucket], run_weight, relaxed);
 759  }
 760  
 761  static void
 762  sched_sub_bucket(sched_bucket_t bucket, uint8_t run_weight)
 763  {
 764  	assert(bucket >= TH_BUCKET_FIXPRI &&
 765  	    bucket <= TH_BUCKET_SHARE_BG);
 766  
 767  	assert(os_atomic_load(&sched_run_buckets[bucket], relaxed) > 0);
 768  
 769  	os_atomic_sub(&sched_run_buckets[bucket], run_weight, relaxed);
 770  }
 771  
 772  uint32_t
 773  sched_run_incr(thread_t thread)
 774  {
 775  	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 776  
 777  	uint32_t new_count = os_atomic_inc(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 778  
 779  	sched_incr_bucket(thread->th_sched_bucket);
 780  
 781  	return new_count;
 782  }
 783  
 784  uint32_t
 785  sched_run_decr(thread_t thread)
 786  {
 787  	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
 788  
 789  	sched_decr_bucket(thread->th_sched_bucket);
 790  
 791  	uint32_t new_count = os_atomic_dec(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
 792  
 793  	return new_count;
 794  }
 795  
 796  uint32_t
 797  sched_smt_run_incr(thread_t thread)
 798  {
 799  	assert((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN);
 800  
 801  	uint8_t run_weight = (thread_no_smt(thread) && smt_timeshare_enabled) ? 2 : 1;
 802  	thread->sched_saved_run_weight = run_weight;
 803  
 804  	uint32_t new_count = os_atomic_add(&sched_run_buckets[TH_BUCKET_RUN], run_weight, relaxed);
 805  
 806  	sched_add_bucket(thread->th_sched_bucket, run_weight);
 807  
 808  	return new_count;
 809  }
 810  
 811  uint32_t
 812  sched_smt_run_decr(thread_t thread)
 813  {
 814  	assert((thread->state & (TH_RUN | TH_IDLE)) != TH_RUN);
 815  
 816  	uint8_t run_weight = thread->sched_saved_run_weight;
 817  
 818  	sched_sub_bucket(thread->th_sched_bucket, run_weight);
 819  
 820  	uint32_t new_count = os_atomic_sub(&sched_run_buckets[TH_BUCKET_RUN], run_weight, relaxed);
 821  
 822  	return new_count;
 823  }
 824  
 825  void
 826  sched_update_thread_bucket(thread_t thread)
 827  {
 828  	sched_bucket_t old_bucket = thread->th_sched_bucket;
 829  	sched_bucket_t new_bucket = TH_BUCKET_RUN;
 830  
 831  	switch (thread->sched_mode) {
 832  	case TH_MODE_FIXED:
 833  	case TH_MODE_REALTIME:
 834  		new_bucket = TH_BUCKET_FIXPRI;
 835  		break;
 836  
 837  	case TH_MODE_TIMESHARE:
 838  		if (thread->base_pri > BASEPRI_DEFAULT) {
 839  			new_bucket = TH_BUCKET_SHARE_FG;
 840  		} else if (thread->base_pri > BASEPRI_UTILITY) {
 841  			new_bucket = TH_BUCKET_SHARE_DF;
 842  		} else if (thread->base_pri > MAXPRI_THROTTLE) {
 843  			new_bucket = TH_BUCKET_SHARE_UT;
 844  		} else {
 845  			new_bucket = TH_BUCKET_SHARE_BG;
 846  		}
 847  		break;
 848  
 849  	default:
 850  		panic("unexpected mode: %d", thread->sched_mode);
 851  		break;
 852  	}
 853  
 854  	if (old_bucket != new_bucket) {
 855  		thread->th_sched_bucket = new_bucket;
 856  		thread->pri_shift = sched_pri_shifts[new_bucket];
 857  
 858  		if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
 859  			sched_decr_bucket(old_bucket);
 860  			sched_incr_bucket(new_bucket);
 861  		}
 862  	}
 863  }
 864  
 865  void
 866  sched_smt_update_thread_bucket(thread_t thread)
 867  {
 868  	sched_bucket_t old_bucket = thread->th_sched_bucket;
 869  	sched_bucket_t new_bucket = TH_BUCKET_RUN;
 870  
 871  	switch (thread->sched_mode) {
 872  	case TH_MODE_FIXED:
 873  	case TH_MODE_REALTIME:
 874  		new_bucket = TH_BUCKET_FIXPRI;
 875  		break;
 876  
 877  	case TH_MODE_TIMESHARE:
 878  		if (thread->base_pri > BASEPRI_DEFAULT) {
 879  			new_bucket = TH_BUCKET_SHARE_FG;
 880  		} else if (thread->base_pri > BASEPRI_UTILITY) {
 881  			new_bucket = TH_BUCKET_SHARE_DF;
 882  		} else if (thread->base_pri > MAXPRI_THROTTLE) {
 883  			new_bucket = TH_BUCKET_SHARE_UT;
 884  		} else {
 885  			new_bucket = TH_BUCKET_SHARE_BG;
 886  		}
 887  		break;
 888  
 889  	default:
 890  		panic("unexpected mode: %d", thread->sched_mode);
 891  		break;
 892  	}
 893  
 894  	if (old_bucket != new_bucket) {
 895  		thread->th_sched_bucket = new_bucket;
 896  		thread->pri_shift = sched_pri_shifts[new_bucket];
 897  
 898  		if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) {
 899  			sched_sub_bucket(old_bucket, thread->sched_saved_run_weight);
 900  			sched_add_bucket(new_bucket, thread->sched_saved_run_weight);
 901  		}
 902  	}
 903  }
 904  
 905  /*
 906   * Set the thread's true scheduling mode
 907   * Called with thread mutex and thread locked
 908   * The thread has already been removed from the runqueue.
 909   *
 910   * (saved_mode is handled before this point)
 911   */
 912  void
 913  sched_set_thread_mode(thread_t thread, sched_mode_t new_mode)
 914  {
 915  	assert(thread->runq == PROCESSOR_NULL);
 916  
 917  	switch (new_mode) {
 918  	case TH_MODE_FIXED:
 919  	case TH_MODE_REALTIME:
 920  	case TH_MODE_TIMESHARE:
 921  		break;
 922  
 923  	default:
 924  		panic("unexpected mode: %d", new_mode);
 925  		break;
 926  	}
 927  
 928  #if CONFIG_SCHED_AUTO_JOIN
 929  	/*
 930  	 * Realtime threads might have auto-joined a work interval based on
 931  	 * make runnable relationships. If such an RT thread is now being demoted
 932  	 * to non-RT, unjoin the thread from the work interval.
 933  	 */
 934  	if ((thread->sched_flags & TH_SFLAG_THREAD_GROUP_AUTO_JOIN) && (new_mode != TH_MODE_REALTIME)) {
 935  		assert((thread->sched_mode == TH_MODE_REALTIME) || (thread->th_work_interval_flags & TH_WORK_INTERVAL_FLAGS_AUTO_JOIN_LEAK));
 936  		work_interval_auto_join_demote(thread);
 937  	}
 938  #endif /* CONFIG_SCHED_AUTO_JOIN */
 939  
 940  	thread->sched_mode = new_mode;
 941  
 942  	SCHED(update_thread_bucket)(thread);
 943  }
 944  
 945  /*
 946   * Demote the true scheduler mode to timeshare (called with the thread locked)
 947   */
 948  void
 949  sched_thread_mode_demote(thread_t thread, uint32_t reason)
 950  {
 951  	assert(reason & TH_SFLAG_DEMOTED_MASK);
 952  	assert((thread->sched_flags & reason) != reason);
 953  
 954  	if (thread->policy_reset) {
 955  		return;
 956  	}
 957  
 958  	if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 959  		/* Another demotion reason is already active */
 960  		thread->sched_flags |= reason;
 961  		return;
 962  	}
 963  
 964  	assert(thread->saved_mode == TH_MODE_NONE);
 965  
 966  	boolean_t removed = thread_run_queue_remove(thread);
 967  
 968  	thread->sched_flags |= reason;
 969  
 970  	thread->saved_mode = thread->sched_mode;
 971  
 972  	sched_set_thread_mode(thread, TH_MODE_TIMESHARE);
 973  
 974  	thread_recompute_priority(thread);
 975  
 976  	if (removed) {
 977  		thread_run_queue_reinsert(thread, SCHED_TAILQ);
 978  	}
 979  }
 980  
 981  /*
 982   * Un-demote the true scheduler mode back to the saved mode (called with the thread locked)
 983   */
 984  void
 985  sched_thread_mode_undemote(thread_t thread, uint32_t reason)
 986  {
 987  	assert(reason & TH_SFLAG_DEMOTED_MASK);
 988  	assert((thread->sched_flags & reason) == reason);
 989  	assert(thread->saved_mode != TH_MODE_NONE);
 990  	assert(thread->sched_mode == TH_MODE_TIMESHARE);
 991  	assert(thread->policy_reset == 0);
 992  
 993  	thread->sched_flags &= ~reason;
 994  
 995  	if (thread->sched_flags & TH_SFLAG_DEMOTED_MASK) {
 996  		/* Another demotion reason is still active */
 997  		return;
 998  	}
 999  
1000  	boolean_t removed = thread_run_queue_remove(thread);
1001  
1002  	sched_set_thread_mode(thread, thread->saved_mode);
1003  
1004  	thread->saved_mode = TH_MODE_NONE;
1005  
1006  	thread_recompute_priority(thread);
1007  
1008  	if (removed) {
1009  		thread_run_queue_reinsert(thread, SCHED_TAILQ);
1010  	}
1011  }
1012  
1013  /*
1014   * Promote thread to have a sched pri floor for a specific reason
1015   *
1016   * Promotion must not last past syscall boundary
1017   * Clients must always pair promote and demote 1:1,
1018   * Handling nesting of the same promote reason is the client's responsibility
1019   *
1020   * Called at splsched with thread locked
1021   */
1022  void
1023  sched_thread_promote_reason(thread_t    thread,
1024      uint32_t    reason,
1025      __kdebug_only uintptr_t   trace_obj /* already unslid */)
1026  {
1027  	assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
1028  	assert((thread->sched_flags & reason) != reason);
1029  
1030  	switch (reason) {
1031  	case TH_SFLAG_RW_PROMOTED:
1032  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE),
1033  		    thread_tid(thread), thread->sched_pri,
1034  		    thread->base_pri, trace_obj);
1035  		break;
1036  	case TH_SFLAG_WAITQ_PROMOTED:
1037  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE),
1038  		    thread_tid(thread), thread->sched_pri,
1039  		    thread->base_pri, trace_obj);
1040  		break;
1041  	case TH_SFLAG_EXEC_PROMOTED:
1042  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE),
1043  		    thread_tid(thread), thread->sched_pri,
1044  		    thread->base_pri, trace_obj);
1045  		break;
1046  	}
1047  
1048  	thread->sched_flags |= reason;
1049  
1050  	thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
1051  }
1052  
1053  /*
1054   * End a specific promotion reason
1055   * Demotes a thread back to its expected priority without the promotion in place
1056   *
1057   * Called at splsched with thread locked
1058   */
1059  void
1060  sched_thread_unpromote_reason(thread_t  thread,
1061      uint32_t  reason,
1062      __kdebug_only uintptr_t trace_obj /* already unslid */)
1063  {
1064  	assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
1065  	assert((thread->sched_flags & reason) == reason);
1066  
1067  	switch (reason) {
1068  	case TH_SFLAG_RW_PROMOTED:
1069  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE),
1070  		    thread_tid(thread), thread->sched_pri,
1071  		    thread->base_pri, trace_obj);
1072  		break;
1073  	case TH_SFLAG_WAITQ_PROMOTED:
1074  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE),
1075  		    thread_tid(thread), thread->sched_pri,
1076  		    thread->base_pri, trace_obj);
1077  		break;
1078  	case TH_SFLAG_EXEC_PROMOTED:
1079  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE),
1080  		    thread_tid(thread), thread->sched_pri,
1081  		    thread->base_pri, trace_obj);
1082  		break;
1083  	}
1084  
1085  	thread->sched_flags &= ~reason;
1086  
1087  	thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
1088  }