/ duct-tape / xnu / osfmk / kern / timer_call.c
timer_call.c
   1  /*
   2   * Copyright (c) 1993-2008 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. The rights granted to you under the License
  10   * may not be used to create, or enable the creation or redistribution of,
  11   * unlawful or unlicensed copies of an Apple operating system, or to
  12   * circumvent, violate, or enable the circumvention or violation of, any
  13   * terms of an Apple operating system software license agreement.
  14   *
  15   * Please obtain a copy of the License at
  16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17   *
  18   * The Original Code and all software distributed under the License are
  19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23   * Please see the License for the specific language governing rights and
  24   * limitations under the License.
  25   *
  26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27   */
  28  /*
  29   * Timer interrupt callout module.
  30   */
  31  
  32  #include <mach/mach_types.h>
  33  
  34  #include <kern/clock.h>
  35  #include <kern/smp.h>
  36  #include <kern/processor.h>
  37  #include <kern/timer_call.h>
  38  #include <kern/timer_queue.h>
  39  #include <kern/thread.h>
  40  #include <kern/policy_internal.h>
  41  
  42  #include <sys/kdebug.h>
  43  
  44  #if CONFIG_DTRACE
  45  #include <mach/sdt.h>
  46  #endif
  47  
  48  
  49  #if DEBUG
  50  #define TIMER_ASSERT    1
  51  #endif
  52  
  53  //#define TIMER_ASSERT	1
  54  //#define TIMER_DBG	1
  55  
  56  #if TIMER_DBG
  57  #define DBG(x...) kprintf("DBG: " x);
  58  #else
  59  #define DBG(x...)
  60  #endif
  61  
  62  #if TIMER_TRACE
  63  #define TIMER_KDEBUG_TRACE      KERNEL_DEBUG_CONSTANT_IST
  64  #else
  65  #define TIMER_KDEBUG_TRACE(x...)
  66  #endif
  67  
  68  LCK_GRP_DECLARE(timer_call_lck_grp, "timer_call");
  69  LCK_GRP_DECLARE(timer_longterm_lck_grp, "timer_longterm");
  70  
  71  /* Timer queue lock must be acquired with interrupts disabled (under splclock()) */
  72  #define timer_queue_lock_spin(queue)                                    \
  73  	lck_mtx_lock_spin_always(&queue->lock_data)
  74  
  75  #define timer_queue_unlock(queue)               \
  76  	lck_mtx_unlock_always(&queue->lock_data)
  77  
  78  /*
  79   * The longterm timer object is a global structure holding all timers
  80   * beyond the short-term, local timer queue threshold. The boot processor
  81   * is responsible for moving each timer to its local timer queue
  82   * if and when that timer becomes due within the threshold.
  83   */
  84  
  85  /* Sentinel for "no time set": */
  86  #define TIMER_LONGTERM_NONE             EndOfAllTime
  87  /* The default threadhold is the delta above which a timer is "long-term" */
  88  #if defined(__x86_64__)
  89  #define TIMER_LONGTERM_THRESHOLD        (1ULL * NSEC_PER_SEC)   /* 1 sec */
  90  #else
  91  #define TIMER_LONGTERM_THRESHOLD        TIMER_LONGTERM_NONE     /* disabled */
  92  #endif
  93  
  94  /*
  95   * The scan_limit throttles processing of the longterm queue.
  96   * If the scan time exceeds this limit, we terminate, unlock
  97   * and defer for scan_interval. This prevents unbounded holding of
  98   * timer queue locks with interrupts masked.
  99   */
 100  #define TIMER_LONGTERM_SCAN_LIMIT       (100ULL * NSEC_PER_USEC)        /* 100 us */
 101  #define TIMER_LONGTERM_SCAN_INTERVAL    (100ULL * NSEC_PER_USEC)        /* 100 us */
 102  /* Sentinel for "scan limit exceeded": */
 103  #define TIMER_LONGTERM_SCAN_AGAIN       0
 104  
 105  typedef struct {
 106  	uint64_t        interval;       /* longterm timer interval */
 107  	uint64_t        margin;         /* fudge factor (10% of interval */
 108  	uint64_t        deadline;       /* first/soonest longterm deadline */
 109  	uint64_t        preempted;      /* sooner timer has pre-empted */
 110  	timer_call_t    call;           /* first/soonest longterm timer call */
 111  	uint64_t        deadline_set;   /* next timer set */
 112  	timer_call_data_t timer;        /* timer used by threshold management */
 113  	                                /* Stats: */
 114  	uint64_t        scans;          /*   num threshold timer scans */
 115  	uint64_t        preempts;       /*   num threshold reductions */
 116  	uint64_t        latency;        /*   average threshold latency */
 117  	uint64_t        latency_min;    /*   minimum threshold latency */
 118  	uint64_t        latency_max;    /*   maximum threshold latency */
 119  } threshold_t;
 120  
 121  typedef struct {
 122  	mpqueue_head_t  queue;          /* longterm timer list */
 123  	uint64_t        enqueues;       /* num timers queued */
 124  	uint64_t        dequeues;       /* num timers dequeued */
 125  	uint64_t        escalates;      /* num timers becoming shortterm */
 126  	uint64_t        scan_time;      /* last time the list was scanned */
 127  	threshold_t     threshold;      /* longterm timer threshold */
 128  	uint64_t        scan_limit;     /* maximum scan time */
 129  	uint64_t        scan_interval;  /* interval between LT "escalation" scans */
 130  	uint64_t        scan_pauses;    /* num scans exceeding time limit */
 131  } timer_longterm_t;
 132  
 133  timer_longterm_t                timer_longterm = {
 134  	.scan_limit = TIMER_LONGTERM_SCAN_LIMIT,
 135  	.scan_interval = TIMER_LONGTERM_SCAN_INTERVAL,
 136  };
 137  
 138  static mpqueue_head_t           *timer_longterm_queue = NULL;
 139  
 140  static void                     timer_longterm_init(void);
 141  static void                     timer_longterm_callout(
 142  	timer_call_param_t      p0,
 143  	timer_call_param_t      p1);
 144  extern void                     timer_longterm_scan(
 145  	timer_longterm_t        *tlp,
 146  	uint64_t                now);
 147  static void                     timer_longterm_update(
 148  	timer_longterm_t *tlp);
 149  static void                     timer_longterm_update_locked(
 150  	timer_longterm_t *tlp);
 151  static mpqueue_head_t *         timer_longterm_enqueue_unlocked(
 152  	timer_call_t            call,
 153  	uint64_t                now,
 154  	uint64_t                deadline,
 155  	mpqueue_head_t **       old_queue,
 156  	uint64_t                soft_deadline,
 157  	uint64_t                ttd,
 158  	timer_call_param_t      param1,
 159  	uint32_t                callout_flags);
 160  static void                     timer_longterm_dequeued_locked(
 161  	timer_call_t            call);
 162  
 163  uint64_t past_deadline_timers;
 164  uint64_t past_deadline_deltas;
 165  uint64_t past_deadline_longest;
 166  uint64_t past_deadline_shortest = ~0ULL;
 167  enum {PAST_DEADLINE_TIMER_ADJUSTMENT_NS = 10 * 1000};
 168  
 169  uint64_t past_deadline_timer_adjustment;
 170  
 171  static boolean_t timer_call_enter_internal(timer_call_t call, timer_call_param_t param1, uint64_t deadline, uint64_t leeway, uint32_t flags, boolean_t ratelimited);
 172  boolean_t       mach_timer_coalescing_enabled = TRUE;
 173  
 174  mpqueue_head_t  *timer_call_enqueue_deadline_unlocked(
 175  	timer_call_t            call,
 176  	mpqueue_head_t          *queue,
 177  	uint64_t                deadline,
 178  	uint64_t                soft_deadline,
 179  	uint64_t                ttd,
 180  	timer_call_param_t      param1,
 181  	uint32_t                flags);
 182  
 183  mpqueue_head_t  *timer_call_dequeue_unlocked(
 184  	timer_call_t            call);
 185  
 186  timer_coalescing_priority_params_t tcoal_prio_params;
 187  
 188  #if TCOAL_PRIO_STATS
 189  int32_t nc_tcl, rt_tcl, bg_tcl, kt_tcl, fp_tcl, ts_tcl, qos_tcl;
 190  #define TCOAL_PRIO_STAT(x) (x++)
 191  #else
 192  #define TCOAL_PRIO_STAT(x)
 193  #endif
 194  
 195  static void
 196  timer_call_init_abstime(void)
 197  {
 198  	int i;
 199  	uint64_t result;
 200  	timer_coalescing_priority_params_ns_t * tcoal_prio_params_init = timer_call_get_priority_params();
 201  	nanoseconds_to_absolutetime(PAST_DEADLINE_TIMER_ADJUSTMENT_NS, &past_deadline_timer_adjustment);
 202  	nanoseconds_to_absolutetime(tcoal_prio_params_init->idle_entry_timer_processing_hdeadline_threshold_ns, &result);
 203  	tcoal_prio_params.idle_entry_timer_processing_hdeadline_threshold_abstime = (uint32_t)result;
 204  	nanoseconds_to_absolutetime(tcoal_prio_params_init->interrupt_timer_coalescing_ilat_threshold_ns, &result);
 205  	tcoal_prio_params.interrupt_timer_coalescing_ilat_threshold_abstime = (uint32_t)result;
 206  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_resort_threshold_ns, &result);
 207  	tcoal_prio_params.timer_resort_threshold_abstime = (uint32_t)result;
 208  	tcoal_prio_params.timer_coalesce_rt_shift = tcoal_prio_params_init->timer_coalesce_rt_shift;
 209  	tcoal_prio_params.timer_coalesce_bg_shift = tcoal_prio_params_init->timer_coalesce_bg_shift;
 210  	tcoal_prio_params.timer_coalesce_kt_shift = tcoal_prio_params_init->timer_coalesce_kt_shift;
 211  	tcoal_prio_params.timer_coalesce_fp_shift = tcoal_prio_params_init->timer_coalesce_fp_shift;
 212  	tcoal_prio_params.timer_coalesce_ts_shift = tcoal_prio_params_init->timer_coalesce_ts_shift;
 213  
 214  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_rt_ns_max,
 215  	    &tcoal_prio_params.timer_coalesce_rt_abstime_max);
 216  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_bg_ns_max,
 217  	    &tcoal_prio_params.timer_coalesce_bg_abstime_max);
 218  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_kt_ns_max,
 219  	    &tcoal_prio_params.timer_coalesce_kt_abstime_max);
 220  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_fp_ns_max,
 221  	    &tcoal_prio_params.timer_coalesce_fp_abstime_max);
 222  	nanoseconds_to_absolutetime(tcoal_prio_params_init->timer_coalesce_ts_ns_max,
 223  	    &tcoal_prio_params.timer_coalesce_ts_abstime_max);
 224  
 225  	for (i = 0; i < NUM_LATENCY_QOS_TIERS; i++) {
 226  		tcoal_prio_params.latency_qos_scale[i] = tcoal_prio_params_init->latency_qos_scale[i];
 227  		nanoseconds_to_absolutetime(tcoal_prio_params_init->latency_qos_ns_max[i],
 228  		    &tcoal_prio_params.latency_qos_abstime_max[i]);
 229  		tcoal_prio_params.latency_tier_rate_limited[i] = tcoal_prio_params_init->latency_tier_rate_limited[i];
 230  	}
 231  }
 232  
 233  
 234  void
 235  timer_call_init(void)
 236  {
 237  	timer_longterm_init();
 238  	timer_call_init_abstime();
 239  }
 240  
 241  
 242  void
 243  timer_call_queue_init(mpqueue_head_t *queue)
 244  {
 245  	DBG("timer_call_queue_init(%p)\n", queue);
 246  	mpqueue_init(queue, &timer_call_lck_grp, LCK_ATTR_NULL);
 247  }
 248  
 249  
 250  void
 251  timer_call_setup(
 252  	timer_call_t                    call,
 253  	timer_call_func_t               func,
 254  	timer_call_param_t              param0)
 255  {
 256  	DBG("timer_call_setup(%p,%p,%p)\n", call, func, param0);
 257  
 258  	*call = (struct timer_call) {
 259  		.tc_func = func,
 260  		.tc_param0 = param0,
 261  		.tc_async_dequeue = false,
 262  	};
 263  
 264  	simple_lock_init(&(call)->tc_lock, 0);
 265  }
 266  
 267  static mpqueue_head_t*
 268  mpqueue_for_timer_call(timer_call_t entry)
 269  {
 270  	queue_t queue_entry_is_on = entry->tc_queue;
 271  	/* 'cast' the queue back to the orignal mpqueue */
 272  	return __container_of(queue_entry_is_on, struct mpqueue_head, head);
 273  }
 274  
 275  
 276  static __inline__ mpqueue_head_t *
 277  timer_call_entry_dequeue(
 278  	timer_call_t            entry)
 279  {
 280  	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 281  
 282  	/* The entry was always on a queue */
 283  	assert(old_mpqueue != NULL);
 284  
 285  #if TIMER_ASSERT
 286  	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
 287  		panic("_call_entry_dequeue() "
 288  		    "entry %p is not locked\n", entry);
 289  	}
 290  
 291  	/*
 292  	 * XXX The queue lock is actually a mutex in spin mode
 293  	 *     but there's no way to test for it being held
 294  	 *     so we pretend it's a spinlock!
 295  	 */
 296  	if (!hw_lock_held((hw_lock_t)&old_mpqueue->lock_data)) {
 297  		panic("_call_entry_dequeue() "
 298  		    "queue %p is not locked\n", old_mpqueue);
 299  	}
 300  #endif /* TIMER_ASSERT */
 301  
 302  	if (old_mpqueue != timer_longterm_queue) {
 303  		priority_queue_remove(&old_mpqueue->mpq_pqhead,
 304  		    &entry->tc_pqlink);
 305  	}
 306  
 307  	remqueue(&entry->tc_qlink);
 308  
 309  	entry->tc_queue = NULL;
 310  
 311  	old_mpqueue->count--;
 312  
 313  	return old_mpqueue;
 314  }
 315  
 316  static __inline__ mpqueue_head_t *
 317  timer_call_entry_enqueue_deadline(
 318  	timer_call_t                    entry,
 319  	mpqueue_head_t                  *new_mpqueue,
 320  	uint64_t                        deadline)
 321  {
 322  	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 323  
 324  #if TIMER_ASSERT
 325  	if (!hw_lock_held((hw_lock_t)&entry->tc_lock)) {
 326  		panic("_call_entry_enqueue_deadline() "
 327  		    "entry %p is not locked\n", entry);
 328  	}
 329  
 330  	/* XXX More lock pretense:  */
 331  	if (!hw_lock_held((hw_lock_t)&new_mpqueue->lock_data)) {
 332  		panic("_call_entry_enqueue_deadline() "
 333  		    "queue %p is not locked\n", new_mpqueue);
 334  	}
 335  
 336  	if (old_mpqueue != NULL && old_mpqueue != new_mpqueue) {
 337  		panic("_call_entry_enqueue_deadline() "
 338  		    "old_mpqueue %p != new_mpqueue", old_mpqueue);
 339  	}
 340  #endif /* TIMER_ASSERT */
 341  
 342  	/* no longterm queue involved */
 343  	assert(new_mpqueue != timer_longterm_queue);
 344  	assert(old_mpqueue != timer_longterm_queue);
 345  
 346  	if (old_mpqueue == new_mpqueue) {
 347  		/* optimize the same-queue case to avoid a full re-insert */
 348  		uint64_t old_deadline = entry->tc_pqlink.deadline;
 349  		entry->tc_pqlink.deadline = deadline;
 350  
 351  		if (old_deadline < deadline) {
 352  			priority_queue_entry_increased(&new_mpqueue->mpq_pqhead,
 353  			    &entry->tc_pqlink);
 354  		} else {
 355  			priority_queue_entry_decreased(&new_mpqueue->mpq_pqhead,
 356  			    &entry->tc_pqlink);
 357  		}
 358  	} else {
 359  		if (old_mpqueue != NULL) {
 360  			priority_queue_remove(&old_mpqueue->mpq_pqhead,
 361  			    &entry->tc_pqlink);
 362  
 363  			re_queue_tail(&new_mpqueue->head, &entry->tc_qlink);
 364  		} else {
 365  			enqueue_tail(&new_mpqueue->head, &entry->tc_qlink);
 366  		}
 367  
 368  		entry->tc_queue = &new_mpqueue->head;
 369  		entry->tc_pqlink.deadline = deadline;
 370  
 371  		priority_queue_insert(&new_mpqueue->mpq_pqhead, &entry->tc_pqlink);
 372  	}
 373  
 374  
 375  	/* For efficiency, track the earliest soft deadline on the queue,
 376  	 * so that fuzzy decisions can be made without lock acquisitions.
 377  	 */
 378  
 379  	timer_call_t thead = priority_queue_min(&new_mpqueue->mpq_pqhead, struct timer_call, tc_pqlink);
 380  
 381  	new_mpqueue->earliest_soft_deadline = thead->tc_flags & TIMER_CALL_RATELIMITED ? thead->tc_pqlink.deadline : thead->tc_soft_deadline;
 382  
 383  	if (old_mpqueue) {
 384  		old_mpqueue->count--;
 385  	}
 386  	new_mpqueue->count++;
 387  
 388  	return old_mpqueue;
 389  }
 390  
 391  static __inline__ void
 392  timer_call_entry_enqueue_tail(
 393  	timer_call_t                    entry,
 394  	mpqueue_head_t                  *queue)
 395  {
 396  	/* entry is always dequeued before this call */
 397  	assert(entry->tc_queue == NULL);
 398  
 399  	/*
 400  	 * this is only used for timer_longterm_queue, which is unordered
 401  	 * and thus needs no priority queueing
 402  	 */
 403  	assert(queue == timer_longterm_queue);
 404  
 405  	enqueue_tail(&queue->head, &entry->tc_qlink);
 406  
 407  	entry->tc_queue = &queue->head;
 408  
 409  	queue->count++;
 410  	return;
 411  }
 412  
 413  /*
 414   * Remove timer entry from its queue but don't change the queue pointer
 415   * and set the async_dequeue flag. This is locking case 2b.
 416   */
 417  static __inline__ void
 418  timer_call_entry_dequeue_async(
 419  	timer_call_t            entry)
 420  {
 421  	mpqueue_head_t  *old_mpqueue = mpqueue_for_timer_call(entry);
 422  	if (old_mpqueue) {
 423  		old_mpqueue->count--;
 424  
 425  		if (old_mpqueue != timer_longterm_queue) {
 426  			priority_queue_remove(&old_mpqueue->mpq_pqhead,
 427  			    &entry->tc_pqlink);
 428  		}
 429  
 430  		remqueue(&entry->tc_qlink);
 431  		entry->tc_async_dequeue = true;
 432  	}
 433  	return;
 434  }
 435  
 436  #if TIMER_ASSERT
 437  unsigned timer_call_enqueue_deadline_unlocked_async1;
 438  unsigned timer_call_enqueue_deadline_unlocked_async2;
 439  #endif
 440  /*
 441   * Assumes call_entry and queues unlocked, interrupts disabled.
 442   */
 443  __inline__ mpqueue_head_t *
 444  timer_call_enqueue_deadline_unlocked(
 445  	timer_call_t                    call,
 446  	mpqueue_head_t                  *queue,
 447  	uint64_t                        deadline,
 448  	uint64_t                        soft_deadline,
 449  	uint64_t                        ttd,
 450  	timer_call_param_t              param1,
 451  	uint32_t                        callout_flags)
 452  {
 453  	DBG("timer_call_enqueue_deadline_unlocked(%p,%p,)\n", call, queue);
 454  
 455  	simple_lock(&call->tc_lock, LCK_GRP_NULL);
 456  
 457  	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
 458  
 459  	if (old_queue != NULL) {
 460  		timer_queue_lock_spin(old_queue);
 461  		if (call->tc_async_dequeue) {
 462  			/* collision (1c): timer already dequeued, clear flag */
 463  #if TIMER_ASSERT
 464  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 465  			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 466  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
 467  			    call->tc_async_dequeue,
 468  			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 469  			    0x1c, 0);
 470  			timer_call_enqueue_deadline_unlocked_async1++;
 471  #endif
 472  			call->tc_async_dequeue = false;
 473  			call->tc_queue = NULL;
 474  		} else if (old_queue != queue) {
 475  			timer_call_entry_dequeue(call);
 476  #if TIMER_ASSERT
 477  			timer_call_enqueue_deadline_unlocked_async2++;
 478  #endif
 479  		}
 480  		if (old_queue == timer_longterm_queue) {
 481  			timer_longterm_dequeued_locked(call);
 482  		}
 483  		if (old_queue != queue) {
 484  			timer_queue_unlock(old_queue);
 485  			timer_queue_lock_spin(queue);
 486  		}
 487  	} else {
 488  		timer_queue_lock_spin(queue);
 489  	}
 490  
 491  	call->tc_soft_deadline = soft_deadline;
 492  	call->tc_flags = callout_flags;
 493  	call->tc_param1 = param1;
 494  	call->tc_ttd = ttd;
 495  
 496  	timer_call_entry_enqueue_deadline(call, queue, deadline);
 497  	timer_queue_unlock(queue);
 498  	simple_unlock(&call->tc_lock);
 499  
 500  	return old_queue;
 501  }
 502  
 503  #if TIMER_ASSERT
 504  unsigned timer_call_dequeue_unlocked_async1;
 505  unsigned timer_call_dequeue_unlocked_async2;
 506  #endif
 507  mpqueue_head_t *
 508  timer_call_dequeue_unlocked(
 509  	timer_call_t            call)
 510  {
 511  	DBG("timer_call_dequeue_unlocked(%p)\n", call);
 512  
 513  	simple_lock(&call->tc_lock, LCK_GRP_NULL);
 514  
 515  	mpqueue_head_t  *old_queue = mpqueue_for_timer_call(call);
 516  
 517  #if TIMER_ASSERT
 518  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 519  	    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 520  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
 521  	    call->tc_async_dequeue,
 522  	    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 523  	    0, 0);
 524  #endif
 525  	if (old_queue != NULL) {
 526  		timer_queue_lock_spin(old_queue);
 527  		if (call->tc_async_dequeue) {
 528  			/* collision (1c): timer already dequeued, clear flag */
 529  #if TIMER_ASSERT
 530  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 531  			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 532  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
 533  			    call->tc_async_dequeue,
 534  			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 535  			    0x1c, 0);
 536  			timer_call_dequeue_unlocked_async1++;
 537  #endif
 538  			call->tc_async_dequeue = false;
 539  			call->tc_queue = NULL;
 540  		} else {
 541  			timer_call_entry_dequeue(call);
 542  		}
 543  		if (old_queue == timer_longterm_queue) {
 544  			timer_longterm_dequeued_locked(call);
 545  		}
 546  		timer_queue_unlock(old_queue);
 547  	}
 548  	simple_unlock(&call->tc_lock);
 549  	return old_queue;
 550  }
 551  
 552  uint64_t
 553  timer_call_past_deadline_timer_handle(uint64_t deadline, uint64_t ctime)
 554  {
 555  	uint64_t delta = (ctime - deadline);
 556  
 557  	past_deadline_timers++;
 558  	past_deadline_deltas += delta;
 559  	if (delta > past_deadline_longest) {
 560  		past_deadline_longest = deadline;
 561  	}
 562  	if (delta < past_deadline_shortest) {
 563  		past_deadline_shortest = delta;
 564  	}
 565  
 566  	return ctime + past_deadline_timer_adjustment;
 567  }
 568  
 569  /*
 570   * Timer call entry locking model
 571   * ==============================
 572   *
 573   * Timer call entries are linked on per-cpu timer queues which are protected
 574   * by the queue lock and the call entry lock. The locking protocol is:
 575   *
 576   *  0) The canonical locking order is timer call entry followed by queue.
 577   *
 578   *  1) With only the entry lock held, entry.queue is valid:
 579   *    1a) NULL: the entry is not queued, or
 580   *    1b) non-NULL: this queue must be locked before the entry is modified.
 581   *        After locking the queue, the call.async_dequeue flag must be checked:
 582   *    1c) TRUE: the entry was removed from the queue by another thread
 583   *	        and we must NULL the entry.queue and reset this flag, or
 584   *    1d) FALSE: (ie. queued), the entry can be manipulated.
 585   *
 586   *  2) If a queue lock is obtained first, the queue is stable:
 587   *    2a) If a try-lock of a queued entry succeeds, the call can be operated on
 588   *	  and dequeued.
 589   *    2b) If a try-lock fails, it indicates that another thread is attempting
 590   *        to change the entry and move it to a different position in this queue
 591   *        or to different queue. The entry can be dequeued but it should not be
 592   *        operated upon since it is being changed. Furthermore, we don't null
 593   *	  the entry.queue pointer (protected by the entry lock we don't own).
 594   *	  Instead, we set the async_dequeue flag -- see (1c).
 595   *    2c) Same as 2b but occurring when a longterm timer is matured.
 596   *  3) A callout's parameters (deadline, flags, parameters, soft deadline &c.)
 597   *     should be manipulated with the appropriate timer queue lock held,
 598   *     to prevent queue traversal observations from observing inconsistent
 599   *     updates to an in-flight callout.
 600   */
 601  
 602  /*
 603   * In the debug case, we assert that the timer call locking protocol
 604   * is being obeyed.
 605   */
 606  
 607  static boolean_t
 608  timer_call_enter_internal(
 609  	timer_call_t            call,
 610  	timer_call_param_t      param1,
 611  	uint64_t                deadline,
 612  	uint64_t                leeway,
 613  	uint32_t                flags,
 614  	boolean_t               ratelimited)
 615  {
 616  	mpqueue_head_t          *queue = NULL;
 617  	mpqueue_head_t          *old_queue;
 618  	spl_t                   s;
 619  	uint64_t                slop;
 620  	uint32_t                urgency;
 621  	uint64_t                sdeadline, ttd;
 622  
 623  	assert(call->tc_func != NULL);
 624  	s = splclock();
 625  
 626  	sdeadline = deadline;
 627  	uint64_t ctime = mach_absolute_time();
 628  
 629  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 630  	    DECR_TIMER_ENTER | DBG_FUNC_START,
 631  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
 632  	    VM_KERNEL_ADDRHIDE(param1), deadline, flags, 0);
 633  
 634  	urgency = (flags & TIMER_CALL_URGENCY_MASK);
 635  
 636  	boolean_t slop_ratelimited = FALSE;
 637  	slop = timer_call_slop(deadline, ctime, urgency, current_thread(), &slop_ratelimited);
 638  
 639  	if ((flags & TIMER_CALL_LEEWAY) != 0 && leeway > slop) {
 640  		slop = leeway;
 641  	}
 642  
 643  	if (UINT64_MAX - deadline <= slop) {
 644  		deadline = UINT64_MAX;
 645  	} else {
 646  		deadline += slop;
 647  	}
 648  
 649  	if (__improbable(deadline < ctime)) {
 650  		deadline = timer_call_past_deadline_timer_handle(deadline, ctime);
 651  		sdeadline = deadline;
 652  	}
 653  
 654  	if (ratelimited || slop_ratelimited) {
 655  		flags |= TIMER_CALL_RATELIMITED;
 656  	} else {
 657  		flags &= ~TIMER_CALL_RATELIMITED;
 658  	}
 659  
 660  	ttd =  sdeadline - ctime;
 661  #if CONFIG_DTRACE
 662  	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
 663  	    timer_call_param_t, call->tc_param0, uint32_t, flags,
 664  	    (deadline - sdeadline),
 665  	    (ttd >> 32), (unsigned) (ttd & 0xFFFFFFFF), call);
 666  #endif
 667  
 668  	/* Program timer callout parameters under the appropriate per-CPU or
 669  	 * longterm queue lock. The callout may have been previously enqueued
 670  	 * and in-flight on this or another timer queue.
 671  	 */
 672  	if (!ratelimited && !slop_ratelimited) {
 673  		queue = timer_longterm_enqueue_unlocked(call, ctime, deadline, &old_queue, sdeadline, ttd, param1, flags);
 674  	}
 675  
 676  	if (queue == NULL) {
 677  		queue = timer_queue_assign(deadline);
 678  		old_queue = timer_call_enqueue_deadline_unlocked(call, queue, deadline, sdeadline, ttd, param1, flags);
 679  	}
 680  
 681  #if TIMER_TRACE
 682  	call->tc_entry_time = ctime;
 683  #endif
 684  
 685  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 686  	    DECR_TIMER_ENTER | DBG_FUNC_END,
 687  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
 688  	    (old_queue != NULL), deadline, queue->count, 0);
 689  
 690  	splx(s);
 691  
 692  	return old_queue != NULL;
 693  }
 694  
 695  /*
 696   * timer_call_*()
 697   *	return boolean indicating whether the call was previously queued.
 698   */
 699  boolean_t
 700  timer_call_enter(
 701  	timer_call_t            call,
 702  	uint64_t                deadline,
 703  	uint32_t                flags)
 704  {
 705  	return timer_call_enter_internal(call, NULL, deadline, 0, flags, FALSE);
 706  }
 707  
 708  boolean_t
 709  timer_call_enter1(
 710  	timer_call_t            call,
 711  	timer_call_param_t      param1,
 712  	uint64_t                deadline,
 713  	uint32_t                flags)
 714  {
 715  	return timer_call_enter_internal(call, param1, deadline, 0, flags, FALSE);
 716  }
 717  
 718  boolean_t
 719  timer_call_enter_with_leeway(
 720  	timer_call_t            call,
 721  	timer_call_param_t      param1,
 722  	uint64_t                deadline,
 723  	uint64_t                leeway,
 724  	uint32_t                flags,
 725  	boolean_t               ratelimited)
 726  {
 727  	return timer_call_enter_internal(call, param1, deadline, leeway, flags, ratelimited);
 728  }
 729  
 730  boolean_t
 731  timer_call_cancel(
 732  	timer_call_t            call)
 733  {
 734  	mpqueue_head_t          *old_queue;
 735  	spl_t                   s;
 736  
 737  	s = splclock();
 738  
 739  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 740  	    DECR_TIMER_CANCEL | DBG_FUNC_START,
 741  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
 742  	    call->tc_pqlink.deadline, call->tc_soft_deadline, call->tc_flags, 0);
 743  
 744  	old_queue = timer_call_dequeue_unlocked(call);
 745  
 746  	if (old_queue != NULL) {
 747  		timer_queue_lock_spin(old_queue);
 748  
 749  		timer_call_t new_head = priority_queue_min(&old_queue->mpq_pqhead, struct timer_call, tc_pqlink);
 750  
 751  		if (new_head) {
 752  			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, new_head->tc_pqlink.deadline);
 753  			old_queue->earliest_soft_deadline = new_head->tc_flags & TIMER_CALL_RATELIMITED ? new_head->tc_pqlink.deadline : new_head->tc_soft_deadline;
 754  		} else {
 755  			timer_queue_cancel(old_queue, call->tc_pqlink.deadline, UINT64_MAX);
 756  			old_queue->earliest_soft_deadline = UINT64_MAX;
 757  		}
 758  
 759  		timer_queue_unlock(old_queue);
 760  	}
 761  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 762  	    DECR_TIMER_CANCEL | DBG_FUNC_END,
 763  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
 764  	    VM_KERNEL_UNSLIDE_OR_PERM(old_queue),
 765  	    call->tc_pqlink.deadline - mach_absolute_time(),
 766  	    call->tc_pqlink.deadline - call->tc_entry_time, 0);
 767  	splx(s);
 768  
 769  #if CONFIG_DTRACE
 770  	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
 771  	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
 772  	    (call->tc_ttd >> 32), (unsigned) (call->tc_ttd & 0xFFFFFFFF));
 773  #endif /* CONFIG_DTRACE */
 774  
 775  	return old_queue != NULL;
 776  }
 777  
 778  static uint32_t timer_queue_shutdown_lock_skips;
 779  static uint32_t timer_queue_shutdown_discarded;
 780  
 781  void
 782  timer_queue_shutdown(
 783  	mpqueue_head_t          *queue)
 784  {
 785  	timer_call_t            call;
 786  	mpqueue_head_t          *new_queue;
 787  	spl_t                   s;
 788  
 789  
 790  	DBG("timer_queue_shutdown(%p)\n", queue);
 791  
 792  	s = splclock();
 793  
 794  	while (TRUE) {
 795  		timer_queue_lock_spin(queue);
 796  
 797  		call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
 798  
 799  		if (call == NULL) {
 800  			break;
 801  		}
 802  
 803  		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
 804  			/*
 805  			 * case (2b) lock order inversion, dequeue and skip
 806  			 * Don't change the call_entry queue back-pointer
 807  			 * but set the async_dequeue field.
 808  			 */
 809  			timer_queue_shutdown_lock_skips++;
 810  			timer_call_entry_dequeue_async(call);
 811  #if TIMER_ASSERT
 812  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 813  			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
 814  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
 815  			    call->tc_async_dequeue,
 816  			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
 817  			    0x2b, 0);
 818  #endif
 819  			timer_queue_unlock(queue);
 820  			continue;
 821  		}
 822  
 823  		boolean_t call_local = ((call->tc_flags & TIMER_CALL_LOCAL) != 0);
 824  
 825  		/* remove entry from old queue */
 826  		timer_call_entry_dequeue(call);
 827  		timer_queue_unlock(queue);
 828  
 829  		if (call_local == FALSE) {
 830  			/* and queue it on new, discarding LOCAL timers */
 831  			new_queue = timer_queue_assign(call->tc_pqlink.deadline);
 832  			timer_queue_lock_spin(new_queue);
 833  			timer_call_entry_enqueue_deadline(
 834  				call, new_queue, call->tc_pqlink.deadline);
 835  			timer_queue_unlock(new_queue);
 836  		} else {
 837  			timer_queue_shutdown_discarded++;
 838  		}
 839  
 840  		assert(call_local == FALSE);
 841  		simple_unlock(&call->tc_lock);
 842  	}
 843  
 844  	timer_queue_unlock(queue);
 845  	splx(s);
 846  }
 847  
 848  
 849  static uint32_t timer_queue_expire_lock_skips;
 850  uint64_t
 851  timer_queue_expire_with_options(
 852  	mpqueue_head_t          *queue,
 853  	uint64_t                deadline,
 854  	boolean_t               rescan)
 855  {
 856  	timer_call_t    call = NULL;
 857  	uint32_t tc_iterations = 0;
 858  	DBG("timer_queue_expire(%p,)\n", queue);
 859  
 860  	/* 'rescan' means look at every timer in the list, instead of
 861  	 * early-exiting when the head of the list expires in the future.
 862  	 * when 'rescan' is true, iterate by linked list instead of priority queue.
 863  	 *
 864  	 * TODO: if we keep a deadline ordered and soft-deadline ordered
 865  	 * priority queue, then it's no longer necessary to do that
 866  	 */
 867  
 868  	uint64_t cur_deadline = deadline;
 869  	timer_queue_lock_spin(queue);
 870  
 871  	while (!queue_empty(&queue->head)) {
 872  		/* Upon processing one or more timer calls, refresh the
 873  		 * deadline to account for time elapsed in the callout
 874  		 */
 875  		if (++tc_iterations > 1) {
 876  			cur_deadline = mach_absolute_time();
 877  		}
 878  
 879  		if (call == NULL) {
 880  			if (rescan == FALSE) {
 881  				call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
 882  			} else {
 883  				call = qe_queue_first(&queue->head, struct timer_call, tc_qlink);
 884  			}
 885  		}
 886  
 887  		if (call->tc_soft_deadline <= cur_deadline) {
 888  			timer_call_func_t               func;
 889  			timer_call_param_t              param0, param1;
 890  
 891  			TCOAL_DEBUG(0xDDDD0000, queue->earliest_soft_deadline, call->tc_soft_deadline, 0, 0, 0);
 892  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 893  			    DECR_TIMER_EXPIRE | DBG_FUNC_NONE,
 894  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
 895  			    call->tc_soft_deadline,
 896  			    call->tc_pqlink.deadline,
 897  			    call->tc_entry_time, 0);
 898  
 899  			if ((call->tc_flags & TIMER_CALL_RATELIMITED) &&
 900  			    (call->tc_pqlink.deadline > cur_deadline)) {
 901  				if (rescan == FALSE) {
 902  					break;
 903  				}
 904  			}
 905  
 906  			if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
 907  				/* case (2b) lock inversion, dequeue and skip */
 908  				timer_queue_expire_lock_skips++;
 909  				timer_call_entry_dequeue_async(call);
 910  				call = NULL;
 911  				continue;
 912  			}
 913  
 914  			timer_call_entry_dequeue(call);
 915  
 916  			func = call->tc_func;
 917  			param0 = call->tc_param0;
 918  			param1 = call->tc_param1;
 919  
 920  			simple_unlock(&call->tc_lock);
 921  			timer_queue_unlock(queue);
 922  
 923  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 924  			    DECR_TIMER_CALLOUT | DBG_FUNC_START,
 925  			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 926  			    VM_KERNEL_ADDRHIDE(param0),
 927  			    VM_KERNEL_ADDRHIDE(param1),
 928  			    0);
 929  
 930  #if CONFIG_DTRACE
 931  			DTRACE_TMR7(callout__start, timer_call_func_t, func,
 932  			    timer_call_param_t, param0, unsigned, call->tc_flags,
 933  			    0, (call->tc_ttd >> 32),
 934  			    (unsigned) (call->tc_ttd & 0xFFFFFFFF), call);
 935  #endif
 936  #ifndef __DARLING__
 937  			/* Maintain time-to-deadline in per-processor data
 938  			 * structure for thread wakeup deadline statistics.
 939  			 */
 940  			uint64_t *ttdp = &current_processor()->timer_call_ttd;
 941  			*ttdp = call->tc_ttd;
 942  #endif // __DARLING__
 943  			(*func)(param0, param1);
 944  #ifndef __DARLING__
 945  			*ttdp = 0;
 946  #endif // __DARLING__
 947  #if CONFIG_DTRACE
 948  			DTRACE_TMR4(callout__end, timer_call_func_t, func,
 949  			    param0, param1, call);
 950  #endif
 951  
 952  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
 953  			    DECR_TIMER_CALLOUT | DBG_FUNC_END,
 954  			    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(func),
 955  			    VM_KERNEL_ADDRHIDE(param0),
 956  			    VM_KERNEL_ADDRHIDE(param1),
 957  			    0);
 958  			call = NULL;
 959  			timer_queue_lock_spin(queue);
 960  		} else {
 961  			if (__probable(rescan == FALSE)) {
 962  				break;
 963  			} else {
 964  				int64_t skew = call->tc_pqlink.deadline - call->tc_soft_deadline;
 965  				assert(call->tc_pqlink.deadline >= call->tc_soft_deadline);
 966  
 967  				/* DRK: On a latency quality-of-service level change,
 968  				 * re-sort potentially rate-limited timers. The platform
 969  				 * layer determines which timers require
 970  				 * this. In the absence of the per-callout
 971  				 * synchronization requirement, a global resort could
 972  				 * be more efficient. The re-sort effectively
 973  				 * annuls all timer adjustments, i.e. the "soft
 974  				 * deadline" is the sort key.
 975  				 */
 976  
 977  				if (timer_resort_threshold(skew)) {
 978  					if (__probable(simple_lock_try(&call->tc_lock, LCK_GRP_NULL))) {
 979  						/* TODO: don't need to dequeue before enqueue */
 980  						timer_call_entry_dequeue(call);
 981  						timer_call_entry_enqueue_deadline(call, queue, call->tc_soft_deadline);
 982  						simple_unlock(&call->tc_lock);
 983  						call = NULL;
 984  					}
 985  				}
 986  				if (call) {
 987  					call = qe_queue_next(&queue->head, call, struct timer_call, tc_qlink);
 988  
 989  					if (call == NULL) {
 990  						break;
 991  					}
 992  				}
 993  			}
 994  		}
 995  	}
 996  
 997  	call = priority_queue_min(&queue->mpq_pqhead, struct timer_call, tc_pqlink);
 998  
 999  	if (call) {
1000  		cur_deadline = call->tc_pqlink.deadline;
1001  		queue->earliest_soft_deadline = (call->tc_flags & TIMER_CALL_RATELIMITED) ? call->tc_pqlink.deadline: call->tc_soft_deadline;
1002  	} else {
1003  		queue->earliest_soft_deadline = cur_deadline = UINT64_MAX;
1004  	}
1005  
1006  	timer_queue_unlock(queue);
1007  
1008  	return cur_deadline;
1009  }
1010  
1011  uint64_t
1012  timer_queue_expire(
1013  	mpqueue_head_t          *queue,
1014  	uint64_t                deadline)
1015  {
1016  	return timer_queue_expire_with_options(queue, deadline, FALSE);
1017  }
1018  
1019  extern int serverperfmode;
1020  static uint32_t timer_queue_migrate_lock_skips;
1021  /*
1022   * timer_queue_migrate() is called by timer_queue_migrate_cpu()
1023   * to move timer requests from the local processor (queue_from)
1024   * to a target processor's (queue_to).
1025   */
1026  int
1027  timer_queue_migrate(mpqueue_head_t *queue_from, mpqueue_head_t *queue_to)
1028  {
1029  	timer_call_t    call;
1030  	timer_call_t    head_to;
1031  	int             timers_migrated = 0;
1032  
1033  	DBG("timer_queue_migrate(%p,%p)\n", queue_from, queue_to);
1034  
1035  	assert(!ml_get_interrupts_enabled());
1036  	assert(queue_from != queue_to);
1037  
1038  	if (serverperfmode) {
1039  		/*
1040  		 * if we're running a high end server
1041  		 * avoid migrations... they add latency
1042  		 * and don't save us power under typical
1043  		 * server workloads
1044  		 */
1045  		return -4;
1046  	}
1047  
1048  	/*
1049  	 * Take both local (from) and target (to) timer queue locks while
1050  	 * moving the timers from the local queue to the target processor.
1051  	 * We assume that the target is always the boot processor.
1052  	 * But only move if all of the following is true:
1053  	 *  - the target queue is non-empty
1054  	 *  - the local queue is non-empty
1055  	 *  - the local queue's first deadline is later than the target's
1056  	 *  - the local queue contains no non-migrateable "local" call
1057  	 * so that we need not have the target resync.
1058  	 */
1059  
1060  	timer_queue_lock_spin(queue_to);
1061  
1062  	head_to = priority_queue_min(&queue_to->mpq_pqhead, struct timer_call, tc_pqlink);
1063  
1064  	if (head_to == NULL) {
1065  		timers_migrated = -1;
1066  		goto abort1;
1067  	}
1068  
1069  	timer_queue_lock_spin(queue_from);
1070  
1071  	call = priority_queue_min(&queue_from->mpq_pqhead, struct timer_call, tc_pqlink);
1072  
1073  	if (call == NULL) {
1074  		timers_migrated = -2;
1075  		goto abort2;
1076  	}
1077  
1078  	if (call->tc_pqlink.deadline < head_to->tc_pqlink.deadline) {
1079  		timers_migrated = 0;
1080  		goto abort2;
1081  	}
1082  
1083  	/* perform scan for non-migratable timers */
1084  	qe_foreach_element(call, &queue_from->head, tc_qlink) {
1085  		if (call->tc_flags & TIMER_CALL_LOCAL) {
1086  			timers_migrated = -3;
1087  			goto abort2;
1088  		}
1089  	}
1090  
1091  	/* migration loop itself -- both queues are locked */
1092  	qe_foreach_element_safe(call, &queue_from->head, tc_qlink) {
1093  		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1094  			/* case (2b) lock order inversion, dequeue only */
1095  #ifdef TIMER_ASSERT
1096  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1097  			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1098  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1099  			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1100  			    0,
1101  			    0x2b, 0);
1102  #endif
1103  			timer_queue_migrate_lock_skips++;
1104  			timer_call_entry_dequeue_async(call);
1105  			continue;
1106  		}
1107  		timer_call_entry_dequeue(call);
1108  		timer_call_entry_enqueue_deadline(
1109  			call, queue_to, call->tc_pqlink.deadline);
1110  		timers_migrated++;
1111  		simple_unlock(&call->tc_lock);
1112  	}
1113  	queue_from->earliest_soft_deadline = UINT64_MAX;
1114  abort2:
1115  	timer_queue_unlock(queue_from);
1116  abort1:
1117  	timer_queue_unlock(queue_to);
1118  
1119  	return timers_migrated;
1120  }
1121  
1122  void
1123  timer_queue_trace_cpu(int ncpu)
1124  {
1125  	timer_call_nosync_cpu(
1126  		ncpu,
1127  		(void (*)(void *))timer_queue_trace,
1128  		(void*) timer_queue_cpu(ncpu));
1129  }
1130  
1131  void
1132  timer_queue_trace(
1133  	mpqueue_head_t                  *queue)
1134  {
1135  	timer_call_t    call;
1136  	spl_t           s;
1137  
1138  	if (!kdebug_enable) {
1139  		return;
1140  	}
1141  
1142  	s = splclock();
1143  	timer_queue_lock_spin(queue);
1144  
1145  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1146  	    DECR_TIMER_QUEUE | DBG_FUNC_START,
1147  	    queue->count, mach_absolute_time(), 0, 0, 0);
1148  
1149  	qe_foreach_element(call, &queue->head, tc_qlink) {
1150  		TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1151  		    DECR_TIMER_QUEUE | DBG_FUNC_NONE,
1152  		    call->tc_soft_deadline,
1153  		    call->tc_pqlink.deadline,
1154  		    call->tc_entry_time,
1155  		    VM_KERNEL_UNSLIDE(call->tc_func),
1156  		    0);
1157  	}
1158  
1159  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1160  	    DECR_TIMER_QUEUE | DBG_FUNC_END,
1161  	    queue->count, mach_absolute_time(), 0, 0, 0);
1162  
1163  	timer_queue_unlock(queue);
1164  	splx(s);
1165  }
1166  
1167  void
1168  timer_longterm_dequeued_locked(timer_call_t call)
1169  {
1170  	timer_longterm_t        *tlp = &timer_longterm;
1171  
1172  	tlp->dequeues++;
1173  	if (call == tlp->threshold.call) {
1174  		tlp->threshold.call = NULL;
1175  	}
1176  }
1177  
1178  /*
1179   * Place a timer call in the longterm list
1180   * and adjust the next timer callout deadline if the new timer is first.
1181   */
1182  mpqueue_head_t *
1183  timer_longterm_enqueue_unlocked(timer_call_t    call,
1184      uint64_t        now,
1185      uint64_t        deadline,
1186      mpqueue_head_t  **old_queue,
1187      uint64_t        soft_deadline,
1188      uint64_t        ttd,
1189      timer_call_param_t      param1,
1190      uint32_t        callout_flags)
1191  {
1192  	timer_longterm_t        *tlp = &timer_longterm;
1193  	boolean_t               update_required = FALSE;
1194  	uint64_t                longterm_threshold;
1195  
1196  	longterm_threshold = now + tlp->threshold.interval;
1197  
1198  	/*
1199  	 * Return NULL without doing anything if:
1200  	 *  - this timer is local, or
1201  	 *  - the longterm mechanism is disabled, or
1202  	 *  - this deadline is too short.
1203  	 */
1204  	if ((callout_flags & TIMER_CALL_LOCAL) != 0 ||
1205  	    (tlp->threshold.interval == TIMER_LONGTERM_NONE) ||
1206  	    (deadline <= longterm_threshold)) {
1207  		return NULL;
1208  	}
1209  
1210  	/*
1211  	 * Remove timer from its current queue, if any.
1212  	 */
1213  	*old_queue = timer_call_dequeue_unlocked(call);
1214  
1215  	/*
1216  	 * Lock the longterm queue, queue timer and determine
1217  	 * whether an update is necessary.
1218  	 */
1219  	assert(!ml_get_interrupts_enabled());
1220  	simple_lock(&call->tc_lock, LCK_GRP_NULL);
1221  	timer_queue_lock_spin(timer_longterm_queue);
1222  	call->tc_pqlink.deadline = deadline;
1223  	call->tc_param1 = param1;
1224  	call->tc_ttd = ttd;
1225  	call->tc_soft_deadline = soft_deadline;
1226  	call->tc_flags = callout_flags;
1227  	timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1228  
1229  	tlp->enqueues++;
1230  
1231  	/*
1232  	 * We'll need to update the currently set threshold timer
1233  	 * if the new deadline is sooner and no sooner update is in flight.
1234  	 */
1235  	if (deadline < tlp->threshold.deadline &&
1236  	    deadline < tlp->threshold.preempted) {
1237  		tlp->threshold.preempted = deadline;
1238  		tlp->threshold.call = call;
1239  		update_required = TRUE;
1240  	}
1241  	timer_queue_unlock(timer_longterm_queue);
1242  	simple_unlock(&call->tc_lock);
1243  
1244  	if (update_required) {
1245  		/*
1246  		 * Note: this call expects that calling the master cpu
1247  		 * alone does not involve locking the topo lock.
1248  		 */
1249  		timer_call_nosync_cpu(
1250  			master_cpu,
1251  			(void (*)(void *))timer_longterm_update,
1252  			(void *)tlp);
1253  	}
1254  
1255  	return timer_longterm_queue;
1256  }
1257  
1258  /*
1259   * Scan for timers below the longterm threshold.
1260   * Move these to the local timer queue (of the boot processor on which the
1261   * calling thread is running).
1262   * Both the local (boot) queue and the longterm queue are locked.
1263   * The scan is similar to the timer migrate sequence but is performed by
1264   * successively examining each timer on the longterm queue:
1265   *  - if within the short-term threshold
1266   *    - enter on the local queue (unless being deleted),
1267   *  - otherwise:
1268   *    - if sooner, deadline becomes the next threshold deadline.
1269   * The total scan time is limited to TIMER_LONGTERM_SCAN_LIMIT. Should this be
1270   * exceeded, we abort and reschedule again so that we don't shut others from
1271   * the timer queues. Longterm timers firing late is not critical.
1272   */
1273  void
1274  timer_longterm_scan(timer_longterm_t    *tlp,
1275      uint64_t            time_start)
1276  {
1277  	timer_call_t    call;
1278  	uint64_t        threshold;
1279  	uint64_t        deadline;
1280  	uint64_t        time_limit = time_start + tlp->scan_limit;
1281  	mpqueue_head_t  *timer_master_queue;
1282  
1283  	assert(!ml_get_interrupts_enabled());
1284  	assert(cpu_number() == master_cpu);
1285  
1286  	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1287  		threshold = time_start + tlp->threshold.interval;
1288  	}
1289  
1290  	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1291  	tlp->threshold.call = NULL;
1292  
1293  	if (queue_empty(&timer_longterm_queue->head)) {
1294  		return;
1295  	}
1296  
1297  	timer_master_queue = timer_queue_cpu(master_cpu);
1298  	timer_queue_lock_spin(timer_master_queue);
1299  
1300  	qe_foreach_element_safe(call, &timer_longterm_queue->head, tc_qlink) {
1301  		deadline = call->tc_soft_deadline;
1302  		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1303  			/* case (2c) lock order inversion, dequeue only */
1304  #ifdef TIMER_ASSERT
1305  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1306  			    DECR_TIMER_ASYNC_DEQ | DBG_FUNC_NONE,
1307  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1308  			    VM_KERNEL_UNSLIDE_OR_PERM(call->tc_queue),
1309  			    0,
1310  			    0x2c, 0);
1311  #endif
1312  			timer_call_entry_dequeue_async(call);
1313  			continue;
1314  		}
1315  		if (deadline < threshold) {
1316  			/*
1317  			 * This timer needs moving (escalating)
1318  			 * to the local (boot) processor's queue.
1319  			 */
1320  #ifdef TIMER_ASSERT
1321  			if (deadline < time_start) {
1322  				TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1323  				    DECR_TIMER_OVERDUE | DBG_FUNC_NONE,
1324  				    VM_KERNEL_UNSLIDE_OR_PERM(call),
1325  				    deadline,
1326  				    time_start,
1327  				    threshold,
1328  				    0);
1329  			}
1330  #endif
1331  			TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1332  			    DECR_TIMER_ESCALATE | DBG_FUNC_NONE,
1333  			    VM_KERNEL_UNSLIDE_OR_PERM(call),
1334  			    call->tc_pqlink.deadline,
1335  			    call->tc_entry_time,
1336  			    VM_KERNEL_UNSLIDE(call->tc_func),
1337  			    0);
1338  			tlp->escalates++;
1339  			timer_call_entry_dequeue(call);
1340  			timer_call_entry_enqueue_deadline(
1341  				call, timer_master_queue, call->tc_pqlink.deadline);
1342  			/*
1343  			 * A side-effect of the following call is to update
1344  			 * the actual hardware deadline if required.
1345  			 */
1346  			(void) timer_queue_assign(deadline);
1347  		} else {
1348  			if (deadline < tlp->threshold.deadline) {
1349  				tlp->threshold.deadline = deadline;
1350  				tlp->threshold.call = call;
1351  			}
1352  		}
1353  		simple_unlock(&call->tc_lock);
1354  
1355  		/* Abort scan if we're taking too long. */
1356  		if (mach_absolute_time() > time_limit) {
1357  			tlp->threshold.deadline = TIMER_LONGTERM_SCAN_AGAIN;
1358  			tlp->scan_pauses++;
1359  			DBG("timer_longterm_scan() paused %llu, qlen: %llu\n",
1360  			    time_limit, tlp->queue.count);
1361  			break;
1362  		}
1363  	}
1364  
1365  	timer_queue_unlock(timer_master_queue);
1366  }
1367  
1368  void
1369  timer_longterm_callout(timer_call_param_t p0, __unused timer_call_param_t p1)
1370  {
1371  	timer_longterm_t        *tlp = (timer_longterm_t *) p0;
1372  
1373  	timer_longterm_update(tlp);
1374  }
1375  
1376  void
1377  timer_longterm_update_locked(timer_longterm_t *tlp)
1378  {
1379  	uint64_t        latency;
1380  
1381  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1382  	    DECR_TIMER_UPDATE | DBG_FUNC_START,
1383  	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1384  	    tlp->threshold.deadline,
1385  	    tlp->threshold.preempted,
1386  	    tlp->queue.count, 0);
1387  
1388  	tlp->scan_time = mach_absolute_time();
1389  	if (tlp->threshold.preempted != TIMER_LONGTERM_NONE) {
1390  		tlp->threshold.preempts++;
1391  		tlp->threshold.deadline = tlp->threshold.preempted;
1392  		tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1393  		/*
1394  		 * Note: in the unlikely event that a pre-empted timer has
1395  		 * itself been cancelled, we'll simply re-scan later at the
1396  		 * time of the preempted/cancelled timer.
1397  		 */
1398  	} else {
1399  		tlp->threshold.scans++;
1400  
1401  		/*
1402  		 * Maintain a moving average of our wakeup latency.
1403  		 * Clamp latency to 0 and ignore above threshold interval.
1404  		 */
1405  		if (tlp->scan_time > tlp->threshold.deadline_set) {
1406  			latency = tlp->scan_time - tlp->threshold.deadline_set;
1407  		} else {
1408  			latency = 0;
1409  		}
1410  		if (latency < tlp->threshold.interval) {
1411  			tlp->threshold.latency_min =
1412  			    MIN(tlp->threshold.latency_min, latency);
1413  			tlp->threshold.latency_max =
1414  			    MAX(tlp->threshold.latency_max, latency);
1415  			tlp->threshold.latency =
1416  			    (tlp->threshold.latency * 99 + latency) / 100;
1417  		}
1418  
1419  		timer_longterm_scan(tlp, tlp->scan_time);
1420  	}
1421  
1422  	tlp->threshold.deadline_set = tlp->threshold.deadline;
1423  	/* The next deadline timer to be set is adjusted */
1424  	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE &&
1425  	    tlp->threshold.deadline != TIMER_LONGTERM_SCAN_AGAIN) {
1426  		tlp->threshold.deadline_set -= tlp->threshold.margin;
1427  		tlp->threshold.deadline_set -= tlp->threshold.latency;
1428  	}
1429  
1430  	/* Throttle next scan time */
1431  	uint64_t scan_clamp = mach_absolute_time() + tlp->scan_interval;
1432  	if (tlp->threshold.deadline_set < scan_clamp) {
1433  		tlp->threshold.deadline_set = scan_clamp;
1434  	}
1435  
1436  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE,
1437  	    DECR_TIMER_UPDATE | DBG_FUNC_END,
1438  	    VM_KERNEL_UNSLIDE_OR_PERM(&tlp->queue),
1439  	    tlp->threshold.deadline,
1440  	    tlp->threshold.scans,
1441  	    tlp->queue.count, 0);
1442  }
1443  
1444  void
1445  timer_longterm_update(timer_longterm_t *tlp)
1446  {
1447  	spl_t   s = splclock();
1448  
1449  	timer_queue_lock_spin(timer_longterm_queue);
1450  
1451  	if (cpu_number() != master_cpu) {
1452  		panic("timer_longterm_update_master() on non-boot cpu");
1453  	}
1454  
1455  	timer_longterm_update_locked(tlp);
1456  
1457  	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1458  		timer_call_enter(
1459  			&tlp->threshold.timer,
1460  			tlp->threshold.deadline_set,
1461  			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1462  	}
1463  
1464  	timer_queue_unlock(timer_longterm_queue);
1465  	splx(s);
1466  }
1467  
1468  void
1469  timer_longterm_init(void)
1470  {
1471  	uint32_t                longterm;
1472  	timer_longterm_t        *tlp = &timer_longterm;
1473  
1474  	DBG("timer_longterm_init() tlp: %p, queue: %p\n", tlp, &tlp->queue);
1475  
1476  	/*
1477  	 * Set the longterm timer threshold. Defaults to TIMER_LONGTERM_THRESHOLD
1478  	 * or TIMER_LONGTERM_NONE (disabled) for server;
1479  	 * overridden longterm boot-arg
1480  	 */
1481  	tlp->threshold.interval = serverperfmode ? TIMER_LONGTERM_NONE
1482  	    : TIMER_LONGTERM_THRESHOLD;
1483  	if (PE_parse_boot_argn("longterm", &longterm, sizeof(longterm))) {
1484  		tlp->threshold.interval = (longterm == 0) ?
1485  		    TIMER_LONGTERM_NONE :
1486  		    longterm * NSEC_PER_MSEC;
1487  	}
1488  	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1489  		printf("Longterm timer threshold: %llu ms\n",
1490  		    tlp->threshold.interval / NSEC_PER_MSEC);
1491  		kprintf("Longterm timer threshold: %llu ms\n",
1492  		    tlp->threshold.interval / NSEC_PER_MSEC);
1493  		nanoseconds_to_absolutetime(tlp->threshold.interval,
1494  		    &tlp->threshold.interval);
1495  		tlp->threshold.margin = tlp->threshold.interval / 10;
1496  		tlp->threshold.latency_min = EndOfAllTime;
1497  		tlp->threshold.latency_max = 0;
1498  	}
1499  
1500  	tlp->threshold.preempted = TIMER_LONGTERM_NONE;
1501  	tlp->threshold.deadline = TIMER_LONGTERM_NONE;
1502  
1503  	mpqueue_init(&tlp->queue, &timer_longterm_lck_grp, LCK_ATTR_NULL);
1504  
1505  	timer_call_setup(&tlp->threshold.timer,
1506  	    timer_longterm_callout, (timer_call_param_t) tlp);
1507  
1508  	timer_longterm_queue = &tlp->queue;
1509  }
1510  
1511  enum {
1512  	THRESHOLD, QCOUNT,
1513  	ENQUEUES, DEQUEUES, ESCALATES, SCANS, PREEMPTS,
1514  	LATENCY, LATENCY_MIN, LATENCY_MAX, SCAN_LIMIT, SCAN_INTERVAL, PAUSES
1515  };
1516  uint64_t
1517  timer_sysctl_get(int oid)
1518  {
1519  	timer_longterm_t        *tlp = &timer_longterm;
1520  
1521  	switch (oid) {
1522  	case THRESHOLD:
1523  		return (tlp->threshold.interval == TIMER_LONGTERM_NONE) ?
1524  		       0 : tlp->threshold.interval / NSEC_PER_MSEC;
1525  	case QCOUNT:
1526  		return tlp->queue.count;
1527  	case ENQUEUES:
1528  		return tlp->enqueues;
1529  	case DEQUEUES:
1530  		return tlp->dequeues;
1531  	case ESCALATES:
1532  		return tlp->escalates;
1533  	case SCANS:
1534  		return tlp->threshold.scans;
1535  	case PREEMPTS:
1536  		return tlp->threshold.preempts;
1537  	case LATENCY:
1538  		return tlp->threshold.latency;
1539  	case LATENCY_MIN:
1540  		return tlp->threshold.latency_min;
1541  	case LATENCY_MAX:
1542  		return tlp->threshold.latency_max;
1543  	case SCAN_LIMIT:
1544  		return tlp->scan_limit;
1545  	case SCAN_INTERVAL:
1546  		return tlp->scan_interval;
1547  	case PAUSES:
1548  		return tlp->scan_pauses;
1549  	default:
1550  		return 0;
1551  	}
1552  }
1553  
1554  /*
1555   * timer_master_scan() is the inverse of timer_longterm_scan()
1556   * since it un-escalates timers to the longterm queue.
1557   */
1558  static void
1559  timer_master_scan(timer_longterm_t      *tlp,
1560      uint64_t              now)
1561  {
1562  	timer_call_t    call;
1563  	uint64_t        threshold;
1564  	uint64_t        deadline;
1565  	mpqueue_head_t  *timer_master_queue;
1566  
1567  	if (tlp->threshold.interval != TIMER_LONGTERM_NONE) {
1568  		threshold = now + tlp->threshold.interval;
1569  	} else {
1570  		threshold = TIMER_LONGTERM_NONE;
1571  	}
1572  
1573  	timer_master_queue = timer_queue_cpu(master_cpu);
1574  	timer_queue_lock_spin(timer_master_queue);
1575  
1576  	qe_foreach_element_safe(call, &timer_master_queue->head, tc_qlink) {
1577  		deadline = call->tc_pqlink.deadline;
1578  		if ((call->tc_flags & TIMER_CALL_LOCAL) != 0) {
1579  			continue;
1580  		}
1581  		if (!simple_lock_try(&call->tc_lock, LCK_GRP_NULL)) {
1582  			/* case (2c) lock order inversion, dequeue only */
1583  			timer_call_entry_dequeue_async(call);
1584  			continue;
1585  		}
1586  		if (deadline > threshold) {
1587  			/* move from master to longterm */
1588  			timer_call_entry_dequeue(call);
1589  			timer_call_entry_enqueue_tail(call, timer_longterm_queue);
1590  			if (deadline < tlp->threshold.deadline) {
1591  				tlp->threshold.deadline = deadline;
1592  				tlp->threshold.call = call;
1593  			}
1594  		}
1595  		simple_unlock(&call->tc_lock);
1596  	}
1597  	timer_queue_unlock(timer_master_queue);
1598  }
1599  
1600  static void
1601  timer_sysctl_set_threshold(uint64_t value)
1602  {
1603  	timer_longterm_t        *tlp = &timer_longterm;
1604  	spl_t                   s = splclock();
1605  	boolean_t               threshold_increase;
1606  
1607  	timer_queue_lock_spin(timer_longterm_queue);
1608  
1609  	timer_call_cancel(&tlp->threshold.timer);
1610  
1611  	/*
1612  	 * Set the new threshold and note whther it's increasing.
1613  	 */
1614  	if (value == 0) {
1615  		tlp->threshold.interval = TIMER_LONGTERM_NONE;
1616  		threshold_increase = TRUE;
1617  		timer_call_cancel(&tlp->threshold.timer);
1618  	} else {
1619  		uint64_t        old_interval = tlp->threshold.interval;
1620  		tlp->threshold.interval = value * NSEC_PER_MSEC;
1621  		nanoseconds_to_absolutetime(tlp->threshold.interval,
1622  		    &tlp->threshold.interval);
1623  		tlp->threshold.margin = tlp->threshold.interval / 10;
1624  		if (old_interval == TIMER_LONGTERM_NONE) {
1625  			threshold_increase = FALSE;
1626  		} else {
1627  			threshold_increase = (tlp->threshold.interval > old_interval);
1628  		}
1629  	}
1630  
1631  	if (threshold_increase /* or removal */) {
1632  		/* Escalate timers from the longterm queue */
1633  		timer_longterm_scan(tlp, mach_absolute_time());
1634  	} else { /* decrease or addition  */
1635  		/*
1636  		 * We scan the local/master queue for timers now longterm.
1637  		 * To be strictly correct, we should scan all processor queues
1638  		 * but timer migration results in most timers gravitating to the
1639  		 * master processor in any case.
1640  		 */
1641  		timer_master_scan(tlp, mach_absolute_time());
1642  	}
1643  
1644  	/* Set new timer accordingly */
1645  	tlp->threshold.deadline_set = tlp->threshold.deadline;
1646  	if (tlp->threshold.deadline != TIMER_LONGTERM_NONE) {
1647  		tlp->threshold.deadline_set -= tlp->threshold.margin;
1648  		tlp->threshold.deadline_set -= tlp->threshold.latency;
1649  		timer_call_enter(
1650  			&tlp->threshold.timer,
1651  			tlp->threshold.deadline_set,
1652  			TIMER_CALL_LOCAL | TIMER_CALL_SYS_CRITICAL);
1653  	}
1654  
1655  	/* Reset stats */
1656  	tlp->enqueues = 0;
1657  	tlp->dequeues = 0;
1658  	tlp->escalates = 0;
1659  	tlp->scan_pauses = 0;
1660  	tlp->threshold.scans = 0;
1661  	tlp->threshold.preempts = 0;
1662  	tlp->threshold.latency = 0;
1663  	tlp->threshold.latency_min = EndOfAllTime;
1664  	tlp->threshold.latency_max = 0;
1665  
1666  	timer_queue_unlock(timer_longterm_queue);
1667  	splx(s);
1668  }
1669  
1670  int
1671  timer_sysctl_set(int oid, uint64_t value)
1672  {
1673  	switch (oid) {
1674  	case THRESHOLD:
1675  		timer_call_cpu(
1676  			master_cpu,
1677  			(void (*)(void *))timer_sysctl_set_threshold,
1678  			(void *) value);
1679  		return KERN_SUCCESS;
1680  	case SCAN_LIMIT:
1681  		timer_longterm.scan_limit = value;
1682  		return KERN_SUCCESS;
1683  	case SCAN_INTERVAL:
1684  		timer_longterm.scan_interval = value;
1685  		return KERN_SUCCESS;
1686  	default:
1687  		return KERN_INVALID_ARGUMENT;
1688  	}
1689  }
1690  
1691  
1692  /* Select timer coalescing window based on per-task quality-of-service hints */
1693  static boolean_t
1694  tcoal_qos_adjust(thread_t t, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1695  {
1696  	uint32_t latency_qos;
1697  	boolean_t adjusted = FALSE;
1698  	task_t ctask = t->task;
1699  
1700  	if (ctask) {
1701  		latency_qos = proc_get_effective_thread_policy(t, TASK_POLICY_LATENCY_QOS);
1702  
1703  		assert(latency_qos <= NUM_LATENCY_QOS_TIERS);
1704  
1705  		if (latency_qos) {
1706  			*tshift = tcoal_prio_params.latency_qos_scale[latency_qos - 1];
1707  			*tmax_abstime = tcoal_prio_params.latency_qos_abstime_max[latency_qos - 1];
1708  			*pratelimited = tcoal_prio_params.latency_tier_rate_limited[latency_qos - 1];
1709  			adjusted = TRUE;
1710  		}
1711  	}
1712  	return adjusted;
1713  }
1714  
1715  
1716  /* Adjust timer deadlines based on priority of the thread and the
1717   * urgency value provided at timeout establishment. With this mechanism,
1718   * timers are no longer necessarily sorted in order of soft deadline
1719   * on a given timer queue, i.e. they may be differentially skewed.
1720   * In the current scheme, this could lead to fewer pending timers
1721   * processed than is technically possible when the HW deadline arrives.
1722   */
1723  static void
1724  timer_compute_leeway(thread_t cthread, int32_t urgency, int32_t *tshift, uint64_t *tmax_abstime, boolean_t *pratelimited)
1725  {
1726  	int16_t tpri = cthread->sched_pri;
1727  	if ((urgency & TIMER_CALL_USER_MASK) != 0) {
1728  		if (tpri >= BASEPRI_RTQUEUES ||
1729  		    urgency == TIMER_CALL_USER_CRITICAL) {
1730  			*tshift = tcoal_prio_params.timer_coalesce_rt_shift;
1731  			*tmax_abstime = tcoal_prio_params.timer_coalesce_rt_abstime_max;
1732  			TCOAL_PRIO_STAT(rt_tcl);
1733  		} else if (proc_get_effective_thread_policy(cthread, TASK_POLICY_DARWIN_BG) ||
1734  		    (urgency == TIMER_CALL_USER_BACKGROUND)) {
1735  			/* Determine if timer should be subjected to a lower QoS */
1736  			if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1737  				if (*tmax_abstime > tcoal_prio_params.timer_coalesce_bg_abstime_max) {
1738  					return;
1739  				} else {
1740  					*pratelimited = FALSE;
1741  				}
1742  			}
1743  			*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1744  			*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1745  			TCOAL_PRIO_STAT(bg_tcl);
1746  		} else if (tpri >= MINPRI_KERNEL) {
1747  			*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1748  			*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1749  			TCOAL_PRIO_STAT(kt_tcl);
1750  		} else if (cthread->sched_mode == TH_MODE_FIXED) {
1751  			*tshift = tcoal_prio_params.timer_coalesce_fp_shift;
1752  			*tmax_abstime = tcoal_prio_params.timer_coalesce_fp_abstime_max;
1753  			TCOAL_PRIO_STAT(fp_tcl);
1754  		} else if (tcoal_qos_adjust(cthread, tshift, tmax_abstime, pratelimited)) {
1755  			TCOAL_PRIO_STAT(qos_tcl);
1756  		} else if (cthread->sched_mode == TH_MODE_TIMESHARE) {
1757  			*tshift = tcoal_prio_params.timer_coalesce_ts_shift;
1758  			*tmax_abstime = tcoal_prio_params.timer_coalesce_ts_abstime_max;
1759  			TCOAL_PRIO_STAT(ts_tcl);
1760  		} else {
1761  			TCOAL_PRIO_STAT(nc_tcl);
1762  		}
1763  	} else if (urgency == TIMER_CALL_SYS_BACKGROUND) {
1764  		*tshift = tcoal_prio_params.timer_coalesce_bg_shift;
1765  		*tmax_abstime = tcoal_prio_params.timer_coalesce_bg_abstime_max;
1766  		TCOAL_PRIO_STAT(bg_tcl);
1767  	} else {
1768  		*tshift = tcoal_prio_params.timer_coalesce_kt_shift;
1769  		*tmax_abstime = tcoal_prio_params.timer_coalesce_kt_abstime_max;
1770  		TCOAL_PRIO_STAT(kt_tcl);
1771  	}
1772  }
1773  
1774  
1775  int timer_user_idle_level;
1776  
1777  uint64_t
1778  timer_call_slop(uint64_t deadline, uint64_t now, uint32_t flags, thread_t cthread, boolean_t *pratelimited)
1779  {
1780  	int32_t tcs_shift = 0;
1781  	uint64_t tcs_max_abstime = 0;
1782  	uint64_t adjval;
1783  	uint32_t urgency = (flags & TIMER_CALL_URGENCY_MASK);
1784  
1785  	if (mach_timer_coalescing_enabled &&
1786  	    (deadline > now) && (urgency != TIMER_CALL_SYS_CRITICAL)) {
1787  		timer_compute_leeway(cthread, urgency, &tcs_shift, &tcs_max_abstime, pratelimited);
1788  
1789  		if (tcs_shift >= 0) {
1790  			adjval =  MIN((deadline - now) >> tcs_shift, tcs_max_abstime);
1791  		} else {
1792  			adjval =  MIN((deadline - now) << (-tcs_shift), tcs_max_abstime);
1793  		}
1794  		/* Apply adjustments derived from "user idle level" heuristic */
1795  		adjval += (adjval * timer_user_idle_level) >> 7;
1796  		return adjval;
1797  	} else {
1798  		return 0;
1799  	}
1800  }
1801  
1802  int
1803  timer_get_user_idle_level(void)
1804  {
1805  	return timer_user_idle_level;
1806  }
1807  
1808  kern_return_t
1809  timer_set_user_idle_level(int ilevel)
1810  {
1811  	boolean_t do_reeval = FALSE;
1812  
1813  	if ((ilevel < 0) || (ilevel > 128)) {
1814  		return KERN_INVALID_ARGUMENT;
1815  	}
1816  
1817  	if (ilevel < timer_user_idle_level) {
1818  		do_reeval = TRUE;
1819  	}
1820  
1821  	timer_user_idle_level = ilevel;
1822  
1823  	if (do_reeval) {
1824  		ml_timer_evaluate();
1825  	}
1826  
1827  	return KERN_SUCCESS;
1828  }
1829  
1830  #pragma mark - running timers
1831  
1832  #define RUNNING_TIMER_FAKE_FLAGS (TIMER_CALL_SYS_CRITICAL | \
1833      TIMER_CALL_LOCAL)
1834  
1835  /*
1836   * timer_call_trace_* functions mimic the tracing behavior from the normal
1837   * timer_call subsystem, so tools continue to function.
1838   */
1839  
1840  static void
1841  timer_call_trace_enter_before(struct timer_call *call, uint64_t deadline,
1842      uint32_t flags, uint64_t now)
1843  {
1844  #pragma unused(call, deadline, flags, now)
1845  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_START,
1846  	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_ADDRHIDE(call->tc_param1),
1847  	    deadline, flags, 0);
1848  #if CONFIG_DTRACE
1849  	uint64_t ttd = deadline - now;
1850  	DTRACE_TMR7(callout__create, timer_call_func_t, call->tc_func,
1851  	    timer_call_param_t, call->tc_param0, uint32_t, flags, 0,
1852  	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1853  #endif /* CONFIG_DTRACE */
1854  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1855  	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1856  }
1857  
1858  static void
1859  timer_call_trace_enter_after(struct timer_call *call, uint64_t deadline)
1860  {
1861  #pragma unused(call, deadline)
1862  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_ENTER | DBG_FUNC_END,
1863  	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline, 0, 0);
1864  }
1865  
1866  static void
1867  timer_call_trace_cancel(struct timer_call *call)
1868  {
1869  #pragma unused(call)
1870  	__unused uint64_t deadline = call->tc_pqlink.deadline;
1871  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_START,
1872  	    VM_KERNEL_UNSLIDE_OR_PERM(call), deadline, 0,
1873  	    call->tc_flags, 0);
1874  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CANCEL | DBG_FUNC_END,
1875  	    VM_KERNEL_UNSLIDE_OR_PERM(call), 0, deadline - mach_absolute_time(),
1876  	    deadline - call->tc_entry_time, 0);
1877  #if CONFIG_DTRACE
1878  #if TIMER_TRACE
1879  	uint64_t ttd = deadline - call->tc_entry_time;
1880  #else
1881  	uint64_t ttd = UINT64_MAX;
1882  #endif /* TIMER_TRACE */
1883  	DTRACE_TMR6(callout__cancel, timer_call_func_t, call->tc_func,
1884  	    timer_call_param_t, call->tc_param0, uint32_t, call->tc_flags, 0,
1885  	    (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF));
1886  #endif /* CONFIG_DTRACE */
1887  }
1888  
1889  static void
1890  timer_call_trace_expire_entry(struct timer_call *call)
1891  {
1892  #pragma unused(call)
1893  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_START,
1894  	    VM_KERNEL_UNSLIDE_OR_PERM(call), VM_KERNEL_UNSLIDE(call->tc_func),
1895  	    VM_KERNEL_ADDRHIDE(call->tc_param0),
1896  	    VM_KERNEL_ADDRHIDE(call->tc_param1),
1897  	    0);
1898  #if CONFIG_DTRACE
1899  #if TIMER_TRACE
1900  	uint64_t ttd = call->tc_pqlink.deadline - call->tc_entry_time;
1901  #else /* TIMER_TRACE */
1902  	uint64_t ttd = UINT64_MAX;
1903  #endif /* TIMER_TRACE */
1904  	DTRACE_TMR7(callout__start, timer_call_func_t, call->tc_func,
1905  	    timer_call_param_t, call->tc_param0, unsigned, call->tc_flags,
1906  	    0, (ttd >> 32), (unsigned int)(ttd & 0xFFFFFFFF), NULL);
1907  #endif /* CONFIG_DTRACE */
1908  }
1909  
1910  static void
1911  timer_call_trace_expire_return(struct timer_call *call)
1912  {
1913  #pragma unused(call)
1914  #if CONFIG_DTRACE
1915  	DTRACE_TMR4(callout__end, timer_call_func_t, call->tc_func,
1916  	    call->tc_param0, call->tc_param1, NULL);
1917  #endif /* CONFIG_DTRACE */
1918  	TIMER_KDEBUG_TRACE(KDEBUG_TRACE, DECR_TIMER_CALLOUT | DBG_FUNC_END,
1919  	    VM_KERNEL_UNSLIDE_OR_PERM(call),
1920  	    VM_KERNEL_UNSLIDE(call->tc_func),
1921  	    VM_KERNEL_ADDRHIDE(call->tc_param0),
1922  	    VM_KERNEL_ADDRHIDE(call->tc_param1),
1923  	    0);
1924  }
1925  
1926  /*
1927   * Set a new deadline for a running timer on this processor.
1928   */
1929  void
1930  running_timer_setup(processor_t processor, enum running_timer timer,
1931      void *param, uint64_t deadline, uint64_t now)
1932  {
1933  	assert(timer < RUNNING_TIMER_MAX);
1934  	assert(ml_get_interrupts_enabled() == FALSE);
1935  
1936  	struct timer_call *call = &processor->running_timers[timer];
1937  
1938  	timer_call_trace_enter_before(call, deadline, RUNNING_TIMER_FAKE_FLAGS,
1939  	    now);
1940  
1941  	if (__improbable(deadline < now)) {
1942  		deadline = timer_call_past_deadline_timer_handle(deadline, now);
1943  	}
1944  
1945  	call->tc_pqlink.deadline = deadline;
1946  #if TIMER_TRACE
1947  	call->tc_entry_time = now;
1948  #endif /* TIMER_TRACE */
1949  	call->tc_param1 = param;
1950  
1951  	timer_call_trace_enter_after(call, deadline);
1952  }
1953  
1954  void
1955  running_timers_sync(void)
1956  {
1957  	timer_resync_deadlines();
1958  }
1959  
1960  void
1961  timer_call_sloptimer_call_sloptimer_call_slop(processor_t processor, unsigned int timer,
1962      void *param, uint64_t deadline, uint64_t now)
1963  {
1964  	running_timer_setup(processor, timer, param, deadline, now);
1965  	running_timers_sync();
1966  }
1967  
1968  /*
1969   * Call the callback for any running timers that fired for this processor.
1970   * Returns true if any timers were past their deadline.
1971   */
1972  bool
1973  running_timers_expire(processor_t processor, uint64_t now)
1974  {
1975  	bool expired = false;
1976  
1977  	if (!processor->running_timers_active) {
1978  		return expired;
1979  	}
1980  
1981  	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
1982  		struct timer_call *call = &processor->running_timers[i];
1983  
1984  		uint64_t deadline = call->tc_pqlink.deadline;
1985  		if (deadline > now) {
1986  			continue;
1987  		}
1988  
1989  		expired = true;
1990  		timer_call_trace_expire_entry(call);
1991  		call->tc_func(call->tc_param0, call->tc_param1);
1992  		timer_call_trace_expire_return(call);
1993  	}
1994  
1995  	return expired;
1996  }
1997  
1998  void
1999  running_timer_clear(processor_t processor, enum running_timer timer)
2000  {
2001  	struct timer_call *call = &processor->running_timers[timer];
2002  	uint64_t deadline = call->tc_pqlink.deadline;
2003  	if (deadline == EndOfAllTime) {
2004  		return;
2005  	}
2006  
2007  	call->tc_pqlink.deadline = EndOfAllTime;
2008  #if TIMER_TRACE
2009  	call->tc_entry_time = 0;
2010  #endif /* TIMER_TRACE */
2011  	timer_call_trace_cancel(call);
2012  }
2013  
2014  void
2015  running_timer_cancel(processor_t processor, unsigned int timer)
2016  {
2017  	running_timer_clear(processor, timer);
2018  	running_timers_sync();
2019  }
2020  
2021  uint64_t
2022  running_timers_deadline(processor_t processor)
2023  {
2024  	if (!processor->running_timers_active) {
2025  		return EndOfAllTime;
2026  	}
2027  
2028  	uint64_t deadline = EndOfAllTime;
2029  	for (int i = 0; i < RUNNING_TIMER_MAX; i++) {
2030  		uint64_t candidate =
2031  		    processor->running_timers[i].tc_pqlink.deadline;
2032  		if (candidate != 0 && candidate < deadline) {
2033  			deadline = candidate;
2034  		}
2035  	}
2036  
2037  	return deadline;
2038  }
2039  
2040  void
2041  running_timers_activate(processor_t processor)
2042  {
2043  	processor->running_timers_active = true;
2044  	running_timers_sync();
2045  }
2046  
2047  void
2048  running_timers_deactivate(processor_t processor)
2049  {
2050  	assert(processor->running_timers_active == true);
2051  	processor->running_timers_active = false;
2052  	running_timers_sync();
2053  }