/ duct-tape / xnu / osfmk / kern / cpu_quiesce.c
cpu_quiesce.c
  1  /*
  2   * Copyright (c) 2018 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   *
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   *
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   *
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   */
 28  
 29  #ifdef __x86_64__
 30  #error This file is only needed on weakly-ordered systems!
 31  #endif
 32  
 33  #include <machine/atomic.h>
 34  #include <machine/commpage.h>
 35  #include <machine/machine_cpu.h>
 36  
 37  #include <kern/sched_prim.h>
 38  #include <kern/percpu.h>
 39  #include <kern/ast.h>
 40  
 41  #include <kern/cpu_quiesce.h>
 42  
 43  /*
 44   * CPU quiescing generation counter implemented with a checkin mask
 45   *
 46   * A tri-state bitfield, with 2 bits for each processor:;
 47   * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
 48   * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
 49   *
 50   * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
 51   * indicates that all processors have executed the barrier, it ticks the algorithm
 52   * and resets the state.
 53   *
 54   * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
 55   * However, they can't do anything in userspace while idle, so we don't need
 56   * them to execute barriers, so we have them 'leave' the counter so that
 57   * they don't delay the tick while idle.
 58   *
 59   * This bitfield currently limits MAX_CPUS to 32 on LP64.
 60   * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
 61   *
 62   * The mask only guarantees ordering to code running in userspace.
 63   * We defer joining the counter until we actually reach userspace, allowing
 64   * processors that come out of idle and only run kernel code to avoid the overhead
 65   * of participation.
 66   *
 67   * We additionally defer updating the counter for a minimum interval to
 68   * reduce the frequency of executing the exclusive atomic operations.
 69   *
 70   * The longest delay between two checkins assuming that at least one processor
 71   * joins is <checkin delay> + (<thread quantum> * 2)
 72   */
 73  
 74  typedef unsigned long checkin_mask_t;
 75  
 76  static _Atomic checkin_mask_t cpu_quiescing_checkin_state;
 77  
 78  static uint64_t cpu_checkin_last_commit;
 79  
 80  struct cpu_quiesce {
 81  	cpu_quiescent_state_t   state;
 82  	uint64_t                last_checkin;
 83  };
 84  
 85  static struct cpu_quiesce PERCPU_DATA(cpu_quiesce);
 86  
 87  #define CPU_CHECKIN_MIN_INTERVAL_US     4000 /* 4ms */
 88  #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
 89  static uint64_t cpu_checkin_min_interval;
 90  static uint32_t cpu_checkin_min_interval_us;
 91  
 92  #if __LP64__
 93  #define CPU_CHECKIN_MASK_MAX_CPUS 32
 94  #define CPU_CHECKIN_MASK          0x5555555555555555UL
 95  #define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
 96  #else
 97  /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
 98  #define CPU_CHECKIN_MASK_MAX_CPUS 16
 99  #define CPU_CHECKIN_MASK          0x55555555UL
100  #define CPU_EXPECTED_MASK         (~CPU_CHECKIN_MASK)
101  #endif
102  
103  static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS);
104  static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
105  
106  static inline checkin_mask_t
107  cpu_checked_in_bit(int cpuid)
108  {
109  	return 1UL << (2 * cpuid);
110  }
111  
112  static inline checkin_mask_t
113  cpu_expected_bit(int cpuid)
114  {
115  	return 1UL << (2 * cpuid + 1);
116  }
117  
118  void
119  cpu_quiescent_counter_init(void)
120  {
121  	assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1));
122  	assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1));
123  	assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0);
124  	assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0);
125  
126  	cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
127  }
128  
129  void
130  cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
131  {
132  	/* clamp to something vaguely sane */
133  	if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
134  		new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
135  	}
136  
137  	cpu_checkin_min_interval_us = new_value_us;
138  
139  	uint64_t abstime = 0;
140  	clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
141  	    NSEC_PER_USEC, &abstime);
142  	cpu_checkin_min_interval = abstime;
143  }
144  
145  uint32_t
146  cpu_quiescent_counter_get_min_interval_us(void)
147  {
148  	return cpu_checkin_min_interval_us;
149  }
150  
151  
152  /*
153   * Called when all running CPUs have checked in.
154   *
155   * The commpage increment is protected by the 'lock' of having caused the tick,
156   * and it is published by the state reset release barrier.
157   */
158  static void
159  cpu_quiescent_counter_commit(uint64_t ctime)
160  {
161  	__kdebug_only uint64_t          old_gen;
162  	__kdebug_only checkin_mask_t    old_state;
163  
164  	old_gen = commpage_increment_cpu_quiescent_counter();
165  
166  	cpu_checkin_last_commit = ctime;
167  
168  	old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release);
169  
170  	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
171  }
172  
173  /*
174   * Have all the expected CPUs checked in?
175   */
176  static bool
177  cpu_quiescent_counter_needs_commit(checkin_mask_t state)
178  {
179  	return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1);
180  }
181  
182  /*
183   * Called when a processor wants to start participating in the counter, e.g.
184   * 1) when context switching away from the idle thread
185   * 2) when coming up for the first time
186   * 3) when coming up after a shutdown
187   *
188   * Called with interrupts disabled.
189   */
190  void
191  cpu_quiescent_counter_join(__unused uint64_t ctime)
192  {
193  	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
194  	__assert_only int cpuid = cpu_number();
195  
196  	assert(cpuid < MAX_CPUS);
197  	assert(st->state == CPU_QUIESCE_COUNTER_NONE ||
198  	    st->state == CPU_QUIESCE_COUNTER_LEFT);
199  
200  	assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
201  	    (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
202  
203  	st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN;
204  
205  	/*
206  	 * Mark the processor to call cpu_quiescent_counter_ast before it
207  	 * ever returns to userspace.
208  	 */
209  	ast_on(AST_UNQUIESCE);
210  }
211  
212  /*
213   * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
214   * It needs to acquire the counter to see data and the counter published by other CPUs.
215   */
216  void
217  cpu_quiescent_counter_ast(void)
218  {
219  	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
220  	int cpuid = cpu_number();
221  
222  	assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
223  
224  	/* We had better not already be joined. */
225  	assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
226  	    (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
227  
228  	/*
229  	 * No release barrier needed because we have no prior state to publish.
230  	 * Acquire barrier needed because we need this processor to see
231  	 * the latest counter value.
232  	 *
233  	 * The state may be in 'needs checkin' both before and after
234  	 * this atomic or.
235  	 *
236  	 * Additionally, if this is the first processor to come out of idle,
237  	 * it may need to kickstart the algorithm, otherwise it would
238  	 * stay in 'needs commit' perpetually with no processor assigned to
239  	 * actually do the commit.  To do that, the first processor only adds
240  	 * its expected bit.
241  	 */
242  
243  	st->state = CPU_QUIESCE_COUNTER_JOINED;
244  	st->last_checkin = mach_absolute_time();
245  
246  	checkin_mask_t old_mask, new_mask;
247  	os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, {
248  		if (old_mask == 0) {
249  		        new_mask = old_mask | cpu_expected_bit(cpuid);
250  		} else {
251  		        new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid);
252  		}
253  	});
254  }
255  
256  /*
257   * Called when a processor no longer wants to participate in the counter,
258   * i.e. when a processor is on its way to idle or shutdown.
259   *
260   * Called with interrupts disabled.
261   *
262   * The processor needs to remove itself from the expected mask, to allow the
263   * algorithm to continue ticking without its participation.
264   * However, it needs to ensure that anything it has done since the last time
265   * it checked in has been published before the next tick is allowed to commit.
266   */
267  void
268  cpu_quiescent_counter_leave(uint64_t ctime)
269  {
270  	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
271  	int cpuid = cpu_number();
272  
273  	assert(st->state == CPU_QUIESCE_COUNTER_JOINED ||
274  	    st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
275  
276  	/* We no longer need the cpu_quiescent_counter_ast callback to be armed */
277  	ast_off(AST_UNQUIESCE);
278  
279  	if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) {
280  		/* We never actually joined, so we don't have to do the work to leave. */
281  		st->state = CPU_QUIESCE_COUNTER_LEFT;
282  		return;
283  	}
284  
285  	/* Leaving can't be deferred, even if we're within the min interval */
286  	st->last_checkin = ctime;
287  
288  	checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
289  
290  	checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state,
291  	    mask, acq_rel);
292  
293  	assert((orig_state & cpu_expected_bit(cpuid)));
294  
295  	st->state = CPU_QUIESCE_COUNTER_LEFT;
296  
297  	if (cpu_quiescent_counter_needs_commit(orig_state)) {
298  		/*
299  		 * the old state indicates someone else was already doing a commit
300  		 * but hadn't finished yet.  We successfully inserted the acq_rel
301  		 * before they finished the commit by resetting the bitfield,
302  		 * so we're done here.
303  		 */
304  		return;
305  	}
306  
307  	checkin_mask_t new_state = orig_state & ~mask;
308  
309  	if (cpu_quiescent_counter_needs_commit(new_state)) {
310  		cpu_quiescent_counter_commit(ctime);
311  	}
312  }
313  
314  /*
315   * Called when a processor wants to check in to the counter
316   * If it hasn't yet fully joined, it doesn't need to check in.
317   *
318   * Called with interrupts disabled.
319   */
320  void
321  cpu_quiescent_counter_checkin(uint64_t ctime)
322  {
323  	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
324  	int cpuid = cpu_number();
325  
326  	assert(st->state != CPU_QUIESCE_COUNTER_NONE);
327  
328  	/* If we're not joined yet, we don't need to check in */
329  	if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) {
330  		return;
331  	}
332  
333  	/* If we've checked in recently, we don't need to check in yet. */
334  	if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) {
335  		return;
336  	}
337  
338  	st->last_checkin = ctime;
339  
340  	checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
341  
342  	assert((state & cpu_expected_bit(cpuid)));
343  
344  	if (__probable((state & cpu_checked_in_bit(cpuid)))) {
345  		/*
346  		 * Processor has already checked in for this round, no need to
347  		 * acquire the cacheline exclusive.
348  		 */
349  		return;
350  	}
351  
352  	checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state,
353  	    cpu_checked_in_bit(cpuid), acq_rel);
354  
355  	checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid);
356  
357  	if (cpu_quiescent_counter_needs_commit(new_state)) {
358  		assertf(!cpu_quiescent_counter_needs_commit(orig_state),
359  		    "old: 0x%lx, new: 0x%lx", orig_state, new_state);
360  		cpu_quiescent_counter_commit(ctime);
361  	}
362  }
363  
364  #if MACH_ASSERT
365  /*
366   * Called on all AST exits to userspace to assert this processor actually joined
367   *
368   * Called with interrupts disabled after the AST should have been handled
369   */
370  void
371  cpu_quiescent_counter_assert_ast(void)
372  {
373  	struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce);
374  	int cpuid = cpu_number();
375  
376  	assert(st->state == CPU_QUIESCE_COUNTER_JOINED);
377  
378  	checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
379  	assert((state & cpu_expected_bit(cpuid)));
380  }
381  #endif /* MACH_ASSERT */