cpu_quiesce.c
1 /* 2 * Copyright (c) 2018 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 #ifdef __x86_64__ 30 #error This file is only needed on weakly-ordered systems! 31 #endif 32 33 #include <machine/atomic.h> 34 #include <machine/commpage.h> 35 #include <machine/machine_cpu.h> 36 37 #include <kern/sched_prim.h> 38 #include <kern/percpu.h> 39 #include <kern/ast.h> 40 41 #include <kern/cpu_quiesce.h> 42 43 /* 44 * CPU quiescing generation counter implemented with a checkin mask 45 * 46 * A tri-state bitfield, with 2 bits for each processor:; 47 * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier 48 * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. 49 * 50 * When a processor causes the 'expected' bits to equal the 'checkin' bits, which 51 * indicates that all processors have executed the barrier, it ticks the algorithm 52 * and resets the state. 53 * 54 * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. 55 * However, they can't do anything in userspace while idle, so we don't need 56 * them to execute barriers, so we have them 'leave' the counter so that 57 * they don't delay the tick while idle. 58 * 59 * This bitfield currently limits MAX_CPUS to 32 on LP64. 60 * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. 61 * 62 * The mask only guarantees ordering to code running in userspace. 63 * We defer joining the counter until we actually reach userspace, allowing 64 * processors that come out of idle and only run kernel code to avoid the overhead 65 * of participation. 66 * 67 * We additionally defer updating the counter for a minimum interval to 68 * reduce the frequency of executing the exclusive atomic operations. 69 * 70 * The longest delay between two checkins assuming that at least one processor 71 * joins is <checkin delay> + (<thread quantum> * 2) 72 */ 73 74 typedef unsigned long checkin_mask_t; 75 76 static _Atomic checkin_mask_t cpu_quiescing_checkin_state; 77 78 static uint64_t cpu_checkin_last_commit; 79 80 struct cpu_quiesce { 81 cpu_quiescent_state_t state; 82 uint64_t last_checkin; 83 }; 84 85 static struct cpu_quiesce PERCPU_DATA(cpu_quiesce); 86 87 #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ 88 #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ 89 static uint64_t cpu_checkin_min_interval; 90 static uint32_t cpu_checkin_min_interval_us; 91 92 #if __LP64__ 93 #define CPU_CHECKIN_MASK_MAX_CPUS 32 94 #define CPU_CHECKIN_MASK 0x5555555555555555UL 95 #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) 96 #else 97 /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ 98 #define CPU_CHECKIN_MASK_MAX_CPUS 16 99 #define CPU_CHECKIN_MASK 0x55555555UL 100 #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) 101 #endif 102 103 static_assert(MAX_CPUS <= CPU_CHECKIN_MASK_MAX_CPUS); 104 static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); 105 106 static inline checkin_mask_t 107 cpu_checked_in_bit(int cpuid) 108 { 109 return 1UL << (2 * cpuid); 110 } 111 112 static inline checkin_mask_t 113 cpu_expected_bit(int cpuid) 114 { 115 return 1UL << (2 * cpuid + 1); 116 } 117 118 void 119 cpu_quiescent_counter_init(void) 120 { 121 assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS - 1)); 122 assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS - 1)); 123 assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS - 1)) == 0); 124 assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS - 1)) == 0); 125 126 cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); 127 } 128 129 void 130 cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) 131 { 132 /* clamp to something vaguely sane */ 133 if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) { 134 new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; 135 } 136 137 cpu_checkin_min_interval_us = new_value_us; 138 139 uint64_t abstime = 0; 140 clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, 141 NSEC_PER_USEC, &abstime); 142 cpu_checkin_min_interval = abstime; 143 } 144 145 uint32_t 146 cpu_quiescent_counter_get_min_interval_us(void) 147 { 148 return cpu_checkin_min_interval_us; 149 } 150 151 152 /* 153 * Called when all running CPUs have checked in. 154 * 155 * The commpage increment is protected by the 'lock' of having caused the tick, 156 * and it is published by the state reset release barrier. 157 */ 158 static void 159 cpu_quiescent_counter_commit(uint64_t ctime) 160 { 161 __kdebug_only uint64_t old_gen; 162 __kdebug_only checkin_mask_t old_state; 163 164 old_gen = commpage_increment_cpu_quiescent_counter(); 165 166 cpu_checkin_last_commit = ctime; 167 168 old_state = os_atomic_andnot(&cpu_quiescing_checkin_state, CPU_CHECKIN_MASK, release); 169 170 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); 171 } 172 173 /* 174 * Have all the expected CPUs checked in? 175 */ 176 static bool 177 cpu_quiescent_counter_needs_commit(checkin_mask_t state) 178 { 179 return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); 180 } 181 182 /* 183 * Called when a processor wants to start participating in the counter, e.g. 184 * 1) when context switching away from the idle thread 185 * 2) when coming up for the first time 186 * 3) when coming up after a shutdown 187 * 188 * Called with interrupts disabled. 189 */ 190 void 191 cpu_quiescent_counter_join(__unused uint64_t ctime) 192 { 193 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); 194 __assert_only int cpuid = cpu_number(); 195 196 assert(cpuid < MAX_CPUS); 197 assert(st->state == CPU_QUIESCE_COUNTER_NONE || 198 st->state == CPU_QUIESCE_COUNTER_LEFT); 199 200 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & 201 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); 202 203 st->state = CPU_QUIESCE_COUNTER_PENDING_JOIN; 204 205 /* 206 * Mark the processor to call cpu_quiescent_counter_ast before it 207 * ever returns to userspace. 208 */ 209 ast_on(AST_UNQUIESCE); 210 } 211 212 /* 213 * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback 214 * It needs to acquire the counter to see data and the counter published by other CPUs. 215 */ 216 void 217 cpu_quiescent_counter_ast(void) 218 { 219 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); 220 int cpuid = cpu_number(); 221 222 assert(st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); 223 224 /* We had better not already be joined. */ 225 assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & 226 (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); 227 228 /* 229 * No release barrier needed because we have no prior state to publish. 230 * Acquire barrier needed because we need this processor to see 231 * the latest counter value. 232 * 233 * The state may be in 'needs checkin' both before and after 234 * this atomic or. 235 * 236 * Additionally, if this is the first processor to come out of idle, 237 * it may need to kickstart the algorithm, otherwise it would 238 * stay in 'needs commit' perpetually with no processor assigned to 239 * actually do the commit. To do that, the first processor only adds 240 * its expected bit. 241 */ 242 243 st->state = CPU_QUIESCE_COUNTER_JOINED; 244 st->last_checkin = mach_absolute_time(); 245 246 checkin_mask_t old_mask, new_mask; 247 os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { 248 if (old_mask == 0) { 249 new_mask = old_mask | cpu_expected_bit(cpuid); 250 } else { 251 new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); 252 } 253 }); 254 } 255 256 /* 257 * Called when a processor no longer wants to participate in the counter, 258 * i.e. when a processor is on its way to idle or shutdown. 259 * 260 * Called with interrupts disabled. 261 * 262 * The processor needs to remove itself from the expected mask, to allow the 263 * algorithm to continue ticking without its participation. 264 * However, it needs to ensure that anything it has done since the last time 265 * it checked in has been published before the next tick is allowed to commit. 266 */ 267 void 268 cpu_quiescent_counter_leave(uint64_t ctime) 269 { 270 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); 271 int cpuid = cpu_number(); 272 273 assert(st->state == CPU_QUIESCE_COUNTER_JOINED || 274 st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN); 275 276 /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ 277 ast_off(AST_UNQUIESCE); 278 279 if (st->state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { 280 /* We never actually joined, so we don't have to do the work to leave. */ 281 st->state = CPU_QUIESCE_COUNTER_LEFT; 282 return; 283 } 284 285 /* Leaving can't be deferred, even if we're within the min interval */ 286 st->last_checkin = ctime; 287 288 checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); 289 290 checkin_mask_t orig_state = os_atomic_andnot_orig(&cpu_quiescing_checkin_state, 291 mask, acq_rel); 292 293 assert((orig_state & cpu_expected_bit(cpuid))); 294 295 st->state = CPU_QUIESCE_COUNTER_LEFT; 296 297 if (cpu_quiescent_counter_needs_commit(orig_state)) { 298 /* 299 * the old state indicates someone else was already doing a commit 300 * but hadn't finished yet. We successfully inserted the acq_rel 301 * before they finished the commit by resetting the bitfield, 302 * so we're done here. 303 */ 304 return; 305 } 306 307 checkin_mask_t new_state = orig_state & ~mask; 308 309 if (cpu_quiescent_counter_needs_commit(new_state)) { 310 cpu_quiescent_counter_commit(ctime); 311 } 312 } 313 314 /* 315 * Called when a processor wants to check in to the counter 316 * If it hasn't yet fully joined, it doesn't need to check in. 317 * 318 * Called with interrupts disabled. 319 */ 320 void 321 cpu_quiescent_counter_checkin(uint64_t ctime) 322 { 323 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); 324 int cpuid = cpu_number(); 325 326 assert(st->state != CPU_QUIESCE_COUNTER_NONE); 327 328 /* If we're not joined yet, we don't need to check in */ 329 if (__probable(st->state != CPU_QUIESCE_COUNTER_JOINED)) { 330 return; 331 } 332 333 /* If we've checked in recently, we don't need to check in yet. */ 334 if (__probable((ctime - st->last_checkin) <= cpu_checkin_min_interval)) { 335 return; 336 } 337 338 st->last_checkin = ctime; 339 340 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); 341 342 assert((state & cpu_expected_bit(cpuid))); 343 344 if (__probable((state & cpu_checked_in_bit(cpuid)))) { 345 /* 346 * Processor has already checked in for this round, no need to 347 * acquire the cacheline exclusive. 348 */ 349 return; 350 } 351 352 checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, 353 cpu_checked_in_bit(cpuid), acq_rel); 354 355 checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); 356 357 if (cpu_quiescent_counter_needs_commit(new_state)) { 358 assertf(!cpu_quiescent_counter_needs_commit(orig_state), 359 "old: 0x%lx, new: 0x%lx", orig_state, new_state); 360 cpu_quiescent_counter_commit(ctime); 361 } 362 } 363 364 #if MACH_ASSERT 365 /* 366 * Called on all AST exits to userspace to assert this processor actually joined 367 * 368 * Called with interrupts disabled after the AST should have been handled 369 */ 370 void 371 cpu_quiescent_counter_assert_ast(void) 372 { 373 struct cpu_quiesce *st = PERCPU_GET(cpu_quiesce); 374 int cpuid = cpu_number(); 375 376 assert(st->state == CPU_QUIESCE_COUNTER_JOINED); 377 378 checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); 379 assert((state & cpu_expected_bit(cpuid))); 380 } 381 #endif /* MACH_ASSERT */