sfi.c
1 /* 2 * Copyright (c) 2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 #include <mach/mach_types.h> 29 #include <kern/assert.h> 30 #include <kern/clock.h> 31 #include <kern/coalition.h> 32 #include <kern/debug.h> 33 #include <kern/startup.h> 34 #include <kern/host.h> 35 #include <kern/kern_types.h> 36 #include <kern/machine.h> 37 #include <kern/simple_lock.h> 38 #include <kern/misc_protos.h> 39 #include <kern/sched.h> 40 #include <kern/sched_prim.h> 41 #include <kern/sfi.h> 42 #include <kern/timer_call.h> 43 #include <kern/waitq.h> 44 #include <kern/ledger.h> 45 #include <kern/policy_internal.h> 46 47 #include <machine/atomic.h> 48 49 #include <pexpert/pexpert.h> 50 51 #include <libkern/kernel_mach_header.h> 52 53 #include <sys/kdebug.h> 54 55 #if CONFIG_SCHED_SFI 56 57 #define SFI_DEBUG 0 58 59 #if SFI_DEBUG 60 #define dprintf(...) kprintf(__VA_ARGS__) 61 #else 62 #define dprintf(...) do { } while(0) 63 #endif 64 65 /* 66 * SFI (Selective Forced Idle) operates by enabling a global 67 * timer on the SFI window interval. When it fires, all processors 68 * running a thread that should be SFI-ed are sent an AST. 69 * As threads become runnable while in their "off phase", they 70 * are placed on a deferred ready queue. When a per-class 71 * "on timer" fires, the ready threads for that class are 72 * re-enqueued for running. As an optimization to avoid spurious 73 * wakeups, the timer may be lazily programmed. 74 */ 75 76 /* 77 * The "sfi_lock" simple lock guards access to static configuration 78 * parameters (as specified by userspace), dynamic state changes 79 * (as updated by the timer event routine), and timer data structures. 80 * Since it can be taken with interrupts disabled in some cases, all 81 * uses should be taken with interrupts disabled at splsched(). The 82 * "sfi_lock" also guards the "sfi_wait_class" field of thread_t, and 83 * must only be accessed with it held. 84 * 85 * When an "on timer" fires, we must deterministically be able to drain 86 * the wait queue, since if any threads are added to the queue afterwards, 87 * they may never get woken out of SFI wait. So sfi_lock must be 88 * taken before the wait queue's own spinlock. 89 * 90 * The wait queue will take the thread's scheduling lock. We may also take 91 * the thread_lock directly to update the "sfi_class" field and determine 92 * if the thread should block in the wait queue, but the lock will be 93 * released before doing so. 94 * 95 * The pset lock may also be taken, but not while any other locks are held. 96 * 97 * The task and thread mutex may also be held while reevaluating sfi state. 98 * 99 * splsched ---> sfi_lock ---> waitq ---> thread_lock 100 * \ \ \__ thread_lock (*) 101 * \ \__ pset_lock 102 * \ 103 * \__ thread_lock 104 */ 105 106 decl_simple_lock_data(static, sfi_lock); 107 static timer_call_data_t sfi_timer_call_entry; 108 volatile boolean_t sfi_is_enabled; 109 110 boolean_t sfi_window_is_set; 111 uint64_t sfi_window_usecs; 112 uint64_t sfi_window_interval; 113 uint64_t sfi_next_off_deadline; 114 115 typedef struct { 116 sfi_class_id_t class_id; 117 thread_continue_t class_continuation; 118 const char * class_name; 119 const char * class_ledger_name; 120 } sfi_class_registration_t; 121 122 /* 123 * To add a new SFI class: 124 * 125 * 1) Raise MAX_SFI_CLASS_ID in mach/sfi_class.h 126 * 2) Add a #define for it to mach/sfi_class.h. It need not be inserted in order of restrictiveness. 127 * 3) Add a call to SFI_CLASS_REGISTER below 128 * 4) Augment sfi_thread_classify to categorize threads as early as possible for as restrictive as possible. 129 * 5) Modify thermald to use the SFI class 130 */ 131 132 static inline void _sfi_wait_cleanup(void); 133 134 static void sfi_class_register(sfi_class_registration_t *); 135 136 #define SFI_CLASS_REGISTER(clsid, ledger_name) \ 137 \ 138 static void __attribute__((noinline, noreturn)) \ 139 SFI_ ## clsid ## _THREAD_IS_WAITING(void *arg __unused, wait_result_t wret __unused) \ 140 { \ 141 _sfi_wait_cleanup(); \ 142 thread_exception_return(); \ 143 } \ 144 \ 145 static_assert(SFI_CLASS_ ## clsid < MAX_SFI_CLASS_ID, "Invalid ID"); \ 146 \ 147 static __startup_data sfi_class_registration_t \ 148 SFI_ ## clsid ## _registration = { \ 149 .class_id = SFI_CLASS_ ## clsid, \ 150 .class_continuation = SFI_ ## clsid ## _THREAD_IS_WAITING, \ 151 .class_name = "SFI_CLASS_" # clsid, \ 152 .class_ledger_name = "SFI_CLASS_" # ledger_name, \ 153 }; \ 154 STARTUP_ARG(TUNABLES, STARTUP_RANK_MIDDLE, \ 155 sfi_class_register, &SFI_ ## clsid ## _registration) 156 157 /* SFI_CLASS_UNSPECIFIED not included here */ 158 SFI_CLASS_REGISTER(MAINTENANCE, MAINTENANCE); 159 SFI_CLASS_REGISTER(DARWIN_BG, DARWIN_BG); 160 SFI_CLASS_REGISTER(APP_NAP, APP_NAP); 161 SFI_CLASS_REGISTER(MANAGED_FOCAL, MANAGED); 162 SFI_CLASS_REGISTER(MANAGED_NONFOCAL, MANAGED); 163 SFI_CLASS_REGISTER(UTILITY, UTILITY); 164 SFI_CLASS_REGISTER(DEFAULT_FOCAL, DEFAULT); 165 SFI_CLASS_REGISTER(DEFAULT_NONFOCAL, DEFAULT); 166 SFI_CLASS_REGISTER(LEGACY_FOCAL, LEGACY); 167 SFI_CLASS_REGISTER(LEGACY_NONFOCAL, LEGACY); 168 SFI_CLASS_REGISTER(USER_INITIATED_FOCAL, USER_INITIATED); 169 SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL, USER_INITIATED); 170 SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL, USER_INTERACTIVE); 171 SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE); 172 SFI_CLASS_REGISTER(KERNEL, OPTED_OUT); 173 SFI_CLASS_REGISTER(OPTED_OUT, OPTED_OUT); 174 175 struct sfi_class_state { 176 uint64_t off_time_usecs; 177 uint64_t off_time_interval; 178 179 timer_call_data_t on_timer; 180 uint64_t on_timer_deadline; 181 boolean_t on_timer_programmed; 182 183 boolean_t class_sfi_is_enabled; 184 volatile boolean_t class_in_on_phase; 185 186 struct waitq waitq; /* threads in ready state */ 187 thread_continue_t continuation; 188 189 const char * class_name; 190 const char * class_ledger_name; 191 }; 192 193 /* Static configuration performed in sfi_early_init() */ 194 struct sfi_class_state sfi_classes[MAX_SFI_CLASS_ID]; 195 196 int sfi_enabled_class_count; // protected by sfi_lock and used atomically 197 198 static void sfi_timer_global_off( 199 timer_call_param_t param0, 200 timer_call_param_t param1); 201 202 static void sfi_timer_per_class_on( 203 timer_call_param_t param0, 204 timer_call_param_t param1); 205 206 /* Called early in boot, when kernel is single-threaded */ 207 __startup_func 208 static void 209 sfi_class_register(sfi_class_registration_t *reg) 210 { 211 sfi_class_id_t class_id = reg->class_id; 212 213 if (class_id >= MAX_SFI_CLASS_ID) { 214 panic("Invalid SFI class 0x%x", class_id); 215 } 216 if (sfi_classes[class_id].continuation != NULL) { 217 panic("Duplicate SFI registration for class 0x%x", class_id); 218 } 219 sfi_classes[class_id].class_sfi_is_enabled = FALSE; 220 sfi_classes[class_id].class_in_on_phase = TRUE; 221 sfi_classes[class_id].continuation = reg->class_continuation; 222 sfi_classes[class_id].class_name = reg->class_name; 223 sfi_classes[class_id].class_ledger_name = reg->class_ledger_name; 224 } 225 226 void 227 sfi_init(void) 228 { 229 sfi_class_id_t i; 230 kern_return_t kret; 231 232 simple_lock_init(&sfi_lock, 0); 233 timer_call_setup(&sfi_timer_call_entry, sfi_timer_global_off, NULL); 234 sfi_window_is_set = FALSE; 235 os_atomic_init(&sfi_enabled_class_count, 0); 236 sfi_is_enabled = FALSE; 237 238 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 239 /* If the class was set up in sfi_early_init(), initialize remaining fields */ 240 if (sfi_classes[i].continuation) { 241 timer_call_setup(&sfi_classes[i].on_timer, sfi_timer_per_class_on, (void *)(uintptr_t)i); 242 sfi_classes[i].on_timer_programmed = FALSE; 243 244 kret = waitq_init(&sfi_classes[i].waitq, SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ); 245 assert(kret == KERN_SUCCESS); 246 } else { 247 /* The only allowed gap is for SFI_CLASS_UNSPECIFIED */ 248 if (i != SFI_CLASS_UNSPECIFIED) { 249 panic("Gap in registered SFI classes"); 250 } 251 } 252 } 253 } 254 255 /* Can be called before sfi_init() by task initialization, but after sfi_early_init() */ 256 sfi_class_id_t 257 sfi_get_ledger_alias_for_class(sfi_class_id_t class_id) 258 { 259 sfi_class_id_t i; 260 const char *ledger_name = NULL; 261 262 ledger_name = sfi_classes[class_id].class_ledger_name; 263 264 /* Find the first class in the registration table with this ledger name */ 265 if (ledger_name) { 266 for (i = SFI_CLASS_UNSPECIFIED + 1; i < class_id; i++) { 267 if (0 == strcmp(sfi_classes[i].class_ledger_name, ledger_name)) { 268 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, i); 269 return i; 270 } 271 } 272 273 /* This class is the primary one for the ledger, so there is no alias */ 274 dprintf("sfi_get_ledger_alias_for_class(0x%x) -> 0x%x\n", class_id, SFI_CLASS_UNSPECIFIED); 275 return SFI_CLASS_UNSPECIFIED; 276 } 277 278 /* We are permissive on SFI class lookup failures. In sfi_init(), we assert more */ 279 return SFI_CLASS_UNSPECIFIED; 280 } 281 282 int 283 sfi_ledger_entry_add(ledger_template_t template, sfi_class_id_t class_id) 284 { 285 const char *ledger_name = NULL; 286 287 ledger_name = sfi_classes[class_id].class_ledger_name; 288 289 dprintf("sfi_ledger_entry_add(%p, 0x%x) -> %s\n", template, class_id, ledger_name); 290 return ledger_entry_add(template, ledger_name, "sfi", "MATUs"); 291 } 292 293 static void 294 sfi_timer_global_off( 295 timer_call_param_t param0 __unused, 296 timer_call_param_t param1 __unused) 297 { 298 uint64_t now = mach_absolute_time(); 299 sfi_class_id_t i; 300 processor_set_t pset, nset; 301 processor_t processor; 302 uint32_t needs_cause_ast_mask = 0x0; 303 spl_t s; 304 305 s = splsched(); 306 307 simple_lock(&sfi_lock, LCK_GRP_NULL); 308 if (!sfi_is_enabled) { 309 /* If SFI has been disabled, let all "on" timers drain naturally */ 310 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_NONE, 1, 0, 0, 0, 0); 311 312 simple_unlock(&sfi_lock); 313 splx(s); 314 return; 315 } 316 317 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_START, 0, 0, 0, 0, 0); 318 319 /* First set all configured classes into the off state, and program their "on" timer */ 320 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 321 if (sfi_classes[i].class_sfi_is_enabled) { 322 uint64_t on_timer_deadline; 323 324 sfi_classes[i].class_in_on_phase = FALSE; 325 sfi_classes[i].on_timer_programmed = TRUE; 326 327 /* Push out on-timer */ 328 on_timer_deadline = now + sfi_classes[i].off_time_interval; 329 sfi_classes[i].on_timer_deadline = on_timer_deadline; 330 331 timer_call_enter1(&sfi_classes[i].on_timer, NULL, on_timer_deadline, TIMER_CALL_SYS_CRITICAL); 332 } else { 333 /* If this class no longer needs SFI, make sure the timer is cancelled */ 334 sfi_classes[i].class_in_on_phase = TRUE; 335 if (sfi_classes[i].on_timer_programmed) { 336 sfi_classes[i].on_timer_programmed = FALSE; 337 sfi_classes[i].on_timer_deadline = ~0ULL; 338 timer_call_cancel(&sfi_classes[i].on_timer); 339 } 340 } 341 } 342 simple_unlock(&sfi_lock); 343 344 /* Iterate over processors, call cause_ast_check() on ones running a thread that should be in an off phase */ 345 processor = processor_list; 346 pset = processor->processor_set; 347 348 pset_lock(pset); 349 350 do { 351 nset = processor->processor_set; 352 if (nset != pset) { 353 pset_unlock(pset); 354 pset = nset; 355 pset_lock(pset); 356 } 357 358 /* "processor" and its pset are locked */ 359 if (processor->state == PROCESSOR_RUNNING) { 360 if (AST_NONE != sfi_processor_needs_ast(processor)) { 361 needs_cause_ast_mask |= (1U << processor->cpu_id); 362 } 363 } 364 } while ((processor = processor->processor_list) != NULL); 365 366 pset_unlock(pset); 367 368 for (int cpuid = lsb_first(needs_cause_ast_mask); cpuid >= 0; cpuid = lsb_next(needs_cause_ast_mask, cpuid)) { 369 processor = processor_array[cpuid]; 370 if (processor == current_processor()) { 371 ast_on(AST_SFI); 372 } else { 373 cause_ast_check(processor); 374 } 375 } 376 377 /* Re-arm timer if still enabled */ 378 simple_lock(&sfi_lock, LCK_GRP_NULL); 379 if (sfi_is_enabled) { 380 clock_deadline_for_periodic_event(sfi_window_interval, 381 now, 382 &sfi_next_off_deadline); 383 timer_call_enter1(&sfi_timer_call_entry, 384 NULL, 385 sfi_next_off_deadline, 386 TIMER_CALL_SYS_CRITICAL); 387 } 388 389 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_OFF_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0); 390 391 simple_unlock(&sfi_lock); 392 393 splx(s); 394 } 395 396 static void 397 sfi_timer_per_class_on( 398 timer_call_param_t param0, 399 timer_call_param_t param1 __unused) 400 { 401 sfi_class_id_t sfi_class_id = (sfi_class_id_t)(uintptr_t)param0; 402 struct sfi_class_state *sfi_class = &sfi_classes[sfi_class_id]; 403 kern_return_t kret; 404 spl_t s; 405 406 s = splsched(); 407 408 simple_lock(&sfi_lock, LCK_GRP_NULL); 409 410 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_START, sfi_class_id, 0, 0, 0, 0); 411 412 /* 413 * Any threads that may have accumulated in the ready queue for this class should get re-enqueued. 414 * Since we have the sfi_lock held and have changed "class_in_on_phase", we expect 415 * no new threads to be put on this wait queue until the global "off timer" has fired. 416 */ 417 418 sfi_class->class_in_on_phase = TRUE; 419 sfi_class->on_timer_programmed = FALSE; 420 421 kret = waitq_wakeup64_all(&sfi_class->waitq, 422 CAST_EVENT64_T(sfi_class_id), 423 THREAD_AWAKENED, WAITQ_ALL_PRIORITIES); 424 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING); 425 426 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_ON_TIMER) | DBG_FUNC_END, 0, 0, 0, 0, 0); 427 428 simple_unlock(&sfi_lock); 429 430 splx(s); 431 } 432 433 434 kern_return_t 435 sfi_set_window(uint64_t window_usecs) 436 { 437 uint64_t interval, deadline; 438 uint64_t now = mach_absolute_time(); 439 sfi_class_id_t i; 440 spl_t s; 441 uint64_t largest_class_off_interval = 0; 442 443 if (window_usecs < MIN_SFI_WINDOW_USEC) { 444 window_usecs = MIN_SFI_WINDOW_USEC; 445 } 446 447 if (window_usecs > UINT32_MAX) { 448 return KERN_INVALID_ARGUMENT; 449 } 450 451 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_WINDOW), window_usecs, 0, 0, 0, 0); 452 453 clock_interval_to_absolutetime_interval((uint32_t)window_usecs, NSEC_PER_USEC, &interval); 454 deadline = now + interval; 455 456 s = splsched(); 457 458 simple_lock(&sfi_lock, LCK_GRP_NULL); 459 460 /* Check that we are not bringing in the SFI window smaller than any class */ 461 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 462 if (sfi_classes[i].class_sfi_is_enabled) { 463 largest_class_off_interval = MAX(largest_class_off_interval, sfi_classes[i].off_time_interval); 464 } 465 } 466 467 /* 468 * Off window must be strictly greater than all enabled classes, 469 * otherwise threads would build up on ready queue and never be able to run. 470 */ 471 if (interval <= largest_class_off_interval) { 472 simple_unlock(&sfi_lock); 473 splx(s); 474 return KERN_INVALID_ARGUMENT; 475 } 476 477 /* 478 * If the new "off" deadline is further out than the current programmed timer, 479 * just let the current one expire (and the new cadence will be established thereafter). 480 * If the new "off" deadline is nearer than the current one, bring it in, so we 481 * can start the new behavior sooner. Note that this may cause the "off" timer to 482 * fire before some of the class "on" timers have fired. 483 */ 484 sfi_window_usecs = window_usecs; 485 sfi_window_interval = interval; 486 sfi_window_is_set = TRUE; 487 488 if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) { 489 /* Can't program timer yet */ 490 } else if (!sfi_is_enabled) { 491 sfi_is_enabled = TRUE; 492 sfi_next_off_deadline = deadline; 493 timer_call_enter1(&sfi_timer_call_entry, 494 NULL, 495 sfi_next_off_deadline, 496 TIMER_CALL_SYS_CRITICAL); 497 } else if (deadline >= sfi_next_off_deadline) { 498 sfi_next_off_deadline = deadline; 499 } else { 500 sfi_next_off_deadline = deadline; 501 timer_call_enter1(&sfi_timer_call_entry, 502 NULL, 503 sfi_next_off_deadline, 504 TIMER_CALL_SYS_CRITICAL); 505 } 506 507 simple_unlock(&sfi_lock); 508 splx(s); 509 510 return KERN_SUCCESS; 511 } 512 513 kern_return_t 514 sfi_window_cancel(void) 515 { 516 spl_t s; 517 518 s = splsched(); 519 520 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_WINDOW), 0, 0, 0, 0, 0); 521 522 /* Disable globals so that global "off-timer" is not re-armed */ 523 simple_lock(&sfi_lock, LCK_GRP_NULL); 524 sfi_window_is_set = FALSE; 525 sfi_window_usecs = 0; 526 sfi_window_interval = 0; 527 sfi_next_off_deadline = 0; 528 sfi_is_enabled = FALSE; 529 simple_unlock(&sfi_lock); 530 531 splx(s); 532 533 return KERN_SUCCESS; 534 } 535 536 /* Defers SFI off and per-class on timers (if live) by the specified interval 537 * in Mach Absolute Time Units. Currently invoked to align with the global 538 * forced idle mechanism. Making some simplifying assumptions, the iterative GFI 539 * induced SFI on+off deferrals form a geometric series that converges to yield 540 * an effective SFI duty cycle that is scaled by the GFI duty cycle. Initial phase 541 * alignment and congruency of the SFI/GFI periods can distort this to some extent. 542 */ 543 544 kern_return_t 545 sfi_defer(uint64_t sfi_defer_matus) 546 { 547 spl_t s; 548 kern_return_t kr = KERN_FAILURE; 549 s = splsched(); 550 551 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_GLOBAL_DEFER), sfi_defer_matus, 0, 0, 0, 0); 552 553 simple_lock(&sfi_lock, LCK_GRP_NULL); 554 if (!sfi_is_enabled) { 555 goto sfi_defer_done; 556 } 557 558 assert(sfi_next_off_deadline != 0); 559 560 sfi_next_off_deadline += sfi_defer_matus; 561 timer_call_enter1(&sfi_timer_call_entry, NULL, sfi_next_off_deadline, TIMER_CALL_SYS_CRITICAL); 562 563 int i; 564 for (i = 0; i < MAX_SFI_CLASS_ID; i++) { 565 if (sfi_classes[i].class_sfi_is_enabled) { 566 if (sfi_classes[i].on_timer_programmed) { 567 uint64_t new_on_deadline = sfi_classes[i].on_timer_deadline + sfi_defer_matus; 568 sfi_classes[i].on_timer_deadline = new_on_deadline; 569 timer_call_enter1(&sfi_classes[i].on_timer, NULL, new_on_deadline, TIMER_CALL_SYS_CRITICAL); 570 } 571 } 572 } 573 574 kr = KERN_SUCCESS; 575 sfi_defer_done: 576 simple_unlock(&sfi_lock); 577 578 splx(s); 579 580 return kr; 581 } 582 583 584 kern_return_t 585 sfi_get_window(uint64_t *window_usecs) 586 { 587 spl_t s; 588 uint64_t off_window_us; 589 590 s = splsched(); 591 simple_lock(&sfi_lock, LCK_GRP_NULL); 592 593 off_window_us = sfi_window_usecs; 594 595 simple_unlock(&sfi_lock); 596 splx(s); 597 598 *window_usecs = off_window_us; 599 600 return KERN_SUCCESS; 601 } 602 603 604 kern_return_t 605 sfi_set_class_offtime(sfi_class_id_t class_id, uint64_t offtime_usecs) 606 { 607 uint64_t interval; 608 spl_t s; 609 uint64_t off_window_interval; 610 611 if (offtime_usecs < MIN_SFI_WINDOW_USEC) { 612 offtime_usecs = MIN_SFI_WINDOW_USEC; 613 } 614 615 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) { 616 return KERN_INVALID_ARGUMENT; 617 } 618 619 if (offtime_usecs > UINT32_MAX) { 620 return KERN_INVALID_ARGUMENT; 621 } 622 623 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_SET_CLASS_OFFTIME), offtime_usecs, class_id, 0, 0, 0); 624 625 clock_interval_to_absolutetime_interval((uint32_t)offtime_usecs, NSEC_PER_USEC, &interval); 626 627 s = splsched(); 628 629 simple_lock(&sfi_lock, LCK_GRP_NULL); 630 off_window_interval = sfi_window_interval; 631 632 /* Check that we are not bringing in class off-time larger than the SFI window */ 633 if (off_window_interval && (interval >= off_window_interval)) { 634 simple_unlock(&sfi_lock); 635 splx(s); 636 return KERN_INVALID_ARGUMENT; 637 } 638 639 /* We never re-program the per-class on-timer, but rather just let it expire naturally */ 640 if (!sfi_classes[class_id].class_sfi_is_enabled) { 641 os_atomic_inc(&sfi_enabled_class_count, relaxed); 642 } 643 sfi_classes[class_id].off_time_usecs = offtime_usecs; 644 sfi_classes[class_id].off_time_interval = interval; 645 sfi_classes[class_id].class_sfi_is_enabled = TRUE; 646 647 if (sfi_window_is_set && !sfi_is_enabled) { 648 /* start global off timer */ 649 sfi_is_enabled = TRUE; 650 sfi_next_off_deadline = mach_absolute_time() + sfi_window_interval; 651 timer_call_enter1(&sfi_timer_call_entry, 652 NULL, 653 sfi_next_off_deadline, 654 TIMER_CALL_SYS_CRITICAL); 655 } 656 657 simple_unlock(&sfi_lock); 658 659 splx(s); 660 661 return KERN_SUCCESS; 662 } 663 664 kern_return_t 665 sfi_class_offtime_cancel(sfi_class_id_t class_id) 666 { 667 spl_t s; 668 669 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) { 670 return KERN_INVALID_ARGUMENT; 671 } 672 673 s = splsched(); 674 675 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_CANCEL_CLASS_OFFTIME), class_id, 0, 0, 0, 0); 676 677 simple_lock(&sfi_lock, LCK_GRP_NULL); 678 679 /* We never re-program the per-class on-timer, but rather just let it expire naturally */ 680 if (sfi_classes[class_id].class_sfi_is_enabled) { 681 os_atomic_dec(&sfi_enabled_class_count, relaxed); 682 } 683 sfi_classes[class_id].off_time_usecs = 0; 684 sfi_classes[class_id].off_time_interval = 0; 685 sfi_classes[class_id].class_sfi_is_enabled = FALSE; 686 687 if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) { 688 sfi_is_enabled = FALSE; 689 } 690 691 simple_unlock(&sfi_lock); 692 693 splx(s); 694 695 return KERN_SUCCESS; 696 } 697 698 kern_return_t 699 sfi_get_class_offtime(sfi_class_id_t class_id, uint64_t *offtime_usecs) 700 { 701 uint64_t off_time_us; 702 spl_t s; 703 704 if (class_id == SFI_CLASS_UNSPECIFIED || class_id >= MAX_SFI_CLASS_ID) { 705 return 0; 706 } 707 708 s = splsched(); 709 710 simple_lock(&sfi_lock, LCK_GRP_NULL); 711 off_time_us = sfi_classes[class_id].off_time_usecs; 712 simple_unlock(&sfi_lock); 713 714 splx(s); 715 716 *offtime_usecs = off_time_us; 717 718 return KERN_SUCCESS; 719 } 720 721 /* 722 * sfi_thread_classify and sfi_processor_active_thread_classify perform the critical 723 * role of quickly categorizing a thread into its SFI class so that an AST_SFI can be 724 * set. As the thread is unwinding to userspace, sfi_ast() performs full locking 725 * and determines whether the thread should enter an SFI wait state. Because of 726 * the inherent races between the time the AST is set and when it is evaluated, 727 * thread classification can be inaccurate (but should always be safe). This is 728 * especially the case for sfi_processor_active_thread_classify, which must 729 * classify the active thread on a remote processor without taking the thread lock. 730 * When in doubt, classification should err on the side of *not* classifying a 731 * thread at all, and wait for the thread itself to either hit a quantum expiration 732 * or block inside the kernel. 733 */ 734 735 /* 736 * Thread must be locked. Ultimately, the real decision to enter 737 * SFI wait happens at the AST boundary. 738 */ 739 sfi_class_id_t 740 sfi_thread_classify(thread_t thread) 741 { 742 task_t task = thread->task; 743 boolean_t is_kernel_thread = (task == kernel_task); 744 sched_mode_t thmode = thread->sched_mode; 745 boolean_t focal = FALSE; 746 747 /* kernel threads never reach the user AST boundary, and are in a separate world for SFI */ 748 if (is_kernel_thread) { 749 return SFI_CLASS_KERNEL; 750 } 751 752 /* no need to re-classify threads unless there is at least one enabled SFI class */ 753 if (os_atomic_load(&sfi_enabled_class_count, relaxed) == 0) { 754 return SFI_CLASS_OPTED_OUT; 755 } 756 757 int task_role = proc_get_effective_task_policy(task, TASK_POLICY_ROLE); 758 int latency_qos = proc_get_effective_task_policy(task, TASK_POLICY_LATENCY_QOS); 759 int managed_task = proc_get_effective_task_policy(task, TASK_POLICY_SFI_MANAGED); 760 761 int thread_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS); 762 int thread_bg = proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG); 763 764 if (thread_qos == THREAD_QOS_MAINTENANCE) { 765 return SFI_CLASS_MAINTENANCE; 766 } 767 768 if (thread_bg || thread_qos == THREAD_QOS_BACKGROUND) { 769 return SFI_CLASS_DARWIN_BG; 770 } 771 772 if (latency_qos != 0) { 773 int latency_qos_wtf = latency_qos - 1; 774 775 if ((latency_qos_wtf >= 4) && (latency_qos_wtf <= 5)) { 776 return SFI_CLASS_APP_NAP; 777 } 778 } 779 780 /* 781 * Realtime and fixed priority threads express their duty cycle constraints 782 * via other mechanisms, and are opted out of (most) forms of SFI 783 */ 784 if (thmode == TH_MODE_REALTIME || thmode == TH_MODE_FIXED || task_role == TASK_GRAPHICS_SERVER) { 785 return SFI_CLASS_OPTED_OUT; 786 } 787 788 /* 789 * Threads with unspecified, legacy, or user-initiated QOS class can be individually managed. 790 */ 791 switch (task_role) { 792 case TASK_CONTROL_APPLICATION: 793 case TASK_FOREGROUND_APPLICATION: 794 focal = TRUE; 795 break; 796 case TASK_BACKGROUND_APPLICATION: 797 case TASK_DEFAULT_APPLICATION: 798 case TASK_UNSPECIFIED: 799 /* Focal if the task is in a coalition with a FG/focal app */ 800 if (task_coalition_focal_count(thread->task) > 0) { 801 focal = TRUE; 802 } 803 break; 804 case TASK_THROTTLE_APPLICATION: 805 case TASK_DARWINBG_APPLICATION: 806 case TASK_NONUI_APPLICATION: 807 /* Definitely not focal */ 808 default: 809 break; 810 } 811 812 if (managed_task) { 813 switch (thread_qos) { 814 case THREAD_QOS_UNSPECIFIED: 815 case THREAD_QOS_LEGACY: 816 case THREAD_QOS_USER_INITIATED: 817 if (focal) { 818 return SFI_CLASS_MANAGED_FOCAL; 819 } else { 820 return SFI_CLASS_MANAGED_NONFOCAL; 821 } 822 default: 823 break; 824 } 825 } 826 827 if (thread_qos == THREAD_QOS_UTILITY) { 828 return SFI_CLASS_UTILITY; 829 } 830 831 /* 832 * Classify threads in non-managed tasks 833 */ 834 if (focal) { 835 switch (thread_qos) { 836 case THREAD_QOS_USER_INTERACTIVE: 837 return SFI_CLASS_USER_INTERACTIVE_FOCAL; 838 case THREAD_QOS_USER_INITIATED: 839 return SFI_CLASS_USER_INITIATED_FOCAL; 840 case THREAD_QOS_LEGACY: 841 return SFI_CLASS_LEGACY_FOCAL; 842 default: 843 return SFI_CLASS_DEFAULT_FOCAL; 844 } 845 } else { 846 switch (thread_qos) { 847 case THREAD_QOS_USER_INTERACTIVE: 848 return SFI_CLASS_USER_INTERACTIVE_NONFOCAL; 849 case THREAD_QOS_USER_INITIATED: 850 return SFI_CLASS_USER_INITIATED_NONFOCAL; 851 case THREAD_QOS_LEGACY: 852 return SFI_CLASS_LEGACY_NONFOCAL; 853 default: 854 return SFI_CLASS_DEFAULT_NONFOCAL; 855 } 856 } 857 } 858 859 /* 860 * pset must be locked. 861 */ 862 sfi_class_id_t 863 sfi_processor_active_thread_classify(processor_t processor) 864 { 865 return processor->current_sfi_class; 866 } 867 868 /* 869 * thread must be locked. This is inherently racy, with the intent that 870 * at the AST boundary, it will be fully evaluated whether we need to 871 * perform an AST wait 872 */ 873 ast_t 874 sfi_thread_needs_ast(thread_t thread, sfi_class_id_t *out_class) 875 { 876 sfi_class_id_t class_id; 877 878 class_id = sfi_thread_classify(thread); 879 880 if (out_class) { 881 *out_class = class_id; 882 } 883 884 /* No lock taken, so a stale value may be used. */ 885 if (!sfi_classes[class_id].class_in_on_phase) { 886 return AST_SFI; 887 } else { 888 return AST_NONE; 889 } 890 } 891 892 /* 893 * pset must be locked. We take the SFI class for 894 * the currently running thread which is cached on 895 * the processor_t, and assume it is accurate. In the 896 * worst case, the processor will get an IPI and be asked 897 * to evaluate if the current running thread at that 898 * later point in time should be in an SFI wait. 899 */ 900 ast_t 901 sfi_processor_needs_ast(processor_t processor) 902 { 903 sfi_class_id_t class_id; 904 905 class_id = sfi_processor_active_thread_classify(processor); 906 907 /* No lock taken, so a stale value may be used. */ 908 if (!sfi_classes[class_id].class_in_on_phase) { 909 return AST_SFI; 910 } else { 911 return AST_NONE; 912 } 913 } 914 915 static inline void 916 _sfi_wait_cleanup(void) 917 { 918 thread_t self = current_thread(); 919 920 spl_t s = splsched(); 921 simple_lock(&sfi_lock, LCK_GRP_NULL); 922 923 sfi_class_id_t current_sfi_wait_class = self->sfi_wait_class; 924 925 assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) && 926 (current_sfi_wait_class < MAX_SFI_CLASS_ID)); 927 928 self->sfi_wait_class = SFI_CLASS_UNSPECIFIED; 929 930 simple_unlock(&sfi_lock); 931 splx(s); 932 933 /* 934 * It's possible for the thread to be woken up due to the SFI period 935 * ending *before* it finishes blocking. In that case, 936 * wait_sfi_begin_time won't be set. 937 * 938 * Derive the time sacrificed to SFI by looking at when this thread was 939 * awoken by the on-timer, to avoid counting the time this thread spent 940 * waiting to get scheduled. 941 * 942 * Note that last_made_runnable_time could be reset if this thread 943 * gets preempted before we read the value. To fix that, we'd need to 944 * track wait time in a thread timer, sample the timer before blocking, 945 * pass the value through thread->parameter, and subtract that. 946 */ 947 948 if (self->wait_sfi_begin_time != 0) { 949 uint64_t made_runnable = os_atomic_load(&self->last_made_runnable_time, relaxed); 950 int64_t sfi_wait_time = made_runnable - self->wait_sfi_begin_time; 951 assert(sfi_wait_time >= 0); 952 953 ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], 954 sfi_wait_time); 955 956 self->wait_sfi_begin_time = 0; 957 } 958 } 959 960 /* 961 * Called at AST context to fully evaluate if the current thread 962 * (which is obviously running) should instead block in an SFI wait. 963 * We must take the sfi_lock to check whether we are in the "off" period 964 * for the class, and if so, block. 965 */ 966 void 967 sfi_ast(thread_t thread) 968 { 969 sfi_class_id_t class_id; 970 spl_t s; 971 struct sfi_class_state *sfi_class; 972 wait_result_t waitret; 973 boolean_t did_wait = FALSE; 974 thread_continue_t continuation; 975 976 s = splsched(); 977 978 simple_lock(&sfi_lock, LCK_GRP_NULL); 979 980 if (!sfi_is_enabled) { 981 /* 982 * SFI is not enabled, or has recently been disabled. 983 * There is no point putting this thread on a deferred ready 984 * queue, even if it were classified as needing it, since 985 * SFI will truly be off at the next global off timer 986 */ 987 simple_unlock(&sfi_lock); 988 splx(s); 989 990 return; 991 } 992 993 thread_lock(thread); 994 thread->sfi_class = class_id = sfi_thread_classify(thread); 995 thread_unlock(thread); 996 997 /* 998 * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we 999 * are committed to transitioning to whatever state is indicated by "->class_in_on_phase". 1000 * If another thread tries to call sfi_reevaluate() after this point, it will take the 1001 * sfi_lock and see the thread in this wait state. If another thread calls 1002 * sfi_reevaluate() before this point, it would see a runnable thread and at most 1003 * attempt to send an AST to this processor, but we would have the most accurate 1004 * classification. 1005 */ 1006 1007 sfi_class = &sfi_classes[class_id]; 1008 if (!sfi_class->class_in_on_phase) { 1009 /* Need to block thread in wait queue */ 1010 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), 1011 thread_tid(thread), class_id, 0, 0, 0); 1012 1013 waitret = waitq_assert_wait64(&sfi_class->waitq, 1014 CAST_EVENT64_T(class_id), 1015 THREAD_INTERRUPTIBLE | THREAD_WAIT_NOREPORT, 0); 1016 if (waitret == THREAD_WAITING) { 1017 thread->sfi_wait_class = class_id; 1018 did_wait = TRUE; 1019 continuation = sfi_class->continuation; 1020 } else { 1021 /* thread may be exiting already, all other errors are unexpected */ 1022 assert(waitret == THREAD_INTERRUPTED); 1023 } 1024 } 1025 simple_unlock(&sfi_lock); 1026 1027 splx(s); 1028 1029 if (did_wait) { 1030 assert(thread->wait_sfi_begin_time == 0); 1031 1032 thread_block_reason(continuation, NULL, AST_SFI); 1033 } 1034 } 1035 1036 /* Thread must be unlocked */ 1037 void 1038 sfi_reevaluate(thread_t thread) 1039 { 1040 kern_return_t kret; 1041 spl_t s; 1042 sfi_class_id_t class_id, current_class_id; 1043 ast_t sfi_ast; 1044 1045 s = splsched(); 1046 1047 simple_lock(&sfi_lock, LCK_GRP_NULL); 1048 1049 thread_lock(thread); 1050 sfi_ast = sfi_thread_needs_ast(thread, &class_id); 1051 thread->sfi_class = class_id; 1052 1053 /* 1054 * This routine chiefly exists to boost threads out of an SFI wait 1055 * if their classification changes before the "on" timer fires. 1056 * 1057 * If we calculate that a thread is in a different ->sfi_wait_class 1058 * than we think it should be (including no-SFI-wait), we need to 1059 * correct that: 1060 * 1061 * If the thread is in SFI wait and should not be (or should be waiting 1062 * on a different class' "on" timer), we wake it up. If needed, the 1063 * thread may immediately block again in the different SFI wait state. 1064 * 1065 * If the thread is not in an SFI wait state and it should be, we need 1066 * to get that thread's attention, possibly by sending an AST to another 1067 * processor. 1068 */ 1069 1070 if ((current_class_id = thread->sfi_wait_class) != SFI_CLASS_UNSPECIFIED) { 1071 thread_unlock(thread); /* not needed anymore */ 1072 1073 assert(current_class_id < MAX_SFI_CLASS_ID); 1074 1075 if ((sfi_ast == AST_NONE) || (class_id != current_class_id)) { 1076 struct sfi_class_state *sfi_class = &sfi_classes[current_class_id]; 1077 1078 KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_WAIT_CANCELED), thread_tid(thread), current_class_id, class_id, 0, 0); 1079 1080 kret = waitq_wakeup64_thread(&sfi_class->waitq, 1081 CAST_EVENT64_T(current_class_id), 1082 thread, 1083 THREAD_AWAKENED); 1084 assert(kret == KERN_SUCCESS || kret == KERN_NOT_WAITING); 1085 } 1086 } else { 1087 /* 1088 * Thread's current SFI wait class is not set, and because we 1089 * have the sfi_lock, it won't get set. 1090 */ 1091 1092 if ((thread->state & (TH_RUN | TH_IDLE)) == TH_RUN) { 1093 if (sfi_ast != AST_NONE) { 1094 if (thread == current_thread()) { 1095 ast_on(sfi_ast); 1096 } else { 1097 processor_t processor = thread->last_processor; 1098 1099 if (processor != PROCESSOR_NULL && 1100 processor->state == PROCESSOR_RUNNING && 1101 processor->active_thread == thread) { 1102 cause_ast_check(processor); 1103 } else { 1104 /* 1105 * Runnable thread that's not on a CPU currently. When a processor 1106 * does context switch to it, the AST will get set based on whether 1107 * the thread is in its "off time". 1108 */ 1109 } 1110 } 1111 } 1112 } 1113 1114 thread_unlock(thread); 1115 } 1116 1117 simple_unlock(&sfi_lock); 1118 splx(s); 1119 } 1120 1121 #else /* !CONFIG_SCHED_SFI */ 1122 1123 kern_return_t 1124 sfi_set_window(uint64_t window_usecs __unused) 1125 { 1126 return KERN_NOT_SUPPORTED; 1127 } 1128 1129 kern_return_t 1130 sfi_window_cancel(void) 1131 { 1132 return KERN_NOT_SUPPORTED; 1133 } 1134 1135 1136 kern_return_t 1137 sfi_get_window(uint64_t *window_usecs __unused) 1138 { 1139 return KERN_NOT_SUPPORTED; 1140 } 1141 1142 1143 kern_return_t 1144 sfi_set_class_offtime(sfi_class_id_t class_id __unused, uint64_t offtime_usecs __unused) 1145 { 1146 return KERN_NOT_SUPPORTED; 1147 } 1148 1149 kern_return_t 1150 sfi_class_offtime_cancel(sfi_class_id_t class_id __unused) 1151 { 1152 return KERN_NOT_SUPPORTED; 1153 } 1154 1155 kern_return_t 1156 sfi_get_class_offtime(sfi_class_id_t class_id __unused, uint64_t *offtime_usecs __unused) 1157 { 1158 return KERN_NOT_SUPPORTED; 1159 } 1160 1161 void 1162 sfi_reevaluate(thread_t thread __unused) 1163 { 1164 return; 1165 } 1166 1167 sfi_class_id_t 1168 sfi_thread_classify(thread_t thread) 1169 { 1170 task_t task = thread->task; 1171 boolean_t is_kernel_thread = (task == kernel_task); 1172 1173 if (is_kernel_thread) { 1174 return SFI_CLASS_KERNEL; 1175 } 1176 1177 return SFI_CLASS_OPTED_OUT; 1178 } 1179 1180 #endif /* !CONFIG_SCHED_SFI */