sched_amp_common.c
1 /* 2 * Copyright (c) 2019 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 #include <mach/mach_types.h> 30 #include <mach/machine.h> 31 #include <machine/machine_routines.h> 32 #include <machine/sched_param.h> 33 #include <machine/machine_cpu.h> 34 #include <kern/kern_types.h> 35 #include <kern/debug.h> 36 #include <kern/machine.h> 37 #include <kern/misc_protos.h> 38 #include <kern/processor.h> 39 #include <kern/queue.h> 40 #include <kern/sched.h> 41 #include <kern/sched_prim.h> 42 #include <kern/task.h> 43 #include <kern/thread.h> 44 #include <machine/atomic.h> 45 #include <sys/kdebug.h> 46 #include <kern/sched_amp_common.h> 47 #include <stdatomic.h> 48 49 #if __AMP__ 50 51 /* Exported globals */ 52 processor_set_t ecore_set = NULL; 53 processor_set_t pcore_set = NULL; 54 55 static struct processor_set pset1; 56 static struct pset_node pset_node1; 57 58 #if DEVELOPMENT || DEBUG 59 bool system_ecore_only = false; 60 #endif /* DEVELOPMENT || DEBUG */ 61 62 /* 63 * sched_amp_init() 64 * 65 * Initialize the pcore_set and ecore_set globals which describe the 66 * P/E processor sets. 67 */ 68 void 69 sched_amp_init(void) 70 { 71 pset_init(&pset1, &pset_node1); 72 pset_node1.psets = &pset1; 73 pset_node0.node_list = &pset_node1; 74 75 if (ml_get_boot_cluster() == CLUSTER_TYPE_P) { 76 pcore_set = &pset0; 77 ecore_set = &pset1; 78 } else { 79 ecore_set = &pset0; 80 pcore_set = &pset1; 81 } 82 83 ecore_set->pset_cluster_type = PSET_AMP_E; 84 ecore_set->pset_cluster_id = 0; 85 86 pcore_set->pset_cluster_type = PSET_AMP_P; 87 pcore_set->pset_cluster_id = 1; 88 89 #if DEVELOPMENT || DEBUG 90 if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) { 91 system_ecore_only = true; 92 } 93 #endif /* DEVELOPMENT || DEBUG */ 94 95 sched_timeshare_init(); 96 } 97 98 /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */ 99 int sched_amp_spill_count = 3; 100 int sched_amp_idle_steal = 1; 101 int sched_amp_spill_steal = 1; 102 103 /* 104 * We see performance gains from doing immediate IPIs to P-cores to run 105 * P-eligible threads and lesser P-E migrations from using deferred IPIs 106 * for spill. 107 */ 108 int sched_amp_spill_deferred_ipi = 1; 109 int sched_amp_pcores_preempt_immediate_ipi = 1; 110 111 /* 112 * sched_perfcontrol_inherit_recommendation_from_tg changes amp 113 * scheduling policy away from default and allows policy to be 114 * modified at run-time. 115 * 116 * once modified from default, the policy toggles between "follow 117 * thread group" and "restrict to e". 118 */ 119 120 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT; 121 _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT; 122 123 /* 124 * sched_amp_spill_threshold() 125 * 126 * Routine to calulate spill threshold which decides if cluster should spill. 127 */ 128 int 129 sched_amp_spill_threshold(processor_set_t pset) 130 { 131 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask); 132 133 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count; 134 } 135 136 /* 137 * pset_signal_spill() 138 * 139 * Routine to signal a running/idle CPU to cause a spill onto that CPU. 140 * Called with pset locked, returns unlocked 141 */ 142 void 143 pset_signal_spill(processor_set_t pset, int spilled_thread_priority) 144 { 145 processor_t processor; 146 sched_ipi_type_t ipi_type = SCHED_IPI_NONE; 147 148 uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]; 149 for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) { 150 processor = processor_array[cpuid]; 151 if (bit_set_if_clear(&pset->pending_spill_cpu_mask, processor->cpu_id)) { 152 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0); 153 154 processor->deadline = UINT64_MAX; 155 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING); 156 157 if (processor == current_processor()) { 158 bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id); 159 } else { 160 ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL); 161 } 162 pset_unlock(pset); 163 sched_ipi_perform(processor, ipi_type); 164 return; 165 } 166 } 167 168 processor_t ast_processor = NULL; 169 uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]; 170 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { 171 processor = processor_array[cpuid]; 172 if (processor->current_recommended_pset_type == PSET_AMP_P) { 173 /* Already running a spilled P-core recommended thread */ 174 continue; 175 } 176 if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) { 177 /* Already received a spill signal */ 178 continue; 179 } 180 if (processor->current_pri >= spilled_thread_priority) { 181 /* Already running a higher or equal priority thread */ 182 continue; 183 } 184 185 /* Found a suitable processor */ 186 bit_set(pset->pending_spill_cpu_mask, processor->cpu_id); 187 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0); 188 if (processor == current_processor()) { 189 ast_on(AST_PREEMPT); 190 } 191 ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL); 192 if (ipi_type != SCHED_IPI_NONE) { 193 ast_processor = processor; 194 } 195 break; 196 } 197 198 pset_unlock(pset); 199 sched_ipi_perform(ast_processor, ipi_type); 200 } 201 202 /* 203 * pset_should_accept_spilled_thread() 204 * 205 * Routine to decide if pset should accept spilled threads. 206 * This function must be safe to call (to use as a hint) without holding the pset lock. 207 */ 208 bool 209 pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority) 210 { 211 if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) { 212 return true; 213 } 214 215 uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]); 216 217 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) { 218 processor_t processor = processor_array[cpuid]; 219 220 if (processor->current_recommended_pset_type == PSET_AMP_P) { 221 /* This processor is already running a spilled thread */ 222 continue; 223 } 224 225 if (processor->current_pri < spilled_thread_priority) { 226 return true; 227 } 228 } 229 230 return false; 231 } 232 233 /* 234 * should_spill_to_ecores() 235 * 236 * Spill policy is implemented here 237 */ 238 bool 239 should_spill_to_ecores(processor_set_t nset, thread_t thread) 240 { 241 if (nset->pset_cluster_type == PSET_AMP_E) { 242 /* Not relevant if ecores already preferred */ 243 return false; 244 } 245 246 if (!pset_is_recommended(ecore_set)) { 247 /* E cores must be recommended */ 248 return false; 249 } 250 251 if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) { 252 return false; 253 } 254 255 if (thread->sched_pri >= BASEPRI_RTQUEUES) { 256 /* Never spill realtime threads */ 257 return false; 258 } 259 260 if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) { 261 /* Don't spill if idle cores */ 262 return false; 263 } 264 265 if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) && /* There is already a load on P cores */ 266 pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */ 267 return true; 268 } 269 270 return false; 271 } 272 273 /* 274 * sched_amp_check_spill() 275 * 276 * Routine to check if the thread should be spilled and signal the pset if needed. 277 */ 278 void 279 sched_amp_check_spill(processor_set_t pset, thread_t thread) 280 { 281 /* pset is unlocked */ 282 283 /* Bound threads don't call this function */ 284 assert(thread->bound_processor == PROCESSOR_NULL); 285 286 if (should_spill_to_ecores(pset, thread)) { 287 pset_lock(ecore_set); 288 289 pset_signal_spill(ecore_set, thread->sched_pri); 290 /* returns with ecore_set unlocked */ 291 } 292 } 293 294 /* 295 * sched_amp_steal_threshold() 296 * 297 * Routine to calculate the steal threshold 298 */ 299 int 300 sched_amp_steal_threshold(processor_set_t pset, bool spill_pending) 301 { 302 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask); 303 304 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal); 305 } 306 307 /* 308 * sched_amp_steal_thread_enabled() 309 * 310 */ 311 bool 312 sched_amp_steal_thread_enabled(processor_set_t pset) 313 { 314 return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0); 315 } 316 317 /* 318 * sched_amp_balance() 319 * 320 * Invoked with pset locked, returns with pset unlocked 321 */ 322 void 323 sched_amp_balance(processor_t cprocessor, processor_set_t cpset) 324 { 325 assert(cprocessor == current_processor()); 326 327 pset_unlock(cpset); 328 329 if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) { 330 return; 331 } 332 333 /* 334 * cprocessor is an idle, recommended P core processor. 335 * Look for P-eligible threads that have spilled to an E core 336 * and coax them to come back. 337 */ 338 339 processor_set_t pset = ecore_set; 340 341 pset_lock(pset); 342 343 processor_t eprocessor; 344 uint64_t ast_processor_map = 0; 345 346 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE}; 347 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING]; 348 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { 349 eprocessor = processor_array[cpuid]; 350 if ((eprocessor->current_pri < BASEPRI_RTQUEUES) && 351 (eprocessor->current_recommended_pset_type == PSET_AMP_P)) { 352 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE); 353 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) { 354 bit_set(ast_processor_map, eprocessor->cpu_id); 355 assert(eprocessor != cprocessor); 356 } 357 } 358 } 359 360 pset_unlock(pset); 361 362 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) { 363 processor_t ast_processor = processor_array[cpuid]; 364 sched_ipi_perform(ast_processor, ipi_type[cpuid]); 365 } 366 } 367 368 /* 369 * Helper function for sched_amp_thread_group_recommendation_change() 370 * Find all the cores in the pset running threads from the thread_group tg 371 * and send them a rebalance interrupt. 372 */ 373 void 374 sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg) 375 { 376 assert(pset->pset_cluster_type == PSET_AMP_E); 377 uint64_t ast_processor_map = 0; 378 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE}; 379 380 spl_t s = splsched(); 381 pset_lock(pset); 382 383 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING]; 384 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) { 385 processor_t eprocessor = processor_array[cpuid]; 386 if (eprocessor->current_thread_group == tg) { 387 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE); 388 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) { 389 bit_set(ast_processor_map, eprocessor->cpu_id); 390 } else if (eprocessor == current_processor()) { 391 ast_on(AST_PREEMPT); 392 bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id); 393 } 394 } 395 } 396 397 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0); 398 399 pset_unlock(pset); 400 401 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) { 402 processor_t ast_processor = processor_array[cpuid]; 403 sched_ipi_perform(ast_processor, ipi_type[cpuid]); 404 } 405 406 splx(s); 407 } 408 409 /* 410 * sched_amp_ipi_policy() 411 */ 412 sched_ipi_type_t 413 sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event) 414 { 415 processor_set_t pset = dst->processor_set; 416 assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false); 417 assert(dst != current_processor()); 418 419 boolean_t deferred_ipi_supported = false; 420 #if defined(CONFIG_SCHED_DEFERRED_AST) 421 deferred_ipi_supported = true; 422 #endif /* CONFIG_SCHED_DEFERRED_AST */ 423 424 switch (event) { 425 case SCHED_IPI_EVENT_SPILL: 426 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */ 427 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) { 428 return sched_ipi_deferred_policy(pset, dst, event); 429 } 430 break; 431 case SCHED_IPI_EVENT_PREEMPT: 432 /* For preemption, the default policy is to use deferred IPIs 433 * for Non-RT P-core preemption. Override that behavior if 434 * sched_amp_pcores_preempt_immediate_ipi is set 435 */ 436 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) { 437 if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) { 438 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE; 439 } 440 } 441 break; 442 default: 443 break; 444 } 445 /* Default back to the global policy for all other scenarios */ 446 return sched_ipi_policy(dst, thread, dst_idle, event); 447 } 448 449 /* 450 * sched_amp_qos_max_parallelism() 451 */ 452 uint32_t 453 sched_amp_qos_max_parallelism(int qos, uint64_t options) 454 { 455 uint32_t ecount = ecore_set->cpu_set_count; 456 uint32_t pcount = pcore_set->cpu_set_count; 457 458 if (options & QOS_PARALLELISM_REALTIME) { 459 /* For realtime threads on AMP, we would want them 460 * to limit the width to just the P-cores since we 461 * do not spill/rebalance for RT threads. 462 */ 463 return pcount; 464 } 465 466 /* 467 * The default AMP scheduler policy is to run utility and by 468 * threads on E-Cores only. Run-time policy adjustment unlocks 469 * ability of utility and bg to threads to be scheduled based on 470 * run-time conditions. 471 */ 472 switch (qos) { 473 case THREAD_QOS_UTILITY: 474 return (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount); 475 case THREAD_QOS_BACKGROUND: 476 case THREAD_QOS_MAINTENANCE: 477 return (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount); 478 default: 479 return ecount + pcount; 480 } 481 } 482 483 pset_node_t 484 sched_amp_choose_node(thread_t thread) 485 { 486 if (recommended_pset_type(thread) == PSET_AMP_P) { 487 return pcore_set->node; 488 } else { 489 return ecore_set->node; 490 } 491 } 492 493 /* 494 * sched_amp_rt_runq() 495 */ 496 rt_queue_t 497 sched_amp_rt_runq(processor_set_t pset) 498 { 499 return &pset->rt_runq; 500 } 501 502 /* 503 * sched_amp_rt_init() 504 */ 505 void 506 sched_amp_rt_init(processor_set_t pset) 507 { 508 pset_rt_init(pset); 509 } 510 511 /* 512 * sched_amp_rt_queue_shutdown() 513 */ 514 void 515 sched_amp_rt_queue_shutdown(processor_t processor) 516 { 517 processor_set_t pset = processor->processor_set; 518 thread_t thread; 519 queue_head_t tqueue; 520 521 pset_lock(pset); 522 523 /* We only need to migrate threads if this is the last active or last recommended processor in the pset */ 524 if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) { 525 pset_unlock(pset); 526 return; 527 } 528 529 queue_init(&tqueue); 530 531 while (rt_runq_count(pset) > 0) { 532 thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links); 533 thread->runq = PROCESSOR_NULL; 534 SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats, 535 os_atomic_load(&pset->rt_runq.count, relaxed)); 536 rt_runq_count_decr(pset); 537 enqueue_tail(&tqueue, &thread->runq_links); 538 } 539 sched_update_pset_load_average(pset, 0); 540 pset_unlock(pset); 541 542 qe_foreach_element_safe(thread, &tqueue, runq_links) { 543 remqueue(&thread->runq_links); 544 545 thread_lock(thread); 546 547 thread_setrun(thread, SCHED_TAILQ); 548 549 thread_unlock(thread); 550 } 551 } 552 553 /* 554 * sched_amp_rt_runq_scan() 555 * 556 * Assumes RT lock is not held, and acquires splsched/rt_lock itself 557 */ 558 void 559 sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context) 560 { 561 thread_t thread; 562 563 pset_node_t node = &pset_node0; 564 processor_set_t pset = node->psets; 565 566 spl_t s = splsched(); 567 do { 568 while (pset != NULL) { 569 pset_lock(pset); 570 571 qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) { 572 if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) { 573 scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time; 574 } 575 } 576 577 pset_unlock(pset); 578 579 pset = pset->pset_list; 580 } 581 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); 582 splx(s); 583 } 584 585 /* 586 * sched_amp_rt_runq_count_sum() 587 */ 588 int64_t 589 sched_amp_rt_runq_count_sum(void) 590 { 591 pset_node_t node = &pset_node0; 592 processor_set_t pset = node->psets; 593 int64_t count = 0; 594 595 do { 596 while (pset != NULL) { 597 count += pset->rt_runq.runq_stats.count_sum; 598 599 pset = pset->pset_list; 600 } 601 } while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL)); 602 603 return count; 604 } 605 606 #endif /* __AMP__ */