/ duct-tape / xnu / osfmk / kern / sched_amp_common.c
sched_amp_common.c
  1  /*
  2   * Copyright (c) 2019 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   *
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   *
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   *
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   */
 28  
 29  #include <mach/mach_types.h>
 30  #include <mach/machine.h>
 31  #include <machine/machine_routines.h>
 32  #include <machine/sched_param.h>
 33  #include <machine/machine_cpu.h>
 34  #include <kern/kern_types.h>
 35  #include <kern/debug.h>
 36  #include <kern/machine.h>
 37  #include <kern/misc_protos.h>
 38  #include <kern/processor.h>
 39  #include <kern/queue.h>
 40  #include <kern/sched.h>
 41  #include <kern/sched_prim.h>
 42  #include <kern/task.h>
 43  #include <kern/thread.h>
 44  #include <machine/atomic.h>
 45  #include <sys/kdebug.h>
 46  #include <kern/sched_amp_common.h>
 47  #include <stdatomic.h>
 48  
 49  #if __AMP__
 50  
 51  /* Exported globals */
 52  processor_set_t ecore_set = NULL;
 53  processor_set_t pcore_set = NULL;
 54  
 55  static struct processor_set pset1;
 56  static struct pset_node pset_node1;
 57  
 58  #if DEVELOPMENT || DEBUG
 59  bool system_ecore_only = false;
 60  #endif /* DEVELOPMENT || DEBUG */
 61  
 62  /*
 63   * sched_amp_init()
 64   *
 65   * Initialize the pcore_set and ecore_set globals which describe the
 66   * P/E processor sets.
 67   */
 68  void
 69  sched_amp_init(void)
 70  {
 71  	pset_init(&pset1, &pset_node1);
 72  	pset_node1.psets = &pset1;
 73  	pset_node0.node_list = &pset_node1;
 74  
 75  	if (ml_get_boot_cluster() == CLUSTER_TYPE_P) {
 76  		pcore_set = &pset0;
 77  		ecore_set = &pset1;
 78  	} else {
 79  		ecore_set = &pset0;
 80  		pcore_set = &pset1;
 81  	}
 82  
 83  	ecore_set->pset_cluster_type = PSET_AMP_E;
 84  	ecore_set->pset_cluster_id = 0;
 85  
 86  	pcore_set->pset_cluster_type = PSET_AMP_P;
 87  	pcore_set->pset_cluster_id = 1;
 88  
 89  #if DEVELOPMENT || DEBUG
 90  	if (PE_parse_boot_argn("enable_skstsct", NULL, 0)) {
 91  		system_ecore_only = true;
 92  	}
 93  #endif /* DEVELOPMENT || DEBUG */
 94  
 95  	sched_timeshare_init();
 96  }
 97  
 98  /* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
 99  int sched_amp_spill_count = 3;
100  int sched_amp_idle_steal = 1;
101  int sched_amp_spill_steal = 1;
102  
103  /*
104   * We see performance gains from doing immediate IPIs to P-cores to run
105   * P-eligible threads and lesser P-E migrations from using deferred IPIs
106   * for spill.
107   */
108  int sched_amp_spill_deferred_ipi = 1;
109  int sched_amp_pcores_preempt_immediate_ipi = 1;
110  
111  /*
112   * sched_perfcontrol_inherit_recommendation_from_tg changes amp
113   * scheduling policy away from default and allows policy to be
114   * modified at run-time.
115   *
116   * once modified from default, the policy toggles between "follow
117   * thread group" and "restrict to e".
118   */
119  
120  _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
121  _Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
122  
123  /*
124   * sched_amp_spill_threshold()
125   *
126   * Routine to calulate spill threshold which decides if cluster should spill.
127   */
128  int
129  sched_amp_spill_threshold(processor_set_t pset)
130  {
131  	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
132  
133  	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
134  }
135  
136  /*
137   * pset_signal_spill()
138   *
139   * Routine to signal a running/idle CPU to cause a spill onto that CPU.
140   * Called with pset locked, returns unlocked
141   */
142  void
143  pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
144  {
145  	processor_t processor;
146  	sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
147  
148  	uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
149  	for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
150  		processor = processor_array[cpuid];
151  		if (bit_set_if_clear(&pset->pending_spill_cpu_mask, processor->cpu_id)) {
152  			KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
153  
154  			processor->deadline = UINT64_MAX;
155  			pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
156  
157  			if (processor == current_processor()) {
158  				bit_set(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id);
159  			} else {
160  				ipi_type = sched_ipi_action(processor, NULL, true, SCHED_IPI_EVENT_SPILL);
161  			}
162  			pset_unlock(pset);
163  			sched_ipi_perform(processor, ipi_type);
164  			return;
165  		}
166  	}
167  
168  	processor_t ast_processor = NULL;
169  	uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
170  	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
171  		processor = processor_array[cpuid];
172  		if (processor->current_recommended_pset_type == PSET_AMP_P) {
173  			/* Already running a spilled P-core recommended thread */
174  			continue;
175  		}
176  		if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
177  			/* Already received a spill signal */
178  			continue;
179  		}
180  		if (processor->current_pri >= spilled_thread_priority) {
181  			/* Already running a higher or equal priority thread */
182  			continue;
183  		}
184  
185  		/* Found a suitable processor */
186  		bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
187  		KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
188  		if (processor == current_processor()) {
189  			ast_on(AST_PREEMPT);
190  		}
191  		ipi_type = sched_ipi_action(processor, NULL, false, SCHED_IPI_EVENT_SPILL);
192  		if (ipi_type != SCHED_IPI_NONE) {
193  			ast_processor = processor;
194  		}
195  		break;
196  	}
197  
198  	pset_unlock(pset);
199  	sched_ipi_perform(ast_processor, ipi_type);
200  }
201  
202  /*
203   * pset_should_accept_spilled_thread()
204   *
205   * Routine to decide if pset should accept spilled threads.
206   * This function must be safe to call (to use as a hint) without holding the pset lock.
207   */
208  bool
209  pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
210  {
211  	if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
212  		return true;
213  	}
214  
215  	uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
216  
217  	for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
218  		processor_t processor = processor_array[cpuid];
219  
220  		if (processor->current_recommended_pset_type == PSET_AMP_P) {
221  			/* This processor is already running a spilled thread */
222  			continue;
223  		}
224  
225  		if (processor->current_pri < spilled_thread_priority) {
226  			return true;
227  		}
228  	}
229  
230  	return false;
231  }
232  
233  /*
234   * should_spill_to_ecores()
235   *
236   * Spill policy is implemented here
237   */
238  bool
239  should_spill_to_ecores(processor_set_t nset, thread_t thread)
240  {
241  	if (nset->pset_cluster_type == PSET_AMP_E) {
242  		/* Not relevant if ecores already preferred */
243  		return false;
244  	}
245  
246  	if (!pset_is_recommended(ecore_set)) {
247  		/* E cores must be recommended */
248  		return false;
249  	}
250  
251  	if (thread->sched_flags & TH_SFLAG_PCORE_ONLY) {
252  		return false;
253  	}
254  
255  	if (thread->sched_pri >= BASEPRI_RTQUEUES) {
256  		/* Never spill realtime threads */
257  		return false;
258  	}
259  
260  	if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
261  		/* Don't spill if idle cores */
262  		return false;
263  	}
264  
265  	if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) &&  /* There is already a load on P cores */
266  	    pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
267  		return true;
268  	}
269  
270  	return false;
271  }
272  
273  /*
274   * sched_amp_check_spill()
275   *
276   * Routine to check if the thread should be spilled and signal the pset if needed.
277   */
278  void
279  sched_amp_check_spill(processor_set_t pset, thread_t thread)
280  {
281  	/* pset is unlocked */
282  
283  	/* Bound threads don't call this function */
284  	assert(thread->bound_processor == PROCESSOR_NULL);
285  
286  	if (should_spill_to_ecores(pset, thread)) {
287  		pset_lock(ecore_set);
288  
289  		pset_signal_spill(ecore_set, thread->sched_pri);
290  		/* returns with ecore_set unlocked */
291  	}
292  }
293  
294  /*
295   * sched_amp_steal_threshold()
296   *
297   * Routine to calculate the steal threshold
298   */
299  int
300  sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
301  {
302  	int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
303  
304  	return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
305  }
306  
307  /*
308   * sched_amp_steal_thread_enabled()
309   *
310   */
311  bool
312  sched_amp_steal_thread_enabled(processor_set_t pset)
313  {
314  	return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set->online_processor_count > 0);
315  }
316  
317  /*
318   * sched_amp_balance()
319   *
320   * Invoked with pset locked, returns with pset unlocked
321   */
322  void
323  sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
324  {
325  	assert(cprocessor == current_processor());
326  
327  	pset_unlock(cpset);
328  
329  	if (cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
330  		return;
331  	}
332  
333  	/*
334  	 * cprocessor is an idle, recommended P core processor.
335  	 * Look for P-eligible threads that have spilled to an E core
336  	 * and coax them to come back.
337  	 */
338  
339  	processor_set_t pset = ecore_set;
340  
341  	pset_lock(pset);
342  
343  	processor_t eprocessor;
344  	uint64_t ast_processor_map = 0;
345  
346  	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
347  	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
348  	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
349  		eprocessor = processor_array[cpuid];
350  		if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
351  		    (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
352  			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
353  			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
354  				bit_set(ast_processor_map, eprocessor->cpu_id);
355  				assert(eprocessor != cprocessor);
356  			}
357  		}
358  	}
359  
360  	pset_unlock(pset);
361  
362  	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
363  		processor_t ast_processor = processor_array[cpuid];
364  		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
365  	}
366  }
367  
368  /*
369   * Helper function for sched_amp_thread_group_recommendation_change()
370   * Find all the cores in the pset running threads from the thread_group tg
371   * and send them a rebalance interrupt.
372   */
373  void
374  sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
375  {
376  	assert(pset->pset_cluster_type == PSET_AMP_E);
377  	uint64_t ast_processor_map = 0;
378  	sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
379  
380  	spl_t s = splsched();
381  	pset_lock(pset);
382  
383  	uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
384  	for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
385  		processor_t eprocessor = processor_array[cpuid];
386  		if (eprocessor->current_thread_group == tg) {
387  			ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, false, SCHED_IPI_EVENT_REBALANCE);
388  			if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
389  				bit_set(ast_processor_map, eprocessor->cpu_id);
390  			} else if (eprocessor == current_processor()) {
391  				ast_on(AST_PREEMPT);
392  				bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
393  			}
394  		}
395  	}
396  
397  	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
398  
399  	pset_unlock(pset);
400  
401  	for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
402  		processor_t ast_processor = processor_array[cpuid];
403  		sched_ipi_perform(ast_processor, ipi_type[cpuid]);
404  	}
405  
406  	splx(s);
407  }
408  
409  /*
410   * sched_amp_ipi_policy()
411   */
412  sched_ipi_type_t
413  sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
414  {
415  	processor_set_t pset = dst->processor_set;
416  	assert(bit_test(pset->pending_AST_URGENT_cpu_mask, dst->cpu_id) == false);
417  	assert(dst != current_processor());
418  
419  	boolean_t deferred_ipi_supported = false;
420  #if defined(CONFIG_SCHED_DEFERRED_AST)
421  	deferred_ipi_supported = true;
422  #endif /* CONFIG_SCHED_DEFERRED_AST */
423  
424  	switch (event) {
425  	case SCHED_IPI_EVENT_SPILL:
426  		/* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
427  		if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
428  			return sched_ipi_deferred_policy(pset, dst, event);
429  		}
430  		break;
431  	case SCHED_IPI_EVENT_PREEMPT:
432  		/* For preemption, the default policy is to use deferred IPIs
433  		 * for Non-RT P-core preemption. Override that behavior if
434  		 * sched_amp_pcores_preempt_immediate_ipi is set
435  		 */
436  		if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
437  			if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
438  				return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
439  			}
440  		}
441  		break;
442  	default:
443  		break;
444  	}
445  	/* Default back to the global policy for all other scenarios */
446  	return sched_ipi_policy(dst, thread, dst_idle, event);
447  }
448  
449  /*
450   * sched_amp_qos_max_parallelism()
451   */
452  uint32_t
453  sched_amp_qos_max_parallelism(int qos, uint64_t options)
454  {
455  	uint32_t ecount = ecore_set->cpu_set_count;
456  	uint32_t pcount = pcore_set->cpu_set_count;
457  
458  	if (options & QOS_PARALLELISM_REALTIME) {
459  		/* For realtime threads on AMP, we would want them
460  		 * to limit the width to just the P-cores since we
461  		 * do not spill/rebalance for RT threads.
462  		 */
463  		return pcount;
464  	}
465  
466  	/*
467  	 * The default AMP scheduler policy is to run utility and by
468  	 * threads on E-Cores only.  Run-time policy adjustment unlocks
469  	 * ability of utility and bg to threads to be scheduled based on
470  	 * run-time conditions.
471  	 */
472  	switch (qos) {
473  	case THREAD_QOS_UTILITY:
474  		return (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount);
475  	case THREAD_QOS_BACKGROUND:
476  	case THREAD_QOS_MAINTENANCE:
477  		return (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) ? ecount : (ecount + pcount);
478  	default:
479  		return ecount + pcount;
480  	}
481  }
482  
483  pset_node_t
484  sched_amp_choose_node(thread_t thread)
485  {
486  	if (recommended_pset_type(thread) == PSET_AMP_P) {
487  		return pcore_set->node;
488  	} else {
489  		return ecore_set->node;
490  	}
491  }
492  
493  /*
494   * sched_amp_rt_runq()
495   */
496  rt_queue_t
497  sched_amp_rt_runq(processor_set_t pset)
498  {
499  	return &pset->rt_runq;
500  }
501  
502  /*
503   * sched_amp_rt_init()
504   */
505  void
506  sched_amp_rt_init(processor_set_t pset)
507  {
508  	pset_rt_init(pset);
509  }
510  
511  /*
512   * sched_amp_rt_queue_shutdown()
513   */
514  void
515  sched_amp_rt_queue_shutdown(processor_t processor)
516  {
517  	processor_set_t pset = processor->processor_set;
518  	thread_t        thread;
519  	queue_head_t    tqueue;
520  
521  	pset_lock(pset);
522  
523  	/* We only need to migrate threads if this is the last active or last recommended processor in the pset */
524  	if ((pset->online_processor_count > 0) && pset_is_recommended(pset)) {
525  		pset_unlock(pset);
526  		return;
527  	}
528  
529  	queue_init(&tqueue);
530  
531  	while (rt_runq_count(pset) > 0) {
532  		thread = qe_dequeue_head(&pset->rt_runq.queue, struct thread, runq_links);
533  		thread->runq = PROCESSOR_NULL;
534  		SCHED_STATS_RUNQ_CHANGE(&pset->rt_runq.runq_stats,
535  		    os_atomic_load(&pset->rt_runq.count, relaxed));
536  		rt_runq_count_decr(pset);
537  		enqueue_tail(&tqueue, &thread->runq_links);
538  	}
539  	sched_update_pset_load_average(pset, 0);
540  	pset_unlock(pset);
541  
542  	qe_foreach_element_safe(thread, &tqueue, runq_links) {
543  		remqueue(&thread->runq_links);
544  
545  		thread_lock(thread);
546  
547  		thread_setrun(thread, SCHED_TAILQ);
548  
549  		thread_unlock(thread);
550  	}
551  }
552  
553  /*
554   * sched_amp_rt_runq_scan()
555   *
556   * Assumes RT lock is not held, and acquires splsched/rt_lock itself
557   */
558  void
559  sched_amp_rt_runq_scan(sched_update_scan_context_t scan_context)
560  {
561  	thread_t        thread;
562  
563  	pset_node_t node = &pset_node0;
564  	processor_set_t pset = node->psets;
565  
566  	spl_t s = splsched();
567  	do {
568  		while (pset != NULL) {
569  			pset_lock(pset);
570  
571  			qe_foreach_element_safe(thread, &pset->rt_runq.queue, runq_links) {
572  				if (thread->last_made_runnable_time < scan_context->earliest_rt_make_runnable_time) {
573  					scan_context->earliest_rt_make_runnable_time = thread->last_made_runnable_time;
574  				}
575  			}
576  
577  			pset_unlock(pset);
578  
579  			pset = pset->pset_list;
580  		}
581  	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
582  	splx(s);
583  }
584  
585  /*
586   * sched_amp_rt_runq_count_sum()
587   */
588  int64_t
589  sched_amp_rt_runq_count_sum(void)
590  {
591  	pset_node_t node = &pset_node0;
592  	processor_set_t pset = node->psets;
593  	int64_t count = 0;
594  
595  	do {
596  		while (pset != NULL) {
597  			count += pset->rt_runq.runq_stats.count_sum;
598  
599  			pset = pset->pset_list;
600  		}
601  	} while (((node = node->node_list) != NULL) && ((pset = node->psets) != NULL));
602  
603  	return count;
604  }
605  
606  #endif /* __AMP__ */