/ duct-tape / xnu / osfmk / kern / telemetry.c
telemetry.c
   1  /*
   2   * Copyright (c) 2012-2020 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. The rights granted to you under the License
  10   * may not be used to create, or enable the creation or redistribution of,
  11   * unlawful or unlicensed copies of an Apple operating system, or to
  12   * circumvent, violate, or enable the circumvention or violation of, any
  13   * terms of an Apple operating system software license agreement.
  14   *
  15   * Please obtain a copy of the License at
  16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17   *
  18   * The Original Code and all software distributed under the License are
  19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23   * Please see the License for the specific language governing rights and
  24   * limitations under the License.
  25   *
  26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27   */
  28  #include <mach/host_priv.h>
  29  #include <mach/host_special_ports.h>
  30  #include <mach/mach_types.h>
  31  #include <mach/telemetry_notification_server.h>
  32  
  33  #include <kern/assert.h>
  34  #include <kern/clock.h>
  35  #include <kern/debug.h>
  36  #include <kern/host.h>
  37  #include <kern/kalloc.h>
  38  #include <kern/kern_types.h>
  39  #include <kern/locks.h>
  40  #include <kern/misc_protos.h>
  41  #include <kern/sched.h>
  42  #include <kern/sched_prim.h>
  43  #include <kern/telemetry.h>
  44  #include <kern/timer_call.h>
  45  #include <kern/policy_internal.h>
  46  #include <kern/kcdata.h>
  47  
  48  #include <pexpert/pexpert.h>
  49  
  50  #include <vm/vm_kern.h>
  51  #include <vm/vm_shared_region.h>
  52  
  53  #include <kperf/callstack.h>
  54  #include <kern/backtrace.h>
  55  #include <kern/monotonic.h>
  56  
  57  #include <sys/kdebug.h>
  58  #include <uuid/uuid.h>
  59  #include <kdp/kdp_dyld.h>
  60  
  61  #define TELEMETRY_DEBUG 0
  62  
  63  struct proc;
  64  extern int      proc_pid(struct proc *);
  65  extern char     *proc_name_address(void *p);
  66  extern uint64_t proc_uniqueid(void *p);
  67  extern uint64_t proc_was_throttled(void *p);
  68  extern uint64_t proc_did_throttle(void *p);
  69  extern int      proc_selfpid(void);
  70  extern boolean_t task_did_exec(task_t task);
  71  extern boolean_t task_is_exec_copy(task_t task);
  72  
  73  struct micro_snapshot_buffer {
  74  	vm_offset_t             buffer;
  75  	uint32_t                size;
  76  	uint32_t                current_position;
  77  	uint32_t                end_point;
  78  };
  79  
  80  void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro_snapshot_buffer * current_buffer);
  81  int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer);
  82  
  83  #define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */
  84  #define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024)
  85  #define TELEMETRY_MAX_BUFFER_SIZE (64*1024)
  86  
  87  #define TELEMETRY_DEFAULT_NOTIFY_LEEWAY (4*1024) // Userland gets 4k of leeway to collect data after notification
  88  #define TELEMETRY_MAX_UUID_COUNT (128) // Max of 128 non-shared-cache UUIDs to log for symbolication
  89  
  90  uint32_t                        telemetry_sample_rate = 0;
  91  volatile boolean_t      telemetry_needs_record = FALSE;
  92  volatile boolean_t      telemetry_needs_timer_arming_record = FALSE;
  93  
  94  /*
  95   * If TRUE, record micro-stackshot samples for all tasks.
  96   * If FALSE, only sample tasks which are marked for telemetry.
  97   */
  98  boolean_t telemetry_sample_all_tasks = FALSE;
  99  boolean_t telemetry_sample_pmis = FALSE;
 100  uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry
 101  
 102  uint32_t telemetry_timestamp = 0;
 103  
 104  /*
 105   * The telemetry_buffer is responsible
 106   * for timer samples and interrupt samples that are driven by
 107   * compute_averages().  It will notify its client (if one
 108   * exists) when it has enough data to be worth flushing.
 109   */
 110  struct micro_snapshot_buffer telemetry_buffer = {
 111  	.buffer = 0,
 112  	.size = 0,
 113  	.current_position = 0,
 114  	.end_point = 0
 115  };
 116  
 117  int                                     telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
 118  int                                     telemetry_buffer_notify_at = 0;
 119  
 120  LCK_GRP_DECLARE(telemetry_lck_grp, "telemetry group");
 121  LCK_MTX_DECLARE(telemetry_mtx, &telemetry_lck_grp);
 122  LCK_MTX_DECLARE(telemetry_pmi_mtx, &telemetry_lck_grp);
 123  
 124  #define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while (0)
 125  #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx)
 126  #define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while (0)
 127  
 128  #define TELEMETRY_PMI_LOCK() do { lck_mtx_lock(&telemetry_pmi_mtx); } while (0)
 129  #define TELEMETRY_PMI_UNLOCK() do { lck_mtx_unlock(&telemetry_pmi_mtx); } while (0)
 130  
 131  void
 132  telemetry_init(void)
 133  {
 134  	kern_return_t ret;
 135  	uint32_t          telemetry_notification_leeway;
 136  
 137  	if (!PE_parse_boot_argn("telemetry_buffer_size",
 138  	    &telemetry_buffer.size, sizeof(telemetry_buffer.size))) {
 139  		telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE;
 140  	}
 141  
 142  	if (telemetry_buffer.size > TELEMETRY_MAX_BUFFER_SIZE) {
 143  		telemetry_buffer.size = TELEMETRY_MAX_BUFFER_SIZE;
 144  	}
 145  
 146  	ret = kmem_alloc(kernel_map, &telemetry_buffer.buffer, telemetry_buffer.size, VM_KERN_MEMORY_DIAG);
 147  	if (ret != KERN_SUCCESS) {
 148  		kprintf("Telemetry: Allocation failed: %d\n", ret);
 149  		return;
 150  	}
 151  	bzero((void *) telemetry_buffer.buffer, telemetry_buffer.size);
 152  
 153  	if (!PE_parse_boot_argn("telemetry_notification_leeway",
 154  	    &telemetry_notification_leeway, sizeof(telemetry_notification_leeway))) {
 155  		/*
 156  		 * By default, notify the user to collect the buffer when there is this much space left in the buffer.
 157  		 */
 158  		telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
 159  	}
 160  	if (telemetry_notification_leeway >= telemetry_buffer.size) {
 161  		printf("telemetry: nonsensical telemetry_notification_leeway boot-arg %d changed to %d\n",
 162  		    telemetry_notification_leeway, TELEMETRY_DEFAULT_NOTIFY_LEEWAY);
 163  		telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
 164  	}
 165  	telemetry_buffer_notify_at = telemetry_buffer.size - telemetry_notification_leeway;
 166  
 167  	if (!PE_parse_boot_argn("telemetry_sample_rate",
 168  	    &telemetry_sample_rate, sizeof(telemetry_sample_rate))) {
 169  		telemetry_sample_rate = TELEMETRY_DEFAULT_SAMPLE_RATE;
 170  	}
 171  
 172  	/*
 173  	 * To enable telemetry for all tasks, include "telemetry_sample_all_tasks=1" in boot-args.
 174  	 */
 175  	if (!PE_parse_boot_argn("telemetry_sample_all_tasks",
 176  	    &telemetry_sample_all_tasks, sizeof(telemetry_sample_all_tasks))) {
 177  #if !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG)
 178  		telemetry_sample_all_tasks = FALSE;
 179  #else
 180  		telemetry_sample_all_tasks = TRUE;
 181  #endif /* !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG) */
 182  	}
 183  
 184  	kprintf("Telemetry: Sampling %stasks once per %u second%s\n",
 185  	    (telemetry_sample_all_tasks) ? "all " : "",
 186  	    telemetry_sample_rate, telemetry_sample_rate == 1 ? "" : "s");
 187  }
 188  
 189  /*
 190   * Enable or disable global microstackshots (ie telemetry_sample_all_tasks).
 191   *
 192   * enable_disable == 1: turn it on
 193   * enable_disable == 0: turn it off
 194   */
 195  void
 196  telemetry_global_ctl(int enable_disable)
 197  {
 198  	if (enable_disable == 1) {
 199  		telemetry_sample_all_tasks = TRUE;
 200  	} else {
 201  		telemetry_sample_all_tasks = FALSE;
 202  	}
 203  }
 204  
 205  /*
 206   * Opt the given task into or out of the telemetry stream.
 207   *
 208   * Supported reasons (callers may use any or all of):
 209   *     TF_CPUMON_WARNING
 210   *     TF_WAKEMON_WARNING
 211   *
 212   * enable_disable == 1: turn it on
 213   * enable_disable == 0: turn it off
 214   */
 215  void
 216  telemetry_task_ctl(task_t task, uint32_t reasons, int enable_disable)
 217  {
 218  	task_lock(task);
 219  	telemetry_task_ctl_locked(task, reasons, enable_disable);
 220  	task_unlock(task);
 221  }
 222  
 223  void
 224  telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable)
 225  {
 226  	uint32_t origflags;
 227  
 228  	assert((reasons != 0) && ((reasons | TF_TELEMETRY) == TF_TELEMETRY));
 229  
 230  	task_lock_assert_owned(task);
 231  
 232  	origflags = task->t_flags;
 233  
 234  	if (enable_disable == 1) {
 235  		task->t_flags |= reasons;
 236  		if ((origflags & TF_TELEMETRY) == 0) {
 237  			OSIncrementAtomic(&telemetry_active_tasks);
 238  #if TELEMETRY_DEBUG
 239  			printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
 240  #endif
 241  		}
 242  	} else {
 243  		task->t_flags &= ~reasons;
 244  		if (((origflags & TF_TELEMETRY) != 0) && ((task->t_flags & TF_TELEMETRY) == 0)) {
 245  			/*
 246  			 * If this task went from having at least one telemetry bit to having none,
 247  			 * the net change was to disable telemetry for the task.
 248  			 */
 249  			OSDecrementAtomic(&telemetry_active_tasks);
 250  #if TELEMETRY_DEBUG
 251  			printf("%s: telemetry ON -> OFF (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
 252  #endif
 253  		}
 254  	}
 255  }
 256  
 257  /*
 258   * Determine if the current thread is eligible for telemetry:
 259   *
 260   * telemetry_sample_all_tasks: All threads are eligible. This takes precedence.
 261   * telemetry_active_tasks: Count of tasks opted in.
 262   * task->t_flags & TF_TELEMETRY: This task is opted in.
 263   */
 264  static boolean_t
 265  telemetry_is_active(thread_t thread)
 266  {
 267  	task_t task = thread->task;
 268  
 269  	if (task == kernel_task) {
 270  		/* Kernel threads never return to an AST boundary, and are ineligible */
 271  		return FALSE;
 272  	}
 273  
 274  	if (telemetry_sample_all_tasks || telemetry_sample_pmis) {
 275  		return TRUE;
 276  	}
 277  
 278  	if ((telemetry_active_tasks > 0) && ((thread->task->t_flags & TF_TELEMETRY) != 0)) {
 279  		return TRUE;
 280  	}
 281  
 282  	return FALSE;
 283  }
 284  
 285  /*
 286   * Userland is arming a timer. If we are eligible for such a record,
 287   * sample now. No need to do this one at the AST because we're already at
 288   * a safe place in this system call.
 289   */
 290  int
 291  telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval, __unused uint64_t leeway)
 292  {
 293  	if (telemetry_needs_timer_arming_record == TRUE) {
 294  		telemetry_needs_timer_arming_record = FALSE;
 295  		telemetry_take_sample(current_thread(), kTimerArmingRecord | kUserMode, &telemetry_buffer);
 296  	}
 297  
 298  	return 0;
 299  }
 300  
 301  #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
 302  static void
 303  telemetry_pmi_handler(bool user_mode, __unused void *ctx)
 304  {
 305  	telemetry_mark_curthread(user_mode, TRUE);
 306  }
 307  #endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
 308  
 309  int
 310  telemetry_pmi_setup(enum telemetry_pmi pmi_ctr, uint64_t period)
 311  {
 312  #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
 313  	static boolean_t sample_all_tasks_aside = FALSE;
 314  	static uint32_t active_tasks_aside = FALSE;
 315  	int error = 0;
 316  	const char *name = "?";
 317  
 318  	unsigned int ctr = 0;
 319  
 320  	TELEMETRY_PMI_LOCK();
 321  
 322  	switch (pmi_ctr) {
 323  	case TELEMETRY_PMI_NONE:
 324  		if (!telemetry_sample_pmis) {
 325  			error = 1;
 326  			goto out;
 327  		}
 328  
 329  		telemetry_sample_pmis = FALSE;
 330  		telemetry_sample_all_tasks = sample_all_tasks_aside;
 331  		telemetry_active_tasks = active_tasks_aside;
 332  		error = mt_microstackshot_stop();
 333  		if (!error) {
 334  			printf("telemetry: disabling ustackshot on PMI\n");
 335  		}
 336  		goto out;
 337  
 338  	case TELEMETRY_PMI_INSTRS:
 339  		ctr = MT_CORE_INSTRS;
 340  		name = "instructions";
 341  		break;
 342  
 343  	case TELEMETRY_PMI_CYCLES:
 344  		ctr = MT_CORE_CYCLES;
 345  		name = "cycles";
 346  		break;
 347  
 348  	default:
 349  		error = 1;
 350  		goto out;
 351  	}
 352  
 353  	telemetry_sample_pmis = TRUE;
 354  	sample_all_tasks_aside = telemetry_sample_all_tasks;
 355  	active_tasks_aside = telemetry_active_tasks;
 356  	telemetry_sample_all_tasks = FALSE;
 357  	telemetry_active_tasks = 0;
 358  
 359  	error = mt_microstackshot_start(ctr, period, telemetry_pmi_handler, NULL);
 360  	if (!error) {
 361  		printf("telemetry: ustackshot every %llu %s\n", period, name);
 362  	}
 363  
 364  out:
 365  	TELEMETRY_PMI_UNLOCK();
 366  	return error;
 367  #else /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
 368  #pragma unused(pmi_ctr, period)
 369  	return 1;
 370  #endif /* !defined(MT_CORE_INSTRS) || !defined(MT_CORE_CYCLES) */
 371  }
 372  
 373  /*
 374   * Mark the current thread for an interrupt-based
 375   * telemetry record, to be sampled at the next AST boundary.
 376   */
 377  void
 378  telemetry_mark_curthread(boolean_t interrupted_userspace, boolean_t pmi)
 379  {
 380  	uint32_t ast_bits = 0;
 381  	thread_t thread = current_thread();
 382  
 383  	/*
 384  	 * If telemetry isn't active for this thread, return and try
 385  	 * again next time.
 386  	 */
 387  	if (telemetry_is_active(thread) == FALSE) {
 388  		return;
 389  	}
 390  
 391  	ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL);
 392  	if (pmi) {
 393  		ast_bits |= AST_TELEMETRY_PMI;
 394  	}
 395  
 396  	telemetry_needs_record = FALSE;
 397  	thread_ast_set(thread, ast_bits);
 398  	ast_propagate(thread);
 399  }
 400  
 401  void
 402  compute_telemetry(void *arg __unused)
 403  {
 404  	if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) {
 405  		if ((++telemetry_timestamp) % telemetry_sample_rate == 0) {
 406  			telemetry_needs_record = TRUE;
 407  			telemetry_needs_timer_arming_record = TRUE;
 408  		}
 409  	}
 410  }
 411  
 412  /*
 413   * If userland has registered a port for telemetry notifications, send one now.
 414   */
 415  static void
 416  telemetry_notify_user(void)
 417  {
 418  	mach_port_t user_port = MACH_PORT_NULL;
 419  
 420  	kern_return_t kr = host_get_telemetry_port(host_priv_self(), &user_port);
 421  	if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
 422  		return;
 423  	}
 424  
 425  	telemetry_notification(user_port, 0);
 426  	ipc_port_release_send(user_port);
 427  }
 428  
 429  void
 430  telemetry_ast(thread_t thread, ast_t reasons)
 431  {
 432  	assert((reasons & AST_TELEMETRY_ALL) != 0);
 433  
 434  	uint8_t record_type = 0;
 435  	if (reasons & AST_TELEMETRY_IO) {
 436  		record_type |= kIORecord;
 437  	}
 438  	if (reasons & (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL)) {
 439  		record_type |= (reasons & AST_TELEMETRY_PMI) ? kPMIRecord :
 440  		    kInterruptRecord;
 441  	}
 442  
 443  	uint8_t user_telemetry = (reasons & AST_TELEMETRY_USER) ? kUserMode : 0;
 444  
 445  	uint8_t microsnapshot_flags = record_type | user_telemetry;
 446  
 447  	telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer);
 448  }
 449  
 450  void
 451  telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct micro_snapshot_buffer * current_buffer)
 452  {
 453  	task_t task;
 454  	void *p;
 455  	uint32_t btcount = 0, bti;
 456  	struct micro_snapshot *msnap;
 457  	struct task_snapshot *tsnap;
 458  	struct thread_snapshot *thsnap;
 459  	clock_sec_t secs;
 460  	clock_usec_t usecs;
 461  	vm_size_t framesize;
 462  	uint32_t current_record_start;
 463  	uint32_t tmp = 0;
 464  	boolean_t notify = FALSE;
 465  
 466  	if (thread == THREAD_NULL) {
 467  		return;
 468  	}
 469  
 470  	task = thread->task;
 471  	if ((task == TASK_NULL) || (task == kernel_task) || task_did_exec(task) || task_is_exec_copy(task)) {
 472  		return;
 473  	}
 474  
 475  	/* telemetry_XXX accessed outside of lock for instrumentation only */
 476  	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START,
 477  	    microsnapshot_flags, telemetry_bytes_since_last_mark, 0,
 478  	    (&telemetry_buffer != current_buffer));
 479  
 480  	p = get_bsdtask_info(task);
 481  
 482  	/*
 483  	 * Gather up the data we'll need for this sample. The sample is written into the kernel
 484  	 * buffer with the global telemetry lock held -- so we must do our (possibly faulting)
 485  	 * copies from userland here, before taking the lock.
 486  	 */
 487  
 488  	uintptr_t frames[128];
 489  	bool user64_regs = false;
 490  	int bterror = 0;
 491  	btcount = backtrace_user(frames,
 492  	    sizeof(frames) / sizeof(frames[0]), &bterror, &user64_regs, NULL);
 493  	if (bterror != 0) {
 494  		return;
 495  	}
 496  	bool user64_va = task_has_64Bit_addr(task);
 497  
 498  	/*
 499  	 * Retrieve the array of UUID's for binaries used by this task.
 500  	 * We reach down into DYLD's data structures to find the array.
 501  	 *
 502  	 * XXX - make this common with kdp?
 503  	 */
 504  	uint32_t uuid_info_count = 0;
 505  	mach_vm_address_t uuid_info_addr = 0;
 506  	uint32_t uuid_info_size = 0;
 507  	if (user64_va) {
 508  		uuid_info_size = sizeof(struct user64_dyld_uuid_info);
 509  		struct user64_dyld_all_image_infos task_image_infos;
 510  		if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
 511  			uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
 512  			uuid_info_addr = task_image_infos.uuidArray;
 513  		}
 514  	} else {
 515  		uuid_info_size = sizeof(struct user32_dyld_uuid_info);
 516  		struct user32_dyld_all_image_infos task_image_infos;
 517  		if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
 518  			uuid_info_count = task_image_infos.uuidArrayCount;
 519  			uuid_info_addr = task_image_infos.uuidArray;
 520  		}
 521  	}
 522  
 523  	/*
 524  	 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
 525  	 * this data structure), we zero the uuid_info_count so that we won't even try to save load info
 526  	 * for this task.
 527  	 */
 528  	if (!uuid_info_addr) {
 529  		uuid_info_count = 0;
 530  	}
 531  
 532  	/*
 533  	 * Don't copy in an unbounded amount of memory. The main binary and interesting
 534  	 * non-shared-cache libraries should be in the first few images.
 535  	 */
 536  	if (uuid_info_count > TELEMETRY_MAX_UUID_COUNT) {
 537  		uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
 538  	}
 539  
 540  	uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
 541  	char     *uuid_info_array = NULL;
 542  
 543  	if (uuid_info_count > 0) {
 544  		uuid_info_array = kheap_alloc(KHEAP_TEMP,
 545  		    uuid_info_array_size, Z_WAITOK);
 546  		if (uuid_info_array == NULL) {
 547  			return;
 548  		}
 549  
 550  		/*
 551  		 * Copy in the UUID info array.
 552  		 * It may be nonresident, in which case just fix up nloadinfos to 0 in the task snapshot.
 553  		 */
 554  		if (copyin(uuid_info_addr, uuid_info_array, uuid_info_array_size) != 0) {
 555  			kheap_free(KHEAP_TEMP, uuid_info_array, uuid_info_array_size);
 556  			uuid_info_array = NULL;
 557  			uuid_info_array_size = 0;
 558  		}
 559  	}
 560  
 561  	/*
 562  	 * Look for a dispatch queue serial number, and copy it in from userland if present.
 563  	 */
 564  	uint64_t dqserialnum = 0;
 565  	int              dqserialnum_valid = 0;
 566  
 567  	uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
 568  	if (dqkeyaddr != 0) {
 569  		uint64_t dqaddr = 0;
 570  		uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task);
 571  		if ((copyin(dqkeyaddr, (char *)&dqaddr, (user64_va ? 8 : 4)) == 0) &&
 572  		    (dqaddr != 0) && (dq_serialno_offset != 0)) {
 573  			uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
 574  			if (copyin(dqserialnumaddr, (char *)&dqserialnum, (user64_va ? 8 : 4)) == 0) {
 575  				dqserialnum_valid = 1;
 576  			}
 577  		}
 578  	}
 579  
 580  	clock_get_calendar_microtime(&secs, &usecs);
 581  
 582  	TELEMETRY_LOCK();
 583  
 584  	/*
 585  	 * If our buffer is not backed by anything,
 586  	 * then we cannot take the sample.  Meant to allow us to deallocate the window
 587  	 * buffer if it is disabled.
 588  	 */
 589  	if (!current_buffer->buffer) {
 590  		goto cancel_sample;
 591  	}
 592  
 593  	/*
 594  	 * We do the bulk of the operation under the telemetry lock, on assumption that
 595  	 * any page faults during execution will not cause another AST_TELEMETRY_ALL
 596  	 * to deadlock; they will just block until we finish. This makes it easier
 597  	 * to copy into the buffer directly. As soon as we unlock, userspace can copy
 598  	 * out of our buffer.
 599  	 */
 600  
 601  copytobuffer:
 602  
 603  	current_record_start = current_buffer->current_position;
 604  
 605  	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct micro_snapshot)) {
 606  		/*
 607  		 * We can't fit a record in the space available, so wrap around to the beginning.
 608  		 * Save the current position as the known end point of valid data.
 609  		 */
 610  		current_buffer->end_point = current_record_start;
 611  		current_buffer->current_position = 0;
 612  		if (current_record_start == 0) {
 613  			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 614  			goto cancel_sample;
 615  		}
 616  		goto copytobuffer;
 617  	}
 618  
 619  	msnap = (struct micro_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 620  	msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC;
 621  	msnap->ms_flags = microsnapshot_flags;
 622  	msnap->ms_opaque_flags = 0; /* namespace managed by userspace */
 623  	msnap->ms_cpu = cpu_number();
 624  	msnap->ms_time = secs;
 625  	msnap->ms_time_microsecs = usecs;
 626  
 627  	current_buffer->current_position += sizeof(struct micro_snapshot);
 628  
 629  	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct task_snapshot)) {
 630  		current_buffer->end_point = current_record_start;
 631  		current_buffer->current_position = 0;
 632  		if (current_record_start == 0) {
 633  			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 634  			goto cancel_sample;
 635  		}
 636  		goto copytobuffer;
 637  	}
 638  
 639  	tsnap = (struct task_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 640  	bzero(tsnap, sizeof(*tsnap));
 641  	tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC;
 642  	tsnap->pid = proc_pid(p);
 643  	tsnap->uniqueid = proc_uniqueid(p);
 644  	tsnap->user_time_in_terminated_threads = task->total_user_time;
 645  	tsnap->system_time_in_terminated_threads = task->total_system_time;
 646  	tsnap->suspend_count = task->suspend_count;
 647  	tsnap->task_size = (typeof(tsnap->task_size))(get_task_phys_footprint(task) / PAGE_SIZE);
 648  	tsnap->faults = counter_load(&task->faults);
 649  	tsnap->pageins = task->pageins;
 650  	tsnap->cow_faults = task->cow_faults;
 651  	/*
 652  	 * The throttling counters are maintained as 64-bit counters in the proc
 653  	 * structure. However, we reserve 32-bits (each) for them in the task_snapshot
 654  	 * struct to save space and since we do not expect them to overflow 32-bits. If we
 655  	 * find these values overflowing in the future, the fix would be to simply
 656  	 * upgrade these counters to 64-bit in the task_snapshot struct
 657  	 */
 658  	tsnap->was_throttled = (uint32_t) proc_was_throttled(p);
 659  	tsnap->did_throttle = (uint32_t) proc_did_throttle(p);
 660  
 661  	if (task->t_flags & TF_TELEMETRY) {
 662  		tsnap->ss_flags |= kTaskRsrcFlagged;
 663  	}
 664  
 665  	if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) {
 666  		tsnap->ss_flags |= kTaskDarwinBG;
 667  	}
 668  
 669  	proc_get_darwinbgstate(task, &tmp);
 670  
 671  	if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION) {
 672  		tsnap->ss_flags |= kTaskIsForeground;
 673  	}
 674  
 675  	if (tmp & PROC_FLAG_ADAPTIVE_IMPORTANT) {
 676  		tsnap->ss_flags |= kTaskIsBoosted;
 677  	}
 678  
 679  	if (tmp & PROC_FLAG_SUPPRESSED) {
 680  		tsnap->ss_flags |= kTaskIsSuppressed;
 681  	}
 682  
 683  
 684  	tsnap->latency_qos = task_grab_latency_qos(task);
 685  
 686  	strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
 687  	if (user64_va) {
 688  		tsnap->ss_flags |= kUser64_p;
 689  	}
 690  
 691  
 692  	if (task->task_shared_region_slide != -1) {
 693  		tsnap->shared_cache_slide = task->task_shared_region_slide;
 694  		bcopy(task->task_shared_region_uuid, tsnap->shared_cache_identifier,
 695  		    sizeof(task->task_shared_region_uuid));
 696  	}
 697  
 698  	current_buffer->current_position += sizeof(struct task_snapshot);
 699  
 700  	/*
 701  	 * Directly after the task snapshot, place the array of UUID's corresponding to the binaries
 702  	 * used by this task.
 703  	 */
 704  	if ((current_buffer->size - current_buffer->current_position) < uuid_info_array_size) {
 705  		current_buffer->end_point = current_record_start;
 706  		current_buffer->current_position = 0;
 707  		if (current_record_start == 0) {
 708  			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 709  			goto cancel_sample;
 710  		}
 711  		goto copytobuffer;
 712  	}
 713  
 714  	/*
 715  	 * Copy the UUID info array into our sample.
 716  	 */
 717  	if (uuid_info_array_size > 0) {
 718  		bcopy(uuid_info_array, (char *)(current_buffer->buffer + current_buffer->current_position), uuid_info_array_size);
 719  		tsnap->nloadinfos = uuid_info_count;
 720  	}
 721  
 722  	current_buffer->current_position += uuid_info_array_size;
 723  
 724  	/*
 725  	 * After the task snapshot & list of binary UUIDs, we place a thread snapshot.
 726  	 */
 727  
 728  	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) {
 729  		/* wrap and overwrite */
 730  		current_buffer->end_point = current_record_start;
 731  		current_buffer->current_position = 0;
 732  		if (current_record_start == 0) {
 733  			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 734  			goto cancel_sample;
 735  		}
 736  		goto copytobuffer;
 737  	}
 738  
 739  	thsnap = (struct thread_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
 740  	bzero(thsnap, sizeof(*thsnap));
 741  
 742  	thsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC;
 743  	thsnap->thread_id = thread_tid(thread);
 744  	thsnap->state = thread->state;
 745  	thsnap->priority = thread->base_pri;
 746  	thsnap->sched_pri = thread->sched_pri;
 747  	thsnap->sched_flags = thread->sched_flags;
 748  	thsnap->ss_flags |= kStacksPCOnly;
 749  	thsnap->ts_qos = thread->effective_policy.thep_qos;
 750  	thsnap->ts_rqos = thread->requested_policy.thrp_qos;
 751  	thsnap->ts_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
 752  	    thread->requested_policy.thrp_qos_workq_override);
 753  
 754  	if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) {
 755  		thsnap->ss_flags |= kThreadDarwinBG;
 756  	}
 757  
 758  	thsnap->user_time = timer_grab(&thread->user_timer);
 759  
 760  	uint64_t tval = timer_grab(&thread->system_timer);
 761  
 762  	if (thread->precise_user_kernel_time) {
 763  		thsnap->system_time = tval;
 764  	} else {
 765  		thsnap->user_time += tval;
 766  		thsnap->system_time = 0;
 767  	}
 768  
 769  	current_buffer->current_position += sizeof(struct thread_snapshot);
 770  
 771  	/*
 772  	 * If this thread has a dispatch queue serial number, include it here.
 773  	 */
 774  	if (dqserialnum_valid) {
 775  		if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) {
 776  			/* wrap and overwrite */
 777  			current_buffer->end_point = current_record_start;
 778  			current_buffer->current_position = 0;
 779  			if (current_record_start == 0) {
 780  				/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 781  				goto cancel_sample;
 782  			}
 783  			goto copytobuffer;
 784  		}
 785  
 786  		thsnap->ss_flags |= kHasDispatchSerial;
 787  		bcopy(&dqserialnum, (char *)current_buffer->buffer + current_buffer->current_position, sizeof(dqserialnum));
 788  		current_buffer->current_position += sizeof(dqserialnum);
 789  	}
 790  
 791  	if (user64_regs) {
 792  		framesize = 8;
 793  		thsnap->ss_flags |= kUser64_p;
 794  	} else {
 795  		framesize = 4;
 796  	}
 797  
 798  	/*
 799  	 * If we can't fit this entire stacktrace then cancel this record, wrap to the beginning,
 800  	 * and start again there so that we always store a full record.
 801  	 */
 802  	if ((current_buffer->size - current_buffer->current_position) / framesize < btcount) {
 803  		current_buffer->end_point = current_record_start;
 804  		current_buffer->current_position = 0;
 805  		if (current_record_start == 0) {
 806  			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
 807  			goto cancel_sample;
 808  		}
 809  		goto copytobuffer;
 810  	}
 811  
 812  	for (bti = 0; bti < btcount; bti++, current_buffer->current_position += framesize) {
 813  		if (framesize == 8) {
 814  			*(uint64_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = frames[bti];
 815  		} else {
 816  			*(uint32_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = (uint32_t)frames[bti];
 817  		}
 818  	}
 819  
 820  	if (current_buffer->end_point < current_buffer->current_position) {
 821  		/*
 822  		 * Each time the cursor wraps around to the beginning, we leave a
 823  		 * differing amount of unused space at the end of the buffer. Make
 824  		 * sure the cursor pushes the end point in case we're making use of
 825  		 * more of the buffer than we did the last time we wrapped.
 826  		 */
 827  		current_buffer->end_point = current_buffer->current_position;
 828  	}
 829  
 830  	thsnap->nuser_frames = btcount;
 831  
 832  	/*
 833  	 * Now THIS is a hack.
 834  	 */
 835  	if (current_buffer == &telemetry_buffer) {
 836  		telemetry_bytes_since_last_mark += (current_buffer->current_position - current_record_start);
 837  		if (telemetry_bytes_since_last_mark > telemetry_buffer_notify_at) {
 838  			notify = TRUE;
 839  		}
 840  	}
 841  
 842  cancel_sample:
 843  	TELEMETRY_UNLOCK();
 844  
 845  	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END,
 846  	    notify, telemetry_bytes_since_last_mark,
 847  	    current_buffer->current_position, current_buffer->end_point);
 848  
 849  	if (notify) {
 850  		telemetry_notify_user();
 851  	}
 852  
 853  	if (uuid_info_array != NULL) {
 854  		kheap_free(KHEAP_TEMP, uuid_info_array, uuid_info_array_size);
 855  	}
 856  }
 857  
 858  #if TELEMETRY_DEBUG
 859  static void
 860  log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz)
 861  {
 862  	struct micro_snapshot *p;
 863  	uint32_t offset;
 864  
 865  	printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos);
 866  
 867  	buf += pos;
 868  
 869  	/*
 870  	 * Find and log each timestamp in this chunk of buffer.
 871  	 */
 872  	for (offset = 0; offset < sz; offset++) {
 873  		p = (struct micro_snapshot *)(buf + offset);
 874  		if (p->snapshot_magic == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
 875  			printf("telemetry timestamp: %lld\n", p->ms_time);
 876  		}
 877  	}
 878  }
 879  #endif
 880  
 881  int
 882  telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark)
 883  {
 884  	return telemetry_buffer_gather(buffer, length, mark, &telemetry_buffer);
 885  }
 886  
 887  int
 888  telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark, struct micro_snapshot_buffer * current_buffer)
 889  {
 890  	int result = 0;
 891  	uint32_t oldest_record_offset;
 892  
 893  	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START,
 894  	    mark, telemetry_bytes_since_last_mark, 0,
 895  	    (&telemetry_buffer != current_buffer));
 896  
 897  	TELEMETRY_LOCK();
 898  
 899  	if (current_buffer->buffer == 0) {
 900  		*length = 0;
 901  		goto out;
 902  	}
 903  
 904  	if (*length < current_buffer->size) {
 905  		result = KERN_NO_SPACE;
 906  		goto out;
 907  	}
 908  
 909  	/*
 910  	 * Copy the ring buffer out to userland in order sorted by time: least recent to most recent.
 911  	 * First, we need to search forward from the cursor to find the oldest record in our buffer.
 912  	 */
 913  	oldest_record_offset = current_buffer->current_position;
 914  	do {
 915  		if (((oldest_record_offset + sizeof(uint32_t)) > current_buffer->size) ||
 916  		    ((oldest_record_offset + sizeof(uint32_t)) > current_buffer->end_point)) {
 917  			if (*(uint32_t *)(uintptr_t)(current_buffer->buffer) == 0) {
 918  				/*
 919  				 * There is no magic number at the start of the buffer, which means
 920  				 * it's empty; nothing to see here yet.
 921  				 */
 922  				*length = 0;
 923  				goto out;
 924  			}
 925  			/*
 926  			 * We've looked through the end of the active buffer without finding a valid
 927  			 * record; that means all valid records are in a single chunk, beginning at
 928  			 * the very start of the buffer.
 929  			 */
 930  
 931  			oldest_record_offset = 0;
 932  			assert(*(uint32_t *)(uintptr_t)(current_buffer->buffer) == STACKSHOT_MICRO_SNAPSHOT_MAGIC);
 933  			break;
 934  		}
 935  
 936  		if (*(uint32_t *)(uintptr_t)(current_buffer->buffer + oldest_record_offset) == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
 937  			break;
 938  		}
 939  
 940  		/*
 941  		 * There are no alignment guarantees for micro-stackshot records, so we must search at each
 942  		 * byte offset.
 943  		 */
 944  		oldest_record_offset++;
 945  	} while (oldest_record_offset != current_buffer->current_position);
 946  
 947  	/*
 948  	 * If needed, copyout in two chunks: from the oldest record to the end of the buffer, and then
 949  	 * from the beginning of the buffer up to the current position.
 950  	 */
 951  	if (oldest_record_offset != 0) {
 952  #if TELEMETRY_DEBUG
 953  		log_telemetry_output(current_buffer->buffer, oldest_record_offset,
 954  		    current_buffer->end_point - oldest_record_offset);
 955  #endif
 956  		if ((result = copyout((void *)(current_buffer->buffer + oldest_record_offset), buffer,
 957  		    current_buffer->end_point - oldest_record_offset)) != 0) {
 958  			*length = 0;
 959  			goto out;
 960  		}
 961  		*length = current_buffer->end_point - oldest_record_offset;
 962  	} else {
 963  		*length = 0;
 964  	}
 965  
 966  #if TELEMETRY_DEBUG
 967  	log_telemetry_output(current_buffer->buffer, 0, current_buffer->current_position);
 968  #endif
 969  	if ((result = copyout((void *)current_buffer->buffer, buffer + *length,
 970  	    current_buffer->current_position)) != 0) {
 971  		*length = 0;
 972  		goto out;
 973  	}
 974  	*length += (uint32_t)current_buffer->current_position;
 975  
 976  out:
 977  
 978  	if (mark && (*length > 0)) {
 979  		telemetry_bytes_since_last_mark = 0;
 980  	}
 981  
 982  	TELEMETRY_UNLOCK();
 983  
 984  	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END,
 985  	    current_buffer->current_position, *length,
 986  	    current_buffer->end_point, (&telemetry_buffer != current_buffer));
 987  
 988  	return result;
 989  }
 990  
 991  /************************/
 992  /* BOOT PROFILE SUPPORT */
 993  /************************/
 994  /*
 995   * Boot Profiling
 996   *
 997   * The boot-profiling support is a mechanism to sample activity happening on the
 998   * system during boot. This mechanism sets up a periodic timer and on every timer fire,
 999   * captures a full backtrace into the boot profiling buffer. This buffer can be pulled
1000   * out and analyzed from user-space. It is turned on using the following boot-args:
1001   * "bootprofile_buffer_size" specifies the size of the boot profile buffer
1002   * "bootprofile_interval_ms" specifies the interval for the profiling timer
1003   *
1004   * Process Specific Boot Profiling
1005   *
1006   * The boot-arg "bootprofile_proc_name" can be used to specify a certain
1007   * process that needs to profiled during boot. Setting this boot-arg changes
1008   * the way stackshots are captured. At every timer fire, the code looks at the
1009   * currently running process and takes a stackshot only if the requested process
1010   * is on-core (which makes it unsuitable for MP systems).
1011   *
1012   * Trigger Events
1013   *
1014   * The boot-arg "bootprofile_type=boot" starts the timer during early boot. Using
1015   * "wake" starts the timer at AP wake from suspend-to-RAM.
1016   */
1017  
1018  #define BOOTPROFILE_MAX_BUFFER_SIZE (64*1024*1024) /* see also COPYSIZELIMIT_PANIC */
1019  
1020  vm_offset_t         bootprofile_buffer = 0;
1021  uint32_t            bootprofile_buffer_size = 0;
1022  uint32_t            bootprofile_buffer_current_position = 0;
1023  uint32_t            bootprofile_interval_ms = 0;
1024  uint64_t            bootprofile_stackshot_flags = 0;
1025  uint64_t            bootprofile_interval_abs = 0;
1026  uint64_t            bootprofile_next_deadline = 0;
1027  uint32_t            bootprofile_all_procs = 0;
1028  char                bootprofile_proc_name[17];
1029  uint64_t            bootprofile_delta_since_timestamp = 0;
1030  LCK_GRP_DECLARE(bootprofile_lck_grp, "bootprofile_group");
1031  LCK_MTX_DECLARE(bootprofile_mtx, &bootprofile_lck_grp);
1032  
1033  
1034  enum {
1035  	kBootProfileDisabled = 0,
1036  	kBootProfileStartTimerAtBoot,
1037  	kBootProfileStartTimerAtWake
1038  } bootprofile_type = kBootProfileDisabled;
1039  
1040  
1041  static timer_call_data_t        bootprofile_timer_call_entry;
1042  
1043  #define BOOTPROFILE_LOCK() do { lck_mtx_lock(&bootprofile_mtx); } while(0)
1044  #define BOOTPROFILE_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&bootprofile_mtx)
1045  #define BOOTPROFILE_UNLOCK() do { lck_mtx_unlock(&bootprofile_mtx); } while(0)
1046  
1047  static void bootprofile_timer_call(
1048  	timer_call_param_t      param0,
1049  	timer_call_param_t      param1);
1050  
1051  void
1052  bootprofile_init(void)
1053  {
1054  	kern_return_t ret;
1055  	char type[32];
1056  
1057  	if (!PE_parse_boot_argn("bootprofile_buffer_size",
1058  	    &bootprofile_buffer_size, sizeof(bootprofile_buffer_size))) {
1059  		bootprofile_buffer_size = 0;
1060  	}
1061  
1062  	if (bootprofile_buffer_size > BOOTPROFILE_MAX_BUFFER_SIZE) {
1063  		bootprofile_buffer_size = BOOTPROFILE_MAX_BUFFER_SIZE;
1064  	}
1065  
1066  	if (!PE_parse_boot_argn("bootprofile_interval_ms",
1067  	    &bootprofile_interval_ms, sizeof(bootprofile_interval_ms))) {
1068  		bootprofile_interval_ms = 0;
1069  	}
1070  
1071  	if (!PE_parse_boot_argn("bootprofile_stackshot_flags",
1072  	    &bootprofile_stackshot_flags, sizeof(bootprofile_stackshot_flags))) {
1073  		bootprofile_stackshot_flags = 0;
1074  	}
1075  
1076  	if (!PE_parse_boot_argn("bootprofile_proc_name",
1077  	    &bootprofile_proc_name, sizeof(bootprofile_proc_name))) {
1078  		bootprofile_all_procs = 1;
1079  		bootprofile_proc_name[0] = '\0';
1080  	}
1081  
1082  	if (PE_parse_boot_argn("bootprofile_type", type, sizeof(type))) {
1083  		if (0 == strcmp(type, "boot")) {
1084  			bootprofile_type = kBootProfileStartTimerAtBoot;
1085  		} else if (0 == strcmp(type, "wake")) {
1086  			bootprofile_type = kBootProfileStartTimerAtWake;
1087  		} else {
1088  			bootprofile_type = kBootProfileDisabled;
1089  		}
1090  	} else {
1091  		bootprofile_type = kBootProfileDisabled;
1092  	}
1093  
1094  	clock_interval_to_absolutetime_interval(bootprofile_interval_ms, NSEC_PER_MSEC, &bootprofile_interval_abs);
1095  
1096  	/* Both boot args must be set to enable */
1097  	if ((bootprofile_type == kBootProfileDisabled) || (bootprofile_buffer_size == 0) || (bootprofile_interval_abs == 0)) {
1098  		return;
1099  	}
1100  
1101  	ret = kmem_alloc(kernel_map, &bootprofile_buffer, bootprofile_buffer_size, VM_KERN_MEMORY_DIAG);
1102  	if (ret != KERN_SUCCESS) {
1103  		kprintf("Boot profile: Allocation failed: %d\n", ret);
1104  		return;
1105  	}
1106  	bzero((void *) bootprofile_buffer, bootprofile_buffer_size);
1107  
1108  	kprintf("Boot profile: Sampling %s once per %u ms at %s\n",
1109  	    bootprofile_all_procs ? "all procs" : bootprofile_proc_name, bootprofile_interval_ms,
1110  	    bootprofile_type == kBootProfileStartTimerAtBoot ? "boot" : (bootprofile_type == kBootProfileStartTimerAtWake ? "wake" : "unknown"));
1111  
1112  	timer_call_setup(&bootprofile_timer_call_entry,
1113  	    bootprofile_timer_call,
1114  	    NULL);
1115  
1116  	if (bootprofile_type == kBootProfileStartTimerAtBoot) {
1117  		bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1118  		timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1119  		    NULL,
1120  		    bootprofile_next_deadline,
1121  		    0,
1122  		    TIMER_CALL_SYS_NORMAL,
1123  		    FALSE);
1124  	}
1125  }
1126  
1127  void
1128  bootprofile_wake_from_sleep(void)
1129  {
1130  	if (bootprofile_type == kBootProfileStartTimerAtWake) {
1131  		bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1132  		timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1133  		    NULL,
1134  		    bootprofile_next_deadline,
1135  		    0,
1136  		    TIMER_CALL_SYS_NORMAL,
1137  		    FALSE);
1138  	}
1139  }
1140  
1141  
1142  static void
1143  bootprofile_timer_call(
1144  	timer_call_param_t      param0 __unused,
1145  	timer_call_param_t      param1 __unused)
1146  {
1147  	unsigned retbytes = 0;
1148  	int pid_to_profile = -1;
1149  
1150  	if (!BOOTPROFILE_TRY_SPIN_LOCK()) {
1151  		goto reprogram;
1152  	}
1153  
1154  	/* Check if process-specific boot profiling is turned on */
1155  	if (!bootprofile_all_procs) {
1156  		/*
1157  		 * Since boot profiling initializes really early in boot, it is
1158  		 * possible that at this point, the task/proc is not initialized.
1159  		 * Nothing to do in that case.
1160  		 */
1161  
1162  		if ((current_task() != NULL) && (current_task()->bsd_info != NULL) &&
1163  		    (0 == strncmp(bootprofile_proc_name, proc_name_address(current_task()->bsd_info), 17))) {
1164  			pid_to_profile = proc_selfpid();
1165  		} else {
1166  			/*
1167  			 * Process-specific boot profiling requested but the on-core process is
1168  			 * something else. Nothing to do here.
1169  			 */
1170  			BOOTPROFILE_UNLOCK();
1171  			goto reprogram;
1172  		}
1173  	}
1174  
1175  	/* initiate a stackshot with whatever portion of the buffer is left */
1176  	if (bootprofile_buffer_current_position < bootprofile_buffer_size) {
1177  		uint64_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO
1178  		    | STACKSHOT_GET_GLOBAL_MEM_STATS;
1179  #if defined(XNU_TARGET_OS_OSX)
1180  		flags |= STACKSHOT_SAVE_KEXT_LOADINFO;
1181  #endif
1182  
1183  
1184  		/* OR on flags specified in boot-args */
1185  		flags |= bootprofile_stackshot_flags;
1186  		if ((flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) && (bootprofile_delta_since_timestamp == 0)) {
1187  			/* Can't take deltas until the first one */
1188  			flags &= ~STACKSHOT_COLLECT_DELTA_SNAPSHOT;
1189  		}
1190  
1191  		uint64_t timestamp = 0;
1192  		if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) {
1193  			timestamp = mach_absolute_time();
1194  		}
1195  
1196  		kern_return_t r = stack_snapshot_from_kernel(
1197  			pid_to_profile, (void *)(bootprofile_buffer + bootprofile_buffer_current_position),
1198  			bootprofile_buffer_size - bootprofile_buffer_current_position,
1199  			flags, bootprofile_delta_since_timestamp, 0, &retbytes);
1200  
1201  		/*
1202  		 * We call with STACKSHOT_TRYLOCK because the stackshot lock is coarser
1203  		 * than the bootprofile lock.  If someone else has the lock we'll just
1204  		 * try again later.
1205  		 */
1206  
1207  		if (r == KERN_LOCK_OWNED) {
1208  			BOOTPROFILE_UNLOCK();
1209  			goto reprogram;
1210  		}
1211  
1212  		if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT &&
1213  		    r == KERN_SUCCESS) {
1214  			bootprofile_delta_since_timestamp = timestamp;
1215  		}
1216  
1217  		bootprofile_buffer_current_position += retbytes;
1218  	}
1219  
1220  	BOOTPROFILE_UNLOCK();
1221  
1222  	/* If we didn't get any data or have run out of buffer space, stop profiling */
1223  	if ((retbytes == 0) || (bootprofile_buffer_current_position == bootprofile_buffer_size)) {
1224  		return;
1225  	}
1226  
1227  
1228  reprogram:
1229  	/* If the user gathered the buffer, no need to keep profiling */
1230  	if (bootprofile_interval_abs == 0) {
1231  		return;
1232  	}
1233  
1234  	clock_deadline_for_periodic_event(bootprofile_interval_abs,
1235  	    mach_absolute_time(),
1236  	    &bootprofile_next_deadline);
1237  	timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1238  	    NULL,
1239  	    bootprofile_next_deadline,
1240  	    0,
1241  	    TIMER_CALL_SYS_NORMAL,
1242  	    FALSE);
1243  }
1244  
1245  void
1246  bootprofile_get(void **buffer, uint32_t *length)
1247  {
1248  	BOOTPROFILE_LOCK();
1249  	*buffer = (void*) bootprofile_buffer;
1250  	*length = bootprofile_buffer_current_position;
1251  	BOOTPROFILE_UNLOCK();
1252  }
1253  
1254  int
1255  bootprofile_gather(user_addr_t buffer, uint32_t *length)
1256  {
1257  	int result = 0;
1258  
1259  	BOOTPROFILE_LOCK();
1260  
1261  	if (bootprofile_buffer == 0) {
1262  		*length = 0;
1263  		goto out;
1264  	}
1265  
1266  	if (*length < bootprofile_buffer_current_position) {
1267  		result = KERN_NO_SPACE;
1268  		goto out;
1269  	}
1270  
1271  	if ((result = copyout((void *)bootprofile_buffer, buffer,
1272  	    bootprofile_buffer_current_position)) != 0) {
1273  		*length = 0;
1274  		goto out;
1275  	}
1276  	*length = bootprofile_buffer_current_position;
1277  
1278  	/* cancel future timers */
1279  	bootprofile_interval_abs = 0;
1280  
1281  out:
1282  
1283  	BOOTPROFILE_UNLOCK();
1284  
1285  	return result;
1286  }