Cradicle Explorer

kern_support.c
   1  /*
   2   * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. The rights granted to you under the License
  10   * may not be used to create, or enable the creation or redistribution of,
  11   * unlawful or unlicensed copies of an Apple operating system, or to
  12   * circumvent, violate, or enable the circumvention or violation of, any
  13   * terms of an Apple operating system software license agreement.
  14   *
  15   * Please obtain a copy of the License at
  16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17   *
  18   * The Original Code and all software distributed under the License are
  19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23   * Please see the License for the specific language governing rights and
  24   * limitations under the License.
  25   *
  26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27   */
  28  /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29  /*
  30   *	pthread_synch.c
  31   */
  32  
  33  #pragma mark - Front Matter
  34  
  35  #define _PTHREAD_CONDATTR_T
  36  #define _PTHREAD_COND_T
  37  #define _PTHREAD_MUTEXATTR_T
  38  #define _PTHREAD_MUTEX_T
  39  #define _PTHREAD_RWLOCKATTR_T
  40  #define _PTHREAD_RWLOCK_T
  41  
  42  #undef pthread_mutexattr_t
  43  #undef pthread_mutex_t
  44  #undef pthread_condattr_t
  45  #undef pthread_cond_t
  46  #undef pthread_rwlockattr_t
  47  #undef pthread_rwlock_t
  48  
  49  #include <sys/cdefs.h>
  50  #include <os/log.h>
  51  
  52  // <rdar://problem/26158937> panic() should be marked noreturn
  53  extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  54  
  55  #include <sys/param.h>
  56  #include <sys/queue.h>
  57  #include <sys/resourcevar.h>
  58  //#include <sys/proc_internal.h>
  59  #include <sys/kauth.h>
  60  #include <sys/systm.h>
  61  #include <sys/timeb.h>
  62  #include <sys/times.h>
  63  #include <sys/acct.h>
  64  #include <sys/kernel.h>
  65  #include <sys/wait.h>
  66  #include <sys/signalvar.h>
  67  #include <sys/sysctl.h>
  68  #include <sys/syslog.h>
  69  #include <sys/stat.h>
  70  #include <sys/lock.h>
  71  #include <sys/kdebug.h>
  72  //#include <sys/sysproto.h>
  73  #include <sys/vm.h>
  74  #include <sys/user.h>		/* for coredump */
  75  #include <sys/proc_info.h>	/* for fill_procworkqueue */
  76  
  77  #include <mach/mach_port.h>
  78  #include <mach/mach_types.h>
  79  #include <mach/semaphore.h>
  80  #include <mach/sync_policy.h>
  81  #include <mach/task.h>
  82  #include <mach/vm_prot.h>
  83  #include <kern/kern_types.h>
  84  #include <kern/task.h>
  85  #include <kern/clock.h>
  86  #include <mach/kern_return.h>
  87  #include <kern/thread.h>
  88  #include <kern/zalloc.h>
  89  #include <kern/sched_prim.h>	/* for thread_exception_return */
  90  #include <kern/processor.h>
  91  #include <kern/assert.h>
  92  #include <mach/mach_vm.h>
  93  #include <mach/mach_param.h>
  94  #include <mach/thread_status.h>
  95  #include <mach/thread_policy.h>
  96  #include <mach/message.h>
  97  #include <mach/port.h>
  98  //#include <vm/vm_protos.h>
  99  #include <vm/vm_fault.h>
 100  #include <vm/vm_map.h>
 101  #include <mach/thread_act.h> /* for thread_resume */
 102  #include <machine/machine_routines.h>
 103  #include <mach/shared_region.h>
 104  
 105  #include <libkern/OSAtomic.h>
 106  #include <libkern/libkern.h>
 107  
 108  #include "kern_internal.h"
 109  
 110  #ifndef WQ_SETUP_EXIT_THREAD
 111  #define WQ_SETUP_EXIT_THREAD    8
 112  #endif
 113  
 114  // XXX: Ditto for thread tags from kern/thread.h
 115  #define	THREAD_TAG_MAINTHREAD 0x1
 116  #define	THREAD_TAG_PTHREAD 0x10
 117  #define	THREAD_TAG_WORKQUEUE 0x20
 118  
 119  lck_grp_attr_t   *pthread_lck_grp_attr;
 120  lck_grp_t    *pthread_lck_grp;
 121  lck_attr_t   *pthread_lck_attr;
 122  
 123  #define C_32_STK_ALIGN          16
 124  #define C_64_STK_ALIGN          16
 125  
 126  // WORKQ use the largest alignment any platform needs
 127  #define C_WORKQ_STK_ALIGN       16
 128  
 129  #if defined(__arm64__)
 130  /* Pull the pthread_t into the same page as the top of the stack so we dirty one less page.
 131   * <rdar://problem/19941744> The _pthread struct at the top of the stack shouldn't be page-aligned
 132   */
 133  #define PTHREAD_T_OFFSET (12*1024)
 134  #else
 135  #define PTHREAD_T_OFFSET 0
 136  #endif
 137  
 138  /*
 139   * Flags filed passed to bsdthread_create and back in pthread_start
 140  31  <---------------------------------> 0
 141  _________________________________________
 142  | flags(8) | policy(8) | importance(16) |
 143  -----------------------------------------
 144  */
 145  
 146  #define PTHREAD_START_CUSTOM		0x01000000 // <rdar://problem/34501401>
 147  #define PTHREAD_START_SETSCHED		0x02000000
 148  // was PTHREAD_START_DETACHED		0x04000000
 149  #define PTHREAD_START_QOSCLASS		0x08000000
 150  #define PTHREAD_START_TSD_BASE_SET	0x10000000
 151  #define PTHREAD_START_SUSPENDED		0x20000000
 152  #define PTHREAD_START_QOSCLASS_MASK	0x00ffffff
 153  #define PTHREAD_START_POLICY_BITSHIFT 16
 154  #define PTHREAD_START_POLICY_MASK 0xff
 155  #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 156  
 157  #define SCHED_OTHER      POLICY_TIMESHARE
 158  #define SCHED_FIFO       POLICY_FIFO
 159  #define SCHED_RR         POLICY_RR
 160  
 161  #define BASEPRI_DEFAULT 31
 162  
 163  uint32_t pthread_debug_tracing = 1;
 164  
 165  static uint32_t pthread_mutex_default_policy;
 166  
 167  SYSCTL_INT(_kern, OID_AUTO, pthread_mutex_default_policy, CTLFLAG_RW | CTLFLAG_LOCKED,
 168  	   &pthread_mutex_default_policy, 0, "");
 169  
 170  #pragma mark - Process/Thread Setup/Teardown syscalls
 171  
 172  static mach_vm_offset_t
 173  stack_addr_hint(proc_t p, vm_map_t vmap)
 174  {
 175  	mach_vm_offset_t stackaddr;
 176  	mach_vm_offset_t aslr_offset;
 177  	bool proc64bit = proc_is64bit(p);
 178  	bool proc64bit_data = proc_is64bit_data(p);
 179  
 180  	// We can't safely take random values % something unless its a power-of-two
 181  	_Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 182  
 183  #if defined(__i386__) || defined(__x86_64__)
 184  	(void)proc64bit_data;
 185  	if (proc64bit) {
 186  		// Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 187  		aslr_offset = random() % (1 << 28); // about 512 stacks
 188  	} else {
 189  		// Actually bigger than the image shift, we've got ~256MB to work with
 190  		aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 191  	}
 192  	aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 193  	if (proc64bit) {
 194  		// Above nanomalloc range (see NANOZONE_SIGNATURE)
 195  		stackaddr = 0x700000000000 + aslr_offset;
 196  	} else {
 197  		stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 198  	}
 199  #elif defined(__arm__) || defined(__arm64__)
 200  	user_addr_t main_thread_stack_top = 0;
 201  	if (pthread_kern->proc_get_user_stack) {
 202  		main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
 203  	}
 204  	if (proc64bit && main_thread_stack_top) {
 205  		// The main thread stack position is randomly slid by xnu (c.f.
 206  		// load_main() in mach_loader.c), so basing pthread stack allocations
 207  		// where the main thread stack ends is already ASLRd and doing so
 208  		// avoids creating a gap in the process address space that may cause
 209  		// extra PTE memory usage. rdar://problem/33328206
 210  		stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
 211  				vm_map_page_mask(vmap));
 212  	} else {
 213  		// vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 214  		aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 215  		aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
 216  				vm_map_page_mask(vmap));
 217  		if (proc64bit) {
 218  			// 64 stacks below shared region
 219  			stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 220  		} else {
 221  			// If you try to slide down from this point, you risk ending up in memory consumed by malloc
 222  			if (proc64bit_data) {
 223  				stackaddr = SHARED_REGION_BASE_ARM64_32;
 224  			} else {
 225  				stackaddr = SHARED_REGION_BASE_ARM;
 226  			}
 227  
 228  			stackaddr -= 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 229  		}
 230  	}
 231  #else
 232  #error Need to define a stack address hint for this architecture
 233  #endif
 234  	return stackaddr;
 235  }
 236  
 237  static bool
 238  _pthread_priority_to_policy(pthread_priority_t priority,
 239  		thread_qos_policy_data_t *data)
 240  {
 241  	data->qos_tier = _pthread_priority_thread_qos(priority);
 242  	data->tier_importance = _pthread_priority_relpri(priority);
 243  	if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
 244  			data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
 245  		return false;
 246  	}
 247  	return true;
 248  }
 249  
 250  /**
 251   * bsdthread_create system call.  Used by pthread_create.
 252   */
 253  int
 254  _bsdthread_create(struct proc *p,
 255  		__unused user_addr_t user_func, __unused user_addr_t user_funcarg,
 256  		user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags,
 257  		user_addr_t *retval)
 258  {
 259  	kern_return_t kret;
 260  	void * sright;
 261  	int error = 0;
 262  	mach_vm_offset_t th_tsd_base;
 263  	mach_port_name_t th_thport;
 264  	thread_t th;
 265  	task_t ctask = current_task();
 266  	unsigned int policy, importance;
 267  	uint32_t tsd_offset;
 268  	bool start_suspended = (flags & PTHREAD_START_SUSPENDED);
 269  
 270  	if (pthread_kern->proc_get_register(p) == 0) {
 271  		return EINVAL;
 272  	}
 273  
 274  	PTHREAD_TRACE(pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0);
 275  
 276  	kret = pthread_kern->thread_create(ctask, &th);
 277  	if (kret != KERN_SUCCESS)
 278  		return(ENOMEM);
 279  	thread_reference(th);
 280  
 281  	pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 282  
 283  	sright = (void *)pthread_kern->convert_thread_to_port(th);
 284  	th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 285  	if (!MACH_PORT_VALID(th_thport)) {
 286  		error = EMFILE; // userland will convert this into a crash
 287  		goto out;
 288  	}
 289  
 290  	if ((flags & PTHREAD_START_CUSTOM) == 0) {
 291  		error = EINVAL;
 292  		goto out;
 293  	}
 294  
 295  	PTHREAD_TRACE(pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3);
 296  
 297  	tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 298  	if (tsd_offset) {
 299  		th_tsd_base = user_pthread + tsd_offset;
 300  		kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 301  		if (kret == KERN_SUCCESS) {
 302  			flags |= PTHREAD_START_TSD_BASE_SET;
 303  		}
 304  	}
 305  	/*
 306  	 * Strip PTHREAD_START_SUSPENDED so that libpthread can observe the kernel
 307  	 * supports this flag (after the fact).
 308  	 */
 309  	flags &= ~PTHREAD_START_SUSPENDED;
 310  
 311  	/*
 312  	 * Set up registers & function call.
 313  	 */
 314  #if defined(__i386__) || defined(__x86_64__)
 315  	if (proc_is64bit_data(p)) {
 316  		x86_thread_state64_t state = {
 317  			.rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 318  			.rdi = (uint64_t)user_pthread,
 319  			.rsi = (uint64_t)th_thport,
 320  			.rdx = (uint64_t)user_func,    /* golang wants this */
 321  			.rcx = (uint64_t)user_funcarg, /* golang wants this */
 322  			.r8  = (uint64_t)user_stack,   /* golang wants this */
 323  			.r9  = (uint64_t)flags,
 324  
 325  			.rsp = (uint64_t)user_stack,
 326  		};
 327  
 328  		(void)pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state);
 329  	} else {
 330  		x86_thread_state32_t state = {
 331  			.eip = (uint32_t)pthread_kern->proc_get_threadstart(p),
 332  			.eax = (uint32_t)user_pthread,
 333  			.ebx = (uint32_t)th_thport,
 334  			.ecx = (uint32_t)user_func,    /* golang wants this */
 335  			.edx = (uint32_t)user_funcarg, /* golang wants this */
 336  			.edi = (uint32_t)user_stack,   /* golang wants this */
 337  			.esi = (uint32_t)flags,
 338  
 339  			.esp = (uint32_t)user_stack,
 340  		};
 341  
 342  		(void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 343  	}
 344  #elif defined(__arm__) || defined(__arm64__)
 345  	if (proc_is64bit_data(p)) {
 346  #ifdef __arm64__
 347  		arm_thread_state64_t state = {
 348  			.pc   = (uint64_t)pthread_kern->proc_get_threadstart(p),
 349  			.x[0] = (uint64_t)user_pthread,
 350  			.x[1] = (uint64_t)th_thport,
 351  			.x[2] = (uint64_t)user_func,    /* golang wants this */
 352  			.x[3] = (uint64_t)user_funcarg, /* golang wants this */
 353  			.x[4] = (uint64_t)user_stack,   /* golang wants this */
 354  			.x[5] = (uint64_t)flags,
 355  
 356  			.sp   = (uint64_t)user_stack,
 357  		};
 358  
 359  		(void)pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state);
 360  #else
 361  		panic("Shouldn't have a 64-bit thread on a 32-bit kernel...");
 362  #endif // defined(__arm64__)
 363  	} else {
 364  		arm_thread_state_t state = {
 365  			.pc   = (uint32_t)pthread_kern->proc_get_threadstart(p),
 366  			.r[0] = (uint32_t)user_pthread,
 367  			.r[1] = (uint32_t)th_thport,
 368  			.r[2] = (uint32_t)user_func,    /* golang wants this */
 369  			.r[3] = (uint32_t)user_funcarg, /* golang wants this */
 370  			.r[4] = (uint32_t)user_stack,   /* golang wants this */
 371  			.r[5] = (uint32_t)flags,
 372  
 373  			.sp   = (uint32_t)user_stack,
 374  		};
 375  
 376  		(void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 377  	}
 378  #else
 379  #error bsdthread_create  not defined for this architecture
 380  #endif
 381  
 382  	if (flags & PTHREAD_START_SETSCHED) {
 383  		/* Set scheduling parameters if needed */
 384  		thread_extended_policy_data_t    extinfo;
 385  		thread_precedence_policy_data_t   precedinfo;
 386  
 387  		importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 388  		policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 389  
 390  		if (policy == SCHED_OTHER) {
 391  			extinfo.timeshare = 1;
 392  		} else {
 393  			extinfo.timeshare = 0;
 394  		}
 395  
 396  		thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 397  
 398  		precedinfo.importance = (importance - BASEPRI_DEFAULT);
 399  		thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 400  	} else if (flags & PTHREAD_START_QOSCLASS) {
 401  		/* Set thread QoS class if requested. */
 402  		thread_qos_policy_data_t qos;
 403  
 404  		if (!_pthread_priority_to_policy(flags & PTHREAD_START_QOSCLASS_MASK, &qos)) {
 405  			error = EINVAL;
 406  			goto out;
 407  		}
 408  		pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY,
 409  				(thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 410  	}
 411  
 412  	if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
 413  		uint64_t mach_thread_self_offset =
 414  				pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
 415  		if (mach_thread_self_offset && tsd_offset) {
 416  			bool proc64bit = proc_is64bit(p);
 417  			if (proc64bit) {
 418  				uint64_t th_thport_tsd = (uint64_t)th_thport;
 419  				error = copyout(&th_thport_tsd, user_pthread + tsd_offset +
 420  						mach_thread_self_offset, sizeof(th_thport_tsd));
 421  			} else {
 422  				uint32_t th_thport_tsd = (uint32_t)th_thport;
 423  				error = copyout(&th_thport_tsd, user_pthread + tsd_offset +
 424  						mach_thread_self_offset, sizeof(th_thport_tsd));
 425  			}
 426  			if (error) {
 427  				goto out;
 428  			}
 429  		}
 430  	}
 431  
 432  	if (!start_suspended) {
 433  		kret = pthread_kern->thread_resume(th);
 434  		if (kret != KERN_SUCCESS) {
 435  			error = EINVAL;
 436  			goto out;
 437  		}
 438  	}
 439  	thread_deallocate(th);	/* drop the creator reference */
 440  
 441  	PTHREAD_TRACE(pthread_thread_create|DBG_FUNC_END, error, user_pthread, 0, 0);
 442  
 443  	*retval = user_pthread;
 444  	return(0);
 445  
 446  out:
 447  	(void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 448  	if (pthread_kern->thread_will_park_or_terminate) {
 449  		pthread_kern->thread_will_park_or_terminate(th);
 450  	}
 451  	(void)thread_terminate(th);
 452  	(void)thread_deallocate(th);
 453  	return(error);
 454  }
 455  
 456  /**
 457   * bsdthread_terminate system call.  Used by pthread_terminate
 458   */
 459  int
 460  _bsdthread_terminate(__unused struct proc *p,
 461  		     user_addr_t stackaddr,
 462  		     size_t size,
 463  		     uint32_t kthport,
 464  		     uint32_t sem,
 465  		     __unused int32_t *retval)
 466  {
 467  	mach_vm_offset_t freeaddr;
 468  	mach_vm_size_t freesize;
 469  	kern_return_t kret;
 470  	thread_t th = current_thread();
 471  
 472  	freeaddr = (mach_vm_offset_t)stackaddr;
 473  	freesize = size;
 474  
 475  	PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff);
 476  
 477  	if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 478  		if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 479  			vm_map_t user_map = pthread_kern->current_map();
 480  			freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 481  			kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 482  #if MACH_ASSERT
 483  			if (kret != KERN_SUCCESS && kret != KERN_INVALID_ADDRESS) {
 484  				os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kret);
 485  			}
 486  #endif
 487  			kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 488  			assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 489  		} else {
 490  			kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 491  			if (kret != KERN_SUCCESS) {
 492  				PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0);
 493  			}
 494  		}
 495  	}
 496  
 497  	if (pthread_kern->thread_will_park_or_terminate) {
 498  		pthread_kern->thread_will_park_or_terminate(th);
 499  	}
 500  	(void)thread_terminate(th);
 501  	if (sem != MACH_PORT_NULL) {
 502  		kret = pthread_kern->semaphore_signal_internal_trap(sem);
 503  		if (kret != KERN_SUCCESS) {
 504  			PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0);
 505  		}
 506  	}
 507  
 508  	if (kthport != MACH_PORT_NULL) {
 509  		pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 510  	}
 511  
 512  	PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0);
 513  
 514  	pthread_kern->thread_exception_return();
 515  	__builtin_unreachable();
 516  }
 517  
 518  /**
 519   * bsdthread_register system call.  Performs per-process setup.  Responsible for
 520   * returning capabilitiy bits to userspace and receiving userspace function addresses.
 521   */
 522  int
 523  _bsdthread_register(struct proc *p,
 524  		    user_addr_t threadstart,
 525  		    user_addr_t wqthread,
 526  		    int pthsize,
 527  		    user_addr_t pthread_init_data,
 528  		    user_addr_t pthread_init_data_size,
 529  		    uint64_t dispatchqueue_offset,
 530  		    int32_t *retval)
 531  {
 532  	struct _pthread_registration_data data = {};
 533  	uint32_t max_tsd_offset;
 534  	kern_return_t kr;
 535  	size_t pthread_init_sz = 0;
 536  
 537  	/* syscall randomizer test can pass bogus values */
 538  	if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 539  		return(EINVAL);
 540  	}
 541  	/*
 542  	 * if we have pthread_init_data, then we use that and target_concptr
 543  	 * (which is an offset) get data.
 544  	 */
 545  	if (pthread_init_data != 0) {
 546  		if (pthread_init_data_size < sizeof(data.version)) {
 547  			return EINVAL;
 548  		}
 549  		pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
 550  		int ret = copyin(pthread_init_data, &data, pthread_init_sz);
 551  		if (ret) {
 552  			return ret;
 553  		}
 554  		if (data.version != (size_t)pthread_init_data_size) {
 555  			return EINVAL;
 556  		}
 557  	} else {
 558  		data.dispatch_queue_offset = dispatchqueue_offset;
 559  	}
 560  
 561  	/* We have to do this before proc_get_register so that it resets after fork */
 562  	mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
 563  	pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
 564  
 565  	/* prevent multiple registrations */
 566  	if (pthread_kern->proc_get_register(p) != 0) {
 567  		return(EINVAL);
 568  	}
 569  
 570  	pthread_kern->proc_set_threadstart(p, threadstart);
 571  	pthread_kern->proc_set_wqthread(p, wqthread);
 572  	pthread_kern->proc_set_pthsize(p, pthsize);
 573  	pthread_kern->proc_set_register(p);
 574  
 575  	uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
 576  	if ((uint32_t)pthsize >= tsd_slot_sz &&
 577  			data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
 578  		max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
 579  	} else {
 580  		data.tsd_offset = 0;
 581  		max_tsd_offset = 0;
 582  	}
 583  	pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 584  
 585  	if (data.dispatch_queue_offset > max_tsd_offset) {
 586  		data.dispatch_queue_offset = 0;
 587  	}
 588  	pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 589  
 590  	if (pthread_kern->proc_set_return_to_kernel_offset) {
 591  		if (data.return_to_kernel_offset > max_tsd_offset) {
 592  			data.return_to_kernel_offset = 0;
 593  		}
 594  		pthread_kern->proc_set_return_to_kernel_offset(p,
 595  				data.return_to_kernel_offset);
 596  	}
 597  
 598  	if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
 599  		if (data.mach_thread_self_offset > max_tsd_offset) {
 600  			data.mach_thread_self_offset = 0;
 601  		}
 602  		pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
 603  				data.mach_thread_self_offset);
 604  	}
 605  
 606  	if (pthread_init_data != 0) {
 607  		/* Outgoing data that userspace expects as a reply */
 608  		data.version = sizeof(struct _pthread_registration_data);
 609  		data.main_qos = _pthread_unspecified_priority();
 610  
 611  		if (pthread_kern->qos_main_thread_active()) {
 612  			mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 613  			thread_qos_policy_data_t qos;
 614  			boolean_t gd = FALSE;
 615  
 616  			kr = pthread_kern->thread_policy_get(current_thread(),
 617  					THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 618  			if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 619  				/*
 620  				 * Unspecified threads means the kernel wants us
 621  				 * to impose legacy upon the thread.
 622  				 */
 623  				qos.qos_tier = THREAD_QOS_LEGACY;
 624  				qos.tier_importance = 0;
 625  
 626  				kr = pthread_kern->thread_policy_set_internal(current_thread(),
 627  						THREAD_QOS_POLICY, (thread_policy_t)&qos,
 628  						THREAD_QOS_POLICY_COUNT);
 629  			}
 630  
 631  			if (kr == KERN_SUCCESS) {
 632  				data.main_qos = _pthread_priority_make_from_thread_qos(
 633  						qos.qos_tier, 0, 0);
 634  			}
 635  		}
 636  
 637  		data.stack_addr_hint = stackaddr;
 638  		data.mutex_default_policy = pthread_mutex_default_policy;
 639  
 640  		kr = copyout(&data, pthread_init_data, pthread_init_sz);
 641  		if (kr != KERN_SUCCESS) {
 642  			return EINVAL;
 643  		}
 644  	}
 645  
 646  	/* return the supported feature set as the return value. */
 647  	*retval = PTHREAD_FEATURE_SUPPORTED;
 648  
 649  	return(0);
 650  }
 651  
 652  
 653  #pragma mark - Workqueue Thread Support
 654  
 655  static mach_vm_size_t
 656  workq_thread_allocsize(proc_t p, vm_map_t wq_map,
 657  		mach_vm_size_t *guardsize_out)
 658  {
 659  	mach_vm_size_t guardsize = vm_map_page_size(wq_map);
 660  	mach_vm_size_t pthread_size = vm_map_round_page_mask(
 661  			pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET,
 662  			vm_map_page_mask(wq_map));
 663  	if (guardsize_out) *guardsize_out = guardsize;
 664  	return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
 665  }
 666  
 667  int
 668  workq_create_threadstack(proc_t p, vm_map_t vmap, mach_vm_offset_t *out_addr)
 669  {
 670  	mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 671  	mach_vm_size_t guardsize, th_allocsize;
 672  	kern_return_t kret;
 673  
 674  	th_allocsize = workq_thread_allocsize(p, vmap, &guardsize);
 675  	kret = mach_vm_map(vmap, &stackaddr, th_allocsize, page_size - 1,
 676  			VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE, NULL, 0, FALSE,
 677  			VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
 678  
 679  	if (kret != KERN_SUCCESS) {
 680  		kret = mach_vm_allocate(vmap, &stackaddr, th_allocsize,
 681  				VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
 682  	}
 683  
 684  	if (kret != KERN_SUCCESS) {
 685  		goto fail;
 686  	}
 687  
 688  	/*
 689  	 * The guard page is at the lowest address
 690  	 * The stack base is the highest address
 691  	 */
 692  	kret = mach_vm_protect(vmap, stackaddr, guardsize, FALSE, VM_PROT_NONE);
 693  	if (kret != KERN_SUCCESS) {
 694  		goto fail_vm_deallocate;
 695  	}
 696  
 697  	if (out_addr) {
 698  		*out_addr = stackaddr;
 699  	}
 700  	return 0;
 701  
 702  fail_vm_deallocate:
 703  	(void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 704  fail:
 705  	return kret;
 706  }
 707  
 708  int
 709  workq_destroy_threadstack(proc_t p, vm_map_t vmap, mach_vm_offset_t stackaddr)
 710  {
 711  	return mach_vm_deallocate(vmap, stackaddr,
 712  			workq_thread_allocsize(p, vmap, NULL));
 713  }
 714  
 715  void
 716  workq_markfree_threadstack(proc_t OS_UNUSED p, thread_t OS_UNUSED th,
 717  		vm_map_t vmap, user_addr_t stackaddr)
 718  {
 719  	// Keep this in sync with workq_setup_thread()
 720  	const vm_size_t       guardsize = vm_map_page_size(vmap);
 721  	const user_addr_t     freeaddr = (user_addr_t)stackaddr + guardsize;
 722  	const vm_map_offset_t freesize = vm_map_trunc_page_mask(
 723  			(PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1,
 724  			vm_map_page_mask(vmap)) - guardsize;
 725  
 726  	__assert_only kern_return_t kr = mach_vm_behavior_set(vmap, freeaddr,
 727  			freesize, VM_BEHAVIOR_REUSABLE);
 728  #if MACH_ASSERT
 729  	if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
 730  		os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
 731  	}
 732  #endif
 733  }
 734  
 735  struct workq_thread_addrs {
 736  	user_addr_t self;
 737  	user_addr_t stack_bottom;
 738  	user_addr_t stack_top;
 739  };
 740  
 741  static inline void
 742  workq_thread_set_top_addr(struct workq_thread_addrs *th_addrs, user_addr_t addr)
 743  {
 744  	th_addrs->stack_top = (addr & -C_WORKQ_STK_ALIGN);
 745  }
 746  
 747  static void
 748  workq_thread_get_addrs(vm_map_t map, user_addr_t stackaddr,
 749  					   struct workq_thread_addrs *th_addrs)
 750  {
 751  	const vm_size_t guardsize = vm_map_page_size(map);
 752  
 753  	th_addrs->self = (user_addr_t)(stackaddr + PTH_DEFAULT_STACKSIZE +
 754  			guardsize + PTHREAD_T_OFFSET);
 755  	workq_thread_set_top_addr(th_addrs, th_addrs->self);
 756  	th_addrs->stack_bottom = (user_addr_t)(stackaddr + guardsize);
 757  }
 758  
 759  static inline void
 760  workq_set_register_state(proc_t p, thread_t th,
 761  		struct workq_thread_addrs *addrs, mach_port_name_t kport,
 762  		user_addr_t kevent_list, uint32_t upcall_flags, int kevent_count)
 763  {
 764  	user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
 765  	if (!wqstart_fnptr) {
 766  		panic("workqueue thread start function pointer is NULL");
 767  	}
 768  
 769  #if defined(__i386__) || defined(__x86_64__)
 770  	if (proc_is64bit_data(p) == 0) {
 771  		x86_thread_state32_t state = {
 772  			.eip = (unsigned int)wqstart_fnptr,
 773  			.eax = /* arg0 */ (unsigned int)addrs->self,
 774  			.ebx = /* arg1 */ (unsigned int)kport,
 775  			.ecx = /* arg2 */ (unsigned int)addrs->stack_bottom,
 776  			.edx = /* arg3 */ (unsigned int)kevent_list,
 777  			.edi = /* arg4 */ (unsigned int)upcall_flags,
 778  			.esi = /* arg5 */ (unsigned int)kevent_count,
 779  
 780  			.esp = (int)((vm_offset_t)addrs->stack_top),
 781  		};
 782  
 783  		int error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 784  		if (error != KERN_SUCCESS) {
 785  			panic(__func__ ": thread_set_wq_state failed: %d", error);
 786  		}
 787  	} else {
 788  		x86_thread_state64_t state64 = {
 789  			// x86-64 already passes all the arguments in registers, so we just put them in their final place here
 790  			.rip = (uint64_t)wqstart_fnptr,
 791  			.rdi = (uint64_t)addrs->self,
 792  			.rsi = (uint64_t)kport,
 793  			.rdx = (uint64_t)addrs->stack_bottom,
 794  			.rcx = (uint64_t)kevent_list,
 795  			.r8  = (uint64_t)upcall_flags,
 796  			.r9  = (uint64_t)kevent_count,
 797  
 798  			.rsp = (uint64_t)(addrs->stack_top)
 799  		};
 800  
 801  		int error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 802  		if (error != KERN_SUCCESS) {
 803  			panic(__func__ ": thread_set_wq_state failed: %d", error);
 804  		}
 805  	}
 806  #elif defined(__arm__) || defined(__arm64__)
 807  	if (!proc_is64bit_data(p)) {
 808  		arm_thread_state_t state = {
 809  			.pc = (int)wqstart_fnptr,
 810  			.r[0] = (unsigned int)addrs->self,
 811  			.r[1] = (unsigned int)kport,
 812  			.r[2] = (unsigned int)addrs->stack_bottom,
 813  			.r[3] = (unsigned int)kevent_list,
 814  			// will be pushed onto the stack as arg4/5
 815  			.r[4] = (unsigned int)upcall_flags,
 816  			.r[5] = (unsigned int)kevent_count,
 817  
 818  			.sp = (int)(addrs->stack_top)
 819  		};
 820  
 821  		int error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 822  		if (error != KERN_SUCCESS) {
 823  			panic(__func__ ": thread_set_wq_state failed: %d", error);
 824  		}
 825  	} else {
 826  #if defined(__arm64__)
 827  		arm_thread_state64_t state = {
 828  			.pc = (uint64_t)wqstart_fnptr,
 829  			.x[0] = (uint64_t)addrs->self,
 830  			.x[1] = (uint64_t)kport,
 831  			.x[2] = (uint64_t)addrs->stack_bottom,
 832  			.x[3] = (uint64_t)kevent_list,
 833  			.x[4] = (uint64_t)upcall_flags,
 834  			.x[5] = (uint64_t)kevent_count,
 835  
 836  			.sp = (uint64_t)((vm_offset_t)addrs->stack_top),
 837  		};
 838  
 839  		int error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state);
 840  		if (error != KERN_SUCCESS) {
 841  			panic(__func__ ": thread_set_wq_state failed: %d", error);
 842  		}
 843  #else /* defined(__arm64__) */
 844  		panic("Shouldn't have a 64-bit thread on a 32-bit kernel...");
 845  #endif /* defined(__arm64__) */
 846  	}
 847  #else
 848  #error setup_wqthread  not defined for this architecture
 849  #endif
 850  }
 851  
 852  static inline int
 853  workq_kevent(proc_t p, struct workq_thread_addrs *th_addrs,
 854  		user_addr_t eventlist, int nevents, int kevent_flags,
 855  		user_addr_t *kevent_list_out, int *kevent_count_out)
 856  {
 857  	int ret;
 858  
 859  	user_addr_t kevent_list = th_addrs->self -
 860  			WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
 861  	user_addr_t data_buf = kevent_list - WQ_KEVENT_DATA_SIZE;
 862  	user_size_t data_available = WQ_KEVENT_DATA_SIZE;
 863  
 864  	ret = pthread_kern->kevent_workq_internal(p, eventlist, nevents,
 865  			kevent_list, WQ_KEVENT_LIST_LEN,
 866  			data_buf, &data_available,
 867  			kevent_flags, kevent_count_out);
 868  
 869  	// squash any errors into just empty output
 870  	if (ret != 0 || *kevent_count_out == -1) {
 871  		*kevent_list_out = NULL;
 872  		*kevent_count_out = 0;
 873  		return ret;
 874  	}
 875  
 876  	workq_thread_set_top_addr(th_addrs, data_buf + data_available);
 877  	*kevent_list_out = kevent_list;
 878  	return ret;
 879  }
 880  
 881  /**
 882   * configures initial thread stack/registers to jump into:
 883   * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
 884   * to get there we jump through assembily stubs in pthread_asm.s.  Those
 885   * routines setup a stack frame, using the current stack pointer, and marshall
 886   * arguments from registers to the stack as required by the ABI.
 887   *
 888   * One odd thing we do here is to start the pthread_t 4k below what would be the
 889   * top of the stack otherwise.  This is because usually only the first 4k of the
 890   * pthread_t will be used and so we want to put it on the same 16k page as the
 891   * top of the stack to save memory.
 892   *
 893   * When we are done the stack will look like:
 894   * |-----------| th_stackaddr + th_allocsize
 895   * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
 896   * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
 897   * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
 898   * |stack gap  | bottom aligned to 16 bytes
 899   * |   STACK   |
 900   * |     ⇓     |
 901   * |           |
 902   * |guard page | guardsize
 903   * |-----------| th_stackaddr
 904   */
 905  __attribute__((noreturn,noinline))
 906  void
 907  workq_setup_thread(proc_t p, thread_t th, vm_map_t map, user_addr_t stackaddr,
 908  		mach_port_name_t kport, int th_qos __unused, int setup_flags, int upcall_flags)
 909  {
 910  	struct workq_thread_addrs th_addrs;
 911  	bool first_use = (setup_flags & WQ_SETUP_FIRST_USE);
 912  	user_addr_t kevent_list = NULL;
 913  	int kevent_count = 0;
 914  
 915  	workq_thread_get_addrs(map, stackaddr, &th_addrs);
 916  
 917  	if (first_use) {
 918  		uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 919  		if (tsd_offset) {
 920  			mach_vm_offset_t th_tsd_base = th_addrs.self + tsd_offset;
 921  			kern_return_t kret = pthread_kern->thread_set_tsd_base(th,
 922  					th_tsd_base);
 923  			if (kret == KERN_SUCCESS) {
 924  				upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
 925  			}
 926  		}
 927  
 928  		/*
 929  		 * Pre-fault the first page of the new thread's stack and the page that will
 930  		 * contain the pthread_t structure.
 931  		 */
 932  		vm_map_offset_t mask = vm_map_page_mask(map);
 933  		vm_map_offset_t th_page = vm_map_trunc_page_mask(th_addrs.self, mask);
 934  		vm_map_offset_t stk_page = vm_map_trunc_page_mask(th_addrs.stack_top - 1, mask);
 935  		if (th_page != stk_page) {
 936  			vm_fault(map, stk_page, VM_PROT_READ | VM_PROT_WRITE, FALSE, THREAD_UNINT, NULL, 0);
 937  		}
 938  		vm_fault(map, th_page, VM_PROT_READ | VM_PROT_WRITE, FALSE, THREAD_UNINT, NULL, 0);
 939  	}
 940  
 941  	if (setup_flags & WQ_SETUP_EXIT_THREAD) {
 942  		kevent_count = WORKQ_EXIT_THREAD_NKEVENT;
 943  	} else if (upcall_flags & WQ_FLAG_THREAD_KEVENT) {
 944  		unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
 945  		workq_kevent(p, &th_addrs, NULL, 0, flags, &kevent_list, &kevent_count);
 946  	}
 947  
 948  	workq_set_register_state(p, th, &th_addrs, kport,
 949  			kevent_list, upcall_flags, kevent_count);
 950  
 951  	if (first_use) {
 952  		pthread_kern->thread_bootstrap_return();
 953  	} else {
 954  		pthread_kern->unix_syscall_return(EJUSTRETURN);
 955  	}
 956  	__builtin_unreachable();
 957  }
 958  
 959  int
 960  workq_handle_stack_events(proc_t p, thread_t th, vm_map_t map,
 961  		user_addr_t stackaddr, mach_port_name_t kport,
 962  		user_addr_t events, int nevents, int upcall_flags)
 963  {
 964  	struct workq_thread_addrs th_addrs;
 965  	user_addr_t kevent_list = NULL;
 966  	int kevent_count = 0, error;
 967  	__assert_only kern_return_t kr;
 968  
 969  	workq_thread_get_addrs(map, stackaddr, &th_addrs);
 970  
 971  	unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE |
 972  			KEVENT_FLAG_PARKING;
 973  	error = workq_kevent(p, &th_addrs, events, nevents, flags,
 974  			&kevent_list, &kevent_count);
 975  
 976  	if (error || kevent_count == 0) {
 977  		return error;
 978  	}
 979  
 980  	kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
 981  	assert(kr == KERN_SUCCESS);
 982  
 983  	workq_set_register_state(p, th, &th_addrs, kport,
 984  			kevent_list, upcall_flags, kevent_count);
 985  
 986  	pthread_kern->unix_syscall_return(EJUSTRETURN);
 987  	__builtin_unreachable();
 988  }
 989  
 990  int
 991  _thread_selfid(__unused struct proc *p, uint64_t *retval)
 992  {
 993  	thread_t thread = current_thread();
 994  	*retval = thread_tid(thread);
 995  	return KERN_SUCCESS;
 996  }
 997  
 998  void
 999  _pthread_init(void)
1000  {
1001  	pthread_lck_grp_attr = lck_grp_attr_alloc_init();
1002  	pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
1003  
1004  	/*
1005  	 * allocate the lock attribute for pthread synchronizers
1006  	 */
1007  	pthread_lck_attr = lck_attr_alloc_init();
1008  	pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
1009  
1010  	pth_global_hashinit();
1011  	psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
1012  	psynch_zoneinit();
1013  
1014  	int policy_bootarg;
1015  	if (PE_parse_boot_argn("pthread_mutex_default_policy", &policy_bootarg, sizeof(policy_bootarg))) {
1016  		pthread_mutex_default_policy = policy_bootarg;
1017  	}
1018  
1019  	sysctl_register_oid(&sysctl__kern_pthread_mutex_default_policy);
1020  }