Cradicle Explorer

/ duct-tape / xnu / osfmk / kern / waitq.c
waitq.c
   1  /*
   2   * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. The rights granted to you under the License
  10   * may not be used to create, or enable the creation or redistribution of,
  11   * unlawful or unlicensed copies of an Apple operating system, or to
  12   * circumvent, violate, or enable the circumvention or violation of, any
  13   * terms of an Apple operating system software license agreement.
  14   *
  15   * Please obtain a copy of the License at
  16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17   *
  18   * The Original Code and all software distributed under the License are
  19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23   * Please see the License for the specific language governing rights and
  24   * limitations under the License.
  25   *
  26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27   */
  28  /*
  29   * @OSF_FREE_COPYRIGHT@
  30   */
  31  /*
  32   * Mach Operating System
  33   * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34   * All Rights Reserved.
  35   *
  36   * Permission to use, copy, modify and distribute this software and its
  37   * documentation is hereby granted, provided that both the copyright
  38   * notice and this permission notice appear in all copies of the
  39   * software, derivative works or modified versions, and any portions
  40   * thereof, and that both notices appear in supporting documentation.
  41   *
  42   * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43   * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44   * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45   *
  46   * Carnegie Mellon requests users of this software to return to
  47   *
  48   *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49   *  School of Computer Science
  50   *  Carnegie Mellon University
  51   *  Pittsburgh PA 15213-3890
  52   *
  53   * any improvements or extensions that they make and grant Carnegie Mellon
  54   * the rights to redistribute these changes.
  55   */
  56  
  57  /*
  58   * un-comment the following lines to debug the link/prepost tables
  59   * NOTE: this expands each element by ~40 bytes
  60   */
  61  //#define KEEP_WAITQ_LINK_STATS
  62  //#define KEEP_WAITQ_PREPOST_STATS
  63  
  64  #include <kern/ast.h>
  65  #include <kern/backtrace.h>
  66  #include <kern/kern_types.h>
  67  #include <kern/ltable.h>
  68  #include <kern/mach_param.h>
  69  #include <kern/percpu.h>
  70  #include <kern/queue.h>
  71  #include <kern/sched_prim.h>
  72  #include <kern/simple_lock.h>
  73  #include <kern/spl.h>
  74  #include <kern/waitq.h>
  75  #include <kern/zalloc.h>
  76  #include <kern/policy_internal.h>
  77  #include <kern/turnstile.h>
  78  
  79  #include <os/hash.h>
  80  #include <libkern/OSAtomic.h>
  81  #include <mach/sync_policy.h>
  82  #include <vm/vm_kern.h>
  83  
  84  #include <sys/kdebug.h>
  85  
  86  #if defined(KEEP_WAITQ_LINK_STATS) || defined(KEEP_WAITQ_PREPOST_STATS)
  87  #  if !CONFIG_LTABLE_STATS
  88  #    error "You must configure LTABLE_STATS to use WAITQ_[LINK|PREPOST]_STATS"
  89  #  endif
  90  #  if !CONFIG_WAITQ_STATS
  91  #    error "You must configure WAITQ_STATS to use WAITQ_[LINK|PREPOST]_STATS"
  92  #  endif
  93  #endif
  94  
  95  #ifdef __DARLING__
  96  #include <darlingserver/duct-tape/log.h>
  97  #define wqdbg(fmt, ...) dtape_log_debug("WQ[%s]: " fmt, __func__, ## __VA_ARGS__)
  98  #define wqdbg_v(fmt, ...) wqdbg(fmt, ## __VA_ARGS__)
  99  #define wqinfo(fmt, ...) dtape_log_info("WQ[%s]: " fmt, __func__, ## __VA_ARGS__)
 100  #define wqerr(fmt, ...) dtape_log_error("WQ[%s]: " fmt, __func__, ## __VA_ARGS__)
 101  #else
 102  #if CONFIG_WAITQ_DEBUG
 103  #define wqdbg(fmt, ...) \
 104  	printf("WQ[%s]:  " fmt "\n", __func__, ## __VA_ARGS__)
 105  #else
 106  #define wqdbg(fmt, ...) do { } while (0)
 107  #endif
 108  
 109  #ifdef WAITQ_VERBOSE_DEBUG
 110  #define wqdbg_v(fmt, ...) \
 111  	printf("WQ[v:%s]:  " fmt "\n", __func__, ## __VA_ARGS__)
 112  #else
 113  #define wqdbg_v(fmt, ...) do { } while (0)
 114  #endif
 115  
 116  #define wqinfo(fmt, ...) \
 117  	printf("WQ[%s]: " fmt "\n", __func__,  ## __VA_ARGS__)
 118  
 119  #define wqerr(fmt, ...) \
 120  	printf("WQ[%s] ERROR: " fmt "\n", __func__, ## __VA_ARGS__)
 121  #endif // __DARLING__
 122  
 123  /*
 124   * file-static functions / data
 125   */
 126  static thread_t waitq_select_one_locked(struct waitq *waitq, event64_t event,
 127      uint64_t *reserved_preposts,
 128      int priority, spl_t *spl);
 129  
 130  static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
 131      event64_t event,
 132      thread_t thread, spl_t *spl);
 133  
 134  ZONE_DECLARE(waitq_set_zone, "waitq sets",
 135      sizeof(struct waitq_set), ZC_NOENCRYPT);
 136  
 137  /* waitq prepost cache */
 138  #define WQP_CACHE_MAX   50
 139  struct wqp_cache {
 140  	uint64_t        head;
 141  	unsigned int    avail;
 142  };
 143  static struct wqp_cache PERCPU_DATA(wqp_cache);
 144  
 145  #define P2ROUNDUP(x, align) (-(-((uint32_t)(x)) & -(align)))
 146  #define ROUNDDOWN(x, y)  (((x)/(y))*(y))
 147  
 148  
 149  #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS
 150  static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip);
 151  #endif
 152  
 153  LCK_GRP_DECLARE(waitq_lck_grp, "waitq");
 154  
 155  #ifdef __DARLING__
 156  extern void waitq_lock_init(struct waitq* wq);
 157  #else
 158  #if __arm64__
 159  
 160  #define waitq_lock_to(wq, to) \
 161  	(hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to, &waitq_lck_grp))
 162  
 163  #define waitq_lock_unlock(wq) \
 164  	(hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
 165  
 166  #define waitq_lock_init(wq) \
 167  	(wq->waitq_interlock = 0)
 168  
 169  #else
 170  
 171  #define waitq_lock_to(wq, to) \
 172  	(hw_lock_to(&(wq)->waitq_interlock, to, &waitq_lck_grp))
 173  
 174  #define waitq_lock_unlock(wq) \
 175  	(hw_lock_unlock(&(wq)->waitq_interlock))
 176  
 177  #define waitq_lock_init(wq) \
 178  	(hw_lock_init(&(wq)->waitq_interlock))
 179  
 180  #endif  /* __arm64__ */
 181  #endif // __DARLING__
 182  
 183  /*
 184   * Prepost callback function for specially marked waitq sets
 185   * (prepost alternative)
 186   */
 187  extern void waitq_set__CALLING_PREPOST_HOOK__(waitq_set_prepost_hook_t *ctx);
 188  
 189  #define DEFAULT_MIN_FREE_TABLE_ELEM    100
 190  static uint32_t g_min_free_table_elem;
 191  static uint32_t g_min_free_cache;
 192  
 193  
 194  /* ----------------------------------------------------------------------
 195   *
 196   * SetID Link Table Implementation
 197   *
 198   * ---------------------------------------------------------------------- */
 199  static struct link_table g_wqlinktable;
 200  
 201  enum wq_link_type {
 202  	WQL_ALL     = -1,
 203  	WQL_FREE    = LT_FREE,
 204  	WQL_WQS     = LT_ELEM,
 205  	WQL_LINK    = LT_LINK,
 206  };
 207  
 208  struct waitq_link {
 209  	struct lt_elem wqte;
 210  
 211  	union {
 212  		/* wqt_type == WQL_WQS (LT_ELEM) */
 213  		struct {
 214  			struct waitq_set *wql_set;
 215  			/* uint64_t          sl_prepost_id; */
 216  		} wql_wqs;
 217  
 218  		/* wqt_type == WQL_LINK (LT_LINK) */
 219  		struct {
 220  			uint64_t          left_setid;
 221  			uint64_t          right_setid;
 222  		} wql_link;
 223  	};
 224  #ifdef KEEP_WAITQ_LINK_STATS
 225  	thread_t  sl_alloc_th;
 226  	task_t    sl_alloc_task;
 227  	uintptr_t sl_alloc_bt[NWAITQ_BTFRAMES];
 228  	uint64_t  sl_alloc_ts;
 229  	uintptr_t sl_invalidate_bt[NWAITQ_BTFRAMES];
 230  	uint64_t  sl_invalidate_ts;
 231  	uintptr_t sl_mkvalid_bt[NWAITQ_BTFRAMES];
 232  	uint64_t  sl_mkvalid_ts;
 233  	uint64_t  sl_free_ts;
 234  #endif
 235  };
 236  #if !defined(KEEP_WAITQ_LINK_STATS)
 237  static_assert((sizeof(struct waitq_link) & (sizeof(struct waitq_link) - 1)) == 0,
 238      "waitq_link struct must be a power of two!");
 239  #endif
 240  
 241  #define wql_refcnt(link) \
 242  	(lt_bits_refcnt((link)->wqte.lt_bits))
 243  
 244  #define wql_type(link) \
 245  	(lt_bits_type((link)->wqte.lt_bits))
 246  
 247  #define wql_mkvalid(link) \
 248  	do { \
 249  	        lt_elem_mkvalid(&(link)->wqte); \
 250  	        wql_do_mkvalid_stats(&(link)->wqte); \
 251  	} while (0)
 252  
 253  #define wql_is_valid(link) \
 254  	lt_bits_valid((link)->wqte.lt_bits)
 255  
 256  #define wql_setid wqte.lt_id
 257  
 258  #define WQL_WQS_POISON         ((void *)(0xf00df00d))
 259  #define WQL_LINK_POISON        (0x0bad0badffffffffull)
 260  
 261  static void
 262  wql_poison(struct link_table *table, struct lt_elem *elem)
 263  {
 264  	struct waitq_link *link = (struct waitq_link *)elem;
 265  	(void)table;
 266  
 267  	switch (wql_type(link)) {
 268  	case WQL_WQS:
 269  		link->wql_wqs.wql_set = WQL_WQS_POISON;
 270  		break;
 271  	case WQL_LINK:
 272  		link->wql_link.left_setid = WQL_LINK_POISON;
 273  		link->wql_link.right_setid = WQL_LINK_POISON;
 274  		break;
 275  	default:
 276  		break;
 277  	}
 278  #ifdef KEEP_WAITQ_LINK_STATS
 279  	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 280  	link->sl_alloc_ts = 0;
 281  	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
 282  	link->sl_mkvalid_ts = 0;
 283  
 284  	link->sl_alloc_th = THREAD_NULL;
 285  	/* leave the sl_alloc_task in place for debugging */
 286  
 287  	link->sl_free_ts = mach_absolute_time();
 288  #endif
 289  }
 290  
 291  #ifdef KEEP_WAITQ_LINK_STATS
 292  static __inline__ void
 293  wql_do_alloc_stats(struct lt_elem *elem)
 294  {
 295  	if (elem) {
 296  		struct waitq_link *link = (struct waitq_link *)elem;
 297  		memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 298  		waitq_grab_backtrace(link->sl_alloc_bt, 0);
 299  		link->sl_alloc_th = current_thread();
 300  		link->sl_alloc_task = current_task();
 301  
 302  		assert(link->sl_alloc_ts == 0);
 303  		link->sl_alloc_ts = mach_absolute_time();
 304  
 305  		memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 306  		link->sl_invalidate_ts = 0;
 307  	}
 308  }
 309  
 310  static __inline__ void
 311  wql_do_invalidate_stats(struct lt_elem *elem)
 312  {
 313  	struct waitq_link *link = (struct waitq_link *)elem;
 314  
 315  	if (!elem) {
 316  		return;
 317  	}
 318  
 319  	assert(link->sl_mkvalid_ts > 0);
 320  
 321  	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 322  	link->sl_invalidate_ts = mach_absolute_time();
 323  	waitq_grab_backtrace(link->sl_invalidate_bt, 0);
 324  }
 325  
 326  static __inline__ void
 327  wql_do_mkvalid_stats(struct lt_elem *elem)
 328  {
 329  	struct waitq_link *link = (struct waitq_link *)elem;
 330  
 331  	if (!elem) {
 332  		return;
 333  	}
 334  
 335  	memset(link->sl_mkvalid_bt, 0, sizeof(link->sl_mkvalid_bt));
 336  	link->sl_mkvalid_ts = mach_absolute_time();
 337  	waitq_grab_backtrace(link->sl_mkvalid_bt, 0);
 338  }
 339  #else
 340  #define wql_do_alloc_stats(e)
 341  #define wql_do_invalidate_stats(e)
 342  #define wql_do_mkvalid_stats(e)
 343  #endif /* KEEP_WAITQ_LINK_STATS */
 344  
 345  static void
 346  wql_init(void)
 347  {
 348  	uint32_t tablesz = 0, max_links = 0;
 349  
 350  	if (PE_parse_boot_argn("wql_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
 351  		tablesz = (uint32_t)g_lt_max_tbl_size;
 352  	}
 353  
 354  	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
 355  	max_links = tablesz / sizeof(struct waitq_link);
 356  	assert(max_links > 0 && tablesz > 0);
 357  
 358  	/* we have a restricted index range */
 359  	if (max_links > (LT_IDX_MAX + 1)) {
 360  		max_links = LT_IDX_MAX + 1;
 361  	}
 362  
 363  	wqinfo("init linktable with max:%d elements (%d bytes)",
 364  	    max_links, tablesz);
 365  	ltable_init(&g_wqlinktable, "wqslab.wql", max_links,
 366  	    sizeof(struct waitq_link), wql_poison);
 367  }
 368  
 369  static void
 370  wql_ensure_free_space(void)
 371  {
 372  	if (g_wqlinktable.nelem - g_wqlinktable.used_elem < g_min_free_table_elem) {
 373  		/*
 374  		 * we don't hold locks on these values, so check for underflow
 375  		 */
 376  		if (g_wqlinktable.used_elem <= g_wqlinktable.nelem) {
 377  			wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d",
 378  			    g_wqlinktable.nelem, g_wqlinktable.used_elem,
 379  			    g_min_free_table_elem);
 380  			ltable_grow(&g_wqlinktable, g_min_free_table_elem);
 381  		}
 382  	}
 383  }
 384  
 385  static struct waitq_link *
 386  wql_alloc_link(int type)
 387  {
 388  	struct lt_elem *elem;
 389  
 390  	elem = ltable_alloc_elem(&g_wqlinktable, type, 1, 0);
 391  	wql_do_alloc_stats(elem);
 392  	return (struct waitq_link *)elem;
 393  }
 394  
 395  static void
 396  wql_realloc_link(struct waitq_link *link, int type)
 397  {
 398  	ltable_realloc_elem(&g_wqlinktable, &link->wqte, type);
 399  #ifdef KEEP_WAITQ_LINK_STATS
 400  	memset(link->sl_alloc_bt, 0, sizeof(link->sl_alloc_bt));
 401  	link->sl_alloc_ts = 0;
 402  	wql_do_alloc_stats(&link->wqte);
 403  
 404  	memset(link->sl_invalidate_bt, 0, sizeof(link->sl_invalidate_bt));
 405  	link->sl_invalidate_ts = 0;
 406  #endif
 407  }
 408  
 409  static void
 410  wql_invalidate(struct waitq_link *link)
 411  {
 412  	lt_elem_invalidate(&link->wqte);
 413  	wql_do_invalidate_stats(&link->wqte);
 414  }
 415  
 416  static struct waitq_link *
 417  wql_get_link(uint64_t setid)
 418  {
 419  	struct lt_elem *elem;
 420  
 421  	elem = ltable_get_elem(&g_wqlinktable, setid);
 422  	return (struct waitq_link *)elem;
 423  }
 424  
 425  static void
 426  wql_put_link(struct waitq_link *link)
 427  {
 428  	if (!link) {
 429  		return;
 430  	}
 431  	ltable_put_elem(&g_wqlinktable, (struct lt_elem *)link);
 432  }
 433  
 434  static struct waitq_link *
 435  wql_get_reserved(uint64_t setid, int type)
 436  {
 437  	struct lt_elem *elem;
 438  
 439  	elem = lt_elem_list_first(&g_wqlinktable, setid);
 440  	if (!elem) {
 441  		return NULL;
 442  	}
 443  	ltable_realloc_elem(&g_wqlinktable, elem, type);
 444  	return (struct waitq_link *)elem;
 445  }
 446  
 447  
 448  static inline int waitq_maybe_remove_link(struct waitq *waitq,
 449      uint64_t setid,
 450      struct waitq_link *parent,
 451      struct waitq_link *left,
 452      struct waitq_link *right);
 453  
 454  enum {
 455  	LINK_WALK_ONE_LEVEL = 0,
 456  	LINK_WALK_FULL_DAG  = 1,
 457  	LINK_WALK_FULL_DAG_UNLOCKED = 2,
 458  };
 459  
 460  typedef int (*wql_callback_func)(struct waitq *waitq, void *ctx,
 461      struct waitq_link *link);
 462  
 463  /**
 464   * walk_waitq_links: walk all table elements (of type 'link_type') pointed to by 'setid'
 465   *
 466   * Conditions:
 467   *	waitq is locked (or NULL)
 468   *	'setid' is managed by 'waitq'
 469   *		this could be direct (waitq->waitq_set_id == setid)
 470   *		OR indirect (setid is the left/right ID in a LINK chain,
 471   *		             whose root is waitq->waitq_set_id)
 472   *
 473   * Notes:
 474   *	This function uses recursion to walk the set of table elements
 475   *	pointed to by 'setid'. For each element encountered, 'cb' will be
 476   *	called. If non-zero, the return value of this callback function can
 477   *	early-out of the table walk.
 478   *
 479   *	For each link element encountered, the function takes a reference to
 480   *	it. The reference is dropped only after the callback and any recursion
 481   *	has completed.
 482   *
 483   *	The assumed table/link/tree structure:
 484   *                   'setid'
 485   *                   /    \
 486   *                  /      \
 487   *              L(LINK)     R(LINK)
 488   *               /\             /\
 489   *              /  \           /  \
 490   *             /    \       Rl(*)  Rr(*)
 491   *         Ll(*)  Lr(*)      /\    /\
 492   *           /\     /\    ... ... ... ...
 493   *        ...  ... ... ...
 494   *                    \
 495   *                    WQS(wqset_q.waitq_setid == Sx)
 496   *                    [waitq set is a membet of setid, 'Sx')
 497   *
 498   *                    'Sx'
 499   *                   /    \
 500   *                  /      \
 501   *              L(LINK)     R(LINK)
 502   *               /\             /\
 503   *             ... ...        ... ...
 504   *
 505   *	The basic algorithm is as follows:
 506   *	*) take a reference to the table object pointed to by 'setid'
 507   *	*) if appropriate, call 'cb' (potentially early-out on non-zero return)
 508   *	*) if the link object points to a waitq set, and the walk type
 509   *	   is 'FULL_DAG' (full directed-acyclic-graph), then try to lock
 510   *	   the associated waitq set object and recursively walk all sets to
 511   *	   which that set belongs. This is a DFS of the tree structure.
 512   *	*) recurse down the left side of the tree (following the
 513   *	   'left_setid' pointer in the link object
 514   *	*) recurse down the right side of the tree (following the
 515   *	   'right_setid' pointer in the link object
 516   */
 517  static __attribute__((noinline))
 518  int
 519  walk_waitq_links(int walk_type, struct waitq *waitq,
 520      uint64_t setid, int link_type,
 521      void *ctx, wql_callback_func cb)
 522  {
 523  	struct waitq_link *link;
 524  	uint64_t nextid;
 525  	int wqltype;
 526  
 527  	link = wql_get_link(setid);
 528  
 529  	/* invalid link */
 530  	if (!link) {
 531  		return WQ_ITERATE_CONTINUE;
 532  	}
 533  
 534  	setid = nextid = 0;
 535  	wqltype = wql_type(link);
 536  	if (wqltype == WQL_LINK) {
 537  		setid  = link->wql_link.left_setid;
 538  		nextid = link->wql_link.right_setid;
 539  	}
 540  
 541  	/*
 542  	 * Make the callback only on specified link_type (or all links)
 543  	 * Note that after the callback, the link object may be
 544  	 * invalid. The only valid thing we can do is put our
 545  	 * reference to it (which may put it back on the free list)
 546  	 */
 547  	if (link_type == WQL_ALL || link_type == wqltype) {
 548  		/* allow the callback to early-out */
 549  		int ret = cb(waitq, ctx, link);
 550  		if (ret != WQ_ITERATE_CONTINUE) {
 551  			wql_put_link(link);
 552  			return ret;
 553  		}
 554  	}
 555  
 556  	if (wqltype == WQL_WQS &&
 557  	    (walk_type == LINK_WALK_FULL_DAG ||
 558  	    walk_type == LINK_WALK_FULL_DAG_UNLOCKED)) {
 559  		/*
 560  		 * Recurse down any sets to which this wait queue set was
 561  		 * added.  We do this just before we put our reference to
 562  		 * the link object (which may free it).
 563  		 */
 564  		struct waitq_set *wqset = link->wql_wqs.wql_set;
 565  		int ret = WQ_ITERATE_CONTINUE;
 566  		int should_unlock = 0;
 567  		uint64_t wqset_setid = 0;
 568  
 569  		if (waitq_set_is_valid(wqset) && walk_type == LINK_WALK_FULL_DAG) {
 570  			assert(!waitq_irq_safe(&wqset->wqset_q));
 571  			waitq_set_lock(wqset);
 572  			should_unlock = 1;
 573  		}
 574  
 575  		/*
 576  		 * verify the linked waitq set as it could have been
 577  		 * invalidated before we grabbed the lock!
 578  		 */
 579  		if (wqset->wqset_id != link->wql_setid.id) {
 580  			/* This is the bottom of the tree: just get out */
 581  			if (should_unlock) {
 582  				waitq_set_unlock(wqset);
 583  			}
 584  			wql_put_link(link);
 585  			return WQ_ITERATE_CONTINUE;
 586  		}
 587  
 588  		wqset_setid = wqset->wqset_q.waitq_set_id;
 589  
 590  		if (wqset_setid > 0) {
 591  			ret = walk_waitq_links(walk_type, &wqset->wqset_q,
 592  			    wqset_setid, link_type, ctx, cb);
 593  		}
 594  		if (should_unlock) {
 595  			waitq_set_unlock(wqset);
 596  		}
 597  		if (ret != WQ_ITERATE_CONTINUE) {
 598  			wql_put_link(link);
 599  			return ret;
 600  		}
 601  	}
 602  
 603  	wql_put_link(link);
 604  
 605  	/* recurse down left side of the tree */
 606  	if (setid) {
 607  		int ret = walk_waitq_links(walk_type, waitq, setid, link_type, ctx, cb);
 608  		if (ret != WQ_ITERATE_CONTINUE) {
 609  			return ret;
 610  		}
 611  	}
 612  
 613  	/* recurse down right side of the tree */
 614  	if (nextid) {
 615  		return walk_waitq_links(walk_type, waitq, nextid, link_type, ctx, cb);
 616  	}
 617  
 618  	return WQ_ITERATE_CONTINUE;
 619  }
 620  
 621  /* ----------------------------------------------------------------------
 622   *
 623   * Prepost Link Table Implementation
 624   *
 625   * ---------------------------------------------------------------------- */
 626  static struct link_table g_prepost_table;
 627  
 628  enum wq_prepost_type {
 629  	WQP_FREE  = LT_FREE,
 630  	WQP_WQ    = LT_ELEM,
 631  	WQP_POST  = LT_LINK,
 632  };
 633  
 634  struct wq_prepost {
 635  	struct lt_elem wqte;
 636  
 637  	union {
 638  		/* wqt_type == WQP_WQ (LT_ELEM) */
 639  		struct {
 640  			struct waitq *wqp_wq_ptr;
 641  		} wqp_wq;
 642  		/* wqt_type == WQP_POST (LT_LINK) */
 643  		struct {
 644  			uint64_t      wqp_next_id;
 645  			uint64_t      wqp_wq_id;
 646  		} wqp_post;
 647  	};
 648  #ifdef KEEP_WAITQ_PREPOST_STATS
 649  	thread_t  wqp_alloc_th;
 650  	task_t    wqp_alloc_task;
 651  	uintptr_t wqp_alloc_bt[NWAITQ_BTFRAMES];
 652  #endif
 653  };
 654  #if !defined(KEEP_WAITQ_PREPOST_STATS)
 655  static_assert((sizeof(struct wq_prepost) & (sizeof(struct wq_prepost) - 1)) == 0,
 656      "wq_prepost struct must be a power of two!");
 657  #endif
 658  
 659  #define wqp_refcnt(wqp) \
 660  	(lt_bits_refcnt((wqp)->wqte.lt_bits))
 661  
 662  #define wqp_type(wqp) \
 663  	(lt_bits_type((wqp)->wqte.lt_bits))
 664  
 665  #define wqp_set_valid(wqp) \
 666  	lt_elem_mkvalid(&(wqp)->wqte)
 667  
 668  #define wqp_is_valid(wqp) \
 669  	lt_bits_valid((wqp)->wqte.lt_bits)
 670  
 671  #define wqp_prepostid wqte.lt_id
 672  
 673  #define WQP_WQ_POISON              (0x0bad0badffffffffull)
 674  #define WQP_POST_POISON            (0xf00df00df00df00d)
 675  
 676  static void
 677  wqp_poison(struct link_table *table, struct lt_elem *elem)
 678  {
 679  	struct wq_prepost *wqp = (struct wq_prepost *)elem;
 680  	(void)table;
 681  
 682  	switch (wqp_type(wqp)) {
 683  	case WQP_WQ:
 684  		break;
 685  	case WQP_POST:
 686  		wqp->wqp_post.wqp_next_id = WQP_POST_POISON;
 687  		wqp->wqp_post.wqp_wq_id = WQP_POST_POISON;
 688  		break;
 689  	default:
 690  		break;
 691  	}
 692  }
 693  
 694  #ifdef KEEP_WAITQ_PREPOST_STATS
 695  static __inline__ void
 696  wqp_do_alloc_stats(struct lt_elem *elem)
 697  {
 698  	if (!elem) {
 699  		return;
 700  	}
 701  
 702  	struct wq_prepost *wqp = (struct wq_prepost *)elem;
 703  	uintptr_t alloc_bt[sizeof(wqp->wqp_alloc_bt)];
 704  
 705  	waitq_grab_backtrace(alloc_bt, NWAITQ_BTFRAMES);
 706  
 707  	/* be sure the take stats for _all_ allocated objects */
 708  	for (;;) {
 709  		memcpy(wqp->wqp_alloc_bt, alloc_bt, sizeof(alloc_bt));
 710  		wqp->wqp_alloc_th = current_thread();
 711  		wqp->wqp_alloc_task = current_task();
 712  		wqp = (struct wq_prepost *)lt_elem_list_next(&g_prepost_table, &wqp->wqte);
 713  		if (!wqp) {
 714  			break;
 715  		}
 716  	}
 717  }
 718  #else
 719  #define wqp_do_alloc_stats(e)
 720  #endif /* KEEP_WAITQ_LINK_STATS */
 721  
 722  static void
 723  wqp_init(void)
 724  {
 725  	uint32_t tablesz = 0, max_wqp = 0;
 726  
 727  	if (PE_parse_boot_argn("wqp_tsize", &tablesz, sizeof(tablesz)) != TRUE) {
 728  		tablesz = (uint32_t)g_lt_max_tbl_size;
 729  	}
 730  
 731  	tablesz = P2ROUNDUP(tablesz, PAGE_SIZE);
 732  	max_wqp = tablesz / sizeof(struct wq_prepost);
 733  	assert(max_wqp > 0 && tablesz > 0);
 734  
 735  	/* we have a restricted index range */
 736  	if (max_wqp > (LT_IDX_MAX + 1)) {
 737  		max_wqp = LT_IDX_MAX + 1;
 738  	}
 739  
 740  	wqinfo("init prepost table with max:%d elements (%d bytes)",
 741  	    max_wqp, tablesz);
 742  	ltable_init(&g_prepost_table, "wqslab.prepost", max_wqp,
 743  	    sizeof(struct wq_prepost), wqp_poison);
 744  }
 745  
 746  /*
 747   * Refill the per-CPU cache.
 748   */
 749  static void
 750  wq_prepost_refill_cpu_cache(uint32_t nalloc)
 751  {
 752  #ifndef __DARLING__
 753  	struct lt_elem *new_head, *old_head;
 754  	struct wqp_cache *cache;
 755  
 756  	/* require preemption enabled to allocate elements */
 757  	if (get_preemption_level() != 0) {
 758  		return;
 759  	}
 760  
 761  	new_head = ltable_alloc_elem(&g_prepost_table,
 762  	    LT_RESERVED, nalloc, 1);
 763  	if (new_head == NULL) {
 764  		return;
 765  	}
 766  
 767  	disable_preemption();
 768  	cache = PERCPU_GET(wqp_cache);
 769  
 770  	/* check once more before putting these elements on the list */
 771  	if (cache->avail >= WQP_CACHE_MAX) {
 772  		lt_elem_list_release(&g_prepost_table, new_head, LT_RESERVED);
 773  		enable_preemption();
 774  		return;
 775  	}
 776  
 777  	cache->avail += nalloc;
 778  	if (cache->head == 0 || cache->head == LT_IDX_MAX) {
 779  		cache->head = new_head->lt_id.id;
 780  		goto out;
 781  	}
 782  
 783  	old_head = lt_elem_list_first(&g_prepost_table, cache->head);
 784  	(void)lt_elem_list_link(&g_prepost_table, new_head, old_head);
 785  	cache->head = new_head->lt_id.id;
 786  
 787  out:
 788  	enable_preemption();
 789  	return;
 790  #endif // __DARLING__
 791  }
 792  
 793  static void
 794  wq_prepost_ensure_free_space(void)
 795  {
 796  	uint32_t free_elem;
 797  	uint32_t min_free;
 798  	struct wqp_cache *cache;
 799  
 800  	if (g_min_free_cache == 0) {
 801  		g_min_free_cache = (WQP_CACHE_MAX * ml_wait_max_cpus());
 802  	}
 803  
 804  #ifndef __DARLING__
 805  	/*
 806  	 * Ensure that we always have a pool of per-CPU prepost elements
 807  	 */
 808  	disable_preemption();
 809  	cache = PERCPU_GET(wqp_cache);
 810  	free_elem = cache->avail;
 811  	enable_preemption();
 812  
 813  	if (free_elem < (WQP_CACHE_MAX / 3)) {
 814  		wq_prepost_refill_cpu_cache(WQP_CACHE_MAX - free_elem);
 815  	}
 816  #endif // __DARLING__
 817  
 818  	/*
 819  	 * Now ensure that we have a sufficient amount of free table space
 820  	 */
 821  	free_elem = g_prepost_table.nelem - g_prepost_table.used_elem;
 822  	min_free = g_min_free_table_elem + g_min_free_cache;
 823  	if (free_elem < min_free) {
 824  		/*
 825  		 * we don't hold locks on these values, so check for underflow
 826  		 */
 827  		if (g_prepost_table.used_elem <= g_prepost_table.nelem) {
 828  			wqdbg_v("Forcing table growth: nelem=%d, used=%d, min_free=%d+%d",
 829  			    g_prepost_table.nelem, g_prepost_table.used_elem,
 830  			    g_min_free_table_elem, g_min_free_cache);
 831  			ltable_grow(&g_prepost_table, min_free);
 832  		}
 833  	}
 834  }
 835  
 836  static struct wq_prepost *
 837  wq_prepost_alloc(int type, int nelem)
 838  {
 839  	struct lt_elem *elem;
 840  	struct wq_prepost *wqp;
 841  	struct wqp_cache *cache;
 842  
 843  	if (type != LT_RESERVED) {
 844  		goto do_alloc;
 845  	}
 846  	if (nelem == 0) {
 847  		return NULL;
 848  	}
 849  
 850  #ifndef __DARLING__
 851  	/*
 852  	 * First try to grab the elements from the per-CPU cache if we are
 853  	 * allocating RESERVED elements
 854  	 */
 855  	disable_preemption();
 856  	cache = PERCPU_GET(wqp_cache);
 857  	if (nelem <= (int)cache->avail) {
 858  		struct lt_elem *first, *next = NULL;
 859  		int nalloc = nelem;
 860  
 861  		cache->avail -= nelem;
 862  
 863  		/* grab the first element */
 864  		first = lt_elem_list_first(&g_prepost_table, cache->head);
 865  
 866  		/* find the last element and re-adjust the cache head */
 867  		for (elem = first; elem != NULL && nalloc > 0; elem = next) {
 868  			next = lt_elem_list_next(&g_prepost_table, elem);
 869  			if (--nalloc == 0) {
 870  				/* terminate the allocated list */
 871  				elem->lt_next_idx = LT_IDX_MAX;
 872  				break;
 873  			}
 874  		}
 875  		assert(nalloc == 0);
 876  		if (!next) {
 877  			cache->head = LT_IDX_MAX;
 878  		} else {
 879  			cache->head = next->lt_id.id;
 880  		}
 881  		/* assert that we don't have mis-matched book keeping */
 882  		assert(!(cache->head == LT_IDX_MAX && cache->avail > 0));
 883  		enable_preemption();
 884  		elem = first;
 885  		goto out;
 886  	}
 887  	enable_preemption();
 888  #endif // __DARLING__
 889  
 890  do_alloc:
 891  	/* fall-back to standard table allocation */
 892  	elem = ltable_alloc_elem(&g_prepost_table, type, nelem, 0);
 893  	if (!elem) {
 894  		return NULL;
 895  	}
 896  
 897  out:
 898  	wqp = (struct wq_prepost *)elem;
 899  	wqp_do_alloc_stats(elem);
 900  	return wqp;
 901  }
 902  
 903  static void
 904  wq_prepost_invalidate(struct wq_prepost *wqp)
 905  {
 906  	lt_elem_invalidate(&wqp->wqte);
 907  }
 908  
 909  static struct wq_prepost *
 910  wq_prepost_get(uint64_t wqp_id)
 911  {
 912  	struct lt_elem *elem;
 913  
 914  	elem = ltable_get_elem(&g_prepost_table, wqp_id);
 915  	return (struct wq_prepost *)elem;
 916  }
 917  
 918  static void
 919  wq_prepost_put(struct wq_prepost *wqp)
 920  {
 921  	ltable_put_elem(&g_prepost_table, (struct lt_elem *)wqp);
 922  }
 923  
 924  static int
 925  wq_prepost_rlink(struct wq_prepost *parent, struct wq_prepost *child)
 926  {
 927  	return lt_elem_list_link(&g_prepost_table, &parent->wqte, &child->wqte);
 928  }
 929  
 930  static struct wq_prepost *
 931  wq_prepost_get_rnext(struct wq_prepost *head)
 932  {
 933  	struct lt_elem *elem;
 934  	struct wq_prepost *wqp;
 935  	uint64_t id;
 936  
 937  	elem = lt_elem_list_next(&g_prepost_table, &head->wqte);
 938  	if (!elem) {
 939  		return NULL;
 940  	}
 941  	id = elem->lt_id.id;
 942  	elem = ltable_get_elem(&g_prepost_table, id);
 943  
 944  	if (!elem) {
 945  		return NULL;
 946  	}
 947  	wqp = (struct wq_prepost *)elem;
 948  	if (elem->lt_id.id != id ||
 949  	    wqp_type(wqp) != WQP_POST ||
 950  	    wqp->wqp_post.wqp_next_id != head->wqp_prepostid.id) {
 951  		ltable_put_elem(&g_prepost_table, elem);
 952  		return NULL;
 953  	}
 954  
 955  	return wqp;
 956  }
 957  
 958  static void
 959  wq_prepost_reset_rnext(struct wq_prepost *wqp)
 960  {
 961  	(void)lt_elem_list_break(&g_prepost_table, &wqp->wqte);
 962  }
 963  
 964  
 965  /**
 966   * remove 'wqp' from the prepost list on 'wqset'
 967   *
 968   * Conditions:
 969   *	wqset is locked
 970   *	caller holds a reference on wqp (and is responsible to release it)
 971   *
 972   * Result:
 973   *	wqp is invalidated, wqset is potentially updated with a new
 974   *	prepost ID, and the next element of the prepost list may be
 975   *	consumed as well (if the list contained only 2 objects)
 976   */
 977  static int
 978  wq_prepost_remove(struct waitq_set *wqset,
 979      struct wq_prepost *wqp)
 980  {
 981  	int more_posts = 1;
 982  	uint64_t next_id = wqp->wqp_post.wqp_next_id;
 983  	uint64_t wqp_id = wqp->wqp_prepostid.id;
 984  	struct wq_prepost *prev_wqp, *next_wqp;
 985  
 986  	assert(wqp_type(wqp) == WQP_POST);
 987  	assert(wqset->wqset_q.waitq_prepost == 1);
 988  
 989  	if (next_id == wqp_id) {
 990  		/* the list is singular and becoming empty */
 991  		wqset->wqset_prepost_id = 0;
 992  		more_posts = 0;
 993  		goto out;
 994  	}
 995  
 996  	prev_wqp = wq_prepost_get_rnext(wqp);
 997  	assert(prev_wqp != NULL);
 998  	assert(prev_wqp->wqp_post.wqp_next_id == wqp_id);
 999  	assert(prev_wqp->wqp_prepostid.id != wqp_id);
1000  	assert(wqp_type(prev_wqp) == WQP_POST);
1001  
1002  	if (prev_wqp->wqp_prepostid.id == next_id) {
1003  		/*
1004  		 * There are two items in the list, and we're removing one. We
1005  		 * only need to keep the WQP_WQ pointer from 'prev_wqp'
1006  		 */
1007  		wqset->wqset_prepost_id = prev_wqp->wqp_post.wqp_wq_id;
1008  		wq_prepost_invalidate(prev_wqp);
1009  		wq_prepost_put(prev_wqp);
1010  		more_posts = 0;
1011  		goto out;
1012  	}
1013  
1014  	/* prev->next = next */
1015  	prev_wqp->wqp_post.wqp_next_id = next_id;
1016  
1017  	/* next->prev = prev */
1018  	next_wqp = wq_prepost_get(next_id);
1019  	assert(next_wqp != NULL);
1020  	assert(next_wqp != wqp);
1021  	assert(next_wqp != prev_wqp);
1022  	assert(wqp_type(next_wqp) == WQP_POST);
1023  
1024  	wq_prepost_reset_rnext(next_wqp);
1025  	wq_prepost_rlink(next_wqp, prev_wqp);
1026  
1027  	/* If we remove the head of the list, update the wqset */
1028  	if (wqp_id == wqset->wqset_prepost_id) {
1029  		wqset->wqset_prepost_id = next_id;
1030  	}
1031  
1032  	wq_prepost_put(prev_wqp);
1033  	wq_prepost_put(next_wqp);
1034  
1035  out:
1036  	wq_prepost_reset_rnext(wqp);
1037  	wq_prepost_invalidate(wqp);
1038  	return more_posts;
1039  }
1040  
1041  static struct wq_prepost *
1042  wq_prepost_rfirst(uint64_t id)
1043  {
1044  	struct lt_elem *elem;
1045  	elem = lt_elem_list_first(&g_prepost_table, id);
1046  	wqp_do_alloc_stats(elem);
1047  	return (struct wq_prepost *)(void *)elem;
1048  }
1049  
1050  static struct wq_prepost *
1051  wq_prepost_rpop(uint64_t *id, int type)
1052  {
1053  	struct lt_elem *elem;
1054  	elem = lt_elem_list_pop(&g_prepost_table, id, type);
1055  	wqp_do_alloc_stats(elem);
1056  	return (struct wq_prepost *)(void *)elem;
1057  }
1058  
1059  static void
1060  wq_prepost_release_rlist(struct wq_prepost *wqp)
1061  {
1062  	int nelem = 0;
1063  	struct wqp_cache *cache;
1064  	struct lt_elem *elem;
1065  
1066  	if (!wqp) {
1067  		return;
1068  	}
1069  
1070  	elem = &wqp->wqte;
1071  
1072  #ifndef __DARLING__
1073  	/*
1074  	 * These are reserved elements: release them back to the per-cpu pool
1075  	 * if our cache is running low.
1076  	 */
1077  	disable_preemption();
1078  	cache = PERCPU_GET(wqp_cache);
1079  	if (cache->avail < WQP_CACHE_MAX) {
1080  		struct lt_elem *tmp = NULL;
1081  		if (cache->head != LT_IDX_MAX) {
1082  			tmp = lt_elem_list_first(&g_prepost_table, cache->head);
1083  		}
1084  		nelem = lt_elem_list_link(&g_prepost_table, elem, tmp);
1085  		cache->head = elem->lt_id.id;
1086  		cache->avail += nelem;
1087  		enable_preemption();
1088  		return;
1089  	}
1090  	enable_preemption();
1091  #endif // __DARLING__
1092  
1093  	/* release these elements back to the main table */
1094  	nelem = lt_elem_list_release(&g_prepost_table, elem, LT_RESERVED);
1095  
1096  #if CONFIG_WAITQ_STATS
1097  	g_prepost_table.nreserved_releases += 1;
1098  	OSDecrementAtomic64(&g_prepost_table.nreservations);
1099  #endif
1100  }
1101  
1102  typedef int (*wqp_callback_func)(struct waitq_set *wqset,
1103      void *ctx,
1104      struct wq_prepost *wqp,
1105      struct waitq *waitq);
1106  
1107  /**
1108   * iterate over a chain of preposts associated with a waitq set.
1109   *
1110   * Conditions:
1111   *	wqset is locked
1112   *
1113   * Notes:
1114   *	This loop performs automatic prepost chain management / culling, and
1115   *	may reset or adjust the waitq set's prepost ID pointer. If you don't
1116   *	want this extra processing, you can use wq_prepost_iterate().
1117   */
1118  static int
1119  wq_prepost_foreach_locked(struct waitq_set *wqset,
1120      void *ctx, wqp_callback_func cb)
1121  {
1122  	int ret = WQ_ITERATE_SUCCESS;
1123  	struct wq_prepost *wqp, *tmp_wqp;
1124  
1125  	assert(cb != NULL);
1126  
1127  	if (!wqset || !waitq_set_maybe_preposted(wqset)) {
1128  		return WQ_ITERATE_SUCCESS;
1129  	}
1130  
1131  restart:
1132  	wqp = wq_prepost_get(wqset->wqset_prepost_id);
1133  	if (!wqp) {
1134  		/*
1135  		 * The prepost object is no longer valid, reset the waitq
1136  		 * set's prepost id.
1137  		 */
1138  		wqset->wqset_prepost_id = 0;
1139  		return WQ_ITERATE_SUCCESS;
1140  	}
1141  
1142  	if (wqp_type(wqp) == WQP_WQ) {
1143  		uint64_t __assert_only wqp_id = wqp->wqp_prepostid.id;
1144  
1145  		ret = cb(wqset, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1146  
1147  		switch (ret) {
1148  		case WQ_ITERATE_INVALIDATE_CONTINUE:
1149  			/* the caller wants to remove the only prepost here */
1150  			assert(wqp_id == wqset->wqset_prepost_id);
1151  			wqset->wqset_prepost_id = 0;
1152  			OS_FALLTHROUGH;
1153  		case WQ_ITERATE_CONTINUE:
1154  			wq_prepost_put(wqp);
1155  			ret = WQ_ITERATE_SUCCESS;
1156  			break;
1157  		case WQ_ITERATE_RESTART:
1158  			wq_prepost_put(wqp);
1159  			OS_FALLTHROUGH;
1160  		case WQ_ITERATE_DROPPED:
1161  			goto restart;
1162  		default:
1163  			wq_prepost_put(wqp);
1164  			break;
1165  		}
1166  		return ret;
1167  	}
1168  
1169  	assert(wqp->wqp_prepostid.id == wqset->wqset_prepost_id);
1170  	assert(wqp_type(wqp) == WQP_POST);
1171  
1172  	/*
1173  	 * At this point we know we have a list of POST objects.
1174  	 * Grab a handle to the last element in the list and start
1175  	 * the iteration.
1176  	 */
1177  	tmp_wqp = wq_prepost_get_rnext(wqp);
1178  	assert(tmp_wqp != NULL && wqp_type(tmp_wqp) == WQP_POST);
1179  
1180  	uint64_t last_id = tmp_wqp->wqp_prepostid.id;
1181  	wq_prepost_put(tmp_wqp);
1182  
1183  	ret = WQ_ITERATE_SUCCESS;
1184  	for (;;) {
1185  		uint64_t wqp_id, first_id, next_id;
1186  
1187  		wqp_id = wqp->wqp_prepostid.id;
1188  		first_id = wqset->wqset_prepost_id;
1189  		next_id = wqp->wqp_post.wqp_next_id;
1190  
1191  		/* grab the WQP_WQ object this _POST points to */
1192  		tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1193  		if (!tmp_wqp) {
1194  			/*
1195  			 * This WQP_POST object points to an invalid
1196  			 * WQP_WQ object - remove the POST object from
1197  			 * the list.
1198  			 */
1199  			if (wq_prepost_remove(wqset, wqp) == 0) {
1200  				wq_prepost_put(wqp);
1201  				goto restart;
1202  			}
1203  			goto next_prepost;
1204  		}
1205  		assert(wqp_type(tmp_wqp) == WQP_WQ);
1206  		/*
1207  		 * make the callback: note that this could remove 'wqp' or
1208  		 * drop the lock on our waitq set. We need to re-validate
1209  		 * our state when this function returns.
1210  		 */
1211  		ret = cb(wqset, ctx, wqp, tmp_wqp->wqp_wq.wqp_wq_ptr);
1212  		wq_prepost_put(tmp_wqp);
1213  
1214  		switch (ret) {
1215  		case WQ_ITERATE_CONTINUE:
1216  			/* continue iteration */
1217  			break;
1218  		case WQ_ITERATE_INVALIDATE_CONTINUE:
1219  			assert(next_id == wqp->wqp_post.wqp_next_id);
1220  			if (wq_prepost_remove(wqset, wqp) == 0) {
1221  				wq_prepost_put(wqp);
1222  				goto restart;
1223  			}
1224  			goto next_prepost;
1225  		case WQ_ITERATE_RESTART:
1226  			wq_prepost_put(wqp);
1227  			OS_FALLTHROUGH;
1228  		case WQ_ITERATE_DROPPED:
1229  			/* the callback dropped the ref to wqp: just restart */
1230  			goto restart;
1231  		default:
1232  			/* break out of the iteration for some other reason */
1233  			goto finish_prepost_foreach;
1234  		}
1235  
1236  		/*
1237  		 * the set lock may have been dropped during callback,
1238  		 * if something looks different, restart the prepost iteration
1239  		 */
1240  		if (!wqp_is_valid(wqp) ||
1241  		    (wqp->wqp_post.wqp_next_id != next_id) ||
1242  		    wqset->wqset_prepost_id != first_id) {
1243  			wq_prepost_put(wqp);
1244  			goto restart;
1245  		}
1246  
1247  next_prepost:
1248  		/* this was the last object in the list */
1249  		if (wqp_id == last_id) {
1250  			break;
1251  		}
1252  
1253  		/* get the next object */
1254  		tmp_wqp = wq_prepost_get(next_id);
1255  		if (!tmp_wqp) {
1256  			/*
1257  			 * At this point we've already checked our state
1258  			 * after the callback (which may have dropped the set
1259  			 * lock). If we find an invalid member of the list
1260  			 * then something is wrong.
1261  			 */
1262  			panic("Invalid WQP_POST member 0x%llx in waitq set "
1263  			    "0x%llx prepost list (first:%llx, "
1264  			    "wqp:%p)",
1265  			    next_id, wqset->wqset_id, first_id, wqp);
1266  		}
1267  		wq_prepost_put(wqp);
1268  		wqp = tmp_wqp;
1269  
1270  		assert(wqp_type(wqp) == WQP_POST);
1271  	}
1272  
1273  finish_prepost_foreach:
1274  	wq_prepost_put(wqp);
1275  	if (ret == WQ_ITERATE_CONTINUE) {
1276  		ret = WQ_ITERATE_SUCCESS;
1277  	}
1278  
1279  	return ret;
1280  }
1281  
1282  /**
1283   * Perform a simple loop over a chain of prepost objects
1284   *
1285   * Conditions:
1286   *	If 'prepost_id' is associated with a waitq (set) then that object must
1287   *	be locked before calling this function.
1288   *	Callback function, 'cb', must be able to handle a NULL wqset pointer
1289   *	and a NULL waitq pointer!
1290   *
1291   * Notes:
1292   *	This prepost chain iteration will _not_ automatically adjust any chain
1293   *	element or linkage. This is the responsibility of the caller! If you
1294   *	want automatic prepost chain management (at a cost of extra CPU time),
1295   *	you can use: wq_prepost_foreach_locked().
1296   */
1297  static int
1298  wq_prepost_iterate(uint64_t prepost_id,
1299      void *ctx, wqp_callback_func cb)
1300  {
1301  	int ret;
1302  	struct wq_prepost *wqp;
1303  
1304  	if (!prepost_id) {
1305  		return WQ_ITERATE_SUCCESS;
1306  	}
1307  
1308  	wqp = wq_prepost_get(prepost_id);
1309  	if (!wqp) {
1310  		return WQ_ITERATE_SUCCESS;
1311  	}
1312  
1313  	if (wqp_type(wqp) == WQP_WQ) {
1314  		ret = WQ_ITERATE_SUCCESS;
1315  		if (cb) {
1316  			ret = cb(NULL, ctx, wqp, wqp->wqp_wq.wqp_wq_ptr);
1317  		}
1318  
1319  		if (ret != WQ_ITERATE_DROPPED) {
1320  			wq_prepost_put(wqp);
1321  		}
1322  		return ret;
1323  	}
1324  
1325  	assert(wqp->wqp_prepostid.id == prepost_id);
1326  	assert(wqp_type(wqp) == WQP_POST);
1327  
1328  	/* at this point we know we have a list of POST objects */
1329  	uint64_t next_id;
1330  
1331  	ret = WQ_ITERATE_CONTINUE;
1332  	do {
1333  		struct wq_prepost *tmp_wqp;
1334  		struct waitq *wq = NULL;
1335  
1336  		next_id = wqp->wqp_post.wqp_next_id;
1337  
1338  		/* grab the WQP_WQ object this _POST points to */
1339  		tmp_wqp = wq_prepost_get(wqp->wqp_post.wqp_wq_id);
1340  		if (tmp_wqp) {
1341  			assert(wqp_type(tmp_wqp) == WQP_WQ);
1342  			wq = tmp_wqp->wqp_wq.wqp_wq_ptr;
1343  		}
1344  
1345  		if (cb) {
1346  			ret = cb(NULL, ctx, wqp, wq);
1347  		}
1348  		if (tmp_wqp) {
1349  			wq_prepost_put(tmp_wqp);
1350  		}
1351  
1352  		if (ret != WQ_ITERATE_CONTINUE) {
1353  			break;
1354  		}
1355  
1356  		tmp_wqp = wq_prepost_get(next_id);
1357  		if (!tmp_wqp) {
1358  			/*
1359  			 * the chain is broken: nothing we can do here besides
1360  			 * bail from the iteration.
1361  			 */
1362  			ret = WQ_ITERATE_ABORTED;
1363  			break;
1364  		}
1365  
1366  		wq_prepost_put(wqp);
1367  		wqp = tmp_wqp;
1368  
1369  		assert(wqp_type(wqp) == WQP_POST);
1370  	} while (next_id != prepost_id);
1371  
1372  	if (ret != WQ_ITERATE_DROPPED) {
1373  		wq_prepost_put(wqp);
1374  	}
1375  
1376  	if (ret == WQ_ITERATE_CONTINUE) {
1377  		ret = WQ_ITERATE_SUCCESS;
1378  	}
1379  	return ret;
1380  }
1381  
1382  
1383  struct _is_posted_ctx {
1384  	struct waitq *posting_wq;
1385  	int did_prepost;
1386  };
1387  
1388  static int
1389  wq_is_preposted_on_set_cb(struct waitq_set *wqset, void *ctx,
1390      struct wq_prepost *wqp, struct waitq *waitq)
1391  {
1392  	struct _is_posted_ctx *pctx = (struct _is_posted_ctx *)ctx;
1393  
1394  	(void)wqset;
1395  	(void)wqp;
1396  
1397  	/*
1398  	 * Don't early-out, run through the _entire_ list:
1399  	 * This ensures that we retain a minimum number of invalid elements.
1400  	 */
1401  	if (pctx->posting_wq == waitq) {
1402  		pctx->did_prepost = 1;
1403  	}
1404  
1405  	return WQ_ITERATE_CONTINUE;
1406  }
1407  
1408  
1409  /**
1410   * checks if 'waitq' has already preposted on 'wqset'
1411   *
1412   * Parameters:
1413   *	waitq    The waitq that's preposting
1414   *	wqset    The set onto which waitq may be preposted
1415   *
1416   * Conditions:
1417   *	both waitq and wqset are locked
1418   *
1419   * Returns non-zero if 'waitq' has already preposted to 'wqset'
1420   */
1421  static int
1422  wq_is_preposted_on_set(struct waitq *waitq, struct waitq_set *wqset)
1423  {
1424  	int ret;
1425  	struct _is_posted_ctx pctx;
1426  
1427  	/*
1428  	 * If the set's only prepost matches the waitq's prepost ID,
1429  	 * then it obviously already preposted to the set.
1430  	 */
1431  	if (waitq->waitq_prepost_id != 0 &&
1432  	    wqset->wqset_prepost_id == waitq->waitq_prepost_id) {
1433  		return 1;
1434  	}
1435  
1436  	/* use full prepost iteration: always trim the list */
1437  	pctx.posting_wq = waitq;
1438  	pctx.did_prepost = 0;
1439  	ret = wq_prepost_foreach_locked(wqset, (void *)&pctx,
1440  	    wq_is_preposted_on_set_cb);
1441  	return pctx.did_prepost;
1442  }
1443  
1444  static struct wq_prepost *
1445  wq_get_prepost_obj(uint64_t *reserved, int type)
1446  {
1447  	struct wq_prepost *wqp = NULL;
1448  	/*
1449  	 * don't fail just because the caller doesn't have enough
1450  	 * reservations, we've kept a low-water mark on the prepost table,
1451  	 * so there should be some available for us.
1452  	 */
1453  	if (reserved && *reserved) {
1454  		wqp = wq_prepost_rpop(reserved, type);
1455  		assert(wqp->wqte.lt_id.idx < g_prepost_table.nelem);
1456  	} else {
1457  		/*
1458  		 * TODO: if in interrupt context, grab from a special
1459  		 *       region / reserved list!
1460  		 */
1461  		wqp = wq_prepost_alloc(type, 1);
1462  	}
1463  
1464  	if (wqp == NULL) {
1465  		panic("Couldn't allocate prepost object!");
1466  	}
1467  	return wqp;
1468  }
1469  
1470  
1471  /**
1472   * prepost a waitq onto a waitq set
1473   *
1474   * Parameters:
1475   *	wqset    The set onto which waitq will be preposted
1476   *	waitq    The waitq that's preposting
1477   *	reserved List (lt_elem_list_ style) of pre-allocated prepost elements
1478   *	         Could be NULL
1479   *
1480   * Conditions:
1481   *	both wqset and waitq are locked
1482   *
1483   * Notes:
1484   *	If reserved is NULL, this may block on prepost table growth.
1485   */
1486  static void
1487  wq_prepost_do_post_locked(struct waitq_set *wqset,
1488      struct waitq *waitq,
1489      uint64_t *reserved)
1490  {
1491  	struct wq_prepost *wqp_post, *wqp_head, *wqp_tail;
1492  
1493  	assert(waitq_held(waitq) && waitq_held(&wqset->wqset_q));
1494  
1495  	/*
1496  	 * nothing to do if it's already preposted:
1497  	 * note that this also culls any invalid prepost objects
1498  	 */
1499  	if (wq_is_preposted_on_set(waitq, wqset)) {
1500  		return;
1501  	}
1502  
1503  	assert(waitqs_is_linked(wqset));
1504  
1505  	/*
1506  	 * This function is called because an event is being posted to 'waitq'.
1507  	 * We need a prepost object associated with this queue. Allocate one
1508  	 * now if the waitq isn't already associated with one.
1509  	 */
1510  	if (waitq->waitq_prepost_id == 0) {
1511  		struct wq_prepost *wqp;
1512  		wqp = wq_get_prepost_obj(reserved, WQP_WQ);
1513  		wqp->wqp_wq.wqp_wq_ptr = waitq;
1514  		wqp_set_valid(wqp);
1515  		waitq->waitq_prepost_id = wqp->wqp_prepostid.id;
1516  		wq_prepost_put(wqp);
1517  	}
1518  
1519  #if CONFIG_LTABLE_STATS
1520  	g_prepost_table.npreposts += 1;
1521  #endif
1522  
1523  	wqdbg_v("preposting waitq %p (0x%llx) to set 0x%llx",
1524  	    (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq),
1525  	    waitq->waitq_prepost_id, wqset->wqset_id);
1526  
1527  	if (wqset->wqset_prepost_id == 0) {
1528  		/* the set has no previous preposts */
1529  		wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1530  		return;
1531  	}
1532  
1533  	wqp_head = wq_prepost_get(wqset->wqset_prepost_id);
1534  	if (!wqp_head) {
1535  		/* the previous prepost has become invalid */
1536  		wqset->wqset_prepost_id = waitq->waitq_prepost_id;
1537  		return;
1538  	}
1539  
1540  	assert(wqp_head->wqp_prepostid.id == wqset->wqset_prepost_id);
1541  
1542  	/*
1543  	 * If we get here, we're going to need at least one new wq_prepost
1544  	 * object. If the previous wqset_prepost_id points to a WQP_WQ, we
1545  	 * actually need to allocate 2 wq_prepost objects because the WQP_WQ
1546  	 * is tied to the waitq and shared across all sets.
1547  	 */
1548  	wqp_post = wq_get_prepost_obj(reserved, WQP_POST);
1549  
1550  	wqp_post->wqp_post.wqp_wq_id = waitq->waitq_prepost_id;
1551  	wqdbg_v("POST 0x%llx :: WQ 0x%llx", wqp_post->wqp_prepostid.id,
1552  	    waitq->waitq_prepost_id);
1553  
1554  	if (wqp_type(wqp_head) == WQP_WQ) {
1555  		/*
1556  		 * We must replace the wqset_prepost_id with a pointer
1557  		 * to two new WQP_POST objects
1558  		 */
1559  		uint64_t wqp_id = wqp_head->wqp_prepostid.id;
1560  		wqdbg_v("set 0x%llx previous had 1 WQ prepost (0x%llx): "
1561  		    "replacing with two POST preposts",
1562  		    wqset->wqset_id, wqp_id);
1563  
1564  		/* drop the old reference */
1565  		wq_prepost_put(wqp_head);
1566  
1567  		/* grab another new object (the 2nd of two) */
1568  		wqp_head = wq_get_prepost_obj(reserved, WQP_POST);
1569  
1570  		/* point this one to the original WQP_WQ object */
1571  		wqp_head->wqp_post.wqp_wq_id = wqp_id;
1572  		wqdbg_v("POST 0x%llx :: WQ 0x%llx",
1573  		    wqp_head->wqp_prepostid.id, wqp_id);
1574  
1575  		/* link it to the new wqp_post object allocated earlier */
1576  		wqp_head->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1577  		/* make the list a double-linked and circular */
1578  		wq_prepost_rlink(wqp_head, wqp_post);
1579  
1580  		/*
1581  		 * Finish setting up the new prepost: point it back to the
1582  		 * POST object we allocated to replace the original wqset
1583  		 * WQ prepost object
1584  		 */
1585  		wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1586  		wq_prepost_rlink(wqp_post, wqp_head);
1587  
1588  		/* mark objects valid, and reset the wqset prepost list head */
1589  		wqp_set_valid(wqp_head);
1590  		wqp_set_valid(wqp_post);
1591  		wqset->wqset_prepost_id = wqp_head->wqp_prepostid.id;
1592  
1593  		/* release both references */
1594  		wq_prepost_put(wqp_head);
1595  		wq_prepost_put(wqp_post);
1596  
1597  		wqdbg_v("set 0x%llx: 0x%llx/0x%llx -> 0x%llx/0x%llx -> 0x%llx",
1598  		    wqset->wqset_id, wqset->wqset_prepost_id,
1599  		    wqp_head->wqp_prepostid.id, wqp_head->wqp_post.wqp_next_id,
1600  		    wqp_post->wqp_prepostid.id,
1601  		    wqp_post->wqp_post.wqp_next_id);
1602  		return;
1603  	}
1604  
1605  	assert(wqp_type(wqp_head) == WQP_POST);
1606  
1607  	/*
1608  	 * Add the new prepost to the end of the prepost list
1609  	 */
1610  	wqp_tail = wq_prepost_get_rnext(wqp_head);
1611  	assert(wqp_tail != NULL);
1612  	assert(wqp_tail->wqp_post.wqp_next_id == wqset->wqset_prepost_id);
1613  
1614  	/*
1615  	 * link the head to the new tail
1616  	 * NOTE: this needs to happen first in case wqp_tail == wqp_head
1617  	 */
1618  	wq_prepost_reset_rnext(wqp_head);
1619  	wq_prepost_rlink(wqp_head, wqp_post);
1620  
1621  	/* point the new object to the list head, and list tail */
1622  	wqp_post->wqp_post.wqp_next_id = wqp_head->wqp_prepostid.id;
1623  	wq_prepost_rlink(wqp_post, wqp_tail);
1624  
1625  	/* point the last item in the waitq set's list to the new object */
1626  	wqp_tail->wqp_post.wqp_next_id = wqp_post->wqp_prepostid.id;
1627  
1628  	wqp_set_valid(wqp_post);
1629  
1630  	wq_prepost_put(wqp_head);
1631  	wq_prepost_put(wqp_tail);
1632  	wq_prepost_put(wqp_post);
1633  
1634  	wqdbg_v("set 0x%llx (wqp:0x%llx) last_prepost:0x%llx, "
1635  	    "new_prepost:0x%llx->0x%llx", wqset->wqset_id,
1636  	    wqset->wqset_prepost_id, wqp_head->wqp_prepostid.id,
1637  	    wqp_post->wqp_prepostid.id, wqp_post->wqp_post.wqp_next_id);
1638  
1639  	return;
1640  }
1641  
1642  
1643  /* ----------------------------------------------------------------------
1644   *
1645   * Stats collection / reporting
1646   *
1647   * ---------------------------------------------------------------------- */
1648  #if CONFIG_LTABLE_STATS && CONFIG_WAITQ_STATS
1649  static void
1650  wq_table_stats(struct link_table *table, struct wq_table_stats *stats)
1651  {
1652  	stats->version = WAITQ_STATS_VERSION;
1653  	stats->table_elements = table->nelem;
1654  	stats->table_used_elems = table->used_elem;
1655  	stats->table_elem_sz = table->elem_sz;
1656  	stats->table_slabs = table->nslabs;
1657  	stats->table_slab_sz = table->slab_sz;
1658  
1659  	stats->table_num_allocs = table->nallocs;
1660  	stats->table_num_preposts = table->npreposts;
1661  	stats->table_num_reservations = table->nreservations;
1662  
1663  	stats->table_max_used = table->max_used;
1664  	stats->table_avg_used = table->avg_used;
1665  	stats->table_max_reservations = table->max_reservations;
1666  	stats->table_avg_reservations = table->avg_reservations;
1667  }
1668  
1669  void
1670  waitq_link_stats(struct wq_table_stats *stats)
1671  {
1672  	if (!stats) {
1673  		return;
1674  	}
1675  	wq_table_stats(&g_wqlinktable, stats);
1676  }
1677  
1678  void
1679  waitq_prepost_stats(struct wq_table_stats *stats)
1680  {
1681  	wq_table_stats(&g_prepost_table, stats);
1682  }
1683  #endif
1684  
1685  
1686  /* ----------------------------------------------------------------------
1687   *
1688   * Global Wait Queues
1689   *
1690   * ---------------------------------------------------------------------- */
1691  
1692  static struct waitq g_boot_waitq;
1693  static struct waitq *global_waitqs = &g_boot_waitq;
1694  static uint32_t g_num_waitqs = 1;
1695  
1696  /*
1697   * Zero out the used MSBs of the event.
1698   */
1699  #define _CAST_TO_EVENT_MASK(event)   ((uintptr_t)(event) & ((1ul << _EVENT_MASK_BITS) - 1ul))
1700  
1701  static __inline__ uint32_t
1702  waitq_hash(char *key, size_t length)
1703  {
1704  	uint32_t hash = os_hash_jenkins(key, length);
1705  
1706  	hash &= (g_num_waitqs - 1);
1707  	return hash;
1708  }
1709  
1710  /* return a global waitq pointer corresponding to the given event */
1711  struct waitq *
1712  _global_eventq(char *event, size_t event_length)
1713  {
1714  	return &global_waitqs[waitq_hash(event, event_length)];
1715  }
1716  
1717  /* return an indexed global waitq pointer */
1718  struct waitq *
1719  global_waitq(int index)
1720  {
1721  	return &global_waitqs[index % g_num_waitqs];
1722  }
1723  
1724  
1725  #if CONFIG_LTABLE_STATS || CONFIG_WAITQ_STATS
1726  /* this global is for lldb */
1727  const uint32_t g_nwaitq_btframes = NWAITQ_BTFRAMES;
1728  
1729  static __inline__ void
1730  waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int skip)
1731  {
1732  	uintptr_t buf[NWAITQ_BTFRAMES + skip];
1733  	if (skip < 0) {
1734  		skip = 0;
1735  	}
1736  	memset(buf, 0, (NWAITQ_BTFRAMES + skip) * sizeof(uintptr_t));
1737  	backtrace(buf, g_nwaitq_btframes + skip, NULL);
1738  	memcpy(&bt[0], &buf[skip], NWAITQ_BTFRAMES * sizeof(uintptr_t));
1739  }
1740  #else /* no stats */
1741  #define waitq_grab_backtrace(...)
1742  #endif
1743  
1744  #if CONFIG_WAITQ_STATS
1745  
1746  struct wq_stats g_boot_stats;
1747  struct wq_stats *g_waitq_stats = &g_boot_stats;
1748  
1749  static __inline__ struct wq_stats *
1750  waitq_global_stats(struct waitq *waitq)
1751  {
1752  	struct wq_stats *wqs;
1753  	uint32_t idx;
1754  
1755  	if (!waitq_is_global(waitq)) {
1756  		return NULL;
1757  	}
1758  
1759  	idx = (uint32_t)(((uintptr_t)waitq - (uintptr_t)global_waitqs) / sizeof(*waitq));
1760  	assert(idx < g_num_waitqs);
1761  	wqs = &g_waitq_stats[idx];
1762  	return wqs;
1763  }
1764  
1765  static __inline__ void
1766  waitq_stats_count_wait(struct waitq *waitq)
1767  {
1768  	struct wq_stats *wqs = waitq_global_stats(waitq);
1769  	if (wqs != NULL) {
1770  		wqs->waits++;
1771  		waitq_grab_backtrace(wqs->last_wait, 2);
1772  	}
1773  }
1774  
1775  static __inline__ void
1776  waitq_stats_count_wakeup(struct waitq *waitq)
1777  {
1778  	struct wq_stats *wqs = waitq_global_stats(waitq);
1779  	if (wqs != NULL) {
1780  		wqs->wakeups++;
1781  		waitq_grab_backtrace(wqs->last_wakeup, 2);
1782  	}
1783  }
1784  
1785  static __inline__ void
1786  waitq_stats_count_clear_wakeup(struct waitq *waitq)
1787  {
1788  	struct wq_stats *wqs = waitq_global_stats(waitq);
1789  	if (wqs != NULL) {
1790  		wqs->wakeups++;
1791  		wqs->clears++;
1792  		waitq_grab_backtrace(wqs->last_wakeup, 2);
1793  	}
1794  }
1795  
1796  static __inline__ void
1797  waitq_stats_count_fail(struct waitq *waitq)
1798  {
1799  	struct wq_stats *wqs = waitq_global_stats(waitq);
1800  	if (wqs != NULL) {
1801  		wqs->failed_wakeups++;
1802  		waitq_grab_backtrace(wqs->last_failed_wakeup, 2);
1803  	}
1804  }
1805  #else /* !CONFIG_WAITQ_STATS */
1806  #define waitq_stats_count_wait(q)         do { } while (0)
1807  #define waitq_stats_count_wakeup(q)       do { } while (0)
1808  #define waitq_stats_count_clear_wakeup(q) do { } while (0)
1809  #define waitq_stats_count_fail(q)         do { } while (0)
1810  #endif
1811  
1812  int
1813  waitq_is_valid(struct waitq *waitq)
1814  {
1815  	return (waitq != NULL) && waitq->waitq_isvalid;
1816  }
1817  
1818  int
1819  waitq_set_is_valid(struct waitq_set *wqset)
1820  {
1821  	return (wqset != NULL) && wqset->wqset_q.waitq_isvalid && waitqs_is_set(wqset);
1822  }
1823  
1824  int
1825  waitq_is_global(struct waitq *waitq)
1826  {
1827  	if (waitq >= global_waitqs && waitq < global_waitqs + g_num_waitqs) {
1828  		return 1;
1829  	}
1830  	return 0;
1831  }
1832  
1833  int
1834  waitq_irq_safe(struct waitq *waitq)
1835  {
1836  	/* global wait queues have this bit set on initialization */
1837  	return waitq->waitq_irq;
1838  }
1839  
1840  static inline bool
1841  waitq_empty(struct waitq *wq)
1842  {
1843  	if (waitq_is_turnstile_queue(wq)) {
1844  		return priority_queue_empty(&wq->waitq_prio_queue);
1845  	} else if (waitq_is_turnstile_proxy(wq)) {
1846  		struct turnstile *ts = wq->waitq_ts;
1847  		return ts == TURNSTILE_NULL ||
1848  		       priority_queue_empty(&ts->ts_waitq.waitq_prio_queue);
1849  	} else {
1850  		return queue_empty(&wq->waitq_queue);
1851  	}
1852  }
1853  
1854  static struct waitq *
1855  waitq_get_safeq(struct waitq *waitq)
1856  {
1857  	/* Check if it's a port waitq */
1858  	if (waitq_is_turnstile_proxy(waitq)) {
1859  		struct turnstile *ts = waitq->waitq_ts;
1860  		return ts ? &ts->ts_waitq : NULL;
1861  	}
1862  	return global_eventq(waitq);
1863  }
1864  
1865  static uint32_t
1866  waitq_hash_size(void)
1867  {
1868  	uint32_t hsize, queues;
1869  
1870  	if (PE_parse_boot_argn("wqsize", &hsize, sizeof(hsize))) {
1871  		return hsize;
1872  	}
1873  
1874  	queues = thread_max / 5;
1875  	hsize = P2ROUNDUP(queues * sizeof(struct waitq), PAGE_SIZE);
1876  
1877  	return hsize;
1878  }
1879  
1880  /*
1881   * Since the priority ordered waitq uses basepri as the
1882   * ordering key assert that this value fits in a uint8_t.
1883   */
1884  static_assert(MAXPRI <= UINT8_MAX);
1885  
1886  static inline void
1887  waitq_thread_insert(struct waitq *wq,
1888      thread_t thread, boolean_t fifo)
1889  {
1890  	if (waitq_is_turnstile_queue(wq)) {
1891  		turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
1892  		turnstile_waitq_add_thread_priority_queue(wq, thread);
1893  	} else {
1894  		turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
1895  		if (fifo) {
1896  			enqueue_tail(&wq->waitq_queue, &thread->wait_links);
1897  		} else {
1898  			enqueue_head(&wq->waitq_queue, &thread->wait_links);
1899  		}
1900  	}
1901  }
1902  
1903  static inline void
1904  waitq_thread_remove(struct waitq *wq,
1905      thread_t thread)
1906  {
1907  	if (waitq_is_turnstile_queue(wq)) {
1908  		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
1909  		    (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
1910  		    VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
1911  		    thread_tid(thread),
1912  		    0, 0, 0);
1913  		priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links);
1914  	} else {
1915  		remqueue(&(thread->wait_links));
1916  	}
1917  }
1918  
1919  void
1920  waitq_bootstrap(void)
1921  {
1922  	kern_return_t kret;
1923  	uint32_t whsize, qsz, tmp32;
1924  
1925  	g_min_free_table_elem = DEFAULT_MIN_FREE_TABLE_ELEM;
1926  	if (PE_parse_boot_argn("wqt_min_free", &tmp32, sizeof(tmp32)) == TRUE) {
1927  		g_min_free_table_elem = tmp32;
1928  	}
1929  	wqdbg("Minimum free table elements: %d", tmp32);
1930  
1931  	/*
1932  	 * Determine the amount of memory we're willing to reserve for
1933  	 * the waitqueue hash table
1934  	 */
1935  	whsize = waitq_hash_size();
1936  
1937  	/* Determine the number of waitqueues we can fit. */
1938  	qsz = sizeof(struct waitq);
1939  	whsize = ROUNDDOWN(whsize, qsz);
1940  	g_num_waitqs = whsize / qsz;
1941  
1942  	/*
1943  	 * The hash algorithm requires that this be a power of 2, so we
1944  	 * just mask off all the low-order bits.
1945  	 */
1946  	for (uint32_t i = 0; i < 31; i++) {
1947  		uint32_t bit = (1 << i);
1948  		if ((g_num_waitqs & bit) == g_num_waitqs) {
1949  			break;
1950  		}
1951  		g_num_waitqs &= ~bit;
1952  	}
1953  	assert(g_num_waitqs > 0);
1954  
1955  	/* Now determine how much memory we really need. */
1956  	whsize = P2ROUNDUP(g_num_waitqs * qsz, PAGE_SIZE);
1957  
1958  	wqdbg("allocating %d global queues  (%d bytes)", g_num_waitqs, whsize);
1959  	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&global_waitqs,
1960  	    whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1961  	if (kret != KERN_SUCCESS || global_waitqs == NULL) {
1962  		panic("kernel_memory_allocate() failed to alloc global_waitqs"
1963  		    ", error: %d, whsize: 0x%x", kret, whsize);
1964  	}
1965  
1966  #if CONFIG_WAITQ_STATS
1967  	whsize = P2ROUNDUP(g_num_waitqs * sizeof(struct wq_stats), PAGE_SIZE);
1968  	kret = kernel_memory_allocate(kernel_map, (vm_offset_t *)&g_waitq_stats,
1969  	    whsize, 0, KMA_KOBJECT | KMA_NOPAGEWAIT, VM_KERN_MEMORY_WAITQ);
1970  	if (kret != KERN_SUCCESS || global_waitqs == NULL) {
1971  		panic("kernel_memory_allocate() failed to alloc g_waitq_stats"
1972  		    ", error: %d, whsize: 0x%x", kret, whsize);
1973  	}
1974  	memset(g_waitq_stats, 0, whsize);
1975  #endif
1976  
1977  	for (uint32_t i = 0; i < g_num_waitqs; i++) {
1978  		waitq_init(&global_waitqs[i], SYNC_POLICY_FIFO | SYNC_POLICY_DISABLE_IRQ);
1979  	}
1980  
1981  	/* initialize the global waitq link table */
1982  	wql_init();
1983  
1984  	/* initialize the global waitq prepost table */
1985  	wqp_init();
1986  }
1987  
1988  
1989  /* ----------------------------------------------------------------------
1990   *
1991   * Wait Queue Implementation
1992   *
1993   * ---------------------------------------------------------------------- */
1994  
1995  #ifndef __DARLING__
1996  /*
1997   * Double the standard lock timeout, because wait queues tend
1998   * to iterate over a number of threads - locking each.  If there is
1999   * a problem with a thread lock, it normally times out at the wait
2000   * queue level first, hiding the real problem.
2001   */
2002  /* For x86, the hardware timeout is in TSC units. */
2003  #if defined(__i386__) || defined(__x86_64__)
2004  #define hwLockTimeOut LockTimeOutTSC
2005  #else
2006  #define hwLockTimeOut LockTimeOut
2007  #endif
2008  
2009  void
2010  waitq_lock(struct waitq *wq)
2011  {
2012  	if (__improbable(waitq_lock_to(wq,
2013  	    hwLockTimeOut * 2) == 0)) {
2014  		boolean_t wql_acquired = FALSE;
2015  
2016  		while (machine_timeout_suspended()) {
2017  			mp_enable_preemption();
2018  			wql_acquired = waitq_lock_to(wq,
2019  			    hwLockTimeOut * 2);
2020  			if (wql_acquired) {
2021  				break;
2022  			}
2023  		}
2024  		if (wql_acquired == FALSE) {
2025  			panic("waitq deadlock - waitq=%p, cpu=%d\n",
2026  			    wq, cpu_number());
2027  		}
2028  	}
2029  #if defined(__x86_64__)
2030  	pltrace(FALSE);
2031  #endif
2032  	assert(waitq_held(wq));
2033  }
2034  
2035  void
2036  waitq_unlock(struct waitq *wq)
2037  {
2038  	assert(waitq_held(wq));
2039  #if defined(__x86_64__)
2040  	pltrace(TRUE);
2041  #endif
2042  	waitq_lock_unlock(wq);
2043  }
2044  #endif // __DARLING__
2045  
2046  
2047  /**
2048   * clear the thread-related waitq state
2049   *
2050   * Conditions:
2051   *	'thread' is locked
2052   */
2053  static inline void
2054  thread_clear_waitq_state(thread_t thread)
2055  {
2056  	thread->waitq = NULL;
2057  	thread->wait_event = NO_EVENT64;
2058  	thread->at_safe_point = FALSE;
2059  }
2060  
2061  
2062  typedef thread_t (*waitq_select_cb)(void *ctx, struct waitq *waitq,
2063      int is_global, thread_t thread);
2064  
2065  struct waitq_select_args {
2066  	/* input parameters */
2067  	struct waitq    *posted_waitq;
2068  	struct waitq    *waitq;
2069  	event64_t        event;
2070  	waitq_select_cb  select_cb;
2071  	void            *select_ctx;
2072  	int             priority;
2073  
2074  	uint64_t        *reserved_preposts;
2075  
2076  	/* output parameters */
2077  	queue_t       threadq;
2078  	int           max_threads;
2079  	int          *nthreads;
2080  	spl_t        *spl;
2081  };
2082  
2083  static void do_waitq_select_n_locked(struct waitq_select_args *args);
2084  
2085  /**
2086   * callback invoked once for every waitq set to which a waitq belongs
2087   *
2088   * Conditions:
2089   *	ctx->posted_waitq is locked
2090   *	'link' points to a valid waitq set
2091   *
2092   * Notes:
2093   *	Takes the waitq set lock on the set pointed to by 'link'
2094   *	Calls do_waitq_select_n_locked() which could recurse back into
2095   *	this function if the waitq set is a member of other sets.
2096   *	If no threads were selected, it preposts the input waitq
2097   *	onto the waitq set pointed to by 'link'.
2098   */
2099  static int
2100  waitq_select_walk_cb(struct waitq *waitq, void *ctx,
2101      struct waitq_link *link)
2102  {
2103  	int ret = WQ_ITERATE_CONTINUE;
2104  	struct waitq_select_args args = *((struct waitq_select_args *)ctx);
2105  	struct waitq_set *wqset;
2106  
2107  	(void)waitq;
2108  	assert(wql_type(link) == WQL_WQS);
2109  
2110  	wqset = link->wql_wqs.wql_set;
2111  	args.waitq = &wqset->wqset_q;
2112  
2113  	assert(!waitq_irq_safe(waitq));
2114  	assert(!waitq_irq_safe(&wqset->wqset_q));
2115  
2116  	waitq_set_lock(wqset);
2117  	/*
2118  	 * verify that the link wasn't invalidated just before
2119  	 * we were able to take the lock.
2120  	 */
2121  	if (wqset->wqset_id != link->wql_setid.id) {
2122  		goto out_unlock;
2123  	}
2124  
2125  	assert(waitqs_is_linked(wqset));
2126  
2127  	/*
2128  	 * Find any threads waiting on this wait queue set,
2129  	 * and recurse into any waitq set to which this set belongs.
2130  	 */
2131  	do_waitq_select_n_locked(&args);
2132  
2133  	if (*args.nthreads > 0 || (args.threadq && !queue_empty(args.threadq))) {
2134  		/* at least 1 thread was selected and returned: don't prepost */
2135  		if (args.max_threads > 0 && *args.nthreads >= args.max_threads) {
2136  			/* break out of the setid walk */
2137  			ret = WQ_ITERATE_FOUND;
2138  		}
2139  	} else if (args.event == NO_EVENT64) {
2140  		/*
2141  		 * No thread selected: prepost 'waitq' to 'wqset'
2142  		 * if wqset can handle preposts and the event is set to 0.
2143  		 * We also make sure to not post waitq sets to other sets.
2144  		 *
2145  		 * If the set doesn't support preposts, but does support
2146  		 * prepost callout/hook interaction, invoke the predefined
2147  		 * callout function and pass the set's 'prepost_hook.' This
2148  		 * could potentially release another thread to handle events.
2149  		 */
2150  		if (waitq_set_can_prepost(wqset)) {
2151  			wq_prepost_do_post_locked(
2152  				wqset, waitq, args.reserved_preposts);
2153  		} else if (waitq_set_has_prepost_hook(wqset)) {
2154  			waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
2155  
2156  			/*
2157  			 * When calling out to the prepost hook,
2158  			 * we drop the waitq lock, to allow for the kevent
2159  			 * subsytem to call into the waitq subsystem again,
2160  			 * without risking a deadlock.
2161  			 *
2162  			 * However, we need to guard against wqset going away,
2163  			 * so we increment the prepost hook use count
2164  			 * while the lock is dropped.
2165  			 *
2166  			 * This lets waitq_set_deinit() know to wait for the
2167  			 * prepost hook call to be done before it can proceed.
2168  			 *
2169  			 * Note: we need to keep preemption disabled the whole
2170  			 * time as waitq_set_deinit will spin on this.
2171  			 */
2172  
2173  			disable_preemption();
2174  			os_atomic_add(hook, (uint16_t)1, relaxed);
2175  			waitq_set_unlock(wqset);
2176  
2177  			waitq_set__CALLING_PREPOST_HOOK__(hook);
2178  
2179  			/* Note: after this decrement, the wqset may be deallocated */
2180  			os_atomic_add(hook, (uint16_t)-1, relaxed);
2181  			enable_preemption();
2182  			return ret;
2183  		}
2184  	}
2185  
2186  out_unlock:
2187  	waitq_set_unlock(wqset);
2188  	return ret;
2189  }
2190  
2191  /**
2192   * Routine to iterate over the waitq for non-priority ordered waitqs
2193   *
2194   * Conditions:
2195   *	args->waitq (and args->posted_waitq) is locked
2196   *
2197   * Notes:
2198   *	Uses the optional select callback function to refine the selection
2199   *	of one or more threads from a waitq. The select callback is invoked
2200   *	once for every thread that is found to be waiting on the input args->waitq.
2201   *
2202   *	If one or more threads are selected, this may disable interrupts.
2203   *	The previous interrupt state is returned in args->spl and should
2204   *	be used in a call to splx() if threads are returned to the caller.
2205   */
2206  static thread_t
2207  waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,
2208      spl_t spl, struct waitq_select_args *args,
2209      uint32_t *remaining_eventmask)
2210  {
2211  	int max_threads = args->max_threads;
2212  	int *nthreads = args->nthreads;
2213  	thread_t thread = THREAD_NULL;
2214  	thread_t first_thread = THREAD_NULL;
2215  
2216  	qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
2217  		thread_t t = THREAD_NULL;
2218  		assert_thread_magic(thread);
2219  
2220  		/*
2221  		 * For non-priority ordered waitqs, we allow multiple events to be
2222  		 * mux'ed into the same waitq. Also safeqs may contain threads from
2223  		 * multiple waitqs. Only pick threads that match the
2224  		 * requested wait event.
2225  		 */
2226  		if (thread->waitq == waitq && thread->wait_event == args->event) {
2227  			t = thread;
2228  			if (first_thread == THREAD_NULL) {
2229  				first_thread = thread;
2230  			}
2231  
2232  			/* allow the caller to futher refine the selection */
2233  			if (args->select_cb) {
2234  				t = args->select_cb(args->select_ctx, waitq,
2235  				    waitq_is_global(waitq), thread);
2236  			}
2237  			if (t != THREAD_NULL) {
2238  				*nthreads += 1;
2239  				if (args->threadq) {
2240  					/* if output queue, add locked thread to it */
2241  					if (*nthreads == 1) {
2242  						*(args->spl) = (safeq != waitq) ? spl : splsched();
2243  					}
2244  					thread_lock(t);
2245  					thread_clear_waitq_state(t);
2246  					re_queue_tail(args->threadq, &t->wait_links);
2247  				}
2248  				/* only enqueue up to 'max' threads */
2249  				if (*nthreads >= max_threads && max_threads > 0) {
2250  					break;
2251  				}
2252  			}
2253  		}
2254  		/* thread wasn't selected so track it's event */
2255  		if (t == THREAD_NULL) {
2256  			*remaining_eventmask |= (thread->waitq != safeq) ?
2257  			    _CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);
2258  		}
2259  	}
2260  
2261  	return first_thread;
2262  }
2263  
2264  /**
2265   * Routine to iterate and remove threads from priority ordered waitqs
2266   *
2267   * Conditions:
2268   *	args->waitq (and args->posted_waitq) is locked
2269   *
2270   * Notes:
2271   *	The priority ordered waitqs only support maximum priority element removal.
2272   *
2273   *	Also, the implementation makes sure that all threads in a priority ordered
2274   *	waitq are waiting on the same wait event. This is not necessarily true for
2275   *	non-priority ordered waitqs. If one or more threads are selected, this may
2276   *	disable interrupts. The previous interrupt state is returned in args->spl
2277   *	and should be used in a call to splx() if threads are returned to the caller.
2278   *
2279   *	In the future, we could support priority ordered waitqs with multiple wait
2280   *	events in the same queue. The way to implement that would be to keep removing
2281   *	elements from the waitq and if the event does not match the requested one,
2282   *	add it to a local list. This local list of elements needs to be re-inserted
2283   *	into the priority queue at the end and the select_cb return value &
2284   *	remaining_eventmask would need to be handled appropriately. The implementation
2285   *	is not very efficient but would work functionally.
2286   */
2287  static thread_t
2288  waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
2289      spl_t spl, struct waitq_select_args *args,
2290      uint32_t *remaining_eventmask)
2291  {
2292  	int max_threads = args->max_threads;
2293  	int *nthreads = args->nthreads;
2294  	thread_t first_thread = THREAD_NULL;
2295  	thread_t thread = THREAD_NULL;
2296  
2297  	/*
2298  	 * The waitq select routines need to handle two cases:
2299  	 * Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)
2300  	 *         Get the maximum priority thread from the waitq without removing it.
2301  	 *         In that case args->threadq == NULL and max_threads == 1.
2302  	 * Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)
2303  	 *         Get max_threads (if available) while removing them from the waitq.
2304  	 *         In that case args->threadq != NULL and max_threads is one of {-1, 1}.
2305  	 *
2306  	 * The only possible values for remaining_eventmask for the priority queue
2307  	 * waitq are either 0 (for the remove all threads case) or the original
2308  	 * safeq->waitq_eventmask (for the lookup/remove one thread cases).
2309  	 */
2310  	*remaining_eventmask = safeq->waitq_eventmask;
2311  	boolean_t remove_op = !!(args->threadq);
2312  
2313  	while ((max_threads <= 0) || (*nthreads < max_threads)) {
2314  		if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
2315  			*remaining_eventmask = 0;
2316  			break;
2317  		}
2318  
2319  		if (remove_op) {
2320  			thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
2321  			    struct thread, wait_prioq_links);
2322  		} else {
2323  			/* For the peek operation, the only valid value for max_threads is 1 */
2324  			assert(max_threads == 1);
2325  			thread = priority_queue_max(&safeq->waitq_prio_queue,
2326  			    struct thread, wait_prioq_links);
2327  		}
2328  		/*
2329  		 * Ensure the wait event matches since priority ordered waitqs do not
2330  		 * support multiple events in the same waitq.
2331  		 */
2332  		assert((thread->waitq == waitq) && (thread->wait_event == args->event));
2333  
2334  		if (args->select_cb) {
2335  			/*
2336  			 * Call the select_cb passed into the waitq_select args. The callback
2337  			 * updates the select_ctx with information about the highest priority
2338  			 * thread which is eventually used by the caller.
2339  			 */
2340  			thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,
2341  			    waitq_is_global(waitq), thread);
2342  			if (!remove_op) {
2343  				/* For the peek operation, the thread should not be selected for addition */
2344  				assert(ret_thread == THREAD_NULL);
2345  			} else {
2346  				/*
2347  				 * For the remove operation, the select routine should always return a valid
2348  				 * thread for priority waitqs. Since all threads in a prioq are equally
2349  				 * eligible, it should match the thread removed from the prioq. If this
2350  				 * invariant changes, the implementation would need to handle the
2351  				 * remaining_eventmask here correctly.
2352  				 */
2353  				assert(ret_thread == thread);
2354  			}
2355  		}
2356  
2357  		if (first_thread == THREAD_NULL) {
2358  			first_thread = thread;
2359  			/*
2360  			 * turnstile_kernel_update_inheritor_on_wake_locked will lock
2361  			 * first_thread, so call it before locking it.
2362  			 */
2363  			if (args->priority == WAITQ_PROMOTE_ON_WAKE && first_thread != THREAD_NULL && waitq_is_turnstile_queue(safeq)) {
2364  				turnstile_kernel_update_inheritor_on_wake_locked(waitq_to_turnstile(safeq), (turnstile_inheritor_t)first_thread, TURNSTILE_INHERITOR_THREAD);
2365  			}
2366  		}
2367  
2368  		/* For the peek operation, break out early */
2369  		if (!remove_op) {
2370  			break;
2371  		}
2372  
2373  		/* Add the thread to the result thread list */
2374  		*nthreads += 1;
2375  		if (*nthreads == 1) {
2376  			*(args->spl) = (safeq != waitq) ? spl : splsched();
2377  		}
2378  		thread_lock(thread);
2379  		thread_clear_waitq_state(thread);
2380  		enqueue_tail(args->threadq, &(thread->wait_links));
2381  	}
2382  
2383  	return first_thread;
2384  }
2385  
2386  /**
2387   * generic thread selection from a waitq (and sets to which the waitq belongs)
2388   *
2389   * Conditions:
2390   *	args->waitq (and args->posted_waitq) is locked
2391   *
2392   * Notes:
2393   *	Uses the optional select callback function to refine the selection
2394   *	of one or more threads from a waitq and any set to which the waitq
2395   *	belongs. The select callback is invoked once for every thread that
2396   *	is found to be waiting on the input args->waitq.
2397   *
2398   *	If one or more threads are selected, this may disable interrupts.
2399   *	The previous interrupt state is returned in args->spl and should
2400   *	be used in a call to splx() if threads are returned to the caller.
2401   */
2402  static void
2403  do_waitq_select_n_locked(struct waitq_select_args *args)
2404  {
2405  	struct waitq *waitq = args->waitq;
2406  	int max_threads = args->max_threads;
2407  	thread_t first_thread = THREAD_NULL;
2408  	struct waitq *safeq;
2409  	uint32_t remaining_eventmask = 0;
2410  	uint32_t eventmask;
2411  	int *nthreads = args->nthreads;
2412  	spl_t spl = 0;
2413  
2414  	assert(max_threads != 0);
2415  
2416  	if (!waitq_irq_safe(waitq)) {
2417  		/* JMM - add flag to waitq to avoid global lookup if no waiters */
2418  		eventmask = _CAST_TO_EVENT_MASK(waitq);
2419  		safeq = waitq_get_safeq(waitq);
2420  		if (safeq == NULL) {
2421  			/*
2422  			 * in the WQT_TSPROXY case, if there's no turnstile,
2423  			 * there's no queue and no waiters, so we can move straight
2424  			 * to the waitq set recursion
2425  			 */
2426  			goto handle_waitq_set;
2427  		}
2428  
2429  		if (*nthreads == 0) {
2430  			spl = splsched();
2431  		}
2432  		waitq_lock(safeq);
2433  	} else {
2434  		eventmask = _CAST_TO_EVENT_MASK(args->event);
2435  		safeq = waitq;
2436  	}
2437  
2438  	/*
2439  	 * If the safeq doesn't have an eventmask (not global) or the event
2440  	 * we're looking for IS set in its eventmask, then scan the threads
2441  	 * in that queue for ones that match the original <waitq,event> pair.
2442  	 */
2443  	if (!waitq_is_global(safeq) ||
2444  	    (safeq->waitq_eventmask & eventmask) == eventmask) {
2445  		if (waitq_is_turnstile_queue(safeq)) {
2446  			first_thread = waitq_prioq_iterate_locked(safeq, waitq,
2447  			    spl, args,
2448  			    &remaining_eventmask);
2449  		} else {
2450  			first_thread = waitq_queue_iterate_locked(safeq, waitq,
2451  			    spl, args,
2452  			    &remaining_eventmask);
2453  		}
2454  
2455  		/*
2456  		 * Update the eventmask of global queues we just scanned:
2457  		 * - If we selected all the threads in the queue, we can clear its
2458  		 *   eventmask.
2459  		 *
2460  		 * - If we didn't find enough threads to fill our needs, then we can
2461  		 *   assume we looked at every thread in the queue and the mask we
2462  		 *   computed is complete - so reset it.
2463  		 */
2464  		if (waitq_is_global(safeq)) {
2465  			if (waitq_empty(safeq)) {
2466  				safeq->waitq_eventmask = 0;
2467  			} else if (max_threads < 0 || *nthreads < max_threads) {
2468  				safeq->waitq_eventmask = remaining_eventmask;
2469  			}
2470  		}
2471  	}
2472  
2473  	/*
2474  	 * Grab the first thread in the queue if no other thread was selected.
2475  	 * We can guarantee that no one has manipulated this thread because
2476  	 * it's waiting on the given waitq, and we have that waitq locked.
2477  	 */
2478  	if (*nthreads == 0 && first_thread != THREAD_NULL && args->threadq) {
2479  		/* we know this is the first (and only) thread */
2480  		++(*nthreads);
2481  		*(args->spl) = (safeq != waitq) ? spl : splsched();
2482  
2483  		thread_lock(first_thread);
2484  		thread_clear_waitq_state(first_thread);
2485  		waitq_thread_remove(safeq, first_thread);
2486  		enqueue_tail(args->threadq, &(first_thread->wait_links));
2487  
2488  		/* update the eventmask on [now] empty global queues */
2489  		if (waitq_is_global(safeq) && waitq_empty(safeq)) {
2490  			safeq->waitq_eventmask = 0;
2491  		}
2492  	}
2493  
2494  	/* unlock the safe queue if we locked one above */
2495  	if (safeq != waitq) {
2496  		waitq_unlock(safeq);
2497  		if (*nthreads == 0) {
2498  			splx(spl);
2499  		}
2500  	}
2501  
2502  	if (max_threads > 0 && *nthreads >= max_threads) {
2503  		return;
2504  	}
2505  
2506  handle_waitq_set:
2507  	/*
2508  	 * wait queues that are not in any sets
2509  	 * are the bottom of the recursion
2510  	 */
2511  	if (!waitq->waitq_set_id) {
2512  		return;
2513  	}
2514  
2515  	/* check to see if the set ID for this wait queue is valid */
2516  	struct waitq_link *link = wql_get_link(waitq->waitq_set_id);
2517  	if (!link) {
2518  		/* the waitq set to which this waitq belonged, has been invalidated */
2519  		waitq->waitq_set_id = 0;
2520  		return;
2521  	}
2522  
2523  	wql_put_link(link);
2524  
2525  	/*
2526  	 * If this waitq is a member of any wait queue sets, we need to look
2527  	 * for waiting thread(s) in any of those sets, and prepost all sets that
2528  	 * don't have active waiters.
2529  	 *
2530  	 * Note that we do a local walk of this waitq's links - we manually
2531  	 * recurse down wait queue set's with non-zero wqset_q.waitq_set_id
2532  	 */
2533  	(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
2534  	    WQL_WQS, (void *)args, waitq_select_walk_cb);
2535  }
2536  
2537  /**
2538   * main entry point for thread selection from a waitq
2539   *
2540   * Conditions:
2541   *	waitq is locked
2542   *
2543   * Returns:
2544   *	The number of threads waiting on 'waitq' for 'event' which have
2545   *	been placed onto the input 'threadq'
2546   *
2547   * Notes:
2548   *	The 'select_cb' function is invoked for every thread found waiting on
2549   *	'waitq' for 'event'. The thread is _not_ locked upon callback
2550   *	invocation. This parameter may be NULL.
2551   *
2552   *	If one or more threads are returned in 'threadq' then the caller is
2553   *	responsible to call splx() using the returned 'spl' value. Each
2554   *	returned thread is locked.
2555   */
2556  static __inline__ int
2557  waitq_select_n_locked(struct waitq *waitq,
2558      event64_t event,
2559      waitq_select_cb select_cb,
2560      void *select_ctx,
2561      uint64_t *reserved_preposts,
2562      queue_t threadq,
2563      int max_threads, spl_t *spl,
2564      int priority)
2565  {
2566  	int nthreads = 0;
2567  
2568  	struct waitq_select_args args = {
2569  		.posted_waitq = waitq,
2570  		.waitq = waitq,
2571  		.event = event,
2572  		.select_cb = select_cb,
2573  		.select_ctx = select_ctx,
2574  		.priority = priority,
2575  		.reserved_preposts = reserved_preposts,
2576  		.threadq = threadq,
2577  		.max_threads = max_threads,
2578  		.nthreads = &nthreads,
2579  		.spl = spl,
2580  	};
2581  
2582  	do_waitq_select_n_locked(&args);
2583  	return nthreads;
2584  }
2585  
2586  /**
2587   * select from a waitq a single thread waiting for a given event
2588   *
2589   * Conditions:
2590   *	'waitq' is locked
2591   *
2592   * Returns:
2593   *	A locked thread that's been removed from the waitq, but has not
2594   *	yet been put on a run queue. Caller is responsible to call splx
2595   *	with the '*spl' value.
2596   */
2597  static thread_t
2598  waitq_select_one_locked(struct waitq *waitq, event64_t event,
2599      uint64_t *reserved_preposts,
2600      int priority, spl_t *spl)
2601  {
2602  	int nthreads;
2603  	queue_head_t threadq;
2604  
2605  	queue_init(&threadq);
2606  
2607  	nthreads = waitq_select_n_locked(waitq, event, NULL, NULL,
2608  	    reserved_preposts, &threadq, 1, spl, priority);
2609  
2610  	/* if we selected a thread, return it (still locked) */
2611  	if (!queue_empty(&threadq)) {
2612  		thread_t t;
2613  		queue_entry_t qe = dequeue_head(&threadq);
2614  		t = qe_element(qe, struct thread, wait_links);
2615  		assert(queue_empty(&threadq)); /* there should be 1 entry */
2616  		/* t has been locked and removed from all queues */
2617  		return t;
2618  	}
2619  
2620  	return THREAD_NULL;
2621  }
2622  
2623  struct select_thread_ctx {
2624  	thread_t      thread;
2625  	event64_t     event;
2626  	spl_t        *spl;
2627  };
2628  
2629  /**
2630   * link walk callback invoked once for each set to which a waitq belongs
2631   *
2632   * Conditions:
2633   *	initial waitq is locked
2634   *	ctx->thread is unlocked
2635   *
2636   * Notes:
2637   *	This may disable interrupts and early-out of the full DAG link walk by
2638   *	returning KERN_ALREADY_IN_SET. In this case, the returned thread has
2639   *	been removed from the waitq, it's waitq state has been reset, and the
2640   *	caller is responsible to call splx() with the returned interrupt state
2641   *	in ctx->spl.
2642   */
2643  static int
2644  waitq_select_thread_cb(struct waitq *waitq, void *ctx,
2645      struct waitq_link *link)
2646  {
2647  	struct select_thread_ctx *stctx = (struct select_thread_ctx *)ctx;
2648  	struct waitq_set *wqset;
2649  	struct waitq *wqsetq;
2650  	struct waitq *safeq;
2651  	spl_t s;
2652  
2653  	(void)waitq;
2654  
2655  	thread_t thread = stctx->thread;
2656  	event64_t event = stctx->event;
2657  
2658  	if (wql_type(link) != WQL_WQS) {
2659  		return WQ_ITERATE_CONTINUE;
2660  	}
2661  
2662  	wqset = link->wql_wqs.wql_set;
2663  	wqsetq = &wqset->wqset_q;
2664  
2665  	assert(!waitq_irq_safe(waitq));
2666  	assert(!waitq_irq_safe(wqsetq));
2667  
2668  	waitq_set_lock(wqset);
2669  
2670  	s = splsched();
2671  
2672  	/* find and lock the interrupt-safe waitq the thread is thought to be on */
2673  	safeq = waitq_get_safeq(wqsetq);
2674  	waitq_lock(safeq);
2675  
2676  	thread_lock(thread);
2677  
2678  	if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
2679  		waitq_thread_remove(wqsetq, thread);
2680  		if (waitq_empty(safeq)) {
2681  			safeq->waitq_eventmask = 0;
2682  		}
2683  		thread_clear_waitq_state(thread);
2684  		waitq_unlock(safeq);
2685  		waitq_set_unlock(wqset);
2686  		/*
2687  		 * thread still locked,
2688  		 * return non-zero to break out of WQS walk
2689  		 */
2690  		*(stctx->spl) = s;
2691  		return WQ_ITERATE_FOUND;
2692  	}
2693  
2694  	thread_unlock(thread);
2695  	waitq_set_unlock(wqset);
2696  	waitq_unlock(safeq);
2697  	splx(s);
2698  
2699  	return WQ_ITERATE_CONTINUE;
2700  }
2701  
2702  /**
2703   * returns KERN_SUCCESS and locks 'thread' if-and-only-if 'thread' is waiting
2704   * on 'waitq' (or any set to which waitq belongs) for 'event'
2705   *
2706   * Conditions:
2707   *	'waitq' is locked
2708   *	'thread' is unlocked
2709   */
2710  static kern_return_t
2711  waitq_select_thread_locked(struct waitq *waitq,
2712      event64_t event,
2713      thread_t thread, spl_t *spl)
2714  {
2715  	struct waitq *safeq;
2716  	struct waitq_link *link;
2717  	struct select_thread_ctx ctx;
2718  	kern_return_t kr;
2719  	spl_t s;
2720  
2721  	/* Find and lock the interrupts disabled queue the thread is actually on */
2722  	if (!waitq_irq_safe(waitq)) {
2723  		safeq = waitq_get_safeq(waitq);
2724  		if (safeq == NULL) {
2725  			/*
2726  			 * in the WQT_TSPROXY case, if there's no turnstile,
2727  			 * there's no queue and no waiters, so we can move straight
2728  			 * to the waitq set recursion
2729  			 */
2730  			goto handle_waitq_set;
2731  		}
2732  
2733  		s = splsched();
2734  		waitq_lock(safeq);
2735  	} else {
2736  		s = splsched();
2737  		safeq = waitq;
2738  	}
2739  
2740  	thread_lock(thread);
2741  
2742  	if ((thread->waitq == waitq) && (thread->wait_event == event)) {
2743  		waitq_thread_remove(safeq, thread);
2744  		if (waitq_empty(safeq)) {
2745  			safeq->waitq_eventmask = 0;
2746  		}
2747  		thread_clear_waitq_state(thread);
2748  		*spl = s;
2749  		/* thread still locked */
2750  		return KERN_SUCCESS;
2751  	}
2752  
2753  	thread_unlock(thread);
2754  
2755  	if (safeq != waitq) {
2756  		waitq_unlock(safeq);
2757  	}
2758  
2759  	splx(s);
2760  
2761  handle_waitq_set:
2762  	if (!waitq->waitq_set_id) {
2763  		return KERN_NOT_WAITING;
2764  	}
2765  
2766  	/* check to see if the set ID for this wait queue is valid */
2767  	link = wql_get_link(waitq->waitq_set_id);
2768  	if (!link) {
2769  		/* the waitq to which this set belonged, has been invalidated */
2770  		waitq->waitq_set_id = 0;
2771  		return KERN_NOT_WAITING;
2772  	}
2773  
2774  	/*
2775  	 * The thread may be waiting on a wait queue set to which
2776  	 * the input 'waitq' belongs. Go look for the thread in
2777  	 * all wait queue sets. If it's there, we'll remove it
2778  	 * because it's equivalent to waiting directly on the input waitq.
2779  	 */
2780  	ctx.thread = thread;
2781  	ctx.event = event;
2782  	ctx.spl = spl;
2783  	kr = walk_waitq_links(LINK_WALK_FULL_DAG, waitq, waitq->waitq_set_id,
2784  	    WQL_WQS, (void *)&ctx, waitq_select_thread_cb);
2785  
2786  	wql_put_link(link);
2787  
2788  	/* we found a thread, return success */
2789  	if (kr == WQ_ITERATE_FOUND) {
2790  		return KERN_SUCCESS;
2791  	}
2792  
2793  	return KERN_NOT_WAITING;
2794  }
2795  
2796  static int
2797  prepost_exists_cb(struct waitq_set __unused *wqset,
2798      void __unused *ctx,
2799      struct wq_prepost __unused *wqp,
2800      struct waitq __unused *waitq)
2801  {
2802  	/* if we get here, then we know that there is a valid prepost object! */
2803  	return WQ_ITERATE_FOUND;
2804  }
2805  
2806  /**
2807   * declare a thread's intent to wait on 'waitq' for 'wait_event'
2808   *
2809   * Conditions:
2810   *	'waitq' is locked
2811   */
2812  wait_result_t
2813  waitq_assert_wait64_locked(struct waitq *waitq,
2814      event64_t wait_event,
2815      wait_interrupt_t interruptible,
2816      wait_timeout_urgency_t urgency,
2817      uint64_t deadline,
2818      uint64_t leeway,
2819      thread_t thread)
2820  {
2821  	wait_result_t wait_result;
2822  	int realtime = 0;
2823  	struct waitq *safeq;
2824  	uintptr_t eventmask;
2825  	spl_t s;
2826  
2827  
2828  	/*
2829  	 * Warning: Do _not_ place debugging print statements here.
2830  	 *          The waitq is locked!
2831  	 */
2832  	assert(!thread->started || thread == current_thread());
2833  
2834  	if (thread->waitq != NULL) {
2835  		panic("thread already waiting on %p", thread->waitq);
2836  	}
2837  
2838  	if (waitq_is_set(waitq)) {
2839  		struct waitq_set *wqset = (struct waitq_set *)waitq;
2840  		/*
2841  		 * early-out if the thread is waiting on a wait queue set
2842  		 * that has already been pre-posted.
2843  		 */
2844  		if (wait_event == NO_EVENT64 && waitq_set_maybe_preposted(wqset)) {
2845  			int ret;
2846  			/*
2847  			 * Run through the list of potential preposts. Because
2848  			 * this is a hot path, we short-circuit the iteration
2849  			 * if we find just one prepost object.
2850  			 */
2851  			ret = wq_prepost_foreach_locked(wqset, NULL,
2852  			    prepost_exists_cb);
2853  			if (ret == WQ_ITERATE_FOUND) {
2854  				s = splsched();
2855  				thread_lock(thread);
2856  				thread->wait_result = THREAD_AWAKENED;
2857  				thread_unlock(thread);
2858  				splx(s);
2859  				return THREAD_AWAKENED;
2860  			}
2861  		}
2862  	}
2863  
2864  	s = splsched();
2865  
2866  	/*
2867  	 * If already dealing with an irq safe wait queue, we are all set.
2868  	 * Otherwise, determine a global queue to use and lock it.
2869  	 */
2870  	if (!waitq_irq_safe(waitq)) {
2871  		safeq = waitq_get_safeq(waitq);
2872  		if (__improbable(safeq == NULL)) {
2873  			panic("Trying to assert_wait on a turnstile proxy "
2874  			    "that hasn't been donated one (waitq: %p)", waitq);
2875  		}
2876  		eventmask = _CAST_TO_EVENT_MASK(waitq);
2877  		waitq_lock(safeq);
2878  	} else {
2879  		safeq = waitq;
2880  		eventmask = _CAST_TO_EVENT_MASK(wait_event);
2881  	}
2882  
2883  	/* lock the thread now that we have the irq-safe waitq locked */
2884  	thread_lock(thread);
2885  
2886  	/*
2887  	 * Realtime threads get priority for wait queue placements.
2888  	 * This allows wait_queue_wakeup_one to prefer a waiting
2889  	 * realtime thread, similar in principle to performing
2890  	 * a wait_queue_wakeup_all and allowing scheduler prioritization
2891  	 * to run the realtime thread, but without causing the
2892  	 * lock contention of that scenario.
2893  	 */
2894  	if (thread->sched_pri >= BASEPRI_REALTIME) {
2895  		realtime = 1;
2896  	}
2897  
2898  	/*
2899  	 * This is the extent to which we currently take scheduling attributes
2900  	 * into account.  If the thread is vm priviledged, we stick it at
2901  	 * the front of the queue.  Later, these queues will honor the policy
2902  	 * value set at waitq_init time.
2903  	 */
2904  	wait_result = thread_mark_wait_locked(thread, interruptible);
2905  	/* thread->wait_result has been set */
2906  	if (wait_result == THREAD_WAITING) {
2907  		if (!safeq->waitq_fifo
2908  		    || (thread->options & TH_OPT_VMPRIV) || realtime) {
2909  			waitq_thread_insert(safeq, thread, false);
2910  		} else {
2911  			waitq_thread_insert(safeq, thread, true);
2912  		}
2913  
2914  		/* mark the event and real waitq, even if enqueued on a global safeq */
2915  		thread->wait_event = wait_event;
2916  		thread->waitq = waitq;
2917  
2918  		if (deadline != 0) {
2919  			boolean_t act;
2920  
2921  			act = timer_call_enter_with_leeway(&thread->wait_timer,
2922  			    NULL,
2923  			    deadline, leeway,
2924  			    urgency, FALSE);
2925  			if (!act) {
2926  				thread->wait_timer_active++;
2927  			}
2928  			thread->wait_timer_is_set = TRUE;
2929  		}
2930  
2931  		if (waitq_is_global(safeq)) {
2932  			safeq->waitq_eventmask |= eventmask;
2933  		}
2934  
2935  		waitq_stats_count_wait(waitq);
2936  	}
2937  
2938  	/* unlock the thread */
2939  	thread_unlock(thread);
2940  
2941  	/* update the inheritor's thread priority if the waitq is embedded in turnstile */
2942  	if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {
2943  		turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
2944  		turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
2945  	}
2946  
2947  	/* unlock the safeq if we locked it here */
2948  	if (safeq != waitq) {
2949  		waitq_unlock(safeq);
2950  	}
2951  
2952  	splx(s);
2953  
2954  	return wait_result;
2955  }
2956  
2957  /**
2958   * remove 'thread' from its current blocking state on 'waitq'
2959   *
2960   * Conditions:
2961   *	'thread' is locked
2962   *
2963   * Notes:
2964   *	This function is primarily used by clear_wait_internal in
2965   *	sched_prim.c from the thread timer wakeup path
2966   *	(i.e. the thread was waiting on 'waitq' with a timeout that expired)
2967   */
2968  int
2969  waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
2970  {
2971  	struct waitq *safeq;
2972  
2973  	assert_thread_magic(thread);
2974  	assert(thread->waitq == waitq);
2975  
2976  	/* Find the interrupts disabled queue thread is waiting on */
2977  	if (!waitq_irq_safe(waitq)) {
2978  		safeq = waitq_get_safeq(waitq);
2979  		if (__improbable(safeq == NULL)) {
2980  			panic("Trying to clear_wait on a turnstile proxy "
2981  			    "that hasn't been donated one (waitq: %p)", waitq);
2982  		}
2983  	} else {
2984  		safeq = waitq;
2985  	}
2986  
2987  	/* thread is already locked so have to try for the waitq lock */
2988  	if (!waitq_lock_try(safeq)) {
2989  		return 0;
2990  	}
2991  
2992  	waitq_thread_remove(safeq, thread);
2993  	thread_clear_waitq_state(thread);
2994  	waitq_stats_count_clear_wakeup(waitq);
2995  
2996  	/* clear the global event mask if this was the last thread there! */
2997  	if (waitq_is_global(safeq) && waitq_empty(safeq)) {
2998  		safeq->waitq_eventmask = 0;
2999  		/* JMM - also mark no-waiters on waitq (if not the same as the safeq) */
3000  	}
3001  
3002  	waitq_unlock(safeq);
3003  
3004  	return 1;
3005  }
3006  
3007  
3008  static __inline__
3009  void
3010  maybe_adjust_thread_pri(thread_t   thread,
3011      int        priority,
3012      __kdebug_only struct waitq *waitq)
3013  {
3014  	/*
3015  	 * If the caller is requesting the waitq subsystem to promote the
3016  	 * priority of the awoken thread, then boost the thread's priority to
3017  	 * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
3018  	 * higher priority).  This boost must be removed via a call to
3019  	 * waitq_clear_promotion_locked before the thread waits again.
3020  	 *
3021  	 * WAITQ_PROMOTE_PRIORITY is -2.
3022  	 * Anything above 0 represents a mutex promotion.
3023  	 * The default 'no action' value is -1.
3024  	 * TODO: define this in a header
3025  	 */
3026  	if (priority == WAITQ_PROMOTE_PRIORITY) {
3027  		uintptr_t trace_waitq = 0;
3028  		if (__improbable(kdebug_enable)) {
3029  			trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);
3030  		}
3031  
3032  		sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
3033  	}
3034  }
3035  
3036  /*
3037   * Clear a potential thread priority promotion from a waitq wakeup
3038   * with WAITQ_PROMOTE_PRIORITY.
3039   *
3040   * This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.
3041   */
3042  void
3043  waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
3044  {
3045  	spl_t s;
3046  
3047  	assert(waitq_held(waitq));
3048  	assert(thread != THREAD_NULL);
3049  	assert(thread == current_thread());
3050  
3051  	/* This flag is only cleared by the thread itself, so safe to check outside lock */
3052  	if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED) {
3053  		return;
3054  	}
3055  
3056  	if (!waitq_irq_safe(waitq)) {
3057  		s = splsched();
3058  	}
3059  	thread_lock(thread);
3060  
3061  	sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
3062  
3063  	thread_unlock(thread);
3064  	if (!waitq_irq_safe(waitq)) {
3065  		splx(s);
3066  	}
3067  }
3068  
3069  /**
3070   * wakeup all threads waiting on 'waitq' for 'wake_event'
3071   *
3072   * Conditions:
3073   *	'waitq' is locked
3074   *
3075   * Notes:
3076   *	May temporarily disable and re-enable interrupts
3077   *	and re-adjust thread priority of each awoken thread.
3078   *
3079   *	If the input 'lock_state' == WAITQ_UNLOCK then the waitq will have
3080   *	been unlocked before calling thread_go() on any returned threads, and
3081   *	is guaranteed to be unlocked upon function return.
3082   */
3083  kern_return_t
3084  waitq_wakeup64_all_locked(struct waitq *waitq,
3085      event64_t wake_event,
3086      wait_result_t result,
3087      uint64_t *reserved_preposts,
3088      int priority,
3089      waitq_lock_state_t lock_state)
3090  {
3091  	kern_return_t ret;
3092  	thread_t thread;
3093  	spl_t th_spl;
3094  	int nthreads;
3095  	queue_head_t wakeup_queue;
3096  
3097  	assert(waitq_held(waitq));
3098  	queue_init(&wakeup_queue);
3099  
3100  	nthreads = waitq_select_n_locked(waitq, wake_event, NULL, NULL,
3101  	    reserved_preposts,
3102  	    &wakeup_queue, -1, &th_spl, priority);
3103  
3104  	/* set each thread running */
3105  	ret = KERN_NOT_WAITING;
3106  
3107  #if CONFIG_WAITQ_STATS
3108  	qe_foreach_element(thread, &wakeup_queue, wait_links)
3109  	waitq_stats_count_wakeup(waitq);
3110  #endif
3111  	if (lock_state == WAITQ_UNLOCK) {
3112  		waitq_unlock(waitq);
3113  	}
3114  
3115  	qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
3116  		assert_thread_magic(thread);
3117  		remqueue(&thread->wait_links);
3118  		maybe_adjust_thread_pri(thread, priority, waitq);
3119  		ret = thread_go(thread, result, WQ_OPTION_NONE);
3120  		assert(ret == KERN_SUCCESS);
3121  		thread_unlock(thread);
3122  	}
3123  	if (nthreads > 0) {
3124  		splx(th_spl);
3125  	} else {
3126  		waitq_stats_count_fail(waitq);
3127  	}
3128  
3129  	return ret;
3130  }
3131  
3132  /**
3133   * wakeup one thread waiting on 'waitq' for 'wake_event'
3134   *
3135   * Conditions:
3136   *	'waitq' is locked
3137   *
3138   * Notes:
3139   *	May temporarily disable and re-enable interrupts.
3140   */
3141  kern_return_t
3142  waitq_wakeup64_one_locked(struct waitq *waitq,
3143      event64_t wake_event,
3144      wait_result_t result,
3145      uint64_t *reserved_preposts,
3146      int priority,
3147      waitq_lock_state_t lock_state,
3148      waitq_options_t option)
3149  {
3150  	thread_t thread;
3151  	spl_t th_spl;
3152  
3153  	assert(waitq_held(waitq));
3154  
3155  	thread = waitq_select_one_locked(waitq, wake_event,
3156  	    reserved_preposts,
3157  	    priority, &th_spl);
3158  
3159  	if (thread != THREAD_NULL) {
3160  		waitq_stats_count_wakeup(waitq);
3161  	} else {
3162  		waitq_stats_count_fail(waitq);
3163  	}
3164  
3165  	if (lock_state == WAITQ_UNLOCK) {
3166  		waitq_unlock(waitq);
3167  	}
3168  
3169  	if (thread != THREAD_NULL) {
3170  		maybe_adjust_thread_pri(thread, priority, waitq);
3171  		kern_return_t ret = thread_go(thread, result, option);
3172  		assert(ret == KERN_SUCCESS);
3173  		thread_unlock(thread);
3174  		splx(th_spl);
3175  		return ret;
3176  	}
3177  
3178  	return KERN_NOT_WAITING;
3179  }
3180  
3181  /**
3182   * wakeup one thread waiting on 'waitq' for 'wake_event'
3183   *
3184   * Conditions:
3185   *	'waitq' is locked
3186   *
3187   * Returns:
3188   *	A locked, runnable thread.
3189   *	If return value is non-NULL, interrupts have also
3190   *	been disabled, and the caller is responsible to call
3191   *	splx() with the returned '*spl' value.
3192   */
3193  thread_t
3194  waitq_wakeup64_identify_locked(struct waitq     *waitq,
3195      event64_t        wake_event,
3196      wait_result_t    result,
3197      spl_t            *spl,
3198      uint64_t         *reserved_preposts,
3199      int              priority,
3200      waitq_lock_state_t lock_state)
3201  {
3202  	thread_t thread;
3203  
3204  	assert(waitq_held(waitq));
3205  
3206  	thread = waitq_select_one_locked(waitq, wake_event,
3207  	    reserved_preposts,
3208  	    priority, spl);
3209  
3210  	if (thread != THREAD_NULL) {
3211  		waitq_stats_count_wakeup(waitq);
3212  	} else {
3213  		waitq_stats_count_fail(waitq);
3214  	}
3215  
3216  	if (lock_state == WAITQ_UNLOCK) {
3217  		waitq_unlock(waitq);
3218  	}
3219  
3220  	if (thread != THREAD_NULL) {
3221  		kern_return_t __assert_only ret;
3222  		ret = thread_go(thread, result, WQ_OPTION_NONE);
3223  		assert(ret == KERN_SUCCESS);
3224  	}
3225  
3226  	return thread; /* locked if not NULL (caller responsible for spl) */
3227  }
3228  
3229  /**
3230   * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
3231   *
3232   * Conditions:
3233   *	'waitq' is locked
3234   *	'thread' is unlocked
3235   *
3236   * Notes:
3237   *	May temporarily disable and re-enable interrupts
3238   *
3239   *	If the input lock_state == WAITQ_UNLOCK then the waitq will have been
3240   *	unlocked before calling thread_go() if 'thread' is to be awoken, and
3241   *	is guaranteed to be unlocked upon function return.
3242   */
3243  kern_return_t
3244  waitq_wakeup64_thread_locked(struct waitq *waitq,
3245      event64_t wake_event,
3246      thread_t thread,
3247      wait_result_t result,
3248      waitq_lock_state_t lock_state)
3249  {
3250  	kern_return_t ret;
3251  	spl_t th_spl;
3252  
3253  	assert(waitq_held(waitq));
3254  	assert_thread_magic(thread);
3255  
3256  	/*
3257  	 * See if the thread was still waiting there.  If so, it got
3258  	 * dequeued and returned locked.
3259  	 */
3260  	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
3261  
3262  	if (ret == KERN_SUCCESS) {
3263  		waitq_stats_count_wakeup(waitq);
3264  	} else {
3265  		waitq_stats_count_fail(waitq);
3266  	}
3267  
3268  	if (lock_state == WAITQ_UNLOCK) {
3269  		waitq_unlock(waitq);
3270  	}
3271  
3272  	if (ret != KERN_SUCCESS) {
3273  		return KERN_NOT_WAITING;
3274  	}
3275  
3276  	ret = thread_go(thread, result, WQ_OPTION_NONE);
3277  	assert(ret == KERN_SUCCESS);
3278  	thread_unlock(thread);
3279  	splx(th_spl);
3280  
3281  	return ret;
3282  }
3283  
3284  
3285  
3286  /* ----------------------------------------------------------------------
3287   *
3288   * In-Kernel API
3289   *
3290   * ---------------------------------------------------------------------- */
3291  
3292  /**
3293   * initialize a waitq object
3294   */
3295  kern_return_t
3296  waitq_init(struct waitq *waitq, int policy)
3297  {
3298  	assert(waitq != NULL);
3299  
3300  	/* only FIFO and LIFO for now */
3301  	if ((policy & SYNC_POLICY_FIXED_PRIORITY) != 0) {
3302  		return KERN_INVALID_ARGUMENT;
3303  	}
3304  
3305  	waitq->waitq_fifo = ((policy & SYNC_POLICY_REVERSED) == 0);
3306  	waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
3307  	waitq->waitq_prepost = 0;
3308  	if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
3309  		waitq->waitq_type = WQT_TSPROXY;
3310  	} else {
3311  		waitq->waitq_type = WQT_QUEUE;
3312  	}
3313  	waitq->waitq_turnstile = !!(policy & SYNC_POLICY_TURNSTILE);
3314  	waitq->waitq_eventmask = 0;
3315  
3316  	waitq->waitq_set_id = 0;
3317  	waitq->waitq_prepost_id = 0;
3318  
3319  	waitq_lock_init(waitq);
3320  	if (waitq_is_turnstile_queue(waitq)) {
3321  		/* For turnstile, initialize it as a priority queue */
3322  		priority_queue_init(&waitq->waitq_prio_queue);
3323  		assert(waitq->waitq_fifo == 0);
3324  	} else if (policy & SYNC_POLICY_TURNSTILE_PROXY) {
3325  		waitq->waitq_ts = TURNSTILE_NULL;
3326  		waitq->waitq_tspriv = NULL;
3327  	} else {
3328  		queue_init(&waitq->waitq_queue);
3329  	}
3330  
3331  	waitq->waitq_isvalid = 1;
3332  	return KERN_SUCCESS;
3333  }
3334  
3335  struct wq_unlink_ctx {
3336  	struct waitq *unlink_wq;
3337  	struct waitq_set *unlink_wqset;
3338  };
3339  
3340  static int waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
3341      struct wq_prepost *wqp, struct waitq *waitq);
3342  
3343  /**
3344   * walk_waitq_links callback to invalidate 'link' parameter
3345   *
3346   * Conditions:
3347   *	Called from walk_waitq_links.
3348   *	Note that unlink other callbacks, this one make no assumptions about
3349   *	the 'waitq' parameter, specifically it does not have to be locked or
3350   *	even valid.
3351   */
3352  static int
3353  waitq_unlink_all_cb(struct waitq *waitq, void *ctx,
3354      struct waitq_link *link)
3355  {
3356  	(void)waitq;
3357  	(void)ctx;
3358  	if (wql_type(link) == WQL_LINK && wql_is_valid(link)) {
3359  		wql_invalidate(link);
3360  	}
3361  
3362  	if (wql_type(link) == WQL_WQS) {
3363  		struct waitq_set *wqset;
3364  		struct wq_unlink_ctx ulctx;
3365  
3366  		/*
3367  		 * When destroying the waitq, take the time to clear out any
3368  		 * preposts it may have made. This could potentially save time
3369  		 * on the IPC send path which would otherwise have to iterate
3370  		 * over lots of dead port preposts.
3371  		 */
3372  		if (waitq->waitq_prepost_id == 0) {
3373  			goto out;
3374  		}
3375  
3376  		wqset = link->wql_wqs.wql_set;
3377  		assert(wqset != NULL);
3378  		assert(!waitq_irq_safe(&wqset->wqset_q));
3379  
3380  		waitq_set_lock(wqset);
3381  
3382  		if (!waitq_set_is_valid(wqset)) {
3383  			/* someone raced us to teardown */
3384  			goto out_unlock;
3385  		}
3386  		if (!waitq_set_maybe_preposted(wqset)) {
3387  			goto out_unlock;
3388  		}
3389  
3390  		ulctx.unlink_wq = waitq;
3391  		ulctx.unlink_wqset = wqset;
3392  		(void)wq_prepost_iterate(wqset->wqset_prepost_id, &ulctx,
3393  		    waitq_unlink_prepost_cb);
3394  out_unlock:
3395  		waitq_set_unlock(wqset);
3396  	}
3397  
3398  out:
3399  	return WQ_ITERATE_CONTINUE;
3400  }
3401  
3402  
3403  /**
3404   * cleanup any link/prepost table resources associated with a waitq
3405   */
3406  void
3407  waitq_deinit(struct waitq *waitq)
3408  {
3409  	spl_t s;
3410  
3411  	assert(waitq);
3412  	if (!waitq_is_valid(waitq)) {
3413  		return;
3414  	}
3415  
3416  	if (!waitq_is_queue(waitq) && !waitq_is_turnstile_proxy(waitq)) {
3417  		return;
3418  	}
3419  
3420  	if (waitq_irq_safe(waitq)) {
3421  		s = splsched();
3422  	}
3423  	waitq_lock(waitq);
3424  
3425  	if (waitq_valid(waitq)) {
3426  		waitq->waitq_isvalid = 0;
3427  		if (!waitq_irq_safe(waitq)) {
3428  			waitq_unlink_all_unlock(waitq);
3429  			/* waitq unlocked and set links deallocated */
3430  			goto out;
3431  		}
3432  	}
3433  
3434  	waitq_unlock(waitq);
3435  	if (waitq_irq_safe(waitq)) {
3436  		splx(s);
3437  	}
3438  
3439  out:
3440  #if MACH_ASSERT
3441  	if (waitq_is_turnstile_queue(waitq)) {
3442  		assert(priority_queue_empty(&waitq->waitq_prio_queue));
3443  	} else if (waitq_is_turnstile_proxy(waitq)) {
3444  		assert(waitq->waitq_ts == TURNSTILE_NULL);
3445  	} else {
3446  		assert(queue_empty(&waitq->waitq_queue));
3447  	}
3448  #else
3449  	(void)0;
3450  #endif // MACH_ASSERT
3451  }
3452  
3453  void
3454  waitq_invalidate_locked(struct waitq *waitq)
3455  {
3456  	assert(waitq_held(waitq));
3457  	assert(waitq_is_valid(waitq));
3458  	waitq->waitq_isvalid = 0;
3459  }
3460  
3461  /**
3462   * invalidate the given wq_prepost object
3463   *
3464   * Conditions:
3465   *	Called from wq_prepost_iterate (_not_ from wq_prepost_foreach_locked!)
3466   */
3467  static int
3468  wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
3469      void __unused *ctx,
3470      struct wq_prepost *wqp,
3471      struct waitq __unused *waitq)
3472  {
3473  	if (wqp_type(wqp) == WQP_POST) {
3474  		wq_prepost_invalidate(wqp);
3475  	}
3476  	return WQ_ITERATE_CONTINUE;
3477  }
3478  
3479  
3480  /**
3481   * allocate and initialize a waitq set object
3482   *
3483   * Conditions:
3484   *	may block
3485   *
3486   * Returns:
3487   *	allocated / initialized waitq_set object.
3488   *	the waits_set object returned does not have
3489   *	a waitq_link associated.
3490   *
3491   *	NULL on failure
3492   */
3493  struct waitq_set *
3494  waitq_set_alloc(int policy, waitq_set_prepost_hook_t *prepost_hook)
3495  {
3496  	struct waitq_set *wqset;
3497  
3498  	wqset = (struct waitq_set *)zalloc(waitq_set_zone);
3499  	if (!wqset) {
3500  		panic("Can't allocate a new waitq set from zone %p", waitq_set_zone);
3501  	}
3502  
3503  	kern_return_t ret;
3504  	ret = waitq_set_init(wqset, policy, NULL, prepost_hook);
3505  	if (ret != KERN_SUCCESS) {
3506  		zfree(waitq_set_zone, wqset);
3507  		wqset = NULL;
3508  	}
3509  
3510  	return wqset;
3511  }
3512  
3513  /**
3514   * initialize a waitq set object
3515   *
3516   * if no 'reserved_link' object is passed
3517   * the waitq_link will be lazily allocated
3518   * on demand through waitq_set_lazy_init_link.
3519   */
3520  kern_return_t
3521  waitq_set_init(struct waitq_set *wqset,
3522      int policy, uint64_t *reserved_link,
3523      waitq_set_prepost_hook_t *prepost_hook)
3524  {
3525  	struct waitq_link *link;
3526  	kern_return_t ret;
3527  
3528  	memset(wqset, 0, sizeof(*wqset));
3529  
3530  	ret = waitq_init(&wqset->wqset_q, policy);
3531  	if (ret != KERN_SUCCESS) {
3532  		return ret;
3533  	}
3534  
3535  	wqset->wqset_q.waitq_type = WQT_SET;
3536  	if (policy & SYNC_POLICY_PREPOST) {
3537  		wqset->wqset_q.waitq_prepost = 1;
3538  		wqset->wqset_prepost_id = 0;
3539  		assert(prepost_hook == NULL);
3540  	} else {
3541  		wqset->wqset_q.waitq_prepost = 0;
3542  		wqset->wqset_prepost_hook = prepost_hook;
3543  	}
3544  
3545  	if (reserved_link && *reserved_link != 0) {
3546  		link = wql_get_reserved(*reserved_link, WQL_WQS);
3547  
3548  		if (!link) {
3549  			panic("Can't allocate link object for waitq set: %p", wqset);
3550  		}
3551  
3552  		/* always consume the caller's reference */
3553  		*reserved_link = 0;
3554  
3555  		link->wql_wqs.wql_set = wqset;
3556  		wql_mkvalid(link);
3557  
3558  		wqset->wqset_id = link->wql_setid.id;
3559  		wql_put_link(link);
3560  	} else {
3561  		/*
3562  		 * Lazy allocate the link only when an actual id is needed.
3563  		 */
3564  		wqset->wqset_id = WQSET_NOT_LINKED;
3565  	}
3566  
3567  	return KERN_SUCCESS;
3568  }
3569  
3570  #if DEVELOPMENT || DEBUG
3571  
3572  int
3573  sysctl_helper_waitq_set_nelem(void)
3574  {
3575  	return ltable_nelem(&g_wqlinktable);
3576  }
3577  
3578  #endif
3579  
3580  /**
3581   * initialize a waitq set link.
3582   *
3583   * Conditions:
3584   *	may block
3585   *	locks and unlocks the waiq set lock
3586   *
3587   */
3588  void
3589  waitq_set_lazy_init_link(struct waitq_set *wqset)
3590  {
3591  	struct waitq_link *link;
3592  
3593  	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
3594  
3595  	waitq_set_lock(wqset);
3596  	if (!waitq_set_should_lazy_init_link(wqset)) {
3597  		waitq_set_unlock(wqset);
3598  		return;
3599  	}
3600  
3601  	assert(wqset->wqset_id == WQSET_NOT_LINKED);
3602  	waitq_set_unlock(wqset);
3603  
3604  	link = wql_alloc_link(WQL_WQS);
3605  	if (!link) {
3606  		panic("Can't allocate link object for waitq set: %p", wqset);
3607  	}
3608  
3609  	link->wql_wqs.wql_set = wqset;
3610  
3611  	waitq_set_lock(wqset);
3612  	if (waitq_set_should_lazy_init_link(wqset)) {
3613  		wql_mkvalid(link);
3614  		wqset->wqset_id = link->wql_setid.id;
3615  	}
3616  
3617  	assert(wqset->wqset_id != 0);
3618  	assert(wqset->wqset_id != WQSET_NOT_LINKED);
3619  
3620  	waitq_set_unlock(wqset);
3621  
3622  	wql_put_link(link);
3623  
3624  	return;
3625  }
3626  
3627  /**
3628   * checks if a waitq set needs to be linked.
3629   *
3630   */
3631  boolean_t
3632  waitq_set_should_lazy_init_link(struct waitq_set *wqset)
3633  {
3634  	if (waitqs_is_linked(wqset) || wqset->wqset_id == 0) {
3635  		return FALSE;
3636  	}
3637  	return TRUE;
3638  }
3639  
3640  /**
3641   * clear out / release any resources associated with a waitq set
3642   *
3643   * Conditions:
3644   *	may block
3645   * Note:
3646   *	This will render the waitq set invalid, and it must
3647   *	be re-initialized with waitq_set_init before it can be used again
3648   */
3649  void
3650  waitq_set_deinit(struct waitq_set *wqset)
3651  {
3652  	struct waitq_link *link = NULL;
3653  	uint64_t set_id, prepost_id;
3654  
3655  	if (!waitqs_is_set(wqset)) {
3656  		panic("trying to de-initialize an invalid wqset @%p", wqset);
3657  	}
3658  
3659  	assert(!waitq_irq_safe(&wqset->wqset_q));
3660  
3661  	waitq_set_lock(wqset);
3662  
3663  	if (waitq_set_has_prepost_hook(wqset)) {
3664  		waitq_set_prepost_hook_t *hook = wqset->wqset_prepost_hook;
3665  		/*
3666  		 * If the wqset_prepost_hook value is non 0,
3667  		 * then another core is currently posting to this waitq set
3668  		 * and we need for it to finish what it's doing.
3669  		 */
3670  		while (os_atomic_load(hook, relaxed) != 0) {
3671  			waitq_set_unlock(wqset);
3672  			delay(1);
3673  			waitq_set_lock(wqset);
3674  		}
3675  	}
3676  
3677  	set_id = wqset->wqset_id;
3678  
3679  	if (waitqs_is_linked(wqset) || set_id == 0) {
3680  		/* grab the set's link object */
3681  		link = wql_get_link(set_id);
3682  		if (link) {
3683  			wql_invalidate(link);
3684  		}
3685  		/* someone raced us to deinit */
3686  		if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) {
3687  			if (link) {
3688  				wql_put_link(link);
3689  			}
3690  			waitq_set_unlock(wqset);
3691  			return;
3692  		}
3693  
3694  		/* the link should be a valid link object at this point */
3695  		assert(link != NULL && wql_type(link) == WQL_WQS);
3696  
3697  		wqset->wqset_id = 0;
3698  	}
3699  
3700  	/*
3701  	 * This set may have a lot of preposts, or may have been a member of
3702  	 * many other sets. To minimize spinlock hold times, we clear out the
3703  	 * waitq set data structure under the lock-hold, but don't clear any
3704  	 * table objects. We keep handles to the prepost and set linkage
3705  	 * objects and free those outside the critical section.
3706  	 */
3707  	prepost_id = 0;
3708  	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
3709  		assert(link != NULL);
3710  		prepost_id = wqset->wqset_prepost_id;
3711  	}
3712  	/* else { TODO: notify kqueue subsystem? } */
3713  	wqset->wqset_prepost_id = 0;
3714  
3715  	wqset->wqset_q.waitq_fifo = 0;
3716  	wqset->wqset_q.waitq_prepost = 0;
3717  	wqset->wqset_q.waitq_isvalid = 0;
3718  
3719  	/* don't clear the 'waitq_irq' bit: it's used in locking! */
3720  	wqset->wqset_q.waitq_eventmask = 0;
3721  
3722  	waitq_unlink_all_unlock(&wqset->wqset_q);
3723  	/* wqset->wqset_q unlocked and set links deallocated */
3724  
3725  
3726  	if (link) {
3727  		/*
3728  		 * walk_waitq_links may race with us for access to the waitq set.
3729  		 * If walk_waitq_links has a reference to the set, then we should wait
3730  		 * until the link's refcount goes to 1 (our reference) before we exit
3731  		 * this function. That way we ensure that the waitq set memory will
3732  		 * remain valid even though it's been cleared out.
3733  		 */
3734  		while (wql_refcnt(link) > 1) {
3735  			delay(1);
3736  		}
3737  		wql_put_link(link);
3738  	}
3739  
3740  	/* drop / unlink all the prepost table objects */
3741  	/* JMM - can this happen before the delay? */
3742  	if (prepost_id) {
3743  		(void)wq_prepost_iterate(prepost_id, NULL,
3744  		    wqset_clear_prepost_chain_cb);
3745  	}
3746  }
3747  
3748  /**
3749   * de-initialize and free an allocated waitq set object
3750   *
3751   * Conditions:
3752   *	may block
3753   */
3754  kern_return_t
3755  waitq_set_free(struct waitq_set *wqset)
3756  {
3757  	waitq_set_deinit(wqset);
3758  
3759  	memset(wqset, 0, sizeof(*wqset));
3760  	zfree(waitq_set_zone, wqset);
3761  
3762  	return KERN_SUCCESS;
3763  }
3764  
3765  #if DEVELOPMENT || DEBUG
3766  #if CONFIG_WAITQ_DEBUG
3767  /**
3768   * return the set ID of 'wqset'
3769   */
3770  uint64_t
3771  wqset_id(struct waitq_set *wqset)
3772  {
3773  	if (!wqset) {
3774  		return 0;
3775  	}
3776  
3777  	assert(waitqs_is_set(wqset));
3778  
3779  	if (!waitqs_is_linked(wqset)) {
3780  		waitq_set_lazy_init_link(wqset);
3781  	}
3782  
3783  	return wqset->wqset_id;
3784  }
3785  
3786  /**
3787   * returns a pointer to the waitq object embedded in 'wqset'
3788   */
3789  struct waitq *
3790  wqset_waitq(struct waitq_set *wqset)
3791  {
3792  	if (!wqset) {
3793  		return NULL;
3794  	}
3795  
3796  	assert(waitqs_is_set(wqset));
3797  
3798  	return &wqset->wqset_q;
3799  }
3800  #endif /* CONFIG_WAITQ_DEBUG */
3801  #endif /* DEVELOPMENT || DEBUG */
3802  
3803  
3804  /**
3805   * clear all preposts originating from 'waitq'
3806   *
3807   * Conditions:
3808   *	'waitq' locked
3809   *	may (rarely) spin waiting for another on-core thread to
3810   *	release the last reference to the waitq's prepost link object
3811   *
3812   * NOTE:
3813   *	If this function needs to spin, it will drop the waitq lock!
3814   *	The return value of the function indicates whether or not this
3815   *	happened: 1 == lock was dropped, 0 == lock held
3816   */
3817  int
3818  waitq_clear_prepost_locked(struct waitq *waitq)
3819  {
3820  	struct wq_prepost *wqp;
3821  	int dropped_lock = 0;
3822  
3823  	assert(!waitq_irq_safe(waitq));
3824  
3825  	if (waitq->waitq_prepost_id == 0) {
3826  		return 0;
3827  	}
3828  
3829  	wqp = wq_prepost_get(waitq->waitq_prepost_id);
3830  	waitq->waitq_prepost_id = 0;
3831  	if (wqp) {
3832  		uint64_t wqp_id = wqp->wqp_prepostid.id;
3833  		wqdbg_v("invalidate prepost 0x%llx (refcnt:%d)",
3834  		    wqp->wqp_prepostid.id, wqp_refcnt(wqp));
3835  		wq_prepost_invalidate(wqp);
3836  		while (wqp_refcnt(wqp) > 1) {
3837  			/*
3838  			 * Some other thread must have raced us to grab a link
3839  			 * object reference before we invalidated it. This
3840  			 * means that they are probably trying to access the
3841  			 * waitq to which the prepost object points. We need
3842  			 * to wait here until the other thread drops their
3843  			 * reference. We know that no one else can get a
3844  			 * reference (the object has been invalidated), and
3845  			 * that prepost references are short-lived (dropped on
3846  			 * a call to wq_prepost_put). We also know that no one
3847  			 * blocks while holding a reference therefore the
3848  			 * other reference holder must be on-core. We'll just
3849  			 * sit and wait for the other reference to be dropped.
3850  			 */
3851  			disable_preemption();
3852  
3853  			waitq_unlock(waitq);
3854  			dropped_lock = 1;
3855  			/*
3856  			 * don't yield here, just spin and assume the other
3857  			 * consumer is already on core...
3858  			 */
3859  			delay(1);
3860  
3861  			waitq_lock(waitq);
3862  
3863  			enable_preemption();
3864  		}
3865  		if (wqp_refcnt(wqp) > 0 && wqp->wqp_prepostid.id == wqp_id) {
3866  			wq_prepost_put(wqp);
3867  		}
3868  	}
3869  
3870  	return dropped_lock;
3871  }
3872  
3873  /**
3874   * clear all preposts originating from 'waitq'
3875   *
3876   * Conditions:
3877   *	'waitq' is not locked
3878   *	may disable and re-enable interrupts
3879   */
3880  void
3881  waitq_clear_prepost(struct waitq *waitq)
3882  {
3883  	assert(waitq_valid(waitq));
3884  	assert(!waitq_irq_safe(waitq));
3885  
3886  	waitq_lock(waitq);
3887  	/* it doesn't matter to us if the lock is dropped here */
3888  	(void)waitq_clear_prepost_locked(waitq);
3889  	waitq_unlock(waitq);
3890  }
3891  
3892  /**
3893   * return a the waitq's prepost object ID (allocate if necessary)
3894   *
3895   * Conditions:
3896   *	'waitq' is unlocked
3897   */
3898  uint64_t
3899  waitq_get_prepost_id(struct waitq *waitq)
3900  {
3901  	struct wq_prepost *wqp;
3902  	uint64_t wqp_id = 0;
3903  
3904  	if (!waitq_valid(waitq)) {
3905  		return 0;
3906  	}
3907  
3908  	assert(!waitq_irq_safe(waitq));
3909  
3910  	waitq_lock(waitq);
3911  
3912  	if (!waitq_valid(waitq)) {
3913  		goto out_unlock;
3914  	}
3915  
3916  	if (waitq->waitq_prepost_id) {
3917  		wqp_id = waitq->waitq_prepost_id;
3918  		goto out_unlock;
3919  	}
3920  
3921  	/* don't hold a spinlock while allocating a prepost object */
3922  	waitq_unlock(waitq);
3923  
3924  	wqp = wq_prepost_alloc(WQP_WQ, 1);
3925  	if (!wqp) {
3926  		return 0;
3927  	}
3928  
3929  	/* re-acquire the waitq lock */
3930  	waitq_lock(waitq);
3931  
3932  	if (!waitq_valid(waitq)) {
3933  		wq_prepost_put(wqp);
3934  		wqp_id = 0;
3935  		goto out_unlock;
3936  	}
3937  
3938  	if (waitq->waitq_prepost_id) {
3939  		/* we were beat by someone else */
3940  		wq_prepost_put(wqp);
3941  		wqp_id = waitq->waitq_prepost_id;
3942  		goto out_unlock;
3943  	}
3944  
3945  	wqp->wqp_wq.wqp_wq_ptr = waitq;
3946  
3947  	wqp_set_valid(wqp);
3948  	wqp_id = wqp->wqp_prepostid.id;
3949  	waitq->waitq_prepost_id = wqp_id;
3950  
3951  	wq_prepost_put(wqp);
3952  
3953  out_unlock:
3954  	waitq_unlock(waitq);
3955  
3956  	return wqp_id;
3957  }
3958  
3959  
3960  static int
3961  waitq_inset_cb(struct waitq *waitq, void *ctx, struct waitq_link *link)
3962  {
3963  	uint64_t setid = *(uint64_t *)ctx;
3964  	int wqltype = wql_type(link);
3965  	(void)waitq;
3966  	if (wqltype == WQL_WQS && link->wql_setid.id == setid) {
3967  		wqdbg_v("  waitq already in set 0x%llx", setid);
3968  		return WQ_ITERATE_FOUND;
3969  	} else if (wqltype == WQL_LINK) {
3970  		/*
3971  		 * break out early if we see a link that points to the setid
3972  		 * in question. This saves us a step in the
3973  		 * iteration/recursion
3974  		 */
3975  		wqdbg_v("  waitq already in set 0x%llx (WQL_LINK)", setid);
3976  		if (link->wql_link.left_setid == setid ||
3977  		    link->wql_link.right_setid == setid) {
3978  			return WQ_ITERATE_FOUND;
3979  		}
3980  	}
3981  
3982  	return WQ_ITERATE_CONTINUE;
3983  }
3984  
3985  /**
3986   * determine if 'waitq' is a member of 'wqset'
3987   *
3988   * Conditions:
3989   *	neither 'waitq' nor 'wqset' is not locked
3990   *	may disable and re-enable interrupts while locking 'waitq'
3991   */
3992  boolean_t
3993  waitq_member(struct waitq *waitq, struct waitq_set *wqset)
3994  {
3995  	kern_return_t kr = WQ_ITERATE_SUCCESS;
3996  	uint64_t setid;
3997  
3998  	if (!waitq_valid(waitq)) {
3999  		panic("Invalid waitq: %p", waitq);
4000  	}
4001  	assert(!waitq_irq_safe(waitq));
4002  
4003  	if (!waitqs_is_set(wqset)) {
4004  		return FALSE;
4005  	}
4006  
4007  	waitq_lock(waitq);
4008  
4009  	if (!waitqs_is_linked(wqset)) {
4010  		goto out_unlock;
4011  	}
4012  
4013  	setid = wqset->wqset_id;
4014  
4015  	/* fast path: most waitqs are members of only 1 set */
4016  	if (waitq->waitq_set_id == setid) {
4017  		waitq_unlock(waitq);
4018  		return TRUE;
4019  	}
4020  
4021  	/* walk the link table and look for the Set ID of wqset */
4022  	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4023  	    WQL_ALL, (void *)&setid, waitq_inset_cb);
4024  
4025  out_unlock:
4026  	waitq_unlock(waitq);
4027  	return kr == WQ_ITERATE_FOUND;
4028  }
4029  
4030  /**
4031   * Returns true is the given waitq is a member of at least 1 set
4032   */
4033  boolean_t
4034  waitq_in_set(struct waitq *waitq)
4035  {
4036  	struct waitq_link *link;
4037  	boolean_t inset = FALSE;
4038  
4039  	if (waitq_irq_safe(waitq)) {
4040  		return FALSE;
4041  	}
4042  
4043  	waitq_lock(waitq);
4044  
4045  	if (!waitq->waitq_set_id) {
4046  		goto out_unlock;
4047  	}
4048  
4049  	link = wql_get_link(waitq->waitq_set_id);
4050  	if (link) {
4051  		/* if we get here, the waitq is in _at_least_one_ set */
4052  		inset = TRUE;
4053  		wql_put_link(link);
4054  	} else {
4055  		/* we can just optimize this for next time */
4056  		waitq->waitq_set_id = 0;
4057  	}
4058  
4059  out_unlock:
4060  	waitq_unlock(waitq);
4061  	return inset;
4062  }
4063  
4064  
4065  /**
4066   * pre-allocate a waitq link structure from the link table
4067   *
4068   * Conditions:
4069   *	'waitq' is not locked
4070   *	may (rarely) block if link table needs to grow
4071   */
4072  uint64_t
4073  waitq_link_reserve(struct waitq *waitq)
4074  {
4075  	struct waitq_link *link;
4076  	uint64_t reserved_id = 0;
4077  
4078  	assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
4079  
4080  	/*
4081  	 * We've asserted that the caller can block, so we enforce a
4082  	 * minimum-free table element policy here.
4083  	 */
4084  	wql_ensure_free_space();
4085  
4086  	(void)waitq;
4087  	link = wql_alloc_link(LT_RESERVED);
4088  	if (!link) {
4089  		return 0;
4090  	}
4091  
4092  	reserved_id = link->wql_setid.id;
4093  
4094  	return reserved_id;
4095  }
4096  
4097  /**
4098   * release a pre-allocated waitq link structure
4099   */
4100  void
4101  waitq_link_release(uint64_t id)
4102  {
4103  	struct waitq_link *link;
4104  
4105  	if (id == 0) {
4106  		return;
4107  	}
4108  
4109  	link = wql_get_reserved(id, WQL_LINK);
4110  	if (!link) {
4111  		return;
4112  	}
4113  
4114  	/*
4115  	 * if we successfully got a link object, then we know
4116  	 * it's not been marked valid, and can be released with
4117  	 * a standard wql_put_link() which should free the element.
4118  	 */
4119  	wql_put_link(link);
4120  #if CONFIG_LTABLE_STATS
4121  	g_wqlinktable.nreserved_releases += 1;
4122  #endif
4123  }
4124  
4125  /**
4126   * link 'waitq' to the set identified by 'setid' using the 'link' structure
4127   *
4128   * Conditions:
4129   *	'waitq' is locked
4130   *	caller should have a reference to the 'link' object
4131   */
4132  static kern_return_t
4133  waitq_link_internal(struct waitq *waitq,
4134      uint64_t setid, struct waitq_link *link)
4135  {
4136  	struct waitq_link *qlink;
4137  	kern_return_t kr;
4138  
4139  	assert(waitq_held(waitq));
4140  	assert(setid != 0);
4141  	assert(setid != WQSET_NOT_LINKED);
4142  
4143  	/*
4144  	 * If the waitq_set_id field is empty, then this waitq is not
4145  	 * a member of any other set. All we have to do is update the
4146  	 * field.
4147  	 */
4148  	if (!waitq->waitq_set_id) {
4149  		waitq->waitq_set_id = setid;
4150  		return KERN_SUCCESS;
4151  	}
4152  
4153  	qlink = wql_get_link(waitq->waitq_set_id);
4154  	if (!qlink) {
4155  		/*
4156  		 * The set to which this wait queue belonged has been
4157  		 * destroyed / invalidated. We can re-use the waitq field.
4158  		 */
4159  		waitq->waitq_set_id = setid;
4160  		return KERN_SUCCESS;
4161  	}
4162  	wql_put_link(qlink);
4163  
4164  	/*
4165  	 * Check to see if it's already a member of the set.
4166  	 *
4167  	 * TODO: check for cycles!
4168  	 */
4169  	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4170  	    WQL_ALL, (void *)&setid, waitq_inset_cb);
4171  	if (kr == WQ_ITERATE_FOUND) {
4172  		return KERN_ALREADY_IN_SET;
4173  	}
4174  
4175  	/*
4176  	 * This wait queue is a member of at least one set already,
4177  	 * and _not_ a member of the given set. Use our previously
4178  	 * allocated link object, and hook it up to the wait queue.
4179  	 * Note that it's possible that one or more of the wait queue sets to
4180  	 * which the wait queue belongs was invalidated before we allocated
4181  	 * this link object. That's OK because the next time we use that
4182  	 * object we'll just ignore it.
4183  	 */
4184  	link->wql_link.left_setid = setid;
4185  	link->wql_link.right_setid = waitq->waitq_set_id;
4186  	wql_mkvalid(link);
4187  
4188  	waitq->waitq_set_id = link->wql_setid.id;
4189  
4190  	return KERN_SUCCESS;
4191  }
4192  
4193  /**
4194   * link 'waitq' to 'wqset'
4195   *
4196   * Conditions:
4197   *	if 'lock_state' contains WAITQ_SHOULD_LOCK, 'waitq' must be unlocked.
4198   *	Otherwise, 'waitq' must be locked.
4199   *
4200   *	may (rarely) block on link table allocation if the table has to grow,
4201   *	and no 'reserved_link' object is passed.
4202   *
4203   *	may block and acquire wqset lock if the wqset passed has no link.
4204   *
4205   * Notes:
4206   *	The caller can guarantee that this function will never block by
4207   *	- pre-allocating a link table object and passing its ID in 'reserved_link'
4208   *      - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.
4209   *      It is not possible to provide a reserved_link without having also linked
4210   *	the wqset.
4211   */
4212  kern_return_t
4213  waitq_link(struct waitq *waitq, struct waitq_set *wqset,
4214      waitq_lock_state_t lock_state, uint64_t *reserved_link)
4215  {
4216  	kern_return_t kr;
4217  	struct waitq_link *link;
4218  	int should_lock = (lock_state == WAITQ_SHOULD_LOCK);
4219  
4220  	if (!waitq_valid(waitq) || waitq_irq_safe(waitq)) {
4221  		panic("Invalid waitq: %p", waitq);
4222  	}
4223  
4224  	if (!waitqs_is_set(wqset)) {
4225  		return KERN_INVALID_ARGUMENT;
4226  	}
4227  
4228  	if (!reserved_link || *reserved_link == 0) {
4229  		if (!waitqs_is_linked(wqset)) {
4230  			waitq_set_lazy_init_link(wqset);
4231  		}
4232  	}
4233  
4234  	wqdbg_v("Link waitq %p to wqset 0x%llx",
4235  	    (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4236  
4237  	/*
4238  	 * We _might_ need a new link object here, so we'll grab outside
4239  	 * the lock because the alloc call _might_ block.
4240  	 *
4241  	 * If the caller reserved a link beforehand, then wql_get_link
4242  	 * is guaranteed not to block because the caller holds an extra
4243  	 * reference to the link which, in turn, hold a reference to the
4244  	 * link table.
4245  	 */
4246  	if (reserved_link && *reserved_link != 0) {
4247  		link = wql_get_reserved(*reserved_link, WQL_LINK);
4248  		/* always consume the caller's reference */
4249  		*reserved_link = 0;
4250  	} else {
4251  		link = wql_alloc_link(WQL_LINK);
4252  	}
4253  	if (!link) {
4254  		return KERN_NO_SPACE;
4255  	}
4256  
4257  	if (should_lock) {
4258  		waitq_lock(waitq);
4259  	}
4260  
4261  	kr = waitq_link_internal(waitq, wqset->wqset_id, link);
4262  
4263  	if (should_lock) {
4264  		waitq_unlock(waitq);
4265  	}
4266  
4267  	wql_put_link(link);
4268  
4269  	return kr;
4270  }
4271  
4272  /**
4273   * helper: unlink 'waitq' from waitq set identified by 'setid'
4274   *         this function also prunes invalid objects from the tree
4275   *
4276   * Conditions:
4277   *	MUST be called from walk_waitq_links link table walk
4278   *	'waitq' is locked
4279   *
4280   * Notes:
4281   *	This is a helper function which compresses the link table by culling
4282   *	unused or unnecessary links. See comments below for different
4283   *	scenarios.
4284   */
4285  static inline int
4286  waitq_maybe_remove_link(struct waitq *waitq,
4287      uint64_t setid,
4288      struct waitq_link *parent,
4289      struct waitq_link *left,
4290      struct waitq_link *right)
4291  {
4292  	uint64_t *wq_setid = &waitq->waitq_set_id;
4293  
4294  	/*
4295  	 * There are two scenarios:
4296  	 *
4297  	 * Scenario 1:
4298  	 * --------------------------------------------------------------------
4299  	 * waitq->waitq_set_id == parent
4300  	 *
4301  	 *         parent(LINK)
4302  	 *           /    \
4303  	 *          /      \
4304  	 *         /        \
4305  	 *  L(LINK/WQS_l)   R(LINK/WQS_r)
4306  	 *
4307  	 * In this scenario, we assert that the original waitq points to the
4308  	 * parent link we were passed in.  If WQS_l (or WQS_r) is the waitq
4309  	 * set we're looking for, we can set the corresponding parent
4310  	 * link id (left or right) to 0.  To compress the tree, we can reset the
4311  	 * waitq_set_id of the original waitq to point to the side of the
4312  	 * parent that is still valid. We then discard the parent link object.
4313  	 */
4314  	if (*wq_setid == parent->wql_setid.id) {
4315  		if (!left && !right) {
4316  			/* completely invalid children */
4317  			wql_invalidate(parent);
4318  			wqdbg_v("S1, L+R");
4319  			*wq_setid = 0;
4320  			return WQ_ITERATE_INVALID;
4321  		} else if (!left || left->wql_setid.id == setid) {
4322  			/*
4323  			 * left side matches we know it points either to the
4324  			 * WQS we're unlinking, or to an invalid object:
4325  			 * no need to invalidate it
4326  			 */
4327  			*wq_setid = right ? right->wql_setid.id : 0;
4328  			wql_invalidate(parent);
4329  			wqdbg_v("S1, L");
4330  			return left ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4331  		} else if (!right || right->wql_setid.id == setid) {
4332  			/*
4333  			 * if right side matches we know it points either to the
4334  			 * WQS we're unlinking, or to an invalid object:
4335  			 * no need to invalidate it
4336  			 */
4337  			*wq_setid = left ? left->wql_setid.id : 0;
4338  			wql_invalidate(parent);
4339  			wqdbg_v("S1, R");
4340  			return right ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4341  		}
4342  	}
4343  
4344  	/*
4345  	 * the tree walk starts at the top-of-tree and moves down,
4346  	 * so these are safe asserts.
4347  	 */
4348  	assert(left || right); /* one of them has to be valid at this point */
4349  
4350  	/*
4351  	 * Scenario 2:
4352  	 * --------------------------------------------------------------------
4353  	 * waitq->waitq_set_id == ... (OR parent)
4354  	 *
4355  	 *                    ...
4356  	 *                     |
4357  	 *                   parent
4358  	 *                   /    \
4359  	 *                  /      \
4360  	 *              L(LINK)     R(LINK)
4361  	 *               /\             /\
4362  	 *              /  \           /  \
4363  	 *             /    \       Rl(*)  Rr(*)
4364  	 *         Ll(WQS)  Lr(WQS)
4365  	 *
4366  	 * In this scenario, a leaf node of either the left or right side
4367  	 * could be the wait queue set we're looking to unlink. We also handle
4368  	 * the case where one of these links is invalid.  If a leaf node is
4369  	 * invalid or it's the set we're looking for, we can safely remove the
4370  	 * middle link (left or right) and point the parent link directly to
4371  	 * the remaining leaf node.
4372  	 */
4373  	if (left && wql_type(left) == WQL_LINK) {
4374  		uint64_t Ll, Lr;
4375  		struct waitq_link *linkLl, *linkLr;
4376  		assert(left->wql_setid.id != setid);
4377  		Ll = left->wql_link.left_setid;
4378  		Lr = left->wql_link.right_setid;
4379  		linkLl = wql_get_link(Ll);
4380  		linkLr = wql_get_link(Lr);
4381  		if (!linkLl && !linkLr) {
4382  			/*
4383  			 * The left object points to two invalid objects!
4384  			 * We can invalidate the left w/o touching the parent.
4385  			 */
4386  			wql_invalidate(left);
4387  			wqdbg_v("S2, Ll+Lr");
4388  			return WQ_ITERATE_INVALID;
4389  		} else if (!linkLl || Ll == setid) {
4390  			/* Ll is invalid and/or the wait queue set we're looking for */
4391  			parent->wql_link.left_setid = Lr;
4392  			wql_invalidate(left);
4393  			wql_put_link(linkLl);
4394  			wql_put_link(linkLr);
4395  			wqdbg_v("S2, Ll");
4396  			return linkLl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4397  		} else if (!linkLr || Lr == setid) {
4398  			/* Lr is invalid and/or the wait queue set we're looking for */
4399  			parent->wql_link.left_setid = Ll;
4400  			wql_invalidate(left);
4401  			wql_put_link(linkLr);
4402  			wql_put_link(linkLl);
4403  			wqdbg_v("S2, Lr");
4404  			return linkLr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4405  		}
4406  		wql_put_link(linkLl);
4407  		wql_put_link(linkLr);
4408  	}
4409  
4410  	if (right && wql_type(right) == WQL_LINK) {
4411  		uint64_t Rl, Rr;
4412  		struct waitq_link *linkRl, *linkRr;
4413  		assert(right->wql_setid.id != setid);
4414  		Rl = right->wql_link.left_setid;
4415  		Rr = right->wql_link.right_setid;
4416  		linkRl = wql_get_link(Rl);
4417  		linkRr = wql_get_link(Rr);
4418  		if (!linkRl && !linkRr) {
4419  			/*
4420  			 * The right object points to two invalid objects!
4421  			 * We can invalidate the right w/o touching the parent.
4422  			 */
4423  			wql_invalidate(right);
4424  			wqdbg_v("S2, Rl+Rr");
4425  			return WQ_ITERATE_INVALID;
4426  		} else if (!linkRl || Rl == setid) {
4427  			/* Rl is invalid and/or the wait queue set we're looking for */
4428  			parent->wql_link.right_setid = Rr;
4429  			wql_invalidate(right);
4430  			wql_put_link(linkRl);
4431  			wql_put_link(linkRr);
4432  			wqdbg_v("S2, Rl");
4433  			return linkRl ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4434  		} else if (!linkRr || Rr == setid) {
4435  			/* Rr is invalid and/or the wait queue set we're looking for */
4436  			parent->wql_link.right_setid = Rl;
4437  			wql_invalidate(right);
4438  			wql_put_link(linkRl);
4439  			wql_put_link(linkRr);
4440  			wqdbg_v("S2, Rr");
4441  			return linkRr ? WQ_ITERATE_UNLINKED : WQ_ITERATE_INVALID;
4442  		}
4443  		wql_put_link(linkRl);
4444  		wql_put_link(linkRr);
4445  	}
4446  
4447  	return WQ_ITERATE_CONTINUE;
4448  }
4449  
4450  /**
4451   * link table walk callback that unlinks 'waitq' from 'ctx->setid'
4452   *
4453   * Conditions:
4454   *	called from walk_waitq_links
4455   *	'waitq' is locked
4456   *
4457   * Notes:
4458   *	uses waitq_maybe_remove_link() to compress the linktable and
4459   *	perform the actual unlinking
4460   */
4461  static int
4462  waitq_unlink_cb(struct waitq *waitq, void *ctx,
4463      struct waitq_link *link)
4464  {
4465  	uint64_t setid = *((uint64_t *)ctx);
4466  	struct waitq_link *right, *left;
4467  	int ret = 0;
4468  
4469  	if (wql_type(link) != WQL_LINK) {
4470  		return WQ_ITERATE_CONTINUE;
4471  	}
4472  
4473  	do {
4474  		left  = wql_get_link(link->wql_link.left_setid);
4475  		right = wql_get_link(link->wql_link.right_setid);
4476  
4477  		ret = waitq_maybe_remove_link(waitq, setid, link, left, right);
4478  
4479  		wql_put_link(left);
4480  		wql_put_link(right);
4481  
4482  		if (!wql_is_valid(link)) {
4483  			return WQ_ITERATE_INVALID;
4484  		}
4485  		/* A ret value of UNLINKED will break us out of table walk */
4486  	} while (ret == WQ_ITERATE_INVALID);
4487  
4488  	return ret;
4489  }
4490  
4491  
4492  /**
4493   * undo/remove a prepost from 'ctx' (waitq) to 'wqset'
4494   *
4495   * Conditions:
4496   *	Called from wq_prepost_foreach_locked OR wq_prepost_iterate
4497   *	'wqset' may be NULL
4498   *	(ctx)->unlink_wqset is locked
4499   */
4500  static int
4501  waitq_unlink_prepost_cb(struct waitq_set __unused *wqset, void *ctx,
4502      struct wq_prepost *wqp, struct waitq *waitq)
4503  {
4504  	struct wq_unlink_ctx *ulctx = (struct wq_unlink_ctx *)ctx;
4505  
4506  	if (waitq != ulctx->unlink_wq) {
4507  		return WQ_ITERATE_CONTINUE;
4508  	}
4509  
4510  	if (wqp_type(wqp) == WQP_WQ &&
4511  	    wqp->wqp_prepostid.id == ulctx->unlink_wqset->wqset_prepost_id) {
4512  		/* this is the only prepost on this wait queue set */
4513  		wqdbg_v("unlink wqp (WQ) 0x%llx", wqp->wqp_prepostid.id);
4514  		ulctx->unlink_wqset->wqset_prepost_id = 0;
4515  		return WQ_ITERATE_BREAK;
4516  	}
4517  
4518  	assert(wqp_type(wqp) == WQP_POST);
4519  
4520  	/*
4521  	 * The prepost object 'wqp' points to a waitq which should no longer
4522  	 * be preposted to 'ulctx->unlink_wqset'. We can remove the prepost
4523  	 * object from the list and break out of the iteration. Using the
4524  	 * context object in this way allows this same callback function to be
4525  	 * used from both wq_prepost_foreach_locked and wq_prepost_iterate.
4526  	 */
4527  	wq_prepost_remove(ulctx->unlink_wqset, wqp);
4528  	return WQ_ITERATE_BREAK;
4529  }
4530  
4531  /**
4532   * unlink 'waitq' from 'wqset'
4533   *
4534   * Conditions:
4535   *	'waitq' is locked
4536   *	'wqset' is _not_ locked
4537   *	may (rarely) spin in prepost clear and drop/re-acquire 'waitq' lock
4538   *	(see waitq_clear_prepost_locked)
4539   */
4540  static kern_return_t
4541  waitq_unlink_locked(struct waitq *waitq,
4542      struct waitq_set *wqset)
4543  {
4544  	uint64_t setid;
4545  	kern_return_t kr;
4546  
4547  	assert(!waitq_irq_safe(waitq));
4548  
4549  	if (waitq->waitq_set_id == 0) {
4550  		/*
4551  		 * TODO:
4552  		 * it doesn't belong to anyone, and it has a prepost object?
4553  		 * This is an artifact of not cleaning up after kqueues when
4554  		 * they prepost into select sets...
4555  		 */
4556  		if (waitq->waitq_prepost_id != 0) {
4557  			(void)waitq_clear_prepost_locked(waitq);
4558  		}
4559  		return KERN_NOT_IN_SET;
4560  	}
4561  
4562  	if (!waitqs_is_linked(wqset)) {
4563  		/*
4564  		 * No link has been allocated for the wqset,
4565  		 * so no waitq could have been linked to it.
4566  		 */
4567  		return KERN_NOT_IN_SET;
4568  	}
4569  
4570  	setid = wqset->wqset_id;
4571  
4572  	if (waitq->waitq_set_id == setid) {
4573  		waitq->waitq_set_id = 0;
4574  		/*
4575  		 * This was the only set to which the waitq belonged: we can
4576  		 * safely release the waitq's prepost object. It doesn't
4577  		 * matter if this function drops and re-acquires the lock
4578  		 * because we're not manipulating waitq state any more.
4579  		 */
4580  		(void)waitq_clear_prepost_locked(waitq);
4581  		return KERN_SUCCESS;
4582  	}
4583  
4584  	/*
4585  	 * The waitq was a member of more that 1 set, so we need to
4586  	 * handle potentially compressing the link table, and
4587  	 * adjusting the waitq->waitq_set_id value.
4588  	 *
4589  	 * Note: we can't free the waitq's associated prepost object (if any)
4590  	 *       because it may be in use by the one or more _other_ sets to
4591  	 *       which this queue belongs.
4592  	 *
4593  	 * Note: This function only handles a single level of the queue linkage.
4594  	 *       Removing a waitq from a set to which it does not directly
4595  	 *       belong is undefined. For example, if a waitq belonged to set
4596  	 *       A, and set A belonged to set B. You can't remove the waitq
4597  	 *       from set B.
4598  	 */
4599  	kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
4600  	    WQL_LINK, (void *)&setid, waitq_unlink_cb);
4601  
4602  	if (kr == WQ_ITERATE_UNLINKED) {
4603  		struct wq_unlink_ctx ulctx;
4604  
4605  		kr = KERN_SUCCESS; /* found it and dis-associated it */
4606  
4607  		/* don't look for preposts if it's not prepost-enabled */
4608  		if (!wqset->wqset_q.waitq_prepost) {
4609  			goto out;
4610  		}
4611  
4612  		assert(!waitq_irq_safe(&wqset->wqset_q));
4613  
4614  		waitq_set_lock(wqset);
4615  		/*
4616  		 * clear out any prepost from waitq into wqset
4617  		 * TODO: this could be more efficient than a linear search of
4618  		 *       the waitq set's prepost list.
4619  		 */
4620  		ulctx.unlink_wq = waitq;
4621  		ulctx.unlink_wqset = wqset;
4622  		(void)wq_prepost_iterate(wqset->wqset_prepost_id, (void *)&ulctx,
4623  		    waitq_unlink_prepost_cb);
4624  		waitq_set_unlock(wqset);
4625  	} else {
4626  		kr = KERN_NOT_IN_SET; /* waitq is _not_ associated with wqset */
4627  	}
4628  
4629  out:
4630  	return kr;
4631  }
4632  
4633  /**
4634   * unlink 'waitq' from 'wqset'
4635   *
4636   * Conditions:
4637   *	neither 'waitq' nor 'wqset' is locked
4638   *	may disable and re-enable interrupts
4639   *	may (rarely) spin in prepost clear
4640   *	(see waitq_clear_prepost_locked)
4641   */
4642  kern_return_t
4643  waitq_unlink(struct waitq *waitq, struct waitq_set *wqset)
4644  {
4645  	kern_return_t kr = KERN_SUCCESS;
4646  
4647  	assert(waitqs_is_set(wqset));
4648  
4649  	/*
4650  	 * we allow the waitq to be invalid because the caller may be trying
4651  	 * to clear out old/dirty state
4652  	 */
4653  	if (!waitq_valid(waitq)) {
4654  		return KERN_INVALID_ARGUMENT;
4655  	}
4656  
4657  	wqdbg_v("unlink waitq %p from set 0x%llx",
4658  	    (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
4659  
4660  	assert(!waitq_irq_safe(waitq));
4661  
4662  	waitq_lock(waitq);
4663  
4664  	kr = waitq_unlink_locked(waitq, wqset);
4665  
4666  	waitq_unlock(waitq);
4667  	return kr;
4668  }
4669  
4670  /**
4671   * unlink a waitq from a waitq set, but reference the waitq by its prepost ID
4672   *
4673   * Conditions:
4674   *	'wqset' is unlocked
4675   *	wqp_id may be valid or invalid
4676   */
4677  void
4678  waitq_unlink_by_prepost_id(uint64_t wqp_id, struct waitq_set *wqset)
4679  {
4680  	struct wq_prepost *wqp;
4681  
4682  	disable_preemption();
4683  	wqp = wq_prepost_get(wqp_id);
4684  	if (wqp) {
4685  		struct waitq *wq;
4686  
4687  		wq = wqp->wqp_wq.wqp_wq_ptr;
4688  
4689  		/*
4690  		 * lock the waitq, then release our prepost ID reference, then
4691  		 * unlink the waitq from the wqset: this ensures that we don't
4692  		 * hold a prepost ID reference during the unlink, but we also
4693  		 * complete the unlink operation atomically to avoid a race
4694  		 * with waitq_unlink[_all].
4695  		 */
4696  		assert(!waitq_irq_safe(wq));
4697  
4698  		waitq_lock(wq);
4699  		wq_prepost_put(wqp);
4700  
4701  		if (!waitq_valid(wq)) {
4702  			/* someone already tore down this waitq! */
4703  			waitq_unlock(wq);
4704  			enable_preemption();
4705  			return;
4706  		}
4707  
4708  		/* this _may_ drop the wq lock, but that's OK */
4709  		waitq_unlink_locked(wq, wqset);
4710  
4711  		waitq_unlock(wq);
4712  	}
4713  	enable_preemption();
4714  	return;
4715  }
4716  
4717  
4718  /**
4719   * reference and lock a waitq by its prepost ID
4720   *
4721   * Conditions:
4722   *	wqp_id may be valid or invalid
4723   *
4724   * Returns:
4725   *	a locked waitq if wqp_id was valid
4726   *	NULL on failure
4727   */
4728  struct waitq *
4729  waitq_lock_by_prepost_id(uint64_t wqp_id)
4730  {
4731  	struct waitq *wq = NULL;
4732  	struct wq_prepost *wqp;
4733  
4734  	disable_preemption();
4735  	wqp = wq_prepost_get(wqp_id);
4736  	if (wqp) {
4737  		wq = wqp->wqp_wq.wqp_wq_ptr;
4738  
4739  		assert(!waitq_irq_safe(wq));
4740  
4741  		waitq_lock(wq);
4742  		wq_prepost_put(wqp);
4743  
4744  		if (!waitq_valid(wq)) {
4745  			/* someone already tore down this waitq! */
4746  			waitq_unlock(wq);
4747  			enable_preemption();
4748  			return NULL;
4749  		}
4750  	}
4751  	enable_preemption();
4752  	return wq;
4753  }
4754  
4755  
4756  /**
4757   * unlink 'waitq' from all sets to which it belongs
4758   *
4759   * Conditions:
4760   *	'waitq' is locked on entry
4761   *	returns with waitq lock dropped
4762   *
4763   * Notes:
4764   *	may (rarely) spin (see waitq_clear_prepost_locked)
4765   */
4766  kern_return_t
4767  waitq_unlink_all_unlock(struct waitq *waitq)
4768  {
4769  	uint64_t old_set_id = 0;
4770  	wqdbg_v("unlink waitq %p from all sets",
4771  	    (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq));
4772  	assert(!waitq_irq_safe(waitq));
4773  
4774  	/* it's not a member of any sets */
4775  	if (waitq->waitq_set_id == 0) {
4776  		waitq_unlock(waitq);
4777  		return KERN_SUCCESS;
4778  	}
4779  
4780  	old_set_id = waitq->waitq_set_id;
4781  	waitq->waitq_set_id = 0;
4782  
4783  	/*
4784  	 * invalidate the prepost entry for this waitq.
4785  	 * This may drop and re-acquire the waitq lock, but that's OK because
4786  	 * if it was added to another set and preposted to that set in the
4787  	 * time we drop the lock, the state will remain consistent.
4788  	 */
4789  	(void)waitq_clear_prepost_locked(waitq);
4790  
4791  	waitq_unlock(waitq);
4792  
4793  	if (old_set_id) {
4794  		/*
4795  		 * Walk the link table and invalidate each LINK object that
4796  		 * used to connect this waitq to one or more sets: this works
4797  		 * because WQL_LINK objects are private to each wait queue
4798  		 */
4799  		(void)walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, old_set_id,
4800  		    WQL_LINK, NULL, waitq_unlink_all_cb);
4801  	}
4802  
4803  	return KERN_SUCCESS;
4804  }
4805  
4806  /**
4807   * unlink 'waitq' from all sets to which it belongs
4808   *
4809   * Conditions:
4810   *	'waitq' is not locked
4811   *	may disable and re-enable interrupts
4812   *	may (rarely) spin
4813   *	(see waitq_unlink_all_locked, waitq_clear_prepost_locked)
4814   */
4815  kern_return_t
4816  waitq_unlink_all(struct waitq *waitq)
4817  {
4818  	kern_return_t kr = KERN_SUCCESS;
4819  
4820  	if (!waitq_valid(waitq)) {
4821  		panic("Invalid waitq: %p", waitq);
4822  	}
4823  
4824  	assert(!waitq_irq_safe(waitq));
4825  	waitq_lock(waitq);
4826  	if (!waitq_valid(waitq)) {
4827  		waitq_unlock(waitq);
4828  		return KERN_SUCCESS;
4829  	}
4830  
4831  	kr = waitq_unlink_all_unlock(waitq);
4832  	/* waitq unlocked and set links deallocated */
4833  
4834  	return kr;
4835  }
4836  
4837  
4838  /**
4839   * unlink all waitqs from 'wqset'
4840   *
4841   * Conditions:
4842   *	'wqset' is locked on entry
4843   *	'wqset' is unlocked on exit and spl is restored
4844   *
4845   * Note:
4846   *	may (rarely) spin/block (see waitq_clear_prepost_locked)
4847   */
4848  kern_return_t
4849  waitq_set_unlink_all_unlock(struct waitq_set *wqset)
4850  {
4851  	struct waitq_link *link;
4852  	uint64_t prepost_id;
4853  
4854  	wqdbg_v("unlink all queues from set 0x%llx", wqset->wqset_id);
4855  
4856  	/*
4857  	 * This operation does not require interaction with any of the set's
4858  	 * constituent wait queues. All we have to do is invalidate the SetID
4859  	 */
4860  
4861  	if (waitqs_is_linked(wqset)) {
4862  		/* invalidate and re-alloc the link object first */
4863  		link = wql_get_link(wqset->wqset_id);
4864  
4865  		/* we may have raced with a waitq_set_deinit: handle this */
4866  		if (!link) {
4867  			waitq_set_unlock(wqset);
4868  			return KERN_SUCCESS;
4869  		}
4870  
4871  		wql_invalidate(link);
4872  
4873  		/* re-alloc the object to get a new generation ID */
4874  		wql_realloc_link(link, WQL_WQS);
4875  		link->wql_wqs.wql_set = wqset;
4876  
4877  		wqset->wqset_id = link->wql_setid.id;
4878  		wql_mkvalid(link);
4879  		wql_put_link(link);
4880  	}
4881  
4882  	/* clear any preposts attached to this set */
4883  	prepost_id = 0;
4884  	if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
4885  		prepost_id = wqset->wqset_prepost_id;
4886  	}
4887  	/* else { TODO: notify kqueue subsystem? } */
4888  	wqset->wqset_prepost_id = 0;
4889  
4890  	/*
4891  	 * clear set linkage and prepost object associated with this set:
4892  	 * waitq sets may prepost to other sets if, for example, they are
4893  	 * associated with a kqueue which is in a select set.
4894  	 *
4895  	 * This releases all the set link objects
4896  	 * (links to other sets to which this set was previously added)
4897  	 */
4898  	waitq_unlink_all_unlock(&wqset->wqset_q);
4899  	/* wqset->wqset_q unlocked */
4900  
4901  	/* drop / unlink all the prepost table objects */
4902  	if (prepost_id) {
4903  		(void)wq_prepost_iterate(prepost_id, NULL,
4904  		    wqset_clear_prepost_chain_cb);
4905  	}
4906  
4907  	return KERN_SUCCESS;
4908  }
4909  
4910  /**
4911   * unlink all waitqs from 'wqset'
4912   *
4913   * Conditions:
4914   *	'wqset' is not locked
4915   *	may (rarely) spin/block (see waitq_clear_prepost_locked)
4916   */
4917  kern_return_t
4918  waitq_set_unlink_all(struct waitq_set *wqset)
4919  {
4920  	assert(waitqs_is_set(wqset));
4921  	assert(!waitq_irq_safe(&wqset->wqset_q));
4922  
4923  	waitq_set_lock(wqset);
4924  	return waitq_set_unlink_all_unlock(wqset);
4925  	/* wqset unlocked and set links and preposts deallocated */
4926  }
4927  
4928  static int
4929  waitq_prepost_reserve_cb(struct waitq *waitq, void *ctx,
4930      struct waitq_link *link)
4931  {
4932  	uint32_t *num = (uint32_t *)ctx;
4933  	(void)waitq;
4934  
4935  	/*
4936  	 * In the worst case, we'll have to allocate 2 prepost objects
4937  	 * per waitq set (if the set was already preposted by another
4938  	 * waitq).
4939  	 */
4940  	if (wql_type(link) == WQL_WQS) {
4941  		/*
4942  		 * check to see if the associated waitq actually supports
4943  		 * preposting
4944  		 */
4945  		if (waitq_set_can_prepost(link->wql_wqs.wql_set)) {
4946  			*num += 2;
4947  		}
4948  	}
4949  	return WQ_ITERATE_CONTINUE;
4950  }
4951  
4952  static int
4953  waitq_alloc_prepost_reservation(int nalloc, struct waitq *waitq,
4954      int *did_unlock, struct wq_prepost **wqp)
4955  {
4956  	struct wq_prepost *tmp;
4957  	struct wqp_cache *cache;
4958  
4959  	*did_unlock = 0;
4960  
4961  	/*
4962  	 * Before we unlock the waitq, check the per-processor prepost object
4963  	 * cache to see if there's enough there for us. If so, do the
4964  	 * allocation, keep the lock and save an entire iteration over the set
4965  	 * linkage!
4966  	 */
4967  	if (waitq) {
4968  #ifndef __DARLING__
4969  		disable_preemption();
4970  		cache = PERCPU_GET(wqp_cache);
4971  		if (nalloc <= (int)cache->avail) {
4972  			goto do_alloc;
4973  		}
4974  		enable_preemption();
4975  #endif // __DARLING__
4976  
4977  		/* unlock the waitq to perform the allocation */
4978  		*did_unlock = 1;
4979  		waitq_unlock(waitq);
4980  	}
4981  
4982  do_alloc:
4983  	tmp = wq_prepost_alloc(LT_RESERVED, nalloc);
4984  	if (!tmp) {
4985  		panic("Couldn't reserve %d preposts for waitq @%p (wqp@%p)",
4986  		    nalloc, waitq, *wqp);
4987  	}
4988  	if (*wqp) {
4989  		/* link the two lists */
4990  		int __assert_only rc;
4991  		rc = wq_prepost_rlink(tmp, *wqp);
4992  		assert(rc == nalloc);
4993  	}
4994  	*wqp = tmp;
4995  
4996  	/*
4997  	 * If the caller can block, then enforce a minimum-free table element
4998  	 * policy here. This helps ensure that we will have enough prepost
4999  	 * objects for callers such as selwakeup() that can be called with
5000  	 * spin locks held.
5001  	 */
5002  	if (get_preemption_level() == 0) {
5003  		wq_prepost_ensure_free_space();
5004  	}
5005  
5006  	if (waitq) {
5007  		if (*did_unlock == 0) {
5008  			/* decrement the preemption count if alloc from cache */
5009  			enable_preemption();
5010  		} else {
5011  			/* otherwise: re-lock the waitq */
5012  			waitq_lock(waitq);
5013  		}
5014  	}
5015  
5016  	return nalloc;
5017  }
5018  
5019  static int
5020  waitq_count_prepost_reservation(struct waitq *waitq, int extra, int keep_locked)
5021  {
5022  	int npreposts = 0;
5023  
5024  	/*
5025  	 * If the waitq is not currently part of a set, and we're not asked to
5026  	 * keep the waitq locked then we'll want to have 3 in reserve
5027  	 * just-in-case it becomes part of a set while we unlock and reserve.
5028  	 * We may need up to 1 object for the waitq, and 2 for the set.
5029  	 */
5030  	if (waitq->waitq_set_id == 0) {
5031  		npreposts = 3;
5032  	} else {
5033  		/* this queue has never been preposted before */
5034  		if (waitq->waitq_prepost_id == 0) {
5035  			npreposts = 3;
5036  		}
5037  
5038  		/*
5039  		 * Walk the set of table linkages associated with this waitq
5040  		 * and count the worst-case number of prepost objects that
5041  		 * may be needed during a wakeup_all. We can walk this without
5042  		 * locking each set along the way because the table-based IDs
5043  		 * disconnect us from the set pointers themselves, and the
5044  		 * table walking is careful to read the setid values only once.
5045  		 * Locking each set up the chain also doesn't guarantee that
5046  		 * their membership won't change between the time we unlock
5047  		 * that set and when we actually go to prepost, so our
5048  		 * situation is no worse than before and we've alleviated lock
5049  		 * contention on any sets to which this waitq belongs.
5050  		 */
5051  		(void)walk_waitq_links(LINK_WALK_FULL_DAG_UNLOCKED,
5052  		    waitq, waitq->waitq_set_id,
5053  		    WQL_WQS, (void *)&npreposts,
5054  		    waitq_prepost_reserve_cb);
5055  	}
5056  
5057  	if (extra > 0) {
5058  		npreposts += extra;
5059  	}
5060  
5061  	if (npreposts == 0 && !keep_locked) {
5062  		/*
5063  		 * If we get here, we were asked to reserve some prepost
5064  		 * objects for a waitq that's previously preposted, and is not
5065  		 * currently a member of any sets. We have also been
5066  		 * instructed to unlock the waitq when we're done. In this
5067  		 * case, we pre-allocated enough reserved objects to handle
5068  		 * the case where the waitq gets added to a single set when
5069  		 * the lock is released.
5070  		 */
5071  		npreposts = 3;
5072  	}
5073  
5074  	return npreposts;
5075  }
5076  
5077  
5078  /**
5079   * pre-allocate prepost objects for 'waitq'
5080   *
5081   * Conditions:
5082   *	'waitq' is not locked
5083   *
5084   * Returns:
5085   *	panic on error
5086   *
5087   *	0 on success, '*reserved' is set to the head of a singly-linked
5088   *	list of pre-allocated prepost objects.
5089   *
5090   * Notes:
5091   *	If 'lock_state' is WAITQ_KEEP_LOCKED, this function performs the pre-allocation
5092   *	atomically and returns 'waitq' locked.
5093   *
5094   *	This function attempts to pre-allocate precisely enough prepost
5095   *	objects based on the current set membership of 'waitq'. If the
5096   *	operation is performed atomically, then the caller
5097   *	is guaranteed to have enough pre-allocated prepost object to avoid
5098   *	any (rare) blocking in the wakeup path.
5099   */
5100  uint64_t
5101  waitq_prepost_reserve(struct waitq *waitq, int extra,
5102      waitq_lock_state_t lock_state)
5103  {
5104  	uint64_t reserved = 0;
5105  	uint64_t prev_setid = 0, prev_prepostid = 0;
5106  	struct wq_prepost *wqp = NULL;
5107  	int nalloc = 0, npreposts = 0;
5108  	int keep_locked = (lock_state == WAITQ_KEEP_LOCKED);
5109  	int unlocked = 0;
5110  
5111  	wqdbg_v("Attempting to reserve prepost linkages for waitq %p (extra:%d)",
5112  	    (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), extra);
5113  
5114  	if (waitq == NULL && extra > 0) {
5115  		/*
5116  		 * Simple prepost object allocation:
5117  		 * we'll add 2 more because the waitq might need an object,
5118  		 * and the set itself may need a new POST object in addition
5119  		 * to the number of preposts requested by the caller
5120  		 */
5121  		nalloc = waitq_alloc_prepost_reservation(extra + 2, NULL,
5122  		    &unlocked, &wqp);
5123  		assert(nalloc == extra + 2);
5124  		return wqp->wqp_prepostid.id;
5125  	}
5126  
5127  	assert(lock_state == WAITQ_KEEP_LOCKED || lock_state == WAITQ_UNLOCK);
5128  
5129  	assert(!waitq_irq_safe(waitq));
5130  
5131  	waitq_lock(waitq);
5132  
5133  	/* remember the set ID that we started with */
5134  	prev_setid = waitq->waitq_set_id;
5135  	prev_prepostid = waitq->waitq_prepost_id;
5136  
5137  	/*
5138  	 * If the waitq is not part of a set, and we're asked to
5139  	 * keep the set locked, then we don't have to reserve
5140  	 * anything!
5141  	 */
5142  	if (prev_setid == 0 && keep_locked) {
5143  		goto out;
5144  	}
5145  
5146  	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
5147  
5148  	/* nothing for us to do! */
5149  	if (npreposts == 0) {
5150  		if (keep_locked) {
5151  			goto out;
5152  		}
5153  		goto out_unlock;
5154  	}
5155  
5156  try_alloc:
5157  	/* this _may_ unlock and relock the waitq! */
5158  	nalloc = waitq_alloc_prepost_reservation(npreposts, waitq,
5159  	    &unlocked, &wqp);
5160  
5161  	if (!unlocked) {
5162  		/* allocation held the waitq lock: we'd done! */
5163  		if (keep_locked) {
5164  			goto out;
5165  		}
5166  		goto out_unlock;
5167  	}
5168  
5169  	/*
5170  	 * Before we return, if the allocation had to unlock the waitq, we
5171  	 * must check one more time to see if we have enough. If not, we'll
5172  	 * try to allocate the difference. If the caller requests it, we'll
5173  	 * also leave the waitq locked so that the use of the pre-allocated
5174  	 * prepost objects can be guaranteed to be enough if a wakeup_all is
5175  	 * performed before unlocking the waitq.
5176  	 */
5177  
5178  	/*
5179  	 * If the waitq is no longer associated with a set, or if the waitq's
5180  	 * set/prepostid has not changed since we first walked its linkage,
5181  	 * we're done.
5182  	 */
5183  	if ((waitq->waitq_set_id == 0) ||
5184  	    (waitq->waitq_set_id == prev_setid &&
5185  	    waitq->waitq_prepost_id == prev_prepostid)) {
5186  		if (keep_locked) {
5187  			goto out;
5188  		}
5189  		goto out_unlock;
5190  	}
5191  
5192  	npreposts = waitq_count_prepost_reservation(waitq, extra, keep_locked);
5193  
5194  	if (npreposts > nalloc) {
5195  		prev_setid = waitq->waitq_set_id;
5196  		prev_prepostid = waitq->waitq_prepost_id;
5197  		npreposts = npreposts - nalloc; /* only allocate the diff */
5198  		goto try_alloc;
5199  	}
5200  
5201  	if (keep_locked) {
5202  		goto out;
5203  	}
5204  
5205  out_unlock:
5206  	waitq_unlock(waitq);
5207  out:
5208  	if (wqp) {
5209  		reserved = wqp->wqp_prepostid.id;
5210  	}
5211  
5212  	return reserved;
5213  }
5214  
5215  /**
5216   * release a linked list of prepost objects allocated via _prepost_reserve
5217   *
5218   * Conditions:
5219   *	may (rarely) spin waiting for prepost table growth memcpy
5220   */
5221  void
5222  waitq_prepost_release_reserve(uint64_t id)
5223  {
5224  	struct wq_prepost *wqp;
5225  
5226  	wqdbg_v("releasing reserved preposts starting at: 0x%llx", id);
5227  
5228  	wqp = wq_prepost_rfirst(id);
5229  	if (!wqp) {
5230  		return;
5231  	}
5232  
5233  	wq_prepost_release_rlist(wqp);
5234  }
5235  
5236  
5237  /**
5238   * clear all preposts from 'wqset'
5239   *
5240   * Conditions:
5241   *	'wqset' is not locked
5242   */
5243  void
5244  waitq_set_clear_preposts(struct waitq_set *wqset)
5245  {
5246  	uint64_t prepost_id;
5247  	spl_t spl;
5248  
5249  	assert(waitqs_is_set(wqset));
5250  
5251  	if (!wqset->wqset_q.waitq_prepost || !wqset->wqset_prepost_id) {
5252  		return;
5253  	}
5254  
5255  	wqdbg_v("Clearing all preposted queues on waitq_set: 0x%llx",
5256  	    wqset->wqset_id);
5257  
5258  	if (waitq_irq_safe(&wqset->wqset_q)) {
5259  		spl = splsched();
5260  	}
5261  	waitq_set_lock(wqset);
5262  	prepost_id = wqset->wqset_prepost_id;
5263  	wqset->wqset_prepost_id = 0;
5264  	waitq_set_unlock(wqset);
5265  	if (waitq_irq_safe(&wqset->wqset_q)) {
5266  		splx(spl);
5267  	}
5268  
5269  	/* drop / unlink all the prepost table objects */
5270  	if (prepost_id) {
5271  		(void)wq_prepost_iterate(prepost_id, NULL,
5272  		    wqset_clear_prepost_chain_cb);
5273  	}
5274  }
5275  
5276  
5277  /* ----------------------------------------------------------------------
5278   *
5279   * Iteration: waitq -> sets / waitq_set -> preposts
5280   *
5281   * ---------------------------------------------------------------------- */
5282  
5283  struct wq_it_ctx {
5284  	void *input;
5285  	void *ctx;
5286  	waitq_iterator_t it;
5287  };
5288  
5289  static int
5290  waitq_iterate_sets_cb(struct waitq *waitq, void *ctx,
5291      struct waitq_link *link)
5292  {
5293  	struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
5294  	struct waitq_set *wqset;
5295  	int ret;
5296  
5297  	(void)waitq;
5298  	assert(!waitq_irq_safe(waitq));
5299  	assert(wql_type(link) == WQL_WQS);
5300  
5301  	/*
5302  	 * the waitq is locked, so we can just take the set lock
5303  	 * and call the iterator function
5304  	 */
5305  	wqset = link->wql_wqs.wql_set;
5306  	assert(wqset != NULL);
5307  	assert(!waitq_irq_safe(&wqset->wqset_q));
5308  	waitq_set_lock(wqset);
5309  
5310  	ret = wctx->it(wctx->ctx, (struct waitq *)wctx->input, wqset);
5311  
5312  	waitq_set_unlock(wqset);
5313  	return ret;
5314  }
5315  
5316  /**
5317   * call external iterator function for each prepost object in wqset
5318   *
5319   * Conditions:
5320   *	Called from wq_prepost_foreach_locked
5321   *	(wqset locked, waitq _not_ locked)
5322   */
5323  static int
5324  wqset_iterate_prepost_cb(struct waitq_set *wqset, void *ctx,
5325      struct wq_prepost *wqp, struct waitq *waitq)
5326  {
5327  	struct wq_it_ctx *wctx = (struct wq_it_ctx *)(ctx);
5328  	uint64_t wqp_id;
5329  	int ret;
5330  
5331  	(void)wqp;
5332  
5333  	/*
5334  	 * This is a bit tricky. The 'wqset' is locked, but the 'waitq' is not.
5335  	 * Taking the 'waitq' lock is a lock order violation, so we need to be
5336  	 * careful. We also must realize that we may have taken a reference to
5337  	 * the 'wqp' just as the associated waitq was being torn down (or
5338  	 * clearing all its preposts) - see waitq_clear_prepost_locked(). If
5339  	 * the 'wqp' is valid and we can get the waitq lock, then we are good
5340  	 * to go. If not, we need to back off, check that the 'wqp' hasn't
5341  	 * been invalidated, and try to re-take the locks.
5342  	 */
5343  	assert(!waitq_irq_safe(waitq));
5344  
5345  	if (waitq_lock_try(waitq)) {
5346  		goto call_iterator;
5347  	}
5348  
5349  	if (!wqp_is_valid(wqp)) {
5350  		return WQ_ITERATE_RESTART;
5351  	}
5352  
5353  	/* We are passed a prepost object with a reference on it. If neither
5354  	 * the waitq set nor the waitq require interrupts disabled, then we
5355  	 * may block on the delay(1) call below. We can't hold a prepost
5356  	 * object reference while blocking, so we have to give that up as well
5357  	 * and re-acquire it when we come back.
5358  	 */
5359  	wqp_id = wqp->wqp_prepostid.id;
5360  	wq_prepost_put(wqp);
5361  	waitq_set_unlock(wqset);
5362  	wqdbg_v("dropped set:%p lock waiting for wqp:%p (0x%llx -> wq:%p)",
5363  	    wqset, wqp, wqp->wqp_prepostid.id, waitq);
5364  	delay(1);
5365  	waitq_set_lock(wqset);
5366  	wqp = wq_prepost_get(wqp_id);
5367  	if (!wqp) {
5368  		/* someone cleared preposts while we slept! */
5369  		return WQ_ITERATE_DROPPED;
5370  	}
5371  
5372  	/*
5373  	 * TODO:
5374  	 * This differs slightly from the logic in ipc_mqueue.c:
5375  	 * ipc_mqueue_receive_on_thread(). There, if the waitq lock
5376  	 * can't be obtained, the prepost link is placed on the back of
5377  	 * the chain, and the iteration starts from the beginning. Here,
5378  	 * we just restart from the beginning.
5379  	 */
5380  	return WQ_ITERATE_RESTART;
5381  
5382  call_iterator:
5383  	if (!wqp_is_valid(wqp)) {
5384  		ret = WQ_ITERATE_RESTART;
5385  		goto out_unlock;
5386  	}
5387  
5388  	/* call the external callback */
5389  	ret = wctx->it(wctx->ctx, waitq, wqset);
5390  
5391  	if (ret == WQ_ITERATE_BREAK_KEEP_LOCKED) {
5392  		ret = WQ_ITERATE_BREAK;
5393  		goto out;
5394  	}
5395  
5396  out_unlock:
5397  	waitq_unlock(waitq);
5398  out:
5399  	return ret;
5400  }
5401  
5402  /**
5403   * iterator over all sets to which the given waitq has been linked
5404   *
5405   * Conditions:
5406   *      'waitq' is locked
5407   */
5408  int
5409  waitq_iterate_sets(struct waitq *waitq, void *ctx, waitq_iterator_t it)
5410  {
5411  	int ret;
5412  	struct wq_it_ctx wctx = {
5413  		.input = (void *)waitq,
5414  		.ctx = ctx,
5415  		.it = it,
5416  	};
5417  	if (!it || !waitq) {
5418  		return KERN_INVALID_ARGUMENT;
5419  	}
5420  
5421  	ret = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
5422  	    WQL_WQS, (void *)&wctx, waitq_iterate_sets_cb);
5423  	if (ret == WQ_ITERATE_CONTINUE) {
5424  		ret = WQ_ITERATE_SUCCESS;
5425  	}
5426  	return ret;
5427  }
5428  
5429  /**
5430   * iterator over all preposts in the given wqset
5431   *
5432   * Conditions:
5433   *      'wqset' is locked
5434   */
5435  int
5436  waitq_set_iterate_preposts(struct waitq_set *wqset,
5437      void *ctx, waitq_iterator_t it)
5438  {
5439  	struct wq_it_ctx wctx = {
5440  		.input = (void *)wqset,
5441  		.ctx = ctx,
5442  		.it = it,
5443  	};
5444  	if (!it || !wqset) {
5445  		return WQ_ITERATE_INVALID;
5446  	}
5447  
5448  	assert(waitq_held(&wqset->wqset_q));
5449  
5450  	return wq_prepost_foreach_locked(wqset, (void *)&wctx,
5451  	           wqset_iterate_prepost_cb);
5452  }
5453  
5454  
5455  /* ----------------------------------------------------------------------
5456   *
5457   * Higher-level APIs
5458   *
5459   * ---------------------------------------------------------------------- */
5460  
5461  
5462  /**
5463   * declare a thread's intent to wait on 'waitq' for 'wait_event'
5464   *
5465   * Conditions:
5466   *	'waitq' is not locked
5467   */
5468  wait_result_t
5469  waitq_assert_wait64(struct waitq *waitq,
5470      event64_t wait_event,
5471      wait_interrupt_t interruptible,
5472      uint64_t deadline)
5473  {
5474  	thread_t thread = current_thread();
5475  	wait_result_t ret;
5476  	spl_t s;
5477  
5478  	if (!waitq_valid(waitq)) {
5479  		panic("Invalid waitq: %p", waitq);
5480  	}
5481  
5482  	if (waitq_irq_safe(waitq)) {
5483  		s = splsched();
5484  	}
5485  
5486  	waitq_lock(waitq);
5487  	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5488  	    TIMEOUT_URGENCY_SYS_NORMAL,
5489  	    deadline, TIMEOUT_NO_LEEWAY, thread);
5490  	waitq_unlock(waitq);
5491  
5492  	if (waitq_irq_safe(waitq)) {
5493  		splx(s);
5494  	}
5495  
5496  	return ret;
5497  }
5498  
5499  /**
5500   * declare a thread's intent to wait on 'waitq' for 'wait_event'
5501   *
5502   * Conditions:
5503   *	'waitq' is not locked
5504   *	will disable and re-enable interrupts while locking current_thread()
5505   */
5506  wait_result_t
5507  waitq_assert_wait64_leeway(struct waitq *waitq,
5508      event64_t wait_event,
5509      wait_interrupt_t interruptible,
5510      wait_timeout_urgency_t urgency,
5511      uint64_t deadline,
5512      uint64_t leeway)
5513  {
5514  	wait_result_t ret;
5515  	thread_t thread = current_thread();
5516  	spl_t s;
5517  
5518  	if (!waitq_valid(waitq)) {
5519  		panic("Invalid waitq: %p", waitq);
5520  	}
5521  
5522  	if (waitq_irq_safe(waitq)) {
5523  		s = splsched();
5524  	}
5525  
5526  	waitq_lock(waitq);
5527  	ret = waitq_assert_wait64_locked(waitq, wait_event, interruptible,
5528  	    urgency, deadline, leeway, thread);
5529  	waitq_unlock(waitq);
5530  
5531  	if (waitq_irq_safe(waitq)) {
5532  		splx(s);
5533  	}
5534  
5535  	return ret;
5536  }
5537  
5538  /**
5539   * wakeup a single thread from a waitq that's waiting for a given event
5540   *
5541   * Conditions:
5542   *	'waitq' is not locked
5543   *	may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5544   *	may disable and re-enable interrupts
5545   *
5546   * Notes:
5547   *	will _not_ block if waitq is global (or not a member of any set)
5548   */
5549  kern_return_t
5550  waitq_wakeup64_one(struct waitq *waitq, event64_t wake_event,
5551      wait_result_t result, int priority)
5552  {
5553  	kern_return_t kr;
5554  	uint64_t reserved_preposts = 0;
5555  	spl_t spl;
5556  
5557  	if (!waitq_valid(waitq)) {
5558  		panic("Invalid waitq: %p", waitq);
5559  	}
5560  
5561  	if (!waitq_irq_safe(waitq)) {
5562  		/* reserve preposts in addition to locking the waitq */
5563  		reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
5564  	} else {
5565  		spl = splsched();
5566  		waitq_lock(waitq);
5567  	}
5568  
5569  	/* waitq is locked upon return */
5570  	kr = waitq_wakeup64_one_locked(waitq, wake_event, result,
5571  	    &reserved_preposts, priority, WAITQ_UNLOCK, WQ_OPTION_NONE);
5572  
5573  	if (waitq_irq_safe(waitq)) {
5574  		splx(spl);
5575  	}
5576  
5577  	/* release any left-over prepost object (won't block/lock anything) */
5578  	waitq_prepost_release_reserve(reserved_preposts);
5579  
5580  	return kr;
5581  }
5582  
5583  /**
5584   * wakeup all threads from a waitq that are waiting for a given event
5585   *
5586   * Conditions:
5587   *	'waitq' is not locked
5588   *	may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5589   *	may disable and re-enable interrupts
5590   *
5591   * Notes:
5592   *	will _not_ block if waitq is global (or not a member of any set)
5593   */
5594  kern_return_t
5595  waitq_wakeup64_all(struct waitq *waitq,
5596      event64_t wake_event,
5597      wait_result_t result,
5598      int priority)
5599  {
5600  	kern_return_t ret;
5601  	uint64_t reserved_preposts = 0;
5602  	spl_t s;
5603  
5604  	if (!waitq_valid(waitq)) {
5605  		panic("Invalid waitq: %p", waitq);
5606  	}
5607  
5608  	if (!waitq_irq_safe(waitq)) {
5609  		/* reserve preposts in addition to locking waitq */
5610  		reserved_preposts = waitq_prepost_reserve(waitq, 0,
5611  		    WAITQ_KEEP_LOCKED);
5612  	} else {
5613  		s = splsched();
5614  		waitq_lock(waitq);
5615  	}
5616  
5617  	ret = waitq_wakeup64_all_locked(waitq, wake_event, result,
5618  	    &reserved_preposts, priority,
5619  	    WAITQ_UNLOCK);
5620  
5621  	if (waitq_irq_safe(waitq)) {
5622  		splx(s);
5623  	}
5624  
5625  	waitq_prepost_release_reserve(reserved_preposts);
5626  
5627  	return ret;
5628  }
5629  
5630  /**
5631   * wakeup a specific thread iff it's waiting on 'waitq' for 'wake_event'
5632   *
5633   * Conditions:
5634   *	'waitq' is not locked
5635   *
5636   * Notes:
5637   *	May temporarily disable and re-enable interrupts
5638   */
5639  kern_return_t
5640  waitq_wakeup64_thread(struct waitq *waitq,
5641      event64_t wake_event,
5642      thread_t thread,
5643      wait_result_t result)
5644  {
5645  	kern_return_t ret;
5646  	spl_t s, th_spl;
5647  
5648  	if (!waitq_valid(waitq)) {
5649  		panic("Invalid waitq: %p", waitq);
5650  	}
5651  
5652  	if (waitq_irq_safe(waitq)) {
5653  		s = splsched();
5654  	}
5655  	waitq_lock(waitq);
5656  
5657  	ret = waitq_select_thread_locked(waitq, wake_event, thread, &th_spl);
5658  	/* on success, returns 'thread' locked */
5659  
5660  	waitq_unlock(waitq);
5661  
5662  	if (ret == KERN_SUCCESS) {
5663  		ret = thread_go(thread, result, WQ_OPTION_NONE);
5664  		assert(ret == KERN_SUCCESS);
5665  		thread_unlock(thread);
5666  		splx(th_spl);
5667  		waitq_stats_count_wakeup(waitq);
5668  	} else {
5669  		ret = KERN_NOT_WAITING;
5670  		waitq_stats_count_fail(waitq);
5671  	}
5672  
5673  	if (waitq_irq_safe(waitq)) {
5674  		splx(s);
5675  	}
5676  
5677  	return ret;
5678  }
5679  
5680  /**
5681   * wakeup a single thread from a waitq that's waiting for a given event
5682   * and return a reference to that thread
5683   * returns THREAD_NULL if no thread was waiting
5684   *
5685   * Conditions:
5686   *	'waitq' is not locked
5687   *	may (rarely) block if 'waitq' is non-global and a member of 1 or more sets
5688   *	may disable and re-enable interrupts
5689   *
5690   * Notes:
5691   *	will _not_ block if waitq is global (or not a member of any set)
5692   */
5693  thread_t
5694  waitq_wakeup64_identify(struct waitq    *waitq,
5695      event64_t       wake_event,
5696      wait_result_t   result,
5697      int             priority)
5698  {
5699  	uint64_t reserved_preposts = 0;
5700  	spl_t thread_spl = 0;
5701  	thread_t thread;
5702  	spl_t spl;
5703  
5704  	if (!waitq_valid(waitq)) {
5705  		panic("Invalid waitq: %p", waitq);
5706  	}
5707  
5708  	if (!waitq_irq_safe(waitq)) {
5709  		/* reserve preposts in addition to locking waitq */
5710  		reserved_preposts = waitq_prepost_reserve(waitq, 0, WAITQ_KEEP_LOCKED);
5711  	} else {
5712  		spl = splsched();
5713  		waitq_lock(waitq);
5714  	}
5715  
5716  	thread = waitq_wakeup64_identify_locked(waitq, wake_event, result,
5717  	    &thread_spl, &reserved_preposts,
5718  	    priority, WAITQ_UNLOCK);
5719  	/* waitq is unlocked, thread is locked */
5720  
5721  	if (thread != THREAD_NULL) {
5722  		thread_reference(thread);
5723  		thread_unlock(thread);
5724  		splx(thread_spl);
5725  	}
5726  
5727  	if (waitq_irq_safe(waitq)) {
5728  		splx(spl);
5729  	}
5730  
5731  	/* release any left-over prepost object (won't block/lock anything) */
5732  	waitq_prepost_release_reserve(reserved_preposts);
5733  
5734  	/* returns +1 ref to running thread or THREAD_NULL */
5735  	return thread;
5736  }