/ duct-tape / xnu / osfmk / vm / vm_phantom_cache.c
vm_phantom_cache.c
  1  /*
  2   * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   *
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   *
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   *
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   */
 28  
 29  #include <vm/vm_page.h>
 30  #include <vm/vm_object.h>
 31  #include <vm/vm_kern.h>
 32  #include <vm/vm_pageout.h>
 33  #include <vm/vm_phantom_cache.h>
 34  #include <vm/vm_compressor.h>
 35  
 36  
 37  uint32_t phantom_cache_eval_period_in_msecs = 250;
 38  uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
 39  #if !XNU_TARGET_OS_OSX
 40  uint32_t phantom_cache_thrashing_threshold = 500;
 41  #else /* !XNU_TARGET_OS_OSX */
 42  uint32_t phantom_cache_thrashing_threshold = 50;
 43  #endif /* !XNU_TARGET_OS_OSX */
 44  
 45  /*
 46   * Number of consecutive thrashing periods required before
 47   * vm_phantom_cache_check_pressure() returns true.
 48   */
 49  #if !XNU_TARGET_OS_OSX
 50  unsigned phantom_cache_contiguous_periods = 4;
 51  #else /* !XNU_TARGET_OS_OSX */
 52  unsigned phantom_cache_contiguous_periods = 2;
 53  #endif /* !XNU_TARGET_OS_OSX */
 54  
 55  clock_sec_t     pc_start_of_eval_period_sec = 0;
 56  clock_nsec_t    pc_start_of_eval_period_nsec = 0;
 57  boolean_t       pc_need_eval_reset = FALSE;
 58  
 59  /* One bit per recent sampling period. Bit 0 = current period. */
 60  uint32_t        pc_history = 0;
 61  
 62  uint32_t        sample_period_ghost_added_count = 0;
 63  uint32_t        sample_period_ghost_added_count_ssd = 0;
 64  uint32_t        sample_period_ghost_found_count = 0;
 65  uint32_t        sample_period_ghost_found_count_ssd = 0;
 66  
 67  uint32_t        vm_phantom_object_id = 1;
 68  #define         VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000
 69  
 70  vm_ghost_t      vm_phantom_cache;
 71  uint32_t        vm_phantom_cache_nindx = 1;
 72  uint32_t        vm_phantom_cache_num_entries = 0;
 73  uint32_t        vm_phantom_cache_size;
 74  
 75  typedef uint32_t        vm_phantom_hash_entry_t;
 76  vm_phantom_hash_entry_t *vm_phantom_cache_hash;
 77  uint32_t        vm_phantom_cache_hash_size;
 78  uint32_t        vm_ghost_hash_mask;             /* Mask for hash function */
 79  uint32_t        vm_ghost_bucket_hash;           /* Basic bucket hash */
 80  
 81  
 82  int pg_masks[4] = {
 83  	0x1, 0x2, 0x4, 0x8
 84  };
 85  
 86  
 87  #define vm_phantom_hash(obj_id, offset) (\
 88  	        ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
 89  
 90  
 91  struct phantom_cache_stats {
 92  	uint32_t        pcs_wrapped;
 93  	uint32_t        pcs_added_page_to_entry;
 94  	uint32_t        pcs_added_new_entry;
 95  	uint32_t        pcs_replaced_entry;
 96  
 97  	uint32_t        pcs_lookup_found_page_in_cache;
 98  	uint32_t        pcs_lookup_entry_not_in_cache;
 99  	uint32_t        pcs_lookup_page_not_in_entry;
100  
101  	uint32_t        pcs_updated_phantom_state;
102  } phantom_cache_stats;
103  
104  
105  
106  void
107  vm_phantom_cache_init()
108  {
109  	unsigned int    num_entries;
110  	unsigned int    log1;
111  	unsigned int    size;
112  
113  	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
114  		return;
115  	}
116  #if !XNU_TARGET_OS_OSX
117  	num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 10) / VM_GHOST_PAGES_PER_ENTRY);
118  #else /* !XNU_TARGET_OS_OSX */
119  	num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
120  #endif /* !XNU_TARGET_OS_OSX */
121  	vm_phantom_cache_num_entries = 1;
122  
123  	while (vm_phantom_cache_num_entries < num_entries) {
124  		vm_phantom_cache_num_entries <<= 1;
125  	}
126  
127  	/*
128  	 * We index this with g_next_index, so don't exceed the width of that bitfield.
129  	 */
130  	if (vm_phantom_cache_num_entries > (1 << VM_GHOST_INDEX_BITS)) {
131  		vm_phantom_cache_num_entries = (1 << VM_GHOST_INDEX_BITS);
132  	}
133  
134  	vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
135  	vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
136  
137  	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS) {
138  		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
139  	}
140  	bzero(vm_phantom_cache, vm_phantom_cache_size);
141  
142  	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT | KMA_PERMANENT, VM_KERN_MEMORY_PHANTOM_CACHE) != KERN_SUCCESS) {
143  		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
144  	}
145  	bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
146  
147  
148  	vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
149  
150  	/*
151  	 *	Calculate object_id shift value for hashing algorithm:
152  	 *		O = log2(sizeof(struct vm_object))
153  	 *		B = log2(vm_page_bucket_count)
154  	 *	        hash shifts the object_id left by
155  	 *		B/2 - O
156  	 */
157  	size = vm_phantom_cache_num_entries;
158  	for (log1 = 0; size > 1; log1++) {
159  		size /= 2;
160  	}
161  
162  	vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1);          /* Get (ceiling of sqrt of table size) */
163  	vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2);         /* Get (ceiling of quadroot of table size) */
164  	vm_ghost_bucket_hash |= 1;                              /* Set bit and add 1 - always must be 1 to insure unique series */
165  
166  	if (vm_ghost_hash_mask & vm_phantom_cache_num_entries) {
167  		printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
168  	}
169  }
170  
171  
172  void
173  vm_phantom_cache_add_ghost(vm_page_t m)
174  {
175  	vm_ghost_t      vpce;
176  	vm_object_t     object;
177  	int             ghost_index;
178  	int             pg_mask;
179  	boolean_t       isSSD = FALSE;
180  	vm_phantom_hash_entry_t ghost_hash_index;
181  
182  	object = VM_PAGE_OBJECT(m);
183  
184  	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
185  	vm_object_lock_assert_exclusive(object);
186  
187  	if (vm_phantom_cache_num_entries == 0) {
188  		return;
189  	}
190  
191  	pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
192  
193  	if (object->phantom_object_id == 0) {
194  		vnode_pager_get_isSSD(object->pager, &isSSD);
195  
196  		if (isSSD == TRUE) {
197  			object->phantom_isssd = TRUE;
198  		}
199  
200  		object->phantom_object_id = vm_phantom_object_id++;
201  
202  		if (vm_phantom_object_id == 0) {
203  			vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
204  		}
205  	} else {
206  		if ((vpce = vm_phantom_cache_lookup_ghost(m, 0))) {
207  			vpce->g_pages_held |= pg_mask;
208  
209  			phantom_cache_stats.pcs_added_page_to_entry++;
210  			goto done;
211  		}
212  	}
213  	/*
214  	 * if we're here then the vm_ghost_t of this vm_page_t
215  	 * is not present in the phantom cache... take the next
216  	 * available entry in the LRU first evicting the existing
217  	 * entry if we've wrapped the ring
218  	 */
219  	ghost_index = vm_phantom_cache_nindx++;
220  
221  	if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
222  		vm_phantom_cache_nindx = 1;
223  
224  		phantom_cache_stats.pcs_wrapped++;
225  	}
226  	vpce = &vm_phantom_cache[ghost_index];
227  
228  	if (vpce->g_obj_id) {
229  		/*
230  		 * we're going to replace an existing entry
231  		 * so first remove it from the hash
232  		 */
233  		vm_ghost_t      nvpce;
234  
235  		ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
236  
237  		nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
238  
239  		if (nvpce == vpce) {
240  			vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
241  		} else {
242  			for (;;) {
243  				if (nvpce->g_next_index == 0) {
244  					panic("didn't find ghost in hash\n");
245  				}
246  
247  				if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
248  					nvpce->g_next_index = vpce->g_next_index;
249  					break;
250  				}
251  				nvpce = &vm_phantom_cache[nvpce->g_next_index];
252  			}
253  		}
254  		phantom_cache_stats.pcs_replaced_entry++;
255  	} else {
256  		phantom_cache_stats.pcs_added_new_entry++;
257  	}
258  
259  	vpce->g_pages_held = pg_mask;
260  	vpce->g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
261  	vpce->g_obj_id = object->phantom_object_id;
262  
263  	ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
264  	vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
265  	vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
266  
267  done:
268  	vm_pageout_vminfo.vm_phantom_cache_added_ghost++;
269  
270  	if (object->phantom_isssd) {
271  		OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
272  	} else {
273  		OSAddAtomic(1, &sample_period_ghost_added_count);
274  	}
275  }
276  
277  
278  vm_ghost_t
279  vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
280  {
281  	uint64_t        g_obj_offset;
282  	uint32_t        g_obj_id;
283  	uint32_t        ghost_index;
284  	vm_object_t     object;
285  
286  	object = VM_PAGE_OBJECT(m);
287  
288  	if ((g_obj_id = object->phantom_object_id) == 0) {
289  		/*
290  		 * no entries in phantom cache for this object
291  		 */
292  		return NULL;
293  	}
294  	g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
295  
296  	ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
297  
298  	while (ghost_index) {
299  		vm_ghost_t      vpce;
300  
301  		vpce = &vm_phantom_cache[ghost_index];
302  
303  		if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
304  			if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
305  				phantom_cache_stats.pcs_lookup_found_page_in_cache++;
306  
307  				return vpce;
308  			}
309  			phantom_cache_stats.pcs_lookup_page_not_in_entry++;
310  
311  			return NULL;
312  		}
313  		ghost_index = vpce->g_next_index;
314  	}
315  	phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
316  
317  	return NULL;
318  }
319  
320  
321  
322  void
323  vm_phantom_cache_update(vm_page_t m)
324  {
325  	int             pg_mask;
326  	vm_ghost_t      vpce;
327  	vm_object_t     object;
328  
329  	object = VM_PAGE_OBJECT(m);
330  
331  	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
332  	vm_object_lock_assert_exclusive(object);
333  
334  	if (vm_phantom_cache_num_entries == 0) {
335  		return;
336  	}
337  
338  	pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
339  
340  	if ((vpce = vm_phantom_cache_lookup_ghost(m, pg_mask))) {
341  		vpce->g_pages_held &= ~pg_mask;
342  
343  		phantom_cache_stats.pcs_updated_phantom_state++;
344  		vm_pageout_vminfo.vm_phantom_cache_found_ghost++;
345  
346  		if (object->phantom_isssd) {
347  			OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
348  		} else {
349  			OSAddAtomic(1, &sample_period_ghost_found_count);
350  		}
351  	}
352  }
353  
354  
355  #define PHANTOM_CACHE_DEBUG     1
356  
357  #if     PHANTOM_CACHE_DEBUG
358  
359  int     sample_period_ghost_counts_indx = 0;
360  
361  struct {
362  	uint32_t        added;
363  	uint32_t        found;
364  	uint32_t        added_ssd;
365  	uint32_t        found_ssd;
366  	uint32_t        elapsed_ms;
367  	boolean_t       pressure_detected;
368  } sample_period_ghost_counts[256];
369  
370  #endif
371  
372  /*
373   * Determine if the file cache is thrashing from sampling interval statistics.
374   *
375   * Pages added to the phantom cache = pages evicted from the file cache.
376   * Pages found in the phantom cache = reads of pages that were recently evicted.
377   * Threshold is the latency-dependent number of reads we consider thrashing.
378   */
379  static boolean_t
380  is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
381  {
382  	/* Ignore normal activity below the threshold. */
383  	if (added < threshold || found < threshold) {
384  		return FALSE;
385  	}
386  
387  	/*
388  	 * When thrashing in a way that we can mitigate, most of the pages read
389  	 * into the file cache were recently evicted, and 'found' will be close
390  	 * to 'added'.
391  	 *
392  	 * When replacing the current working set because a new app is
393  	 * launched, we see very high read traffic with sporadic phantom cache
394  	 * hits.
395  	 *
396  	 * This is not thrashing, or freeing up memory wouldn't help much
397  	 * anyway.
398  	 */
399  	if (found < added / 2) {
400  		return FALSE;
401  	}
402  
403  	return TRUE;
404  }
405  
406  /*
407   * the following function is never called
408   * from multiple threads simultaneously due
409   * to a condition variable used to serialize
410   * at the compressor level... thus no need
411   * to provide locking for the sample processing
412   */
413  boolean_t
414  vm_phantom_cache_check_pressure()
415  {
416  	clock_sec_t     cur_ts_sec;
417  	clock_nsec_t    cur_ts_nsec;
418  	uint64_t        elapsed_msecs_in_eval;
419  	boolean_t       pressure_detected = FALSE;
420  
421  	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
422  
423  	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
424  
425  	/*
426  	 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
427  	 * whenever vm_phantom_cache_restart_sample has been called.
428  	 */
429  	if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
430  		pc_need_eval_reset = TRUE;
431  	}
432  
433  	if (pc_need_eval_reset == TRUE) {
434  #if PHANTOM_CACHE_DEBUG
435  		/*
436  		 * maintain some info about the last 256 sample periods
437  		 */
438  		sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
439  		sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
440  		sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
441  		sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
442  		sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
443  
444  		sample_period_ghost_counts_indx++;
445  
446  		if (sample_period_ghost_counts_indx >= 256) {
447  			sample_period_ghost_counts_indx = 0;
448  		}
449  #endif
450  		sample_period_ghost_added_count = 0;
451  		sample_period_ghost_found_count = 0;
452  		sample_period_ghost_added_count_ssd = 0;
453  		sample_period_ghost_found_count_ssd = 0;
454  
455  		pc_start_of_eval_period_sec = cur_ts_sec;
456  		pc_start_of_eval_period_nsec = cur_ts_nsec;
457  		pc_history <<= 1;
458  		pc_need_eval_reset = FALSE;
459  	} else {
460  		/*
461  		 * Since the trashing rate is really a function of the read latency of the disk
462  		 * we have to consider both the SSD and spinning disk case since the file cache
463  		 * could be backed by either or even both flavors.  When the object is first
464  		 * assigned a phantom_object_id, we query the pager to determine if the backing
465  		 * backing media is an SSD and remember that answer in the vm_object.  We use
466  		 * that info to maintains counts for both the SSD and spinning disk cases.
467  		 */
468  		if (is_thrashing(sample_period_ghost_added_count,
469  		    sample_period_ghost_found_count,
470  		    phantom_cache_thrashing_threshold) ||
471  		    is_thrashing(sample_period_ghost_added_count_ssd,
472  		    sample_period_ghost_found_count_ssd,
473  		    phantom_cache_thrashing_threshold_ssd)) {
474  			/* Thrashing in the current period: Set bit 0. */
475  			pc_history |= 1;
476  		}
477  	}
478  
479  	/*
480  	 * Declare pressure_detected after phantom_cache_contiguous_periods.
481  	 *
482  	 * Create a bitmask with the N low bits set. These bits must all be set
483  	 * in pc_history. The high bits of pc_history are ignored.
484  	 */
485  	uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
486  	if ((pc_history & bitmask) == bitmask) {
487  		pressure_detected = TRUE;
488  	}
489  
490  	if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100) {
491  		pressure_detected = FALSE;
492  	}
493  
494  #if PHANTOM_CACHE_DEBUG
495  	sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
496  #endif
497  	return pressure_detected;
498  }
499  
500  /*
501   * Restart the current sampling because conditions have changed significantly,
502   * and we don't want to react to old data.
503   *
504   * This function can be called from any thread.
505   */
506  void
507  vm_phantom_cache_restart_sample(void)
508  {
509  	pc_need_eval_reset = TRUE;
510  }