--- sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (revision 272005) +++ sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (working copy) @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -133,13 +134,6 @@ kmem_size(void) return (kmem_size_val); } -uint64_t -kmem_used(void) -{ - - return (vmem_size(kmem_arena, VMEM_ALLOC)); -} - static int kmem_std_constructor(void *mem, int size __unused, void *private, int flags) { @@ -228,12 +222,45 @@ kmem_cache_reap_now(kmem_cache_t *cache) } void +kmem_cache_reap(kmem_cache_t *cache, uint64_t maxfree) +{ + + if (cache->kc_zone != NULL && + uma_zone_free_size(cache->kc_zone) > maxfree) + zone_drain(cache->kc_zone); +} + +void kmem_reap(void) { uma_reclaim(); } + +uint64_t +kmem_cache_free_size(kmem_cache_t *cache) +{ + uint64_t cachefree; + + cachefree = (cache->kc_zone == NULL) ? 0 : + uma_zone_free_size(cache->kc_zone); + + /* + * Manual probe as the return fbt probe never fires due to + * compiler tall call optimisation. + */ + DTRACE_PROBE2(kmem_cache_free_size, char *, cache->kc_name, uint64_t, + cachefree); + + return (cachefree); +} + #else void +kmem_cache_reap(kmem_cache_t *cache, uint64_t maxfree) +{ +} + +void kmem_cache_reap_now(kmem_cache_t *cache __unused) { } @@ -242,6 +269,11 @@ void kmem_reap(void) { } + +uint64_t +kmem_cache_free_size(kmem_cache_t *cache) +{ +} #endif int --- sys/cddl/compat/opensolaris/sys/kmem.h (revision 272005) +++ sys/cddl/compat/opensolaris/sys/kmem.h (working copy) @@ -44,7 +44,7 @@ MALLOC_DECLARE(M_SOLARIS); #define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) #define KM_SLEEP M_WAITOK -#define KM_PUSHPAGE M_WAITOK +#define KM_PUSHPAGE M_WAITOK|M_USE_RESERVE #define KM_NOSLEEP M_NOWAIT #define KM_NODEBUG M_NODUMP #define KM_NORMALPRI 0 @@ -66,7 +66,6 @@ typedef struct kmem_cache { void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); -uint64_t kmem_used(void); kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); @@ -74,10 +73,15 @@ void kmem_cache_destroy(kmem_cache_t *cache); void *kmem_cache_alloc(kmem_cache_t *cache, int flags); void kmem_cache_free(kmem_cache_t *cache, void *buf); void kmem_cache_reap_now(kmem_cache_t *cache); +void kmem_cache_reap(kmem_cache_t *cache, uint64_t maxfree); void kmem_reap(void); +uint64_t kmem_cache_free_size(kmem_cache_t *cache); int kmem_debugging(void); void *calloc(size_t n, size_t s); +#define freemem (cnt.v_free_count + cnt.v_cache_count) +#define minfree cnt.v_free_min +#define heap_arena kmem_arena #define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) #define kmem_zalloc(size, kmflags) zfs_kmem_alloc((size), (kmflags) | M_ZERO) #define kmem_free(buf, size) zfs_kmem_free((buf), (size)) --- sys/cddl/compat/opensolaris/sys/param.h (revision 272005) +++ sys/cddl/compat/opensolaris/sys/param.h (working copy) @@ -36,6 +36,7 @@ #ifdef _KERNEL #define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) +#define btop(x) ((uint64_t)(x) >> PAGE_SHIFT) #endif #endif --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 272005) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) @@ -138,6 +138,7 @@ #include #include +#include #ifdef illumos #ifndef _KERNEL @@ -159,6 +160,15 @@ typedef enum arc_reclaim_strategy { ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; +typedef enum arc_cache_reclaim_stragegy { + ARC_CACHE_RECLAIM_NOW, /* Immediate reclaim strategy */ + ARC_CACHE_RECLAIM_SIZE, /* Free size reclaim strategy */ + ARC_CACHE_RECLAIM_FORCE, /* Forced immediate reclaim strategy */ +} arc_cache_reclaim_strategy_t; + +/* When the last cache reclaim was processed. */ +static clock_t cache_reclaim_last = 0; + /* * The number of iterations through arc_evict_*() before we * drop & reacquire the lock. @@ -193,9 +203,6 @@ extern int zfs_prefetch_disable; */ static boolean_t arc_warm; -/* - * These tunables are for performance analysis. - */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; @@ -204,7 +211,27 @@ int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +u_int zfs_arc_free_target = 0; +u_int zfs_arc_cache_target = 0; +int zfs_arc_cache_period = 10; +int zfs_arc_cache_partial = 0; +int zfs_arc_cache_free_period = 300; +uint64_t zfs_arc_cache_free_max = (1 <<24); /* 16MB */ +#ifdef _KERNEL +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_cache_target(SYSCTL_HANDLER_ARGS); + +static void +arc_target_init(void *unused __unused) +{ + + zfs_arc_free_target = vm_pageout_wakeup_thresh; + zfs_arc_cache_target = (vm_pageout_wakeup_thresh / 2) * 3; +} +SYSINIT(arc_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_target_init, NULL); + TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); @@ -217,7 +244,75 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_cache_reclaim_period, CTLFLAG_RWTUN, + &zfs_arc_cache_period, 0, + "Min number of seconds between ARC cache reclaims"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_reclaim_partial, CTLFLAG_RWTUN, + &zfs_arc_cache_partial, 0, + "Enable ARC to perform partial cache reclaims"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_cache_free_max, CTLFLAG_RWTUN, + &zfs_arc_cache_free_max, 0, + "Maximum free bytes in an ARC cache zone before reclaim will be triggered"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_free_period, CTLFLAG_RWTUN, + &zfs_arc_cache_free_period, 0, + "Min number of seconds between ARC free size based cache reclaims"); +/* + * We don't have a tunable for these sysctls due to their dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_cache_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_cache_target, "IU", + "Desired number of free pages below which ARC triggers cache reclaim"); + +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} + +static int +sysctl_vfs_zfs_arc_cache_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_cache_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > cnt.v_page_count) + return (EINVAL); + + zfs_arc_cache_target = val; + + return (0); +} +#endif + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -592,6 +687,13 @@ static void arc_evict_ghost(arc_state_t *state, ui static void arc_buf_watch(arc_buf_t *buf); #endif /* illumos */ +static uint64_t arc_cache_free(void); +static boolean_t arc_cache_reclaim_needed(uint64_t size); +static boolean_t arc_cache_reclaim(uint64_t size, + arc_cache_reclaim_strategy_t strat); +static boolean_t arc_cache_reclaim_strat(kmem_cache_t *cache, uint64_t size, + arc_cache_reclaim_strategy_t strat); + static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ @@ -2421,6 +2523,7 @@ arc_flush(spa_t *spa) void arc_shrink(void) { + if (arc_c > arc_c_min) { uint64_t to_free; @@ -2429,6 +2532,9 @@ arc_shrink(void) #else to_free = arc_c >> arc_shrink_shift; #endif + DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min, uint64_t, arc_p, uint64_t, to_free); + if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -2439,12 +2545,19 @@ arc_shrink(void) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); + + DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, + arc_p); + ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) + if (arc_size > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, + uint64_t, arc_c); arc_adjust(); + } } static int needfree = 0; @@ -2454,16 +2567,26 @@ arc_reclaim_needed(void) { #ifdef _KERNEL + if (arc_size <= arc_c_min) { + DTRACE_PROBE2(arc__reclaim_min, uint64_t, arc_size, + uint64_t, arc_c_min); + return (0); + } - if (needfree) + if (needfree) { + DTRACE_PROBE(arc__reclaim_needfree); return (1); + } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - if (vm_paging_needed()) + if (freemem < zfs_arc_free_target) { + DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, + freemem, uint64_t, zfs_arc_free_target); return (1); + } #ifdef sun /* @@ -2491,8 +2614,19 @@ arc_reclaim_needed(void) if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); -#if defined(__i386) /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + if (availrmem <= pages_pp_maximum) + return (1); + +#endif /* sun */ +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against @@ -2503,26 +2637,43 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + if (vmem_size(heap_arena, VMEM_FREE) < + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { + DTRACE_PROBE2(arc__reclaim_used, uint64_t, + vmem_size(heap_arena, VMEM_FREE), uint64_t, + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); return (1); + } #endif -#else /* !sun */ - if (kmem_used() > (kmem_size() * 3) / 4) +#ifdef sun + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) return (1); #endif /* sun */ - -#else +#else /* _KERNEL */ if (spa_get_random(100) == 0) return (1); -#endif +#endif /* _KERNEL */ + DTRACE_PROBE(arc__reclaim_no); + return (0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; +extern kmem_cache_t *range_seg_cache; -static void +static void __noinline arc_kmem_reap_now(arc_reclaim_strategy_t strat) { size_t i; @@ -2529,6 +2680,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; + DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* @@ -2537,7 +2689,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } -#if defined(__i386) +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * Reclaim unused memory from all kmem caches. */ @@ -2552,20 +2704,131 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) if (strat == ARC_RECLAIM_AGGR) arc_shrink(); + (void) arc_cache_reclaim(0, ARC_CACHE_RECLAIM_FORCE); + +#ifdef sun + /* + * Ask the vmem areana to reclaim unused memory from its + * quantum caches. + */ + if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + vmem_qcache_reap(zio_arena); +#endif +out: + DTRACE_PROBE(arc__kmem_reap_end); +} + + +static boolean_t +arc_cache_reclaim_needed(uint64_t size) +{ + + if (zfs_arc_cache_target && freemem < zfs_arc_cache_target + btop(size)) { + DTRACE_PROBE1(arc_cache_reclaim_needed, int, B_TRUE); + return (B_TRUE); + } + + DTRACE_PROBE1(arc_cache_reclaim_needed, int, B_FALSE); + return (B_FALSE); +} + +static boolean_t +arc_cache_reclaim_strat(kmem_cache_t *cache, uint64_t size, + arc_cache_reclaim_strategy_t strat) +{ + + switch(strat) { + case ARC_CACHE_RECLAIM_NOW: + case ARC_CACHE_RECLAIM_FORCE: + kmem_cache_reap_now(cache); + if (zfs_arc_cache_partial && strat != ARC_CACHE_RECLAIM_FORCE && + !arc_cache_reclaim_needed(size)) { + return (B_TRUE); + } + break; + default: + kmem_cache_reap(cache, zfs_arc_cache_free_max); + } + + return (B_FALSE); +} + +static boolean_t +arc_cache_reclaim(uint64_t size, arc_cache_reclaim_strategy_t strat) +{ + int i; + clock_t now; + kmem_cache_t *prev_cache, *prev_data_cache; + + if (strat != ARC_CACHE_RECLAIM_FORCE) { + now = ddi_get_lbolt(); + DTRACE_PROBE3(arc_cache_reclaim_test, int, strat, int64_t, now, + int64_t, cache_reclaim_last); + if (now - cache_reclaim_last > (zfs_arc_cache_period * hz)) + return (B_FALSE); + } + + DTRACE_PROBE1(arc_cache_reclaim, int, strat); + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; - kmem_cache_reap_now(zio_buf_cache[i]); + if (arc_cache_reclaim_strat(zio_buf_cache[i], size, + strat)) { + return (B_TRUE); + } + } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_now(zio_data_buf_cache[i]); + if (arc_cache_reclaim_strat(zio_data_buf_cache[i], + size, strat)) + return (B_TRUE); } } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + if (arc_cache_reclaim_strat(range_seg_cache, size, strat)) + return (B_TRUE); + + if (arc_cache_reclaim_strat(buf_cache, size, strat)) + return (B_TRUE); + + arc_cache_reclaim_strat(hdr_cache, size, strat); + + cache_reclaim_last = ddi_get_lbolt(); + + if (arc_cache_reclaim_needed(size)) + return (B_FALSE); + + return (B_TRUE); } +static uint64_t +arc_cache_free(void) +{ + int i; + uint64_t cachefree; + kmem_cache_t *prev_cache, *prev_data_cache; + + cachefree = kmem_cache_free_size(buf_cache) + + kmem_cache_free_size(hdr_cache); + + prev_cache = NULL; + prev_data_cache = NULL; + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + cachefree += kmem_cache_free_size(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + cachefree += kmem_cache_free_size(zio_data_buf_cache[i]); + } + } + + return (cachefree); +} + static void arc_reclaim_thread(void *dummy __unused) { @@ -2577,10 +2840,28 @@ arc_reclaim_thread(void *dummy __unused) mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { + DTRACE_PROBE(arc__reclaim_thread); + if (arc_cache_reclaim_needed(0)) { + (void) arc_cache_reclaim(0, ARC_CACHE_RECLAIM_NOW); + } else { + clock_t now; + + now = ddi_get_lbolt(); + DTRACE_PROBE2(arc__reclaim_check, int64_t, now, + int64_t, cache_reclaim_last); + if (now - cache_reclaim_last > + (zfs_arc_cache_free_period * hz)) { + (void) arc_cache_reclaim(0, + ARC_CACHE_RECLAIM_SIZE); + } + } + if (arc_reclaim_needed()) { - + DTRACE_PROBE1(arc__caches_free, uint64_t, + arc_cache_free()); if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { + DTRACE_PROBE(arc__reclaim_aggr_no_grow); last_reclaim = ARC_RECLAIM_AGGR; } else { last_reclaim = ARC_RECLAIM_CONS; @@ -2588,6 +2869,7 @@ arc_reclaim_thread(void *dummy __unused) } else { arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; + DTRACE_PROBE(arc__reclaim_aggr); membar_producer(); } @@ -2602,6 +2884,7 @@ arc_reclaim_thread(void *dummy __unused) */ arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; + DTRACE_PROBE(arc__reclaim_aggr_needfree); } arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; @@ -2618,6 +2901,7 @@ arc_reclaim_thread(void *dummy __unused) #ifdef _KERNEL if (needfree) { needfree = 0; + DTRACE_PROBE(arc__clear_needfree); wakeup(&needfree); } #endif @@ -2692,6 +2976,7 @@ arc_adapt(int bytes, arc_state_t *state) * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; @@ -2713,20 +2998,6 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef sun -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif -#endif /* sun */ - if (arc_reclaim_needed()) return (1); @@ -2763,6 +3034,9 @@ arc_get_data_buf(arc_buf_t *buf) uint64_t size = buf->b_hdr->b_size; arc_buf_contents_t type = buf->b_hdr->b_type; + if (arc_cache_reclaim_needed(size)) + (void) arc_cache_reclaim(size, ARC_CACHE_RECLAIM_NOW); + arc_adapt(size, state); /* @@ -3885,20 +4159,16 @@ static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = - ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); + uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; -#ifdef sun -#if defined(__i386) +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); + MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); #endif -#endif /* sun */ - if (cnt.v_free_count + cnt.v_cache_count > - (uint64_t)physmem * arc_lotsfree_percent / 100) + if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3911,7 +4181,7 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { - if (page_load > available_memory / 4) + if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; @@ -3939,8 +4209,10 @@ arc_tempreserve_space(uint64_t reserve, uint64_t t int error; uint64_t anon_size; - if (reserve > arc_c/4 && !arc_no_grow) + if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); + DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); + } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); @@ -3994,6 +4266,7 @@ arc_lowmem(void *arg __unused, int howto __unused) mutex_enter(&arc_lowmem_lock); mutex_enter(&arc_reclaim_thr_lock); needfree = 1; + DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thr_cv); /* --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (revision 272005) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c (working copy) @@ -33,7 +33,7 @@ #include #include -static kmem_cache_t *range_seg_cache; +kmem_cache_t *range_seg_cache; void range_tree_init(void) --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (revision 272005) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c (working copy) @@ -312,6 +312,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); + DTRACE_PROBE3(vdev_queue, vdev_queue_t *, vq, zio_t *, zio, uint64_t, + avl_numnodes(&vq->vq_class[zio->io_priority].vqc_queued_tree)); #ifdef illumos mutex_enter(&spa->spa_iokstat_lock); --- sys/vm/uma.h (revision 272005) +++ sys/vm/uma.h (working copy) @@ -636,6 +636,17 @@ int uma_zone_exhausted(uma_zone_t zone); int uma_zone_exhausted_nolock(uma_zone_t zone); /* + * Used to determine the amount of memory consumed by a zone's free space. + * + * Arguments: + * zone The zone to determine the free space of. + * + * Returns: + * uint64_t The amount of memory consumed by the zone's free space. + */ +uint64_t uma_zone_free_size(uma_zone_t zone); + +/* * Common UMA_ZONE_PCPU zones. */ extern uma_zone_t pcpu_zone_64; --- sys/vm/uma_core.c (revision 272005) +++ sys/vm/uma_core.c (working copy) @@ -3312,6 +3312,33 @@ uma_print_zone(uma_zone_t zone) } } +uint64_t +uma_zone_free_size(uma_zone_t zone) +{ + int cpu; + uint64_t cachefree; + uma_bucket_t bucket; + uma_cache_t cache; + + cachefree = 0; + + ZONE_LOCK(zone); + LIST_FOREACH(bucket, &zone->uz_buckets, ub_link) + cachefree += (uint64_t)bucket->ub_cnt; + + CPU_FOREACH(cpu) { + cache = &zone->uz_cpu[cpu]; + if (cache->uc_allocbucket != NULL) + cachefree += (uint64_t)cache->uc_allocbucket->ub_cnt; + if (cache->uc_freebucket != NULL) + cachefree += (uint64_t)cache->uc_freebucket->ub_cnt; + } + cachefree *= zone->uz_size; + ZONE_UNLOCK(zone); + + return (cachefree); +} + #ifdef DDB /* * Generate statistics across both the zone and its per-cpu cache's. Return --- sys/vm/vm_pageout.c (revision 272005) +++ sys/vm/vm_pageout.c (working copy) @@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$"); #include "opt_vm.h" +#include "opt_kdtrace.h" #include #include #include @@ -89,6 +90,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -115,10 +117,14 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); +static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); +SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, + NULL); + struct proc *pageproc; static struct kproc_desc page_kp = { @@ -126,9 +132,13 @@ static struct kproc_desc page_kp = { vm_pageout, &pageproc }; -SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, +SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); +SDT_PROVIDER_DEFINE(vm); +SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); +SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); + #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); @@ -663,6 +673,7 @@ vm_pageout_grow_cache(int tries, vm_paddr_t low, v * may acquire locks and/or sleep, so they can only be invoked * when "tries" is greater than zero. */ + SDT_PROBE0(vm, , , vm__lowmem_cache); EVENTHANDLER_INVOKE(vm_lowmem, 0); /* @@ -921,10 +932,11 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && - lowmem_ticks + (lowmem_period * hz) < ticks) { + (ticks - lowmem_ticks) / hz >= lowmem_period) { /* * Decrease registered cache sizes. */ + SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been @@ -1650,15 +1662,11 @@ vm_pageout_worker(void *arg) } /* - * vm_pageout is the high level pageout daemon. + * vm_pageout_init initialises basic pageout daemon settings. */ static void -vm_pageout(void) +vm_pageout_init(void) { -#if MAXMEMDOM > 1 - int error, i; -#endif - /* * Initialize some paging parameters. */ @@ -1704,7 +1712,18 @@ static void /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = cnt.v_free_count / 3; +} +/* + * vm_pageout is the high level pageout daemon. + */ +static void +vm_pageout(void) +{ +#if MAXMEMDOM > 1 + int error, i; +#endif + swap_pager_swap_init(); #if MAXMEMDOM > 1 for (i = 1; i < vm_ndomains; i++) {