--- arc.c (revision 271435) +++ arc.c (working copy) @@ -138,6 +138,7 @@ #include #include +#include #ifdef illumos #ifndef _KERNEL @@ -193,9 +194,6 @@ */ static boolean_t arc_warm; -/* - * These tunables are for performance analysis. - */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; @@ -204,7 +202,21 @@ int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +u_int zfs_arc_free_target = (1 << 16); /* default before pagedaemon init only */ +int zfs_arc_reclaim_cache_free = 1; +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); + +#ifdef _KERNEL +static void +arc_free_target_init(void *unused __unused) +{ + + zfs_arc_free_target = cnt.v_free_target; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); @@ -217,7 +229,40 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_reclaim_cache_free, CTLFLAG_RWTUN, + &zfs_arc_reclaim_cache_free, 0, + "ARC treats cached pages as free blocksize"); +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} +#endif + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -2421,6 +2466,7 @@ void arc_shrink(void) { + if (arc_c > arc_c_min) { uint64_t to_free; @@ -2429,6 +2475,9 @@ #else to_free = arc_c >> arc_shrink_shift; #endif + DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min, uint64_t, arc_p, uint64_t, to_free); + if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -2439,12 +2488,19 @@ arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); + + DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, + arc_p); + ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) + if (arc_size > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, + uint64_t, arc_c); arc_adjust(); + } } static int needfree = 0; @@ -2452,18 +2508,51 @@ static int arc_reclaim_needed(void) { + u_int fm; #ifdef _KERNEL - if (needfree) +/* + * First check to see if dirty_data_max needs adjusting. Do not allow + * the dirty data amount per pool to exceed free, non-swappable RAM. + * Subject this to the original test of the max_max limit and a minimum + * of 16MB. + */ + zfs_dirty_data_max = ptob(cnt.v_free_count) - ptob(cnt.v_free_min); + if (zfs_dirty_data_max <= 1 << 24) { + zfs_dirty_data_max = 1 << 24; + } + zfs_dirty_data_max = MIN(zfs_dirty_data_max, ptob(physmem) * + zfs_dirty_data_max_percent / 100); + zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); + + + if (arc_size <= arc_c_min) { + DTRACE_PROBE2(arc__reclaim_min, uint64_t, arc_size, + uint64_t, arc_c_min); + return (0); + } + + if (needfree) { + DTRACE_PROBE(arc__reclaim_needfree); return (1); + } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - if (vm_paging_needed()) + if (zfs_arc_reclaim_cache_free == 0) + fm = cnt.v_free_count; + else + fm = freemem; + + if (fm < zfs_arc_free_target) { + DTRACE_PROBE3(arc__reclaim_freemem, uint64_t, + fm, uint64_t, zfs_arc_free_target, + int, zfs_arc_reclaim_cache_free); return (1); + } #ifdef sun /* @@ -2491,8 +2580,19 @@ if (availrmem < swapfs_minfree + swapfs_reserve + extra) return (1); -#if defined(__i386) /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + if (availrmem <= pages_pp_maximum) + return (1); + +#endif /* sun */ +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against @@ -2503,32 +2603,49 @@ * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + if (vmem_size(heap_arena, VMEM_FREE) < + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { + DTRACE_PROBE2(arc__reclaim_used, uint64_t, + vmem_size(heap_arena, VMEM_FREE), uint64_t, + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); return (1); + } #endif -#else /* !sun */ - if (kmem_used() > (kmem_size() * 3) / 4) +#ifdef sun + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) return (1); #endif /* sun */ - -#else +#else /* _KERNEL */ if (spa_get_random(100) == 0) return (1); -#endif +#endif /* _KERNEL */ + DTRACE_PROBE(arc__reclaim_no); + return (0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; -static void +static void __used arc_kmem_reap_now(arc_reclaim_strategy_t strat) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; + DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* @@ -2564,6 +2681,16 @@ } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_cache); + +#ifdef sun + /* + * Ask the vmem areana to reclaim unused memory from its + * quantum caches. + */ + if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + vmem_qcache_reap(zio_arena); +#endif + DTRACE_PROBE(arc__kmem_reap_end); } static void @@ -2581,6 +2708,7 @@ if (arc_no_grow) { if (last_reclaim == ARC_RECLAIM_CONS) { + DTRACE_PROBE(arc__reclaim_aggr_no_grow); last_reclaim = ARC_RECLAIM_AGGR; } else { last_reclaim = ARC_RECLAIM_CONS; @@ -2588,6 +2716,7 @@ } else { arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; + DTRACE_PROBE(arc__reclaim_aggr); membar_producer(); } @@ -2602,6 +2731,7 @@ */ arc_no_grow = TRUE; last_reclaim = ARC_RECLAIM_AGGR; + DTRACE_PROBE(arc__reclaim_aggr_needfree); } arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; @@ -2618,6 +2748,7 @@ #ifdef _KERNEL if (needfree) { needfree = 0; + DTRACE_PROBE(arc__clear_needfree); wakeup(&needfree); } #endif @@ -2692,6 +2823,7 @@ * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; @@ -2713,20 +2845,6 @@ if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef sun -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif -#endif /* sun */ - if (arc_reclaim_needed()) return (1); @@ -3885,20 +4003,16 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = - ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); + uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; -#ifdef sun -#if defined(__i386) +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = - MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); + MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); #endif -#endif /* sun */ - if (cnt.v_free_count + cnt.v_cache_count > - (uint64_t)physmem * arc_lotsfree_percent / 100) + if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3911,7 +4025,7 @@ * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { - if (page_load > available_memory / 4) + if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; @@ -3939,8 +4053,10 @@ int error; uint64_t anon_size; - if (reserve > arc_c/4 && !arc_no_grow) + if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); + DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); + } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); @@ -3994,6 +4110,7 @@ mutex_enter(&arc_lowmem_lock); mutex_enter(&arc_reclaim_thr_lock); needfree = 1; + DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thr_cv); /* --- dmu_tx.c (revision 271435) +++ dmu_tx.c (working copy) @@ -1063,6 +1063,13 @@ ASSERT3U(dirty, <, zfs_dirty_data_max); now = gethrtime(); + /* + * Because of dynamic dirty_data_max sizing in arc.c, it is possible + * for it to be exactly equal to dirty. Prevent divide-by-zero panic + * in the unlikely event that happens to be the case. + */ + if !(zfs_dirty_data_max - dirty) + zfs_dirty_data_max++; min_tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); if (now > tx->tx_start + min_tx_time)