Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 289078) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) @@ -216,6 +216,15 @@ extern boolean_t zfs_prefetch_disable; /* + * KD 2015-02-10 + * We have to be able to test for UIO use inside the arc allocator. + * NOTE: DO NOT MODIFY HERE! + */ +extern int zio_use_uma; +extern int zfs_dynamic_write_buffer; + + +/* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; @@ -233,7 +242,15 @@ int zfs_disable_dup_eviction = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; +u_int zfs_arc_wakeup_pager = 0; +u_int zfs_arc_wakeup_delay = 500; +#define WAKE_PAGER +#ifdef WAKE_PAGER +#define WAKE_PAGER_CONSTANT 10 / 9 /* Pager wakeup threshold */ +static int arc_init_done = 0; /* We know arc_warm is valid */ +#endif /* WAKE_PAGER */ + static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); @@ -242,7 +259,10 @@ arc_free_target_init(void *unused __unused) { - zfs_arc_free_target = vm_pageout_wakeup_thresh; + zfs_arc_free_target = vm_pageout_wakeup_thresh + ((cnt.v_free_target - vm_pageout_wakeup_thresh) / 2); +#ifdef WAKE_PAGER + zfs_arc_wakeup_pager = zfs_arc_free_target * WAKE_PAGER_CONSTANT; +#endif /* WAKE_PAGER */ } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); @@ -264,7 +284,15 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); - +SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN, + &zfs_dynamic_write_buffer, 0, + "Dynamically restrict dirty data when memory is low"); +#ifdef WAKE_PAGER +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_wakeup_pager, CTLFLAG_RWTUN, + &zfs_arc_wakeup_pager, 0, "Wake VM below this number of pages"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_wakeup_delay, CTLFLAG_RWTUN, + &zfs_arc_wakeup_delay, 0, "May wake up VM once this number of MS"); +#endif /* WAKE_PAGER */ /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. @@ -291,6 +319,9 @@ return (EINVAL); zfs_arc_free_target = val; +#ifdef WAKE_PAGER + zfs_arc_wakeup_pager = zfs_arc_free_target * WAKE_PAGER_CONSTANT; +#endif /* WAKE_PAGER */ return (0); } @@ -3367,6 +3398,11 @@ int64_t arc_swapfs_reserve = 64; /* + * Declare file-local static for event processor bypass + */ +static unsigned int arc_no_wake_event = 0; + +/* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. @@ -3379,6 +3415,10 @@ free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL +#ifdef WAKE_PAGER + sbintime_t now; + static sbintime_t last_pagedaemon_wake = 0; +#endif /* WAKE_PAGER */ if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { @@ -3397,6 +3437,26 @@ r = FMR_LOTSFREE; } +#ifdef WAKE_PAGER +/* + * If memory is less than the ARC wakeup threshold and time has expired since + * the last time we woke the pager... Do not execute until the ARC warms up. + */ + if ((arc_init_done) && + (((int64_t) freemem - zfs_arc_wakeup_pager) < 0) && + (arc_warm == B_TRUE) + ) { + now = getsbinuptime(); + if ((now - last_pagedaemon_wake) / SBT_1MS > zfs_arc_wakeup_delay) { + last_pagedaemon_wake = now; + arc_no_wake_event++; /* Set bypass flag for ARC */ + DTRACE_PROBE(arc__wake_pagedaemon); + pagedaemon_wakeup(); /* Wake the pager */ + } + } + +#endif /* WAKE_PAGER */ + #ifdef sun /* * check that we're out of range of the pageout scanner. It starts to @@ -3505,6 +3565,7 @@ last_free_memory = lowest; last_free_reason = r; DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); } @@ -3524,6 +3585,43 @@ extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; +/* + * Used by arc_kmem_reap_now() and consider_reaping_arc_caches() + * to limit the time spent reaping. + * + * The arc_reaping_in_progress is a (somewhat racy) left over from a + * previous version of this code which could trigger multiple ARC cache + * reapings in parallel which should be avoided to reduce lock + * contention. It's hasn't been removed yet to encourage further + * experimenting. + */ +static unsigned int arc_reaping_in_progress = 0; +static unsigned int arc_pagedaemon_ignore = 0; +static sbintime_t last_reaping = 0; + +static void __noinline +reap_arc_caches(void) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); + kmem_cache_reap_now(range_seg_cache); +} + static __noinline void arc_kmem_reap_now(void) { @@ -3532,6 +3630,8 @@ kmem_cache_t *prev_data_cache = NULL; DTRACE_PROBE(arc__kmem_reap_start); + arc_reaping_in_progress++; + #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* @@ -3548,20 +3648,7 @@ #endif #endif - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { - if (zio_buf_cache[i] != prev_cache) { - prev_cache = zio_buf_cache[i]; - kmem_cache_reap_now(zio_buf_cache[i]); - } - if (zio_data_buf_cache[i] != prev_data_cache) { - prev_data_cache = zio_data_buf_cache[i]; - kmem_cache_reap_now(zio_data_buf_cache[i]); - } - } - kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_full_cache); - kmem_cache_reap_now(hdr_l2only_cache); - kmem_cache_reap_now(range_seg_cache); + reap_arc_caches(); #ifdef sun if (zio_arena != NULL) { @@ -3572,10 +3659,51 @@ vmem_qcache_reap(zio_arena); } #endif +#ifdef _KERNEL + last_reaping = getsbinuptime(); +#endif + arc_reaping_in_progress = 0; DTRACE_PROBE(arc__kmem_reap_end); } + /* + * Declared writable to allow resetting it. + * XXX: Should probably be a uint64 and integrated with kstat. + */ +static unsigned int arc_cache_reapings_skipped = 0; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_reapings_skipped, CTLFLAG_RW, + &arc_cache_reapings_skipped, 0, "Number of times the ARC caches have not been reaped due to the reap delay"); + +static unsigned int min_arc_reap_delay = 200; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_reap_delay_min, CTLFLAG_RW, + &min_arc_reap_delay, 200, "Minimum delay between ARC cache reapings (milliseconds)"); + +static void __noinline +consider_reaping_arc_caches(void) +{ +#ifdef _KERNEL + sbintime_t now; + + if (arc_reaping_in_progress) + { + /* Already reaping in another thread. */ + arc_cache_reapings_skipped++; + return; + } + + now = getsbinuptime(); + if ((now - last_reaping) / SBT_1MS < min_arc_reap_delay) + { + /* Too soon to reap again. */ + arc_cache_reapings_skipped++; + return; + } +#endif + arc_kmem_reap_now(); +} + +/* * Threads can block in arc_get_data_buf() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_buf() are sleeping while holding the hash lock for their @@ -3617,8 +3745,6 @@ */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - arc_kmem_reap_now(); - /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. @@ -3692,6 +3818,12 @@ while (!arc_user_evicts_thread_exit) { mutex_exit(&arc_user_evicts_lock); + /* + * Consider reaping the ARC caches at least once per + * second, but more often when signalled under pressure. + */ + consider_reaping_arc_caches(); + arc_do_user_evicts(); /* @@ -5243,7 +5375,10 @@ static void arc_lowmem(void *arg __unused, int howto __unused) { - + if (arc_no_wake_event) { /* Don't do it if we woke the pager */ + arc_no_wake_event = 0; /* Just clear the flag */ + return; + } mutex_enter(&arc_reclaim_lock); /* XXX: Memory deficit should be passed as argument. */ needfree = btoc(arc_c >> arc_shrink_shift); @@ -5491,6 +5626,9 @@ printf(" in /boot/loader.conf.\n"); } #endif +#ifdef WAKE_PAGER + arc_init_done++; /* For anyone who wants to know */ +#endif /* WAKE_PAGER */ } void Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 289078) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (working copy) @@ -42,6 +42,8 @@ typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); +extern int zio_use_uma; /* Needs to be visible; DO NOT MODIFY! */ +int zfs_dynamic_write_buffer = 1; /* Dynamically tune writes */ dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) @@ -1060,7 +1062,7 @@ { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) @@ -1072,11 +1074,11 @@ * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + ASSERT3U(dirty, <, zfs_dirty_data_max_internal); now = gethrtime(); min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty); if (now > tx->tx_start + min_tx_time) return; @@ -1281,6 +1283,7 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { int err; + static uint64_t last_max; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || @@ -1293,6 +1296,42 @@ if (txg_how == TXG_WAITED) tx->tx_waited = B_TRUE; +#ifdef _KERNEL + /* + * KD 2014-09-22 + * If UMA is enabled it can only return a previously-used block + * of identical size to what it had out before. If it's not the + * same size it will allocate a new one. This is a problem because + * dirty_data_max is the total dirty write data allowed out at any + * given time, but with UMA on that can multiply by the number of + * different block sizes (!!) requested in terms of free RAM that + * is left allocated but unused. For this reason never allow + * dirty_data_max to exceed the difference between the paging + * threshold and the current free memory, with a minimum of 256MB. + * This throttles "burst" allocations and prevents the system from + * choking during times of high write I/O demand. + * + * We allow this to be turned off if you want with + * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time. + * + * Note that we work on the zfs_dirty_data_max_internal variable, + * because the user may set zfs_dirty_data_max himself and we must + * must honor that as a hard cap so it remains a usable tunable value. + */ + if (zio_use_uma & zfs_dynamic_write_buffer) { + zfs_dirty_data_max_internal = 1 << 28; + zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(cnt.v_free_count - cnt.v_free_target)); + zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max); + zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max); + if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) { + last_max = zfs_dirty_data_max_internal / (1024 * 1024); + DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max); + } + } else { + zfs_dirty_data_max_internal = zfs_dirty_data_max; + } +#endif /* _KERNEL */ + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1323,7 +1362,7 @@ * space. */ mutex_enter(&dp->dp_lock); - while (dp->dp_dirty_total >= zfs_dirty_data_max) + while (dp->dp_dirty_total >= zfs_dirty_data_max_internal) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 289078) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (working copy) @@ -98,8 +98,11 @@ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. + * We also dynamically tune during low memory, honoring the sysctl set, so + * internal comparisons are against zfs_dirty_data_max_internal. */ uint64_t zfs_dirty_data_max; +uint64_t zfs_dirty_data_max_internal; uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; int zfs_dirty_data_max_percent = 10; @@ -557,7 +560,7 @@ * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ - if (dp->dp_dirty_total <= zfs_dirty_data_max) + if (dp->dp_dirty_total <= zfs_dirty_data_max_internal) cv_signal(&dp->dp_spaceavail_cv); } @@ -736,7 +739,7 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = - zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; boolean_t rv; mutex_enter(&dp->dp_lock); Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 289078) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (working copy) @@ -50,6 +50,7 @@ struct dsl_scan; extern uint64_t zfs_dirty_data_max; +extern uint64_t zfs_dirty_data_max_internal; extern uint64_t zfs_dirty_data_max_max; extern uint64_t zfs_dirty_data_sync; extern int zfs_dirty_data_max_percent; Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 289078) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (working copy) @@ -43,10 +43,13 @@ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); +/* KD 2015-07-15 Change class to "int" from "static int" as we reference + * this as an extern elsewhere + */ #if defined(__amd64__) -static int zio_use_uma = 1; +int zio_use_uma = 1; #else -static int zio_use_uma = 0; +int zio_use_uma = 0; #endif TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,