FreeBSD Bugzilla – Attachment 164051 Details for
Bug 187594
[zfs] [patch] ZFS ARC behavior problem and fix
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
Update of patch related to other bug report (see comment for details)
patch-10.2-STABLE-r289078 (text/plain), 15.87 KB, created by
karl
on 2015-12-10 15:48:51 UTC
(
hide
)
Description:
Update of patch related to other bug report (see comment for details)
Filename:
MIME Type:
Creator:
karl
Created:
2015-12-10 15:48:51 UTC
Size:
15.87 KB
patch
obsolete
>Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c >=================================================================== >--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 289078) >+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) >@@ -216,6 +216,15 @@ static int arc_dead; > extern boolean_t zfs_prefetch_disable; > > /* >+ * KD 2015-02-10 >+ * We have to be able to test for UIO use inside the arc allocator. >+ * NOTE: DO NOT MODIFY HERE! >+ */ >+extern int zio_use_uma; >+extern int zfs_dynamic_write_buffer; >+ >+ >+/* > * The arc has filled available memory and has now warmed up. > */ > static boolean_t arc_warm; >@@ -233,7 +242,15 @@ int zfs_arc_p_min_shift = 0; > int zfs_disable_dup_eviction = 0; > uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ > u_int zfs_arc_free_target = 0; >+u_int zfs_arc_wakeup_pager = 0; >+u_int zfs_arc_wakeup_delay = 500; > >+#define WAKE_PAGER >+#ifdef WAKE_PAGER >+#define WAKE_PAGER_CONSTANT 10 / 9 /* Pager wakeup threshold */ >+static int arc_init_done = 0; /* We know arc_warm is valid */ >+#endif /* WAKE_PAGER */ >+ > static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); > static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); > >@@ -242,7 +259,10 @@ static void > arc_free_target_init(void *unused __unused) > { > >- zfs_arc_free_target = vm_pageout_wakeup_thresh; >+ zfs_arc_free_target = vm_pageout_wakeup_thresh + ((cnt.v_free_target - vm_pageout_wakeup_thresh) / 2); >+#ifdef WAKE_PAGER >+ zfs_arc_wakeup_pager = zfs_arc_free_target * WAKE_PAGER_CONSTANT; >+#endif /* WAKE_PAGER */ > } > SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, > arc_free_target_init, NULL); >@@ -264,7 +284,15 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_block > SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, > &arc_shrink_shift, 0, > "log2(fraction of arc to reclaim)"); >- >+SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN, >+ &zfs_dynamic_write_buffer, 0, >+ "Dynamically restrict dirty data when memory is low"); >+#ifdef WAKE_PAGER >+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_wakeup_pager, CTLFLAG_RWTUN, >+ &zfs_arc_wakeup_pager, 0, "Wake VM below this number of pages"); >+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_wakeup_delay, CTLFLAG_RWTUN, >+ &zfs_arc_wakeup_delay, 0, "May wake up VM once this number of MS"); >+#endif /* WAKE_PAGER */ > /* > * We don't have a tunable for arc_free_target due to the dependency on > * pagedaemon initialisation. >@@ -291,6 +319,9 @@ sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS > return (EINVAL); > > zfs_arc_free_target = val; >+#ifdef WAKE_PAGER >+ zfs_arc_wakeup_pager = zfs_arc_free_target * WAKE_PAGER_CONSTANT; >+#endif /* WAKE_PAGER */ > > return (0); > } >@@ -3367,6 +3398,11 @@ int64_t arc_pages_pp_reserve = 64; > int64_t arc_swapfs_reserve = 64; > > /* >+ * Declare file-local static for event processor bypass >+ */ >+static unsigned int arc_no_wake_event = 0; >+ >+/* > * Return the amount of memory that can be consumed before reclaim will be > * needed. Positive if there is sufficient free memory, negative indicates > * the amount of memory that needs to be freed up. >@@ -3379,6 +3415,10 @@ arc_available_memory(void) > free_memory_reason_t r = FMR_UNKNOWN; > > #ifdef _KERNEL >+#ifdef WAKE_PAGER >+ sbintime_t now; >+ static sbintime_t last_pagedaemon_wake = 0; >+#endif /* WAKE_PAGER */ > if (needfree > 0) { > n = PAGESIZE * (-needfree); > if (n < lowest) { >@@ -3397,6 +3437,26 @@ arc_available_memory(void) > r = FMR_LOTSFREE; > } > >+#ifdef WAKE_PAGER >+/* >+ * If memory is less than the ARC wakeup threshold and time has expired since >+ * the last time we woke the pager... Do not execute until the ARC warms up. >+ */ >+ if ((arc_init_done) && >+ (((int64_t) freemem - zfs_arc_wakeup_pager) < 0) && >+ (arc_warm == B_TRUE) >+ ) { >+ now = getsbinuptime(); >+ if ((now - last_pagedaemon_wake) / SBT_1MS > zfs_arc_wakeup_delay) { >+ last_pagedaemon_wake = now; >+ arc_no_wake_event++; /* Set bypass flag for ARC */ >+ DTRACE_PROBE(arc__wake_pagedaemon); >+ pagedaemon_wakeup(); /* Wake the pager */ >+ } >+ } >+ >+#endif /* WAKE_PAGER */ >+ > #ifdef sun > /* > * check that we're out of range of the pageout scanner. It starts to >@@ -3505,6 +3565,7 @@ arc_available_memory(void) > last_free_memory = lowest; > last_free_reason = r; > DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); >+ > return (lowest); > } > >@@ -3524,6 +3585,43 @@ extern kmem_cache_t *zio_buf_cache[]; > extern kmem_cache_t *zio_data_buf_cache[]; > extern kmem_cache_t *range_seg_cache; > >+/* >+ * Used by arc_kmem_reap_now() and consider_reaping_arc_caches() >+ * to limit the time spent reaping. >+ * >+ * The arc_reaping_in_progress is a (somewhat racy) left over from a >+ * previous version of this code which could trigger multiple ARC cache >+ * reapings in parallel which should be avoided to reduce lock >+ * contention. It's hasn't been removed yet to encourage further >+ * experimenting. >+ */ >+static unsigned int arc_reaping_in_progress = 0; >+static unsigned int arc_pagedaemon_ignore = 0; >+static sbintime_t last_reaping = 0; >+ >+static void __noinline >+reap_arc_caches(void) >+{ >+ size_t i; >+ kmem_cache_t *prev_cache = NULL; >+ kmem_cache_t *prev_data_cache = NULL; >+ >+ for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { >+ if (zio_buf_cache[i] != prev_cache) { >+ prev_cache = zio_buf_cache[i]; >+ kmem_cache_reap_now(zio_buf_cache[i]); >+ } >+ if (zio_data_buf_cache[i] != prev_data_cache) { >+ prev_data_cache = zio_data_buf_cache[i]; >+ kmem_cache_reap_now(zio_data_buf_cache[i]); >+ } >+ } >+ kmem_cache_reap_now(buf_cache); >+ kmem_cache_reap_now(hdr_full_cache); >+ kmem_cache_reap_now(hdr_l2only_cache); >+ kmem_cache_reap_now(range_seg_cache); >+} >+ > static __noinline void > arc_kmem_reap_now(void) > { >@@ -3532,6 +3630,8 @@ arc_kmem_reap_now(void) > kmem_cache_t *prev_data_cache = NULL; > > DTRACE_PROBE(arc__kmem_reap_start); >+ arc_reaping_in_progress++; >+ > #ifdef _KERNEL > if (arc_meta_used >= arc_meta_limit) { > /* >@@ -3548,20 +3648,7 @@ arc_kmem_reap_now(void) > #endif > #endif > >- for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { >- if (zio_buf_cache[i] != prev_cache) { >- prev_cache = zio_buf_cache[i]; >- kmem_cache_reap_now(zio_buf_cache[i]); >- } >- if (zio_data_buf_cache[i] != prev_data_cache) { >- prev_data_cache = zio_data_buf_cache[i]; >- kmem_cache_reap_now(zio_data_buf_cache[i]); >- } >- } >- kmem_cache_reap_now(buf_cache); >- kmem_cache_reap_now(hdr_full_cache); >- kmem_cache_reap_now(hdr_l2only_cache); >- kmem_cache_reap_now(range_seg_cache); >+ reap_arc_caches(); > > #ifdef sun > if (zio_arena != NULL) { >@@ -3572,10 +3659,51 @@ arc_kmem_reap_now(void) > vmem_qcache_reap(zio_arena); > } > #endif >+#ifdef _KERNEL >+ last_reaping = getsbinuptime(); >+#endif >+ arc_reaping_in_progress = 0; > DTRACE_PROBE(arc__kmem_reap_end); > } > >+ > /* >+ * Declared writable to allow resetting it. >+ * XXX: Should probably be a uint64 and integrated with kstat. >+ */ >+static unsigned int arc_cache_reapings_skipped = 0; >+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_reapings_skipped, CTLFLAG_RW, >+ &arc_cache_reapings_skipped, 0, "Number of times the ARC caches have not been reaped due to the reap delay"); >+ >+static unsigned int min_arc_reap_delay = 200; >+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_reap_delay_min, CTLFLAG_RW, >+ &min_arc_reap_delay, 200, "Minimum delay between ARC cache reapings (milliseconds)"); >+ >+static void __noinline >+consider_reaping_arc_caches(void) >+{ >+#ifdef _KERNEL >+ sbintime_t now; >+ >+ if (arc_reaping_in_progress) >+ { >+ /* Already reaping in another thread. */ >+ arc_cache_reapings_skipped++; >+ return; >+ } >+ >+ now = getsbinuptime(); >+ if ((now - last_reaping) / SBT_1MS < min_arc_reap_delay) >+ { >+ /* Too soon to reap again. */ >+ arc_cache_reapings_skipped++; >+ return; >+ } >+#endif >+ arc_kmem_reap_now(); >+} >+ >+/* > * Threads can block in arc_get_data_buf() waiting for this thread to evict > * enough data and signal them to proceed. When this happens, the threads in > * arc_get_data_buf() are sleeping while holding the hash lock for their >@@ -3617,8 +3745,6 @@ arc_reclaim_thread(void *dummy __unused) > */ > growtime = ddi_get_lbolt() + (arc_grow_retry * hz); > >- arc_kmem_reap_now(); >- > /* > * If we are still low on memory, shrink the ARC > * so that we have arc_shrink_min free space. >@@ -3692,6 +3818,12 @@ arc_user_evicts_thread(void *dummy __unused) > while (!arc_user_evicts_thread_exit) { > mutex_exit(&arc_user_evicts_lock); > >+ /* >+ * Consider reaping the ARC caches at least once per >+ * second, but more often when signalled under pressure. >+ */ >+ consider_reaping_arc_caches(); >+ > arc_do_user_evicts(); > > /* >@@ -5243,7 +5375,10 @@ static eventhandler_tag arc_event_lowmem = NULL; > static void > arc_lowmem(void *arg __unused, int howto __unused) > { >- >+ if (arc_no_wake_event) { /* Don't do it if we woke the pager */ >+ arc_no_wake_event = 0; /* Just clear the flag */ >+ return; >+ } > mutex_enter(&arc_reclaim_lock); > /* XXX: Memory deficit should be passed as argument. */ > needfree = btoc(arc_c >> arc_shrink_shift); >@@ -5491,6 +5626,9 @@ arc_init(void) > printf(" in /boot/loader.conf.\n"); > } > #endif >+#ifdef WAKE_PAGER >+ arc_init_done++; /* For anyone who wants to know */ >+#endif /* WAKE_PAGER */ > } > > void >Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c >=================================================================== >--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 289078) >+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (working copy) >@@ -42,6 +42,8 @@ > typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, > uint64_t arg1, uint64_t arg2); > >+extern int zio_use_uma; /* Needs to be visible; DO NOT MODIFY! */ >+int zfs_dynamic_write_buffer = 1; /* Dynamically tune writes */ > > dmu_tx_t * > dmu_tx_create_dd(dsl_dir_t *dd) >@@ -1060,7 +1062,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) > { > dsl_pool_t *dp = tx->tx_pool; > uint64_t delay_min_bytes = >- zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; >+ zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; > hrtime_t wakeup, min_tx_time, now; > > if (dirty <= delay_min_bytes) >@@ -1072,11 +1074,16 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) > * have to handle the case of it being >= the max, which could > * cause a divide-by-zero if it's == the max. > */ >- ASSERT3U(dirty, <, zfs_dirty_data_max); >+ ASSERT3U(dirty, <, zfs_dirty_data_max_internal); > > now = gethrtime(); >- min_tx_time = zfs_delay_scale * >- (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); >+ if (dirty >= zfs_dirty_data_max_internal) {/* No scaling if overcommitted */ >+ min_tx_time = zfs_delay_scale * >+ (dirty - delay_min_bytes); >+ } else { >+ min_tx_time = zfs_delay_scale * >+ (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty); >+ } > if (now > tx->tx_start + min_tx_time) > return; > >@@ -1281,6 +1288,7 @@ int > dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) > { > int err; >+ static uint64_t last_max; > > ASSERT(tx->tx_txg == 0); > ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || >@@ -1293,6 +1301,42 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) > if (txg_how == TXG_WAITED) > tx->tx_waited = B_TRUE; > >+#ifdef _KERNEL >+ /* >+ * KD 2014-09-22 >+ * If UMA is enabled it can only return a previously-used block >+ * of identical size to what it had out before. If it's not the >+ * same size it will allocate a new one. This is a problem because >+ * dirty_data_max is the total dirty write data allowed out at any >+ * given time, but with UMA on that can multiply by the number of >+ * different block sizes (!!) requested in terms of free RAM that >+ * is left allocated but unused. For this reason never allow >+ * dirty_data_max to exceed the difference between the paging >+ * threshold and the current free memory, with a minimum of 256MB. >+ * This throttles "burst" allocations and prevents the system from >+ * choking during times of high write I/O demand. >+ * >+ * We allow this to be turned off if you want with >+ * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time. >+ * >+ * Note that we work on the zfs_dirty_data_max_internal variable, >+ * because the user may set zfs_dirty_data_max himself and we must >+ * must honor that as a hard cap so it remains a usable tunable value. >+ */ >+ if (zio_use_uma & zfs_dynamic_write_buffer) { >+ zfs_dirty_data_max_internal = 1 << 28; >+ zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(cnt.v_free_count - cnt.v_free_target)); >+ zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max); >+ zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max); >+ if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) { >+ last_max = zfs_dirty_data_max_internal / (1024 * 1024); >+ DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max); >+ } >+ } else { >+ zfs_dirty_data_max_internal = zfs_dirty_data_max; >+ } >+#endif /* _KERNEL */ >+ > while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { > dmu_tx_unassign(tx); > >@@ -1323,7 +1367,7 @@ dmu_tx_wait(dmu_tx_t *tx) > * space. > */ > mutex_enter(&dp->dp_lock); >- while (dp->dp_dirty_total >= zfs_dirty_data_max) >+ while (dp->dp_dirty_total >= zfs_dirty_data_max_internal) > cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); > uint64_t dirty = dp->dp_dirty_total; > mutex_exit(&dp->dp_lock); >Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c >=================================================================== >--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 289078) >+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (working copy) >@@ -98,8 +98,11 @@ > /* > * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, > * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. >+ * We also dynamically tune during low memory, honoring the sysctl set, so >+ * internal comparisons are against zfs_dirty_data_max_internal. > */ > uint64_t zfs_dirty_data_max; >+uint64_t zfs_dirty_data_max_internal; > uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; > int zfs_dirty_data_max_percent = 10; > >@@ -557,7 +560,7 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta > * Note: we signal even when increasing dp_dirty_total. > * This ensures forward progress -- each thread wakes the next waiter. > */ >- if (dp->dp_dirty_total <= zfs_dirty_data_max) >+ if (dp->dp_dirty_total <= zfs_dirty_data_max_internal) > cv_signal(&dp->dp_spaceavail_cv); > } > >@@ -736,7 +739,7 @@ boolean_t > dsl_pool_need_dirty_delay(dsl_pool_t *dp) > { > uint64_t delay_min_bytes = >- zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; >+ zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100; > boolean_t rv; > > mutex_enter(&dp->dp_lock); >Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h >=================================================================== >--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (revision 289078) >+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (working copy) >@@ -50,6 +50,7 @@ struct dmu_tx; > struct dsl_scan; > > extern uint64_t zfs_dirty_data_max; >+extern uint64_t zfs_dirty_data_max_internal; > extern uint64_t zfs_dirty_data_max_max; > extern uint64_t zfs_dirty_data_sync; > extern int zfs_dirty_data_max_percent; >Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c >=================================================================== >--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (revision 289078) >+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (working copy) >@@ -43,10 +43,13 @@ > > SYSCTL_DECL(_vfs_zfs); > SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); >+/* KD 2015-07-15 Change class to "int" from "static int" as we reference >+ * this as an extern elsewhere >+ */ > #if defined(__amd64__) >-static int zio_use_uma = 1; >+int zio_use_uma = 1; > #else >-static int zio_use_uma = 0; >+int zio_use_uma = 0; > #endif > TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma); > SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 187594
:
140882
|
140883
|
140884
|
140885
|
140886
|
140887
|
140888
|
140889
|
140890
|
140891
|
140892
|
146178
|
146203
|
146249
|
146251
|
146287
|
146300
|
146373
|
146423
|
146424
|
146456
|
146816
|
146817
|
146851
|
146852
|
146854
|
146859
|
146861
|
146946
|
146947
|
146948
|
146949
|
147014
|
147068
|
147069
|
147070
|
147265
|
147274
|
147275
|
147276
|
147286
|
147459
|
147607
|
147609
|
147733
|
147738
|
147754
|
147815
|
152852
|
158809
|
159207
|
159688
|
159859
|
159905
|
161691
|
161692
|
161943
| 164051 |
174197
|
174198
|
174231
|
174232
|
174254
|
186818