FreeBSD Bugzilla – Attachment 186818 Details for
Bug 187594
[zfs] [patch] ZFS ARC behavior problem and fix
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Help
|
New Account
|
Log In
Remember
[x]
|
Forgot Password
Login:
[x]
[patch]
Patch against r324056 (11.1-STABLE) w/Phabricator D7538 improvements
patch-D7538-11-1-r324056 (text/plain), 11.06 KB, created by
karl
on 2017-09-30 17:33:02 UTC
(
hide
)
Description:
Patch against r324056 (11.1-STABLE) w/Phabricator D7538 improvements
Filename:
MIME Type:
Creator:
karl
Created:
2017-09-30 17:33:02 UTC
Size:
11.06 KB
patch
obsolete
>Index: cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c >=================================================================== >--- cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 324056) >+++ cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) >@@ -362,7 +362,16 @@ int zfs_arc_no_grow_shift = 0; > int zfs_arc_p_min_shift = 0; > uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ > u_int zfs_arc_free_target = 0; >+u_int zfs_arc_wakeup_pager = 0; >+u_int zfs_arc_wakeup_delay = 500; /* 500ms between pager wakeups min */ > >+#define WAKE_PAGER >+#ifdef WAKE_PAGER >+#define FREE_TARGET_CONSTANT 10 / 8 /* Target above pageout_wakeup_thresh */ >+static int arc_init_done = 0; /* After arc_warm is valid */ >+extern void pagedaemon_wakeup(void); >+#endif >+ > /* Absolute min for arc min / max is 16MB. */ > static uint64_t arc_abs_min = 16 << 20; > >@@ -379,7 +388,9 @@ static void > arc_free_target_init(void *unused __unused) > { > >- zfs_arc_free_target = vm_pageout_wakeup_thresh; >+ zfs_arc_free_target = vm_pageout_wakeup_thresh + (vm_pageout_wakeup_thresh / 5); >+ zfs_arc_wakeup_pager = vm_pageout_wakeup_thresh + ((vm_cnt.v_free_target - vm_pageout_wakeup_thresh) / 2); >+ > } > SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, > arc_free_target_init, NULL); >@@ -4225,13 +4236,21 @@ int64_t arc_pages_pp_reserve = 64; > */ > int64_t arc_swapfs_reserve = 64; > >+typedef enum free_memory_measure_t { >+ FMM_EXCLUDE_ZONE_CACHE, >+ FMM_INCLUDE_ZONE_CACHE >+} free_memory_measure_t; >+ > /* > * Return the amount of memory that can be consumed before reclaim will be > * needed. Positive if there is sufficient free memory, negative indicates > * the amount of memory that needs to be freed up. > */ >+ >+static int64_t arc_check_uma_cache(int64_t lowest); >+ > static int64_t >-arc_available_memory(void) >+arc_available_memory(free_memory_measure_t zone_measure) > { > int64_t lowest = INT64_MAX; > int64_t n; >@@ -4238,7 +4257,16 @@ static int64_t > free_memory_reason_t r = FMR_UNKNOWN; > > #ifdef _KERNEL >+#ifdef WAKE_PAGER >+ sbintime_t now; >+ static sbintime_t last_pagedaemon_wake = 0; >+#endif /* WAKE_PAGER */ >+ > if (needfree > 0) { >+ n = (int64_t)vm_cnt.v_free_target - (int64_t)vm_cnt.v_free_count; >+ needfree = n > 0 ? n : 0; >+ } >+ if (needfree > 0) { > n = PAGESIZE * (-needfree); > if (n < lowest) { > lowest = n; >@@ -4246,11 +4274,41 @@ static int64_t > } > } > >+#ifdef WAKE_PAGER >+/* >+ * When arc is initialized then check to see if we're in a VM "warming" zone, >+ * and if so then wake the pager -- the intent being to demote inactive pages. >+ */ >+ if (arc_init_done) { >+ now = getsbinuptime(); >+ if ((now - last_pagedaemon_wake) / SBT_1MS > zfs_arc_wakeup_delay) { >+ last_pagedaemon_wake = now; >+#ifdef REAP_ARC >+ arc_no_wake_event++; /* Set bypass flag for ARC */ >+#endif >+ if ( ( ((int64_t) freemem - zfs_arc_wakeup_pager) < 0) && (arc_warm == B_TRUE) ) { >+#ifdef REAP_ARC >+ arc_kmem_reap_now(0); /* Reap caches if we're close */ >+#endif >+ DTRACE_PROBE(arc__wake_pagedaemon); >+ (void) pagedaemon_wakeup(); /* Wake the pager */ >+#ifdef REAP_ARC >+ } else { >+ if ( ((int64_t) freemem - vm_cnt.v_free_target) < 0) { >+ arc_kmem_reap_now(1); /* Reap one cache if lots of memory */ >+ DTRACE_PROBE2(arc__reap_one, int, zfs_arc_last_slab, int, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); >+ } >+#endif >+ } >+ } >+ } >+ >+#endif /* WAKE_PAGER */ > /* > * Cooperate with pagedaemon when it's time for it to scan > * and reclaim some pages. > */ >- n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); >+ n = PAGESIZE * ((int64_t)freemem - (int64_t)zfs_arc_free_target - (int64_t)vm_cnt.v_free_reserved); > if (n < lowest) { > lowest = n; > r = FMR_LOTSFREE; >@@ -4355,6 +4413,16 @@ static int64_t > } > } > >+ /* Some memory can be in zone cache elements, for this case >+ * ARC cache not under memory pressure and can rise. >+ * zone_measure == FMM_INCLUDE_ZONE_CACHE flaged this >+ */ >+ if (lowest < 0 && zone_measure == FMM_INCLUDE_ZONE_CACHE) { >+ lowest = arc_check_uma_cache(lowest); >+ if (lowest >= 0) >+ r = FMR_UNKNOWN; >+ } >+ > #else /* _KERNEL */ > /* Every 100 calls, free a small amount */ > if (spa_get_random(100) == 0) >@@ -4376,7 +4444,7 @@ static int64_t > static boolean_t > arc_reclaim_needed(void) > { >- return (arc_available_memory() < 0); >+ return (arc_available_memory(FMM_INCLUDE_ZONE_CACHE) < 0); > } > > extern kmem_cache_t *zio_buf_cache[]; >@@ -4436,6 +4504,66 @@ arc_kmem_reap_now(void) > DTRACE_PROBE(arc__kmem_reap_end); > } > >+int sysctl_drain_cache = 1; >+SYSCTL_INT(_vfs_zfs, OID_AUTO, drain_uma_cache, CTLFLAG_RW, &sysctl_drain_cache, 0, "drain per-CPU UMA cache"); >+ >+ >+#ifdef _KERNEL >+static int64_t >+arc_check_uma_cache(int64_t lowest) >+{ >+ int iter = 4; >+ int step = 1 << (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT - 3); >+ int n = (SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT) - 1; >+ >+ while (n >= 0) { >+ lowest += uma_zone_get_free_size(zio_data_buf_cache[n]->kc_zone); >+ if (lowest >= 0) >+ return lowest; >+ n -= step; >+ if(--iter == 0) { >+ if (step > 1) step >>= 1; >+ iter = 4; >+ } >+ } >+ return lowest; >+} >+#endif >+ >+static void >+arc_drain_uma_cache(uint64_t target) >+{ >+ int iter = 4; >+ int step = 1 << (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT - 3); >+ int n = (SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT) - 1; >+ size_t free_size; >+ >+ DTRACE_PROBE2(arc__drain_uma_cache_start, uint64_t, target, uint64_t, (uint64_t)vm_cnt.v_free_count * PAGESIZE); >+#ifdef _KERNEL >+ free_size = (uint64_t)vm_cnt.v_free_count * PAGESIZE; >+ if (target <= free_size) >+ return; >+ while (n >= 0) { >+ free_size = uma_zone_get_free_size(zio_data_buf_cache[n]->kc_zone); >+ if (free_size) { >+ if (sysctl_drain_cache) >+ uma_reclaim_zone_cache(zio_data_buf_cache[n]->kc_zone); >+ kmem_cache_reap_now(zio_data_buf_cache[n]); >+ DTRACE_PROBE3(arc__drain_uma_cache_zone, char *, zio_data_buf_cache[n]->kc_name, size_t, free_size, uint64_t, (uint64_t)vm_cnt.v_free_count * PAGESIZE); >+ free_size = (uint64_t)vm_cnt.v_free_count * PAGESIZE; >+ if (target <= free_size) >+ break; >+ } >+ n -= step; >+ if(--iter == 0) { >+ if (step > 1) step >>= 1; >+ iter = 4; >+ } >+ } >+#endif >+ DTRACE_PROBE(arc__drain_uma_cache_end); >+} >+ > /* > * Threads can block in arc_get_data_impl() waiting for this thread to evict > * enough data and signal them to proceed. When this happens, the threads in >@@ -4487,7 +4615,8 @@ arc_reclaim_thread(void *dummy __unused) > */ > evicted = arc_adjust(); > >- int64_t free_memory = arc_available_memory(); >+ int64_t free_memory = arc_available_memory(FMM_EXCLUDE_ZONE_CACHE); >+ DTRACE_PROBE2(arc__reclaim_adj, uint64_t, evicted, int64_t, free_memory); > if (free_memory < 0) { > > arc_no_grow = B_TRUE; >@@ -4499,21 +4628,35 @@ arc_reclaim_thread(void *dummy __unused) > */ > growtime = gethrtime() + SEC2NSEC(arc_grow_retry); > >+#ifdef _KERNEL >+ if (arc_check_uma_cache(free_memory) >= 0) >+ arc_drain_uma_cache((uint64_t)freemem * PAGESIZE - free_memory); >+#else > arc_kmem_reap_now(); >- >+#endif >+ > /* > * If we are still low on memory, shrink the ARC > * so that we have arc_shrink_min free space. > */ >- free_memory = arc_available_memory(); >+ free_memory = arc_available_memory(FMM_EXCLUDE_ZONE_CACHE); > > int64_t to_free = > (arc_c >> arc_shrink_shift) - free_memory; >+ DTRACE_PROBE3(arc__reclaim_tst, int64_t, to_free, int64_t, free_memory, long, needfree); > if (to_free > 0) { > #ifdef _KERNEL > to_free = MAX(to_free, ptob(needfree)); >+ uint64_t free_target = >+ (uint64_t)freemem * PAGESIZE - free_memory; > #endif > arc_shrink(to_free); >+#ifdef _KERNEL >+ arc_drain_uma_cache(free_target); >+#else >+ arc_kmem_reap_now(); >+#endif >+ DTRACE_PROBE(arc__reclaim_shr); > } > } else if (free_memory < arc_c >> arc_no_grow_shift) { > arc_no_grow = B_TRUE; >@@ -6308,20 +6451,14 @@ static eventhandler_tag arc_event_lowmem = NULL; > static void > arc_lowmem(void *arg __unused, int howto __unused) > { >+ int64_t n; > > mutex_enter(&arc_reclaim_lock); > /* XXX: Memory deficit should be passed as argument. */ >- needfree = btoc(arc_c >> arc_shrink_shift); >+ n = (int64_t)vm_cnt.v_free_target - (int64_t)vm_cnt.v_free_count; >+ needfree = (n>0) ? n : vm_cnt.v_free_target >> 8; > DTRACE_PROBE(arc__needfree); > cv_signal(&arc_reclaim_thread_cv); >- >- /* >- * It is unsafe to block here in arbitrary threads, because we can come >- * here from ARC itself and may hold ARC locks and thus risk a deadlock >- * with ARC reclaim thread. >- */ >- if (curproc == pageproc) >- (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); > mutex_exit(&arc_reclaim_lock); > } > #endif >@@ -6632,6 +6769,9 @@ arc_init(void) > printf(" in /boot/loader.conf.\n"); > } > #endif >+#ifdef WAKE_PAGER >+ arc_init_done++; >+#endif > } > > void >Index: vm/uma.h >=================================================================== >--- vm/uma.h (revision 324056) >+++ vm/uma.h (working copy) >@@ -448,6 +448,16 @@ void uma_startup2(void); > void uma_reclaim(void); > > /* >+ * Reclaims unused per-CPU cache memory from the specified zone >+ * >+ * Arguments: >+ * zone The zone for cleanup >+ * Returns: >+ * None >+ */ >+void uma_reclaim_zone_cache(uma_zone_t zone); >+ >+/* > * Sets the alignment mask to be used for all zones requesting cache > * alignment. Should be called by MD boot code prior to starting VM/UMA. > * >@@ -545,6 +555,18 @@ void uma_zone_set_maxaction(uma_zone_t zone, uma_m > int uma_zone_get_cur(uma_zone_t zone); > > /* >+ * Obtains the approximate current size of items free in a zone >+ * >+ * Arguments: >+ * zone The zone to obtain the current free size from >+ * >+ * Return: >+ * int The approximate current size of items free in a zone >+ */ >+size_t uma_zone_get_free_size(uma_zone_t zone); >+ >+ >+/* > * The following two routines (uma_zone_set_init/fini) > * are used to set the backend init/fini pair which acts on an > * object as it becomes allocated and is placed in a slab within >Index: vm/uma_core.c >=================================================================== >--- vm/uma_core.c (revision 324056) >+++ vm/uma_core.c (working copy) >@@ -2987,6 +2987,39 @@ uma_zone_get_cur(uma_zone_t zone) > } > > /* See uma.h */ >+size_t >+uma_zone_get_free_size(uma_zone_t zone) >+{ >+ uma_klink_t kl; >+ uma_bucket_t bucket; >+ int64_t nitems; >+ u_int i; >+ >+ ZONE_LOCK(zone); >+ nitems = 0; >+ if(!(zone->uz_flags & UMA_ZONE_SECONDARY)) { >+ LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { >+ nitems += kl->kl_keg->uk_free; >+ } >+ } >+ CPU_FOREACH(i) { >+ /* >+ * See the comment in sysctl_vm_zone_stats() regarding the >+ * safety of accessing the per-cpu caches. With the zone lock >+ * held, it is safe, but can potentially result in stale data. >+ */ >+ bucket = zone->uz_cpu[i].uc_allocbucket; >+ if (bucket != NULL) >+ nitems += bucket->ub_cnt; >+ bucket = zone->uz_cpu[i].uc_freebucket; >+ if (bucket != NULL) >+ nitems += bucket->ub_cnt; >+ } >+ ZONE_UNLOCK(zone); >+ return (nitems * zone->uz_size); >+} >+ >+/* See uma.h */ > void > uma_zone_set_init(uma_zone_t zone, uma_init uminit) > { >@@ -3152,6 +3185,14 @@ uma_prealloc(uma_zone_t zone, int items) > } > > /* See uma.h */ >+void >+uma_reclaim_zone_cache(uma_zone_t zone) >+{ >+ bucket_enable(); >+ cache_drain_safe(zone); >+} >+ >+/* See uma.h */ > static void > uma_reclaim_locked(bool kmem_danger) > {
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 187594
:
140882
|
140883
|
140884
|
140885
|
140886
|
140887
|
140888
|
140889
|
140890
|
140891
|
140892
|
146178
|
146203
|
146249
|
146251
|
146287
|
146300
|
146373
|
146423
|
146424
|
146456
|
146816
|
146817
|
146851
|
146852
|
146854
|
146859
|
146861
|
146946
|
146947
|
146948
|
146949
|
147014
|
147068
|
147069
|
147070
|
147265
|
147274
|
147275
|
147276
|
147286
|
147459
|
147607
|
147609
|
147733
|
147738
|
147754
|
147815
|
152852
|
158809
|
159207
|
159688
|
159859
|
159905
|
161691
|
161692
|
161943
|
164051
|
174197
|
174198
|
174231
|
174232
|
174254
| 186818