--- cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 324056) +++ cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) @@ -362,7 +362,16 @@ int zfs_arc_no_grow_shift = 0; int zfs_arc_p_min_shift = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; +u_int zfs_arc_wakeup_pager = 0; +u_int zfs_arc_wakeup_delay = 500; /* 500ms between pager wakeups min */ +#define WAKE_PAGER +#ifdef WAKE_PAGER +#define FREE_TARGET_CONSTANT 10 / 8 /* Target above pageout_wakeup_thresh */ +static int arc_init_done = 0; /* After arc_warm is valid */ +extern void pagedaemon_wakeup(void); +#endif + /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; @@ -379,7 +388,9 @@ static void arc_free_target_init(void *unused __unused) { - zfs_arc_free_target = vm_pageout_wakeup_thresh; + zfs_arc_free_target = vm_pageout_wakeup_thresh + (vm_pageout_wakeup_thresh / 5); + zfs_arc_wakeup_pager = vm_pageout_wakeup_thresh + ((vm_cnt.v_free_target - vm_pageout_wakeup_thresh) / 2); + } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); @@ -4225,13 +4236,21 @@ int64_t arc_pages_pp_reserve = 64; */ int64_t arc_swapfs_reserve = 64; +typedef enum free_memory_measure_t { + FMM_EXCLUDE_ZONE_CACHE, + FMM_INCLUDE_ZONE_CACHE +} free_memory_measure_t; + /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ + +static int64_t arc_check_uma_cache(int64_t lowest); + static int64_t -arc_available_memory(void) +arc_available_memory(free_memory_measure_t zone_measure) { int64_t lowest = INT64_MAX; int64_t n; @@ -4238,7 +4257,16 @@ static int64_t free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL +#ifdef WAKE_PAGER + sbintime_t now; + static sbintime_t last_pagedaemon_wake = 0; +#endif /* WAKE_PAGER */ + if (needfree > 0) { + n = (int64_t)vm_cnt.v_free_target - (int64_t)vm_cnt.v_free_count; + needfree = n > 0 ? n : 0; + } + if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { lowest = n; @@ -4246,11 +4274,41 @@ static int64_t } } +#ifdef WAKE_PAGER +/* + * When arc is initialized then check to see if we're in a VM "warming" zone, + * and if so then wake the pager -- the intent being to demote inactive pages. + */ + if (arc_init_done) { + now = getsbinuptime(); + if ((now - last_pagedaemon_wake) / SBT_1MS > zfs_arc_wakeup_delay) { + last_pagedaemon_wake = now; +#ifdef REAP_ARC + arc_no_wake_event++; /* Set bypass flag for ARC */ +#endif + if ( ( ((int64_t) freemem - zfs_arc_wakeup_pager) < 0) && (arc_warm == B_TRUE) ) { +#ifdef REAP_ARC + arc_kmem_reap_now(0); /* Reap caches if we're close */ +#endif + DTRACE_PROBE(arc__wake_pagedaemon); + (void) pagedaemon_wakeup(); /* Wake the pager */ +#ifdef REAP_ARC + } else { + if ( ((int64_t) freemem - vm_cnt.v_free_target) < 0) { + arc_kmem_reap_now(1); /* Reap one cache if lots of memory */ + DTRACE_PROBE2(arc__reap_one, int, zfs_arc_last_slab, int, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + } +#endif + } + } + } + +#endif /* WAKE_PAGER */ /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + n = PAGESIZE * ((int64_t)freemem - (int64_t)zfs_arc_free_target - (int64_t)vm_cnt.v_free_reserved); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; @@ -4355,6 +4413,16 @@ static int64_t } } + /* Some memory can be in zone cache elements, for this case + * ARC cache not under memory pressure and can rise. + * zone_measure == FMM_INCLUDE_ZONE_CACHE flaged this + */ + if (lowest < 0 && zone_measure == FMM_INCLUDE_ZONE_CACHE) { + lowest = arc_check_uma_cache(lowest); + if (lowest >= 0) + r = FMR_UNKNOWN; + } + #else /* _KERNEL */ /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) @@ -4376,7 +4444,7 @@ static int64_t static boolean_t arc_reclaim_needed(void) { - return (arc_available_memory() < 0); + return (arc_available_memory(FMM_INCLUDE_ZONE_CACHE) < 0); } extern kmem_cache_t *zio_buf_cache[]; @@ -4436,6 +4504,66 @@ arc_kmem_reap_now(void) DTRACE_PROBE(arc__kmem_reap_end); } +int sysctl_drain_cache = 1; +SYSCTL_INT(_vfs_zfs, OID_AUTO, drain_uma_cache, CTLFLAG_RW, &sysctl_drain_cache, 0, "drain per-CPU UMA cache"); + + +#ifdef _KERNEL +static int64_t +arc_check_uma_cache(int64_t lowest) +{ + int iter = 4; + int step = 1 << (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT - 3); + int n = (SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT) - 1; + + while (n >= 0) { + lowest += uma_zone_get_free_size(zio_data_buf_cache[n]->kc_zone); + if (lowest >= 0) + return lowest; + n -= step; + if(--iter == 0) { + if (step > 1) step >>= 1; + iter = 4; + } + } + return lowest; +} +#endif + +static void +arc_drain_uma_cache(uint64_t target) +{ + int iter = 4; + int step = 1 << (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT - 3); + int n = (SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT) - 1; + size_t free_size; + + DTRACE_PROBE2(arc__drain_uma_cache_start, uint64_t, target, uint64_t, (uint64_t)vm_cnt.v_free_count * PAGESIZE); +#ifdef _KERNEL + free_size = (uint64_t)vm_cnt.v_free_count * PAGESIZE; + if (target <= free_size) + return; + while (n >= 0) { + free_size = uma_zone_get_free_size(zio_data_buf_cache[n]->kc_zone); + if (free_size) { + if (sysctl_drain_cache) + uma_reclaim_zone_cache(zio_data_buf_cache[n]->kc_zone); + kmem_cache_reap_now(zio_data_buf_cache[n]); + DTRACE_PROBE3(arc__drain_uma_cache_zone, char *, zio_data_buf_cache[n]->kc_name, size_t, free_size, uint64_t, (uint64_t)vm_cnt.v_free_count * PAGESIZE); + free_size = (uint64_t)vm_cnt.v_free_count * PAGESIZE; + if (target <= free_size) + break; + } + n -= step; + if(--iter == 0) { + if (step > 1) step >>= 1; + iter = 4; + } + } +#endif + DTRACE_PROBE(arc__drain_uma_cache_end); +} + /* * Threads can block in arc_get_data_impl() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in @@ -4487,7 +4615,8 @@ arc_reclaim_thread(void *dummy __unused) */ evicted = arc_adjust(); - int64_t free_memory = arc_available_memory(); + int64_t free_memory = arc_available_memory(FMM_EXCLUDE_ZONE_CACHE); + DTRACE_PROBE2(arc__reclaim_adj, uint64_t, evicted, int64_t, free_memory); if (free_memory < 0) { arc_no_grow = B_TRUE; @@ -4499,21 +4628,35 @@ arc_reclaim_thread(void *dummy __unused) */ growtime = gethrtime() + SEC2NSEC(arc_grow_retry); +#ifdef _KERNEL + if (arc_check_uma_cache(free_memory) >= 0) + arc_drain_uma_cache((uint64_t)freemem * PAGESIZE - free_memory); +#else arc_kmem_reap_now(); - +#endif + /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. */ - free_memory = arc_available_memory(); + free_memory = arc_available_memory(FMM_EXCLUDE_ZONE_CACHE); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; + DTRACE_PROBE3(arc__reclaim_tst, int64_t, to_free, int64_t, free_memory, long, needfree); if (to_free > 0) { #ifdef _KERNEL to_free = MAX(to_free, ptob(needfree)); + uint64_t free_target = + (uint64_t)freemem * PAGESIZE - free_memory; #endif arc_shrink(to_free); +#ifdef _KERNEL + arc_drain_uma_cache(free_target); +#else + arc_kmem_reap_now(); +#endif + DTRACE_PROBE(arc__reclaim_shr); } } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; @@ -6308,20 +6451,14 @@ static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { + int64_t n; mutex_enter(&arc_reclaim_lock); /* XXX: Memory deficit should be passed as argument. */ - needfree = btoc(arc_c >> arc_shrink_shift); + n = (int64_t)vm_cnt.v_free_target - (int64_t)vm_cnt.v_free_count; + needfree = (n>0) ? n : vm_cnt.v_free_target >> 8; DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thread_cv); - - /* - * It is unsafe to block here in arbitrary threads, because we can come - * here from ARC itself and may hold ARC locks and thus risk a deadlock - * with ARC reclaim thread. - */ - if (curproc == pageproc) - (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); mutex_exit(&arc_reclaim_lock); } #endif @@ -6632,6 +6769,9 @@ arc_init(void) printf(" in /boot/loader.conf.\n"); } #endif +#ifdef WAKE_PAGER + arc_init_done++; +#endif } void --- vm/uma.h (revision 324056) +++ vm/uma.h (working copy) @@ -448,6 +448,16 @@ void uma_startup2(void); void uma_reclaim(void); /* + * Reclaims unused per-CPU cache memory from the specified zone + * + * Arguments: + * zone The zone for cleanup + * Returns: + * None + */ +void uma_reclaim_zone_cache(uma_zone_t zone); + +/* * Sets the alignment mask to be used for all zones requesting cache * alignment. Should be called by MD boot code prior to starting VM/UMA. * @@ -545,6 +555,18 @@ void uma_zone_set_maxaction(uma_zone_t zone, uma_m int uma_zone_get_cur(uma_zone_t zone); /* + * Obtains the approximate current size of items free in a zone + * + * Arguments: + * zone The zone to obtain the current free size from + * + * Return: + * int The approximate current size of items free in a zone + */ +size_t uma_zone_get_free_size(uma_zone_t zone); + + +/* * The following two routines (uma_zone_set_init/fini) * are used to set the backend init/fini pair which acts on an * object as it becomes allocated and is placed in a slab within --- vm/uma_core.c (revision 324056) +++ vm/uma_core.c (working copy) @@ -2987,6 +2987,39 @@ uma_zone_get_cur(uma_zone_t zone) } /* See uma.h */ +size_t +uma_zone_get_free_size(uma_zone_t zone) +{ + uma_klink_t kl; + uma_bucket_t bucket; + int64_t nitems; + u_int i; + + ZONE_LOCK(zone); + nitems = 0; + if(!(zone->uz_flags & UMA_ZONE_SECONDARY)) { + LIST_FOREACH(kl, &zone->uz_kegs, kl_link) { + nitems += kl->kl_keg->uk_free; + } + } + CPU_FOREACH(i) { + /* + * See the comment in sysctl_vm_zone_stats() regarding the + * safety of accessing the per-cpu caches. With the zone lock + * held, it is safe, but can potentially result in stale data. + */ + bucket = zone->uz_cpu[i].uc_allocbucket; + if (bucket != NULL) + nitems += bucket->ub_cnt; + bucket = zone->uz_cpu[i].uc_freebucket; + if (bucket != NULL) + nitems += bucket->ub_cnt; + } + ZONE_UNLOCK(zone); + return (nitems * zone->uz_size); +} + +/* See uma.h */ void uma_zone_set_init(uma_zone_t zone, uma_init uminit) { @@ -3152,6 +3185,14 @@ uma_prealloc(uma_zone_t zone, int items) } /* See uma.h */ +void +uma_reclaim_zone_cache(uma_zone_t zone) +{ + bucket_enable(); + cache_drain_safe(zone); +} + +/* See uma.h */ static void uma_reclaim_locked(bool kmem_danger) {