Index: sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c =================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (revision 271017) +++ sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c (working copy) @@ -132,12 +132,25 @@ kmem_size(void) } uint64_t -kmem_used(void) +kmem_map_used(void) { return (kmem_map->size); } +uint64_t +kmem_map_free(void) +{ + uint64_t size; + + vm_map_lock_read(kmem_map); + size = kmem_map->root != NULL ? kmem_map->root->max_free : + kmem_map->max_offset - kmem_map->min_offset; + vm_map_unlock_read(kmem_map); + + return (size); +} + static int kmem_std_constructor(void *mem, int size __unused, void *private, int flags) { Index: sys/cddl/compat/opensolaris/sys/kmem.h =================================================================== --- sys/cddl/compat/opensolaris/sys/kmem.h (revision 271017) +++ sys/cddl/compat/opensolaris/sys/kmem.h (working copy) @@ -66,7 +66,8 @@ typedef struct kmem_cache { void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); -uint64_t kmem_used(void); +uint64_t kmem_map_used(void); +uint64_t kmem_map_free(void); kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); @@ -78,6 +79,8 @@ void kmem_reap(void); int kmem_debugging(void); void *calloc(size_t n, size_t s); +#define freemem (cnt.v_free_count + cnt.v_cache_count) +#define minfree cnt.v_free_min #define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) #define kmem_zalloc(size, kmflags) zfs_kmem_alloc((size), (kmflags) | M_ZERO) #define kmem_free(buf, size) zfs_kmem_free((buf), (size)) Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 271017) +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (working copy) @@ -193,9 +193,6 @@ extern int zfs_prefetch_disable; */ static boolean_t arc_warm; -/* - * These tunables are for performance analysis. - */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; @@ -203,7 +200,20 @@ int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; +u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */ +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); + +#ifdef _KERNEL +static void +arc_free_target_init(void *unused __unused) +{ + + zfs_arc_free_target = cnt.v_free_reserved + cnt.v_cache_min; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); @@ -214,6 +224,37 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_ "Minimum ARC size"); /* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); + +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} +#endif + +/* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached @@ -2405,9 +2446,12 @@ arc_flush(spa_t *spa) void arc_shrink(void) { + if (arc_c > arc_c_min) { uint64_t to_free; + DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min); #ifdef _KERNEL to_free = arc_c >> arc_shrink_shift; #else @@ -2427,8 +2471,11 @@ arc_shrink(void) ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) + if (arc_size > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, + uint64_t, arc_c); arc_adjust(); + } } static int needfree = 0; @@ -2439,15 +2486,20 @@ arc_reclaim_needed(void) #ifdef _KERNEL - if (needfree) + if (needfree) { + DTRACE_PROBE(arc__reclaim_needfree); return (1); + } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ - if (vm_paging_needed()) + if (freemem < zfs_arc_free_target) { + DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t, + freemem, uint64_t, zfs_arc_free_target); return (1); + } #ifdef sun /* @@ -2487,19 +2539,41 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (btop(vmem_size(heap_arena, VMEM_FREE)) < - (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) + if (vmem_size(heap_arena, VMEM_FREE) < + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) return (1); #endif + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL && + vmem_size(zio_arena, VMEM_FREE) < + (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) + return (1); #else /* !sun */ - if (kmem_used() > (kmem_size() * 3) / 4) +#ifndef UMA_MD_SMALL_ALLOC + /* i386 has KVA limits that the raw page counts above don't consider */ + if (kmem_map_used() > (kmem_size() * 3) / 4) { + DTRACE_PROBE2(arc__reclaim_used, uint64_t, + kmem_map_used(), uint64_t, (kmem_size() * 3) / 4); return (1); + } +#endif #endif /* sun */ -#else +#else /* !_KERNEL */ if (spa_get_random(100) == 0) return (1); -#endif +#endif /* _KERNEL */ + DTRACE_PROBE(arc__reclaim_no); + return (0); } @@ -2697,20 +2771,6 @@ arc_evict_needed(arc_buf_contents_t type) if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) return (1); -#ifdef sun -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif -#endif /* sun */ - if (arc_reclaim_needed()) return (1); @@ -3876,8 +3936,7 @@ static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL - uint64_t available_memory = - ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count); + uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3886,10 +3945,13 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif +#else /* sun */ +#ifndef UMA_MD_SMALL_ALLOC + available_memory = MIN(available_memory, kmem_map_free()); +#endif #endif /* sun */ - if (cnt.v_free_count + cnt.v_cache_count > - (uint64_t)physmem * arc_lotsfree_percent / 100) + if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3902,7 +3964,7 @@ arc_memory_throttle(uint64_t reserve, uint64_t txg * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { - if (page_load > available_memory / 4) + if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; Index: sys/vm/vm_pageout.c =================================================================== --- sys/vm/vm_pageout.c (revision 271017) +++ sys/vm/vm_pageout.c (working copy) @@ -112,9 +112,13 @@ __FBSDID("$FreeBSD$"); /* the kernel process "vm_pageout"*/ static void vm_pageout(void); +static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(int pass); +SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, + NULL); + struct proc *pageproc; static struct kproc_desc page_kp = { @@ -122,7 +126,7 @@ static struct kproc_desc page_kp = { vm_pageout, &pageproc }; -SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, +SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); #if !defined(NO_SWAPPING) @@ -1506,13 +1510,11 @@ vm_pageout_page_stats() } /* - * vm_pageout is the high level pageout daemon. + * vm_pageout_init initialises basic pageout daemon settings. */ static void -vm_pageout() +vm_pageout_init() { - int error, pass; - /* * Initialize some paging parameters. */ @@ -1579,7 +1581,16 @@ static void vm_pageout_stats_interval = 5; if (vm_pageout_full_stats_interval == 0) vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; +} +/* + * vm_pageout is the high level pageout daemon. + */ +static void +vm_pageout() +{ + int error, pass; + swap_pager_swap_init(); pass = 0; /*