*** arc.c.orig Sun Mar 23 14:55:30 2014 --- arc.c Mon Aug 25 21:28:14 2014 *************** *** 18,23 **** --- 18,93 ---- * * CDDL HEADER END */ + + /* Karl Denninger (karl@denninger.net), 8/25/2014, FreeBSD-specific + * + * If "NEWRECLAIM" is defined, change the "low memory" warning that causes + * the ARC cache to be pared down. The reason for the change is that the + * apparent attempted algorithm is to start evicting ARC cache when free + * pages fall below 25% of installed RAM. This maps reasonably well to how + * Solaris is documented to behave; when "lotsfree" is invaded ZFS is told + * to pare down. + * + * The problem is that on FreeBSD machines the system doesn't appear to be + * getting what the authors of the original code thought they were looking at + * with its test -- or at least not what Solaris did -- and as a result that + * test never triggers. That leaves the only reclaim trigger as the "paging + * needed" status flag, and by the time * that trips the system is already + * in low-memory trouble. This can lead to severe pathological behavior + * under the following scenario: + * - The system starts to page and ARC is evicted. + * - The system stops paging as ARC's eviction drops wired RAM a bit. + * - ARC starts increasing its allocation again, and wired memory grows. + * - A new image is activated, and the system once again attempts to page. + * - ARC starts to be evicted again. + * - Back to #2 + * + * Note that ZFS's ARC default (unless you override it in /boot/loader.conf) + * is to allow the ARC cache to grab nearly all of free RAM, provided nobody + * else needs it. That would be ok if we evicted cache when required. + * + * Unfortunately the system can get into a state where it never + * manages to page anything of materiality back in, as if there is active + * I/O the ARC will start grabbing space once again as soon as the memory + * contention state drops. For this reason the "paging is occurring" flag + * should be the **last resort** condition for ARC eviction; you want to + * (as Solaris does) start when there is material free RAM left BUT the + * vm system thinks it needs to be active to steal pages back in the attempt + * to never get into the condition where you're potentially paging off + * executables in favor of leaving disk cache allocated. + * + * To fix this we change how we look at low memory, declaring two new + * runtime tunables and one status. + * + * The new sysctls are: + * vfs.zfs.arc_freepages (free pages required to call RAM "sufficient") + * vfs.zfs.arc_shrink_needed (shows "1" if we're asking for shrinking the ARC) + * + * vfs.zfs.arc_freepages is initialized from vm.v_free_target. + * This should insure that we allow the VM system to steal pages, + * but pare the cache before we suspend processes attempting to get more + * memory, thereby avoiding "stalls." You can set this higher if you wish, + * but doing so may cause the cache to pare back while the VM system + * remains willing to allow "inactive" pages to accumulate. The challenge + * is that image activation can force things into the page space on a + * repeated basis if you allow this level to be too small (the above + * pathological behavior); the defaults should avoid that behavior but the + * sysctl is exposed should your workload require adjustment. + * + * If we're using this check for low memory we are replacing the previous + * ones, including the oddball "random" reclaim that appears to fire far + * more often than it should. We still trigger if the system pages. + * + * If you turn on NEWRECLAIM_DEBUG then the kernel will print on the console + * status messages when the reclaim status trips on and off, along with the + * page count aggregate that triggered it (and the free space) for each + * event. + */ + + #define NEWRECLAIM + #undef NEWRECLAIM_DEBUG + + /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. *************** *** 139,144 **** --- 209,228 ---- #include + #ifdef NEWRECLAIM + #ifdef __FreeBSD__ + #include + #include + /* + * Struct cnt. was renamed in -head (11-current) at rev 110016; check for it + */ + #if __FreeBSD_version < 1100016 + #define vm_cnt cnt + #endif /* __FreeBSD_version */ + + #endif /* __FreeBSD__ */ + #endif /* NEWRECLAIM */ + #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ *************** *** 203,218 **** --- 287,322 ---- int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; int zfs_disable_dup_eviction = 0; + #ifdef NEWRECLAIM + #ifdef __FreeBSD__ + static int freepages = 0; /* This much memory is considered critical */ + static int shrink_needed = 0; /* Shrinkage of ARC cache needed? */ + #endif /* __FreeBSD__ */ + #endif /* NEWRECLAIM */ TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); + #ifdef NEWRECLAIM + #ifdef __FreeBSD__ + TUNABLE_INT("vfs.zfs.arc_freepages", &freepages); + TUNABLE_INT("vfs.zfs.arc_shrink_needed", &shrink_needed); + #endif /* __FreeBSD__ */ + #endif /* NEWRECLAIM */ + SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, "Minimum ARC size"); + #ifdef NEWRECLAIM + #ifdef __FreeBSD__ + SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_freepages, CTLFLAG_RWTUN, &freepages, 0, "ARC Free RAM Pages Required"); + SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_needed, CTLFLAG_RD, &shrink_needed, 0, "ARC Memory Constrained (0 = no, 1 = yes)"); + #endif /* __FreeBSD__ */ + #endif /* NEWRECLAIM */ + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) *************** *** 2438,2443 **** --- 2542,2551 ---- { #ifdef _KERNEL + #ifdef NEWRECLAIM_DEBUG + static int xval = -1; + static int oldfreepages = 0; + #endif /* NEWRECLAIM_DEBUG */ if (needfree) return (1); *************** *** 2476,2481 **** --- 2584,2590 ---- return (1); #if defined(__i386) + /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical *************** *** 2492,2502 **** return (1); #endif #else /* !sun */ if (kmem_used() > (kmem_size() * 3) / 4) return (1); #endif /* sun */ - #else if (spa_get_random(100) == 0) return (1); #endif --- 2601,2661 ---- return (1); #endif #else /* !sun */ + + #ifdef NEWRECLAIM + #ifdef __FreeBSD__ + /* + * Implement the new tunable free RAM algorithm. We check the free pages + * against the minimum specified target and the percentage that should be + * free. If we're low we ask for ARC cache shrinkage. If this is defined + * on a FreeBSD system the older checks are not performed. + * + * Check first to see if we need to init freepages, then test. + */ + if (!freepages) { /* If zero then (re)init */ + freepages = vm_cnt.v_free_target; + #ifdef NEWRECLAIM_DEBUG + printf("ZFS ARC: Default vfs.zfs.arc_freepages to [%u]\n", freepages); + #endif /* NEWRECLAIM_DEBUG */ + } + #ifdef NEWRECLAIM_DEBUG + if (freepages != oldfreepages) { + printf("ZFS ARC: Low RAM page change to [%d], [%d] pages, [%d] free\n", freepages, vm_cnt.v_page_count, vm_cnt.v_free_count); + oldfreepages = freepages; + } + #endif /* NEWRECLAIM_DEBUG */ + /* + * Now figure out how much free RAM we require to call the ARC cache status + * "ok". Add the percentage specified of the total to the base requirement. + */ + + if (vm_cnt.v_free_count < freepages) { + #ifdef NEWRECLAIM_DEBUG + if (xval != 1) { + printf("ZFS ARC: RECLAIM total %u, free %u, free pct (%u), reserved (%u)\n", vm_cnt.v_page_count, vm_cnt.v_free_count, ((vm_cnt.v_free_count * 100) / vm_cnt.v_page_count), freepages); + xval = 1; + } + #endif /* NEWRECLAIM_DEBUG */ + shrink_needed = 1; + return(1); + } else { + #ifdef NEWRECLAIM_DEBUG + if (xval != 0) { + printf("ZFS ARC: NORMAL total %u, free %u, free pct (%u), reserved (%u)\n", vm_cnt.v_page_count, vm_cnt.v_free_count, ((vm_cnt.v_free_count * 100) / vm_cnt.v_page_count), freepages); + xval = 0; + } + #endif /* NEWRECLAIM_DEBUG */ + shrink_needed = 0; + return(0); + } + + #endif /* __FreeBSD__ */ + #endif /* NEWRECLAIM */ + if (kmem_used() > (kmem_size() * 3) / 4) return (1); #endif /* sun */ if (spa_get_random(100) == 0) return (1); #endif