View | Details | Raw Unified | Return to bug 187594 | Differences between
and this patch

Collapse All | Expand All

(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (-16 / +68 lines)
Lines 190-195 static int arc_dead; Link Here
190
extern int zfs_prefetch_disable;
190
extern int zfs_prefetch_disable;
191
191
192
/*
192
/*
193
 * KD 2015-02-10
194
 * We have to be able to test for UIO use inside the arc allocator.
195
 * NOTE: DO NOT MODIFY HERE!
196
 */
197
extern int zio_use_uma;
198
extern int zfs_dynamic_write_buffer;
199
200
201
/*
193
 * The arc has filled available memory and has now warmed up.
202
 * The arc has filled available memory and has now warmed up.
194
 */
203
 */
195
static boolean_t arc_warm;
204
static boolean_t arc_warm;
Lines 212-218 static void Link Here
212
arc_free_target_init(void *unused __unused)
221
arc_free_target_init(void *unused __unused)
213
{
222
{
214
223
215
	zfs_arc_free_target = vm_pageout_wakeup_thresh;
224
	zfs_arc_free_target = vm_pageout_wakeup_thresh + ((vm_cnt.v_free_target - vm_pageout_wakeup_thresh) / 2);
216
}
225
}
217
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
226
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218
    arc_free_target_init, NULL);
227
    arc_free_target_init, NULL);
Lines 233-239 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_block Link Here
233
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
242
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234
    &arc_shrink_shift, 0,
243
    &arc_shrink_shift, 0,
235
    "log2(fraction of arc to reclaim)");
244
    "log2(fraction of arc to reclaim)");
236
245
SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN,
246
    &zfs_dynamic_write_buffer, 0,
247
    "Dynamically restrict dirty data when memory is low");
237
/*
248
/*
238
 * We don't have a tunable for arc_free_target due to the dependency on
249
 * We don't have a tunable for arc_free_target due to the dependency on
239
 * pagedaemon initialisation.
250
 * pagedaemon initialisation.
Lines 2645-2650 extern kmem_cache_t *zio_buf_cache[]; Link Here
2645
extern kmem_cache_t	*zio_data_buf_cache[];
2656
extern kmem_cache_t	*zio_data_buf_cache[];
2646
extern kmem_cache_t	*range_seg_cache;
2657
extern kmem_cache_t	*range_seg_cache;
2647
2658
2659
static void __used
2660
reap_arc_caches()
2661
{
2662
	size_t          i;
2663
	kmem_cache_t            *prev_cache = NULL;
2664
	kmem_cache_t            *prev_data_cache = NULL;
2665
2666
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2667
		if (zio_buf_cache[i] != prev_cache) {
2668
			prev_cache = zio_buf_cache[i];
2669
			kmem_cache_reap_now(zio_buf_cache[i]);
2670
		}
2671
		if (zio_data_buf_cache[i] != prev_data_cache) {
2672
			prev_data_cache = zio_data_buf_cache[i];
2673
			kmem_cache_reap_now(zio_data_buf_cache[i]);
2674
		}
2675
	}
2676
	kmem_cache_reap_now(buf_cache);
2677
	kmem_cache_reap_now(hdr_full_cache);
2678
	kmem_cache_reap_now(hdr_l2only_cache);
2679
	kmem_cache_reap_now(range_seg_cache);
2680
}
2681
2648
static __noinline void
2682
static __noinline void
2649
arc_kmem_reap_now(void)
2683
arc_kmem_reap_now(void)
2650
{
2684
{
Lines 2676-2695 arc_kmem_reap_now(void) Link Here
2676
#endif
2709
#endif
2677
#endif
2710
#endif
2678
2711
2679
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2712
	reap_arc_caches();
2680
		if (zio_buf_cache[i] != prev_cache) {
2681
			prev_cache = zio_buf_cache[i];
2682
			kmem_cache_reap_now(zio_buf_cache[i]);
2683
		}
2684
		if (zio_data_buf_cache[i] != prev_data_cache) {
2685
			prev_data_cache = zio_data_buf_cache[i];
2686
			kmem_cache_reap_now(zio_data_buf_cache[i]);
2687
		}
2688
	}
2689
	kmem_cache_reap_now(buf_cache);
2690
	kmem_cache_reap_now(hdr_full_cache);
2691
	kmem_cache_reap_now(hdr_l2only_cache);
2692
	kmem_cache_reap_now(range_seg_cache);
2693
2713
2694
#ifdef illumos
2714
#ifdef illumos
2695
	if (zio_arena != NULL) {
2715
	if (zio_arena != NULL) {
Lines 2707-2718 arc_reclaim_thread(void *dummy __unused) Link Here
2707
{
2728
{
2708
	clock_t			growtime = 0;
2729
	clock_t			growtime = 0;
2709
	callb_cpr_t		cpr;
2730
	callb_cpr_t		cpr;
2731
	int			autoreap = 0;
2710
2732
2711
	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2733
	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2712
2734
2713
	mutex_enter(&arc_reclaim_thr_lock);
2735
	mutex_enter(&arc_reclaim_thr_lock);
2714
	while (arc_thread_exit == 0) {
2736
	while (arc_thread_exit == 0) {
2737
2738
#ifdef _KERNEL
2739
/* KD 2015-02-10
2740
 * Protect against UMA free memory bloat.  We already do this on a low-memory
2741
 * basis in the allocator; it has to happen there rather than here due to
2742
 * response time considerations.  Make the call here once every 10 passes as
2743
 * well; this reclaims unused UMA buffers every 10 seconds on an idle system
2744
 * and more frequently if the reclaim thread gets woken up by low RAM
2745
 * conditions.
2746
 */
2747
		if ((zio_use_uma) && (autoreap++ == 10)) {
2748
			autoreap = 0;
2749
			DTRACE_PROBE(arc__reclaim_timed_reap);
2750
			reap_arc_caches();
2751
		}
2752
#endif /* _KERNEL */
2753
2715
		int64_t free_memory = arc_available_memory();
2754
		int64_t free_memory = arc_available_memory();
2716
		if (free_memory < 0) {
2755
		if (free_memory < 0) {
2717
2756
2718
			arc_no_grow = B_TRUE;
2757
			arc_no_grow = B_TRUE;
Lines 2899-2904 arc_get_data_buf(arc_buf_t *buf) Link Here
2899
			arc_space_consume(size, ARC_SPACE_DATA);
2938
			arc_space_consume(size, ARC_SPACE_DATA);
2900
		} else {
2939
		} else {
2901
			ASSERT(type == ARC_BUFC_DATA);
2940
			ASSERT(type == ARC_BUFC_DATA);
2941
#ifdef _KERNEL
2942
/* KD 2015-02-10
2943
 * It would be nice if we could leave this to the arc_reclaim thread.
2944
 * Unfortunately we cannot; the test has to be done here as well, because
2945
 * under heavy I/O demand we can grab enough RAM fast enough to induce
2946
 * nasty oscillation problems.  Fortunately we only need to call this when
2947
 * the system is under reasonably-severe memory stress.
2948
 */
2949
			if (zio_use_uma && (ptob(vm_cnt.v_free_count) + size < ptob(vm_cnt.v_free_target))) {
2950
				DTRACE_PROBE3(arc__alloc_lowmem_reap, int, vm_cnt.v_free_count, int, size, int, vm_cnt.v_free_target);
2951
				reap_arc_caches();
2952
			}
2953
#endif /* _KERNEL */
2902
			buf->b_data = zio_data_buf_alloc(size);
2954
			buf->b_data = zio_data_buf_alloc(size);
2903
			ARCSTAT_INCR(arcstat_data_size, size);
2955
			ARCSTAT_INCR(arcstat_data_size, size);
2904
			atomic_add_64(&arc_size, size);
2956
			atomic_add_64(&arc_size, size);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (-4 / +43 lines)
Lines 42-47 Link Here
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43
    uint64_t arg1, uint64_t arg2);
43
    uint64_t arg1, uint64_t arg2);
44
44
45
extern	int	zio_use_uma;	/* Needs to be visible; DO NOT MODIFY! */
46
int	zfs_dynamic_write_buffer = 1;	/* Dynamically tune writes */
45
47
46
dmu_tx_t *
48
dmu_tx_t *
47
dmu_tx_create_dd(dsl_dir_t *dd)
49
dmu_tx_create_dd(dsl_dir_t *dd)
Lines 1058-1064 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) Link Here
1058
{
1060
{
1059
	dsl_pool_t *dp = tx->tx_pool;
1061
	dsl_pool_t *dp = tx->tx_pool;
1060
	uint64_t delay_min_bytes =
1062
	uint64_t delay_min_bytes =
1061
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1063
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
1062
	hrtime_t wakeup, min_tx_time, now;
1064
	hrtime_t wakeup, min_tx_time, now;
1063
1065
1064
	if (dirty <= delay_min_bytes)
1066
	if (dirty <= delay_min_bytes)
Lines 1070-1080 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) Link Here
1070
	 * have to handle the case of it being >= the max, which could
1072
	 * have to handle the case of it being >= the max, which could
1071
	 * cause a divide-by-zero if it's == the max.
1073
	 * cause a divide-by-zero if it's == the max.
1072
	 */
1074
	 */
1073
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1075
	ASSERT3U(dirty, <, zfs_dirty_data_max_internal);
1074
1076
1075
	now = gethrtime();
1077
	now = gethrtime();
1076
	min_tx_time = zfs_delay_scale *
1078
	min_tx_time = zfs_delay_scale *
1077
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1079
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty);
1078
	if (now > tx->tx_start + min_tx_time)
1080
	if (now > tx->tx_start + min_tx_time)
1079
		return;
1081
		return;
1080
1082
Lines 1279-1284 int Link Here
1279
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1281
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1280
{
1282
{
1281
	int err;
1283
	int err;
1284
	static	uint64_t	last_max;
1282
1285
1283
	ASSERT(tx->tx_txg == 0);
1286
	ASSERT(tx->tx_txg == 0);
1284
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1287
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
Lines 1291-1296 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) Link Here
1291
	if (txg_how == TXG_WAITED)
1294
	if (txg_how == TXG_WAITED)
1292
		tx->tx_waited = B_TRUE;
1295
		tx->tx_waited = B_TRUE;
1293
1296
1297
#ifdef _KERNEL
1298
	/*
1299
	 * KD 2014-09-22
1300
	 * If UMA is enabled it can only return a previously-used block
1301
	 * of identical size to what it had out before.  If it's not the
1302
	 * same size it will allocate a new one.  This is a problem because
1303
	 * dirty_data_max is the total dirty write data allowed out at any
1304
	 * given time, but with UMA on that can multiply by the number of
1305
	 * different block sizes (!!) requested in terms of free RAM that
1306
	 * is left allocated but unused.  For this reason never allow
1307
	 * dirty_data_max to exceed the difference between the paging
1308
	 * threshold and the current free memory, with a minimum of 256MB.
1309
	 * This throttles "burst" allocations and prevents the system from
1310
	 * choking during times of high write I/O demand.
1311
	 *
1312
	 * We allow this to be turned off if you want with
1313
	 * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time.
1314
	 *
1315
	 * Note that we work on the zfs_dirty_data_max_internal variable,
1316
	 * because the user may set zfs_dirty_data_max himself and we must
1317
	 * must honor that as a hard cap so it remains a usable tunable value.
1318
	 */
1319
	if (zio_use_uma & zfs_dynamic_write_buffer) {
1320
		zfs_dirty_data_max_internal = 1 << 28;
1321
		zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(vm_cnt.v_free_count - vm_cnt.v_free_target));
1322
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max);
1323
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max);
1324
		if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) {
1325
			last_max = zfs_dirty_data_max_internal / (1024 * 1024);
1326
			DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max);
1327
		}
1328
	} else {
1329
		zfs_dirty_data_max_internal = zfs_dirty_data_max;
1330
	}
1331
#endif /* _KERNEL */
1332
1294
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1333
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1295
		dmu_tx_unassign(tx);
1334
		dmu_tx_unassign(tx);
1296
1335
Lines 1321-1327 dmu_tx_wait(dmu_tx_t *tx) Link Here
1321
		 * space.
1360
		 * space.
1322
		 */
1361
		 */
1323
		mutex_enter(&dp->dp_lock);
1362
		mutex_enter(&dp->dp_lock);
1324
		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1363
		while (dp->dp_dirty_total >= zfs_dirty_data_max_internal)
1325
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1364
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1326
		uint64_t dirty = dp->dp_dirty_total;
1365
		uint64_t dirty = dp->dp_dirty_total;
1327
		mutex_exit(&dp->dp_lock);
1366
		mutex_exit(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (-2 / +5 lines)
Lines 97-104 Link Here
97
/*
97
/*
98
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
98
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
99
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
99
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
100
 * We also dynamically tune during low memory, honoring the sysctl set, so
101
 * internal comparisons are against zfs_dirty_data_max_internal.
100
 */
102
 */
101
uint64_t zfs_dirty_data_max;
103
uint64_t zfs_dirty_data_max;
104
uint64_t zfs_dirty_data_max_internal;
102
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
105
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
103
int zfs_dirty_data_max_percent = 10;
106
int zfs_dirty_data_max_percent = 10;
104
107
Lines 547-553 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta Link Here
547
	 * Note: we signal even when increasing dp_dirty_total.
550
	 * Note: we signal even when increasing dp_dirty_total.
548
	 * This ensures forward progress -- each thread wakes the next waiter.
551
	 * This ensures forward progress -- each thread wakes the next waiter.
549
	 */
552
	 */
550
	if (dp->dp_dirty_total <= zfs_dirty_data_max)
553
	if (dp->dp_dirty_total <= zfs_dirty_data_max_internal)
551
		cv_signal(&dp->dp_spaceavail_cv);
554
		cv_signal(&dp->dp_spaceavail_cv);
552
}
555
}
553
556
Lines 726-732 boolean_t Link Here
726
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
729
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
727
{
730
{
728
	uint64_t delay_min_bytes =
731
	uint64_t delay_min_bytes =
729
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
732
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
730
	boolean_t rv;
733
	boolean_t rv;
731
734
732
	mutex_enter(&dp->dp_lock);
735
	mutex_enter(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (+1 lines)
Lines 50-55 struct dmu_tx; Link Here
50
struct dsl_scan;
50
struct dsl_scan;
51
51
52
extern uint64_t zfs_dirty_data_max;
52
extern uint64_t zfs_dirty_data_max;
53
extern uint64_t zfs_dirty_data_max_internal;
53
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_sync;
55
extern uint64_t zfs_dirty_data_sync;
55
extern int zfs_dirty_data_max_percent;
56
extern int zfs_dirty_data_max_percent;
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (-2 / +5 lines)
Lines 43-52 Link Here
43
43
44
SYSCTL_DECL(_vfs_zfs);
44
SYSCTL_DECL(_vfs_zfs);
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
46
/* KD 2015-07-15 Change class to "int" from "static int" as we reference 
47
 * this as an extern elsewhere
48
 */
46
#if defined(__amd64__)
49
#if defined(__amd64__)
47
static int zio_use_uma = 1;
50
int zio_use_uma = 1;
48
#else
51
#else
49
static int zio_use_uma = 0;
52
int zio_use_uma = 0;
50
#endif
53
#endif
51
TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
54
TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
52
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
55
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,

Return to bug 187594