View | Details | Raw Unified | Return to bug 187594 | Differences between
and this patch

Collapse All | Expand All

(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (-16 / +67 lines)
Lines 216-221 Link Here
216
extern int zfs_prefetch_disable;
216
extern int zfs_prefetch_disable;
217
217
218
/*
218
/*
219
 * KD 2015-02-10
220
 * We have to be able to test for UIO use inside the arc allocator.
221
 * NOTE: DO NOT MODIFY HERE!
222
 */
223
extern int zio_use_uma;
224
extern int zfs_dynamic_write_buffer;
225
226
227
/*
219
 * The arc has filled available memory and has now warmed up.
228
 * The arc has filled available memory and has now warmed up.
220
 */
229
 */
221
static boolean_t arc_warm;
230
static boolean_t arc_warm;
Lines 242-248 Link Here
242
arc_free_target_init(void *unused __unused)
251
arc_free_target_init(void *unused __unused)
243
{
252
{
244
253
245
	zfs_arc_free_target = vm_pageout_wakeup_thresh;
254
	zfs_arc_free_target = vm_pageout_wakeup_thresh + ((cnt.v_free_target - vm_pageout_wakeup_thresh) / 2);
246
}
255
}
247
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
256
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
248
    arc_free_target_init, NULL);
257
    arc_free_target_init, NULL);
Lines 261-267 Link Here
261
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
270
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
262
    &arc_shrink_shift, 0,
271
    &arc_shrink_shift, 0,
263
    "log2(fraction of arc to reclaim)");
272
    "log2(fraction of arc to reclaim)");
264
273
SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN,
274
    &zfs_dynamic_write_buffer, 0,
275
    "Dynamically restrict dirty data when memory is low");
265
/*
276
/*
266
 * We don't have a tunable for arc_free_target due to the dependency on
277
 * We don't have a tunable for arc_free_target due to the dependency on
267
 * pagedaemon initialisation.
278
 * pagedaemon initialisation.
Lines 3518-3523 Link Here
3518
extern kmem_cache_t	*zio_data_buf_cache[];
3529
extern kmem_cache_t	*zio_data_buf_cache[];
3519
extern kmem_cache_t	*range_seg_cache;
3530
extern kmem_cache_t	*range_seg_cache;
3520
3531
3532
static void __used
3533
reap_arc_caches()
3534
{
3535
	size_t          i;
3536
	kmem_cache_t            *prev_cache = NULL;
3537
	kmem_cache_t            *prev_data_cache = NULL;
3538
3539
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3540
		if (zio_buf_cache[i] != prev_cache) {
3541
			prev_cache = zio_buf_cache[i];
3542
			kmem_cache_reap_now(zio_buf_cache[i]);
3543
		}
3544
		if (zio_data_buf_cache[i] != prev_data_cache) {
3545
			prev_data_cache = zio_data_buf_cache[i];
3546
			kmem_cache_reap_now(zio_data_buf_cache[i]);
3547
		}
3548
	}
3549
	kmem_cache_reap_now(buf_cache);
3550
	kmem_cache_reap_now(hdr_full_cache);
3551
	kmem_cache_reap_now(hdr_l2only_cache);
3552
	kmem_cache_reap_now(range_seg_cache);
3553
}
3554
3521
static __noinline void
3555
static __noinline void
3522
arc_kmem_reap_now(void)
3556
arc_kmem_reap_now(void)
3523
{
3557
{
Lines 3542-3561 Link Here
3542
#endif
3576
#endif
3543
#endif
3577
#endif
3544
3578
3545
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3579
	reap_arc_caches();
3546
		if (zio_buf_cache[i] != prev_cache) {
3547
			prev_cache = zio_buf_cache[i];
3548
			kmem_cache_reap_now(zio_buf_cache[i]);
3549
		}
3550
		if (zio_data_buf_cache[i] != prev_data_cache) {
3551
			prev_data_cache = zio_data_buf_cache[i];
3552
			kmem_cache_reap_now(zio_data_buf_cache[i]);
3553
		}
3554
	}
3555
	kmem_cache_reap_now(buf_cache);
3556
	kmem_cache_reap_now(hdr_full_cache);
3557
	kmem_cache_reap_now(hdr_l2only_cache);
3558
	kmem_cache_reap_now(range_seg_cache);
3559
3580
3560
#ifdef illumos
3581
#ifdef illumos
3561
	if (zio_arena != NULL) {
3582
	if (zio_arena != NULL) {
Lines 3590-3600 Link Here
3590
{
3611
{
3591
	clock_t			growtime = 0;
3612
	clock_t			growtime = 0;
3592
	callb_cpr_t		cpr;
3613
	callb_cpr_t		cpr;
3614
	int			autoreap = 0;
3593
3615
3594
	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3616
	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3595
3617
3596
	mutex_enter(&arc_reclaim_lock);
3618
	mutex_enter(&arc_reclaim_lock);
3597
	while (!arc_reclaim_thread_exit) {
3619
	while (!arc_reclaim_thread_exit) {
3620
#ifdef _KERNEL
3621
/* KD 2015-02-10
3622
 * Protect against UMA free memory bloat.  We already do this on a low-memory
3623
 * basis in the allocator; it has to happen there rather than here due to
3624
 * response time considerations.  Make the call here once every 10 passes as
3625
 * well; this reclaims unused UMA buffers every 10 seconds on an idle system
3626
 * and more frequently if the reclaim thread gets woken up by low RAM
3627
 * conditions.
3628
 */
3629
		if ((zio_use_uma) && (autoreap++ == 10)) {
3630
			autoreap = 0;
3631
			DTRACE_PROBE(arc__reclaim_timed_reap);
3632
			reap_arc_caches();
3633
		}
3634
#endif /* _KERNEL */
3635
3598
		int64_t free_memory = arc_available_memory();
3636
		int64_t free_memory = arc_available_memory();
3599
		uint64_t evicted = 0;
3637
		uint64_t evicted = 0;
3600
3638
Lines 3860-3865 Link Here
3860
		arc_space_consume(size, ARC_SPACE_META);
3899
		arc_space_consume(size, ARC_SPACE_META);
3861
	} else {
3900
	} else {
3862
		ASSERT(type == ARC_BUFC_DATA);
3901
		ASSERT(type == ARC_BUFC_DATA);
3902
#ifdef _KERNEL
3903
/* KD 2015-02-10
3904
 * It would be nice if we could leave this to the arc_reclaim thread.
3905
 * Unfortunately we cannot; the test has to be done here as well, because
3906
 * under heavy I/O demand we can grab enough RAM fast enough to induce
3907
 * nasty oscillation problems.  Fortunately we only need to call this when
3908
 * the system is under reasonably-severe memory stress.
3909
 */
3910
		if (zio_use_uma && (ptob(cnt.v_free_count) + size < ptob(cnt.v_free_target))) {
3911
			DTRACE_PROBE3(arc__alloc_lowmem_reap, int, cnt.v_free_count, int, size, int, cnt.v_free_target);
3912
			reap_arc_caches();
3913
		}
3914
#endif /* _KERNEL */
3863
		buf->b_data = zio_data_buf_alloc(size);
3915
		buf->b_data = zio_data_buf_alloc(size);
3864
		arc_space_consume(size, ARC_SPACE_DATA);
3916
		arc_space_consume(size, ARC_SPACE_DATA);
3865
	}
3917
	}
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (-4 / +43 lines)
Lines 42-47 Link Here
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43
    uint64_t arg1, uint64_t arg2);
43
    uint64_t arg1, uint64_t arg2);
44
44
45
extern	int	zio_use_uma;	/* Needs to be visible; DO NOT MODIFY! */
46
int	zfs_dynamic_write_buffer = 1;	/* Dynamically tune writes */
45
47
46
dmu_tx_t *
48
dmu_tx_t *
47
dmu_tx_create_dd(dsl_dir_t *dd)
49
dmu_tx_create_dd(dsl_dir_t *dd)
Lines 1060-1066 Link Here
1060
{
1062
{
1061
	dsl_pool_t *dp = tx->tx_pool;
1063
	dsl_pool_t *dp = tx->tx_pool;
1062
	uint64_t delay_min_bytes =
1064
	uint64_t delay_min_bytes =
1063
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1065
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
1064
	hrtime_t wakeup, min_tx_time, now;
1066
	hrtime_t wakeup, min_tx_time, now;
1065
1067
1066
	if (dirty <= delay_min_bytes)
1068
	if (dirty <= delay_min_bytes)
Lines 1072-1082 Link Here
1072
	 * have to handle the case of it being >= the max, which could
1074
	 * have to handle the case of it being >= the max, which could
1073
	 * cause a divide-by-zero if it's == the max.
1075
	 * cause a divide-by-zero if it's == the max.
1074
	 */
1076
	 */
1075
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1077
	ASSERT3U(dirty, <, zfs_dirty_data_max_internal);
1076
1078
1077
	now = gethrtime();
1079
	now = gethrtime();
1078
	min_tx_time = zfs_delay_scale *
1080
	min_tx_time = zfs_delay_scale *
1079
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1081
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty);
1080
	if (now > tx->tx_start + min_tx_time)
1082
	if (now > tx->tx_start + min_tx_time)
1081
		return;
1083
		return;
1082
1084
Lines 1281-1286 Link Here
1281
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1283
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1282
{
1284
{
1283
	int err;
1285
	int err;
1286
	static	uint64_t	last_max;
1284
1287
1285
	ASSERT(tx->tx_txg == 0);
1288
	ASSERT(tx->tx_txg == 0);
1286
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1289
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
Lines 1293-1298 Link Here
1293
	if (txg_how == TXG_WAITED)
1296
	if (txg_how == TXG_WAITED)
1294
		tx->tx_waited = B_TRUE;
1297
		tx->tx_waited = B_TRUE;
1295
1298
1299
#ifdef _KERNEL
1300
	/*
1301
	 * KD 2014-09-22
1302
	 * If UMA is enabled it can only return a previously-used block
1303
	 * of identical size to what it had out before.  If it's not the
1304
	 * same size it will allocate a new one.  This is a problem because
1305
	 * dirty_data_max is the total dirty write data allowed out at any
1306
	 * given time, but with UMA on that can multiply by the number of
1307
	 * different block sizes (!!) requested in terms of free RAM that
1308
	 * is left allocated but unused.  For this reason never allow
1309
	 * dirty_data_max to exceed the difference between the paging
1310
	 * threshold and the current free memory, with a minimum of 256MB.
1311
	 * This throttles "burst" allocations and prevents the system from
1312
	 * choking during times of high write I/O demand.
1313
	 *
1314
	 * We allow this to be turned off if you want with
1315
	 * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time.
1316
	 *
1317
	 * Note that we work on the zfs_dirty_data_max_internal variable,
1318
	 * because the user may set zfs_dirty_data_max himself and we must
1319
	 * must honor that as a hard cap so it remains a usable tunable value.
1320
	 */
1321
	if (zio_use_uma & zfs_dynamic_write_buffer) {
1322
		zfs_dirty_data_max_internal = 1 << 28;
1323
		zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(cnt.v_free_count - cnt.v_free_target));
1324
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max);
1325
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max);
1326
		if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) {
1327
			last_max = zfs_dirty_data_max_internal / (1024 * 1024);
1328
			DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max);
1329
		}
1330
	} else {
1331
		zfs_dirty_data_max_internal = zfs_dirty_data_max;
1332
	}
1333
#endif /* _KERNEL */
1334
1296
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1335
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1297
		dmu_tx_unassign(tx);
1336
		dmu_tx_unassign(tx);
1298
1337
Lines 1323-1329 Link Here
1323
		 * space.
1362
		 * space.
1324
		 */
1363
		 */
1325
		mutex_enter(&dp->dp_lock);
1364
		mutex_enter(&dp->dp_lock);
1326
		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1365
		while (dp->dp_dirty_total >= zfs_dirty_data_max_internal)
1327
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1366
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1328
		uint64_t dirty = dp->dp_dirty_total;
1367
		uint64_t dirty = dp->dp_dirty_total;
1329
		mutex_exit(&dp->dp_lock);
1368
		mutex_exit(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (-2 / +5 lines)
Lines 98-105 Link Here
98
/*
98
/*
99
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
99
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
100
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
100
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
101
 * We also dynamically tune during low memory, honoring the sysctl set, so
102
 * internal comparisons are against zfs_dirty_data_max_internal.
101
 */
103
 */
102
uint64_t zfs_dirty_data_max;
104
uint64_t zfs_dirty_data_max;
105
uint64_t zfs_dirty_data_max_internal;
103
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
106
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
104
int zfs_dirty_data_max_percent = 10;
107
int zfs_dirty_data_max_percent = 10;
105
108
Lines 553-559 Link Here
553
	 * Note: we signal even when increasing dp_dirty_total.
556
	 * Note: we signal even when increasing dp_dirty_total.
554
	 * This ensures forward progress -- each thread wakes the next waiter.
557
	 * This ensures forward progress -- each thread wakes the next waiter.
555
	 */
558
	 */
556
	if (dp->dp_dirty_total <= zfs_dirty_data_max)
559
	if (dp->dp_dirty_total <= zfs_dirty_data_max_internal)
557
		cv_signal(&dp->dp_spaceavail_cv);
560
		cv_signal(&dp->dp_spaceavail_cv);
558
}
561
}
559
562
Lines 732-738 Link Here
732
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
735
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
733
{
736
{
734
	uint64_t delay_min_bytes =
737
	uint64_t delay_min_bytes =
735
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
738
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
736
	boolean_t rv;
739
	boolean_t rv;
737
740
738
	mutex_enter(&dp->dp_lock);
741
	mutex_enter(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (+1 lines)
Lines 50-55 Link Here
50
struct dsl_scan;
50
struct dsl_scan;
51
51
52
extern uint64_t zfs_dirty_data_max;
52
extern uint64_t zfs_dirty_data_max;
53
extern uint64_t zfs_dirty_data_max_internal;
53
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_sync;
55
extern uint64_t zfs_dirty_data_sync;
55
extern int zfs_dirty_data_max_percent;
56
extern int zfs_dirty_data_max_percent;
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (-2 / +5 lines)
Lines 43-52 Link Here
43
43
44
SYSCTL_DECL(_vfs_zfs);
44
SYSCTL_DECL(_vfs_zfs);
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
46
/* KD 2015-07-15 Change class to "int" from "static int" as we reference 
47
 * this as an extern elsewhere
48
 */
46
#if defined(__amd64__)
49
#if defined(__amd64__)
47
static int zio_use_uma = 1;
50
int zio_use_uma = 1;
48
#else
51
#else
49
static int zio_use_uma = 0;
52
int zio_use_uma = 0;
50
#endif
53
#endif
51
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
54
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
52
    "Use uma(9) for ZIO allocations");
55
    "Use uma(9) for ZIO allocations");

Return to bug 187594