View | Details | Raw Unified | Return to bug 187594 | Differences between
and this patch

Collapse All | Expand All

(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (-18 / +99 lines)
Lines 216-221 Link Here
216
extern boolean_t zfs_prefetch_disable;
216
extern boolean_t zfs_prefetch_disable;
217
217
218
/*
218
/*
219
 * KD 2015-02-10
220
 * We have to be able to test for UIO use inside the arc allocator.
221
 * NOTE: DO NOT MODIFY HERE!
222
 */
223
extern int zio_use_uma;
224
extern int zfs_dynamic_write_buffer;
225
226
227
/*
219
 * The arc has filled available memory and has now warmed up.
228
 * The arc has filled available memory and has now warmed up.
220
 */
229
 */
221
static boolean_t arc_warm;
230
static boolean_t arc_warm;
Lines 242-248 Link Here
242
arc_free_target_init(void *unused __unused)
251
arc_free_target_init(void *unused __unused)
243
{
252
{
244
253
245
	zfs_arc_free_target = vm_pageout_wakeup_thresh;
254
	zfs_arc_free_target = vm_pageout_wakeup_thresh + ((cnt.v_free_target - vm_pageout_wakeup_thresh) / 2);
246
}
255
}
247
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
256
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
248
    arc_free_target_init, NULL);
257
    arc_free_target_init, NULL);
Lines 264-270 Link Here
264
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
273
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
265
    &arc_shrink_shift, 0,
274
    &arc_shrink_shift, 0,
266
    "log2(fraction of arc to reclaim)");
275
    "log2(fraction of arc to reclaim)");
267
276
SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN,
277
    &zfs_dynamic_write_buffer, 0,
278
    "Dynamically restrict dirty data when memory is low");
268
/*
279
/*
269
 * We don't have a tunable for arc_free_target due to the dependency on
280
 * We don't have a tunable for arc_free_target due to the dependency on
270
 * pagedaemon initialisation.
281
 * pagedaemon initialisation.
Lines 3524-3529 Link Here
3524
extern kmem_cache_t	*zio_data_buf_cache[];
3535
extern kmem_cache_t	*zio_data_buf_cache[];
3525
extern kmem_cache_t	*range_seg_cache;
3536
extern kmem_cache_t	*range_seg_cache;
3526
3537
3538
/*
3539
 * Used by arc_kmem_reap_now() and consider_reaping_arc_caches()
3540
 * to limit the time spent reaping.
3541
 *
3542
 * The arc_reaping_in_progress is a (somewhat racy) left over from a
3543
 * previous version of this code which could trigger multiple ARC cache
3544
 * reapings in parallel which should be avoided to reduce lock
3545
 * contention. It's hasn't been removed yet to encourage further
3546
 * experimenting.
3547
 */
3548
static unsigned int arc_reaping_in_progress = 0;
3549
static sbintime_t last_reaping = 0;
3550
3551
static void __noinline
3552
reap_arc_caches(void)
3553
{
3554
	size_t          i;
3555
	kmem_cache_t            *prev_cache = NULL;
3556
	kmem_cache_t            *prev_data_cache = NULL;
3557
3558
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3559
		if (zio_buf_cache[i] != prev_cache) {
3560
			prev_cache = zio_buf_cache[i];
3561
			kmem_cache_reap_now(zio_buf_cache[i]);
3562
		}
3563
		if (zio_data_buf_cache[i] != prev_data_cache) {
3564
			prev_data_cache = zio_data_buf_cache[i];
3565
			kmem_cache_reap_now(zio_data_buf_cache[i]);
3566
		}
3567
	}
3568
	kmem_cache_reap_now(buf_cache);
3569
	kmem_cache_reap_now(hdr_full_cache);
3570
	kmem_cache_reap_now(hdr_l2only_cache);
3571
	kmem_cache_reap_now(range_seg_cache);
3572
}
3573
3527
static __noinline void
3574
static __noinline void
3528
arc_kmem_reap_now(void)
3575
arc_kmem_reap_now(void)
3529
{
3576
{
Lines 3532-3537 Link Here
3532
	kmem_cache_t		*prev_data_cache = NULL;
3579
	kmem_cache_t		*prev_data_cache = NULL;
3533
3580
3534
	DTRACE_PROBE(arc__kmem_reap_start);
3581
	DTRACE_PROBE(arc__kmem_reap_start);
3582
	arc_reaping_in_progress++;
3583
3535
#ifdef _KERNEL
3584
#ifdef _KERNEL
3536
	if (arc_meta_used >= arc_meta_limit) {
3585
	if (arc_meta_used >= arc_meta_limit) {
3537
		/*
3586
		/*
Lines 3548-3567 Link Here
3548
#endif
3597
#endif
3549
#endif
3598
#endif
3550
3599
3551
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3600
	reap_arc_caches();
3552
		if (zio_buf_cache[i] != prev_cache) {
3553
			prev_cache = zio_buf_cache[i];
3554
			kmem_cache_reap_now(zio_buf_cache[i]);
3555
		}
3556
		if (zio_data_buf_cache[i] != prev_data_cache) {
3557
			prev_data_cache = zio_data_buf_cache[i];
3558
			kmem_cache_reap_now(zio_data_buf_cache[i]);
3559
		}
3560
	}
3561
	kmem_cache_reap_now(buf_cache);
3562
	kmem_cache_reap_now(hdr_full_cache);
3563
	kmem_cache_reap_now(hdr_l2only_cache);
3564
	kmem_cache_reap_now(range_seg_cache);
3565
3601
3566
#ifdef sun
3602
#ifdef sun
3567
	if (zio_arena != NULL) {
3603
	if (zio_arena != NULL) {
Lines 3572-3581 Link Here
3572
		vmem_qcache_reap(zio_arena);
3608
		vmem_qcache_reap(zio_arena);
3573
	}
3609
	}
3574
#endif
3610
#endif
3611
#ifdef _KERNEL
3612
	last_reaping = getsbinuptime();
3613
#endif
3614
	arc_reaping_in_progress = 0;
3575
	DTRACE_PROBE(arc__kmem_reap_end);
3615
	DTRACE_PROBE(arc__kmem_reap_end);
3576
}
3616
}
3577
3617
3618
3578
/*
3619
/*
3620
 * Declared writable to allow resetting it.
3621
 * XXX: Should probably be a uint64 and integrated with kstat.
3622
 */
3623
static unsigned int arc_cache_reapings_skipped = 0;
3624
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_reapings_skipped, CTLFLAG_RW,
3625
    &arc_cache_reapings_skipped, 0, "Number of times the ARC caches have not been reaped due to the reap delay");
3626
3627
static unsigned int min_arc_reap_delay = 200;
3628
SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_reap_delay_min, CTLFLAG_RW,
3629
    &min_arc_reap_delay, 200, "Minimum delay between ARC cache reapings (milliseconds)");
3630
3631
static void __noinline
3632
consider_reaping_arc_caches(void)
3633
{
3634
#ifdef _KERNEL
3635
	sbintime_t now;
3636
3637
	if (arc_reaping_in_progress)
3638
	{
3639
		/* Already reaping in another thread. */
3640
		arc_cache_reapings_skipped++;
3641
		return;
3642
	}
3643
3644
	now = getsbinuptime();
3645
	if ((now - last_reaping) / SBT_1MS < min_arc_reap_delay)
3646
	{
3647
		/* Too soon to reap again. */
3648
		arc_cache_reapings_skipped++;
3649
		return;
3650
	}
3651
#endif
3652
	arc_kmem_reap_now();
3653
}
3654
3655
/*
3579
 * Threads can block in arc_get_data_buf() waiting for this thread to evict
3656
 * Threads can block in arc_get_data_buf() waiting for this thread to evict
3580
 * enough data and signal them to proceed. When this happens, the threads in
3657
 * enough data and signal them to proceed. When this happens, the threads in
3581
 * arc_get_data_buf() are sleeping while holding the hash lock for their
3658
 * arc_get_data_buf() are sleeping while holding the hash lock for their
Lines 3617-3624 Link Here
3617
			 */
3694
			 */
3618
			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3695
			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3619
3696
3620
			arc_kmem_reap_now();
3621
3622
			/*
3697
			/*
3623
			 * If we are still low on memory, shrink the ARC
3698
			 * If we are still low on memory, shrink the ARC
3624
			 * so that we have arc_shrink_min free space.
3699
			 * so that we have arc_shrink_min free space.
Lines 3692-3697 Link Here
3692
	while (!arc_user_evicts_thread_exit) {
3767
	while (!arc_user_evicts_thread_exit) {
3693
		mutex_exit(&arc_user_evicts_lock);
3768
		mutex_exit(&arc_user_evicts_lock);
3694
3769
3770
		/*
3771
		 * Consider reaping the ARC caches at least once per
3772
		 * second, but more often when signalled under pressure.
3773
		 */
3774
		consider_reaping_arc_caches();
3775
3695
		arc_do_user_evicts();
3776
		arc_do_user_evicts();
3696
3777
3697
		/*
3778
		/*
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (-4 / +43 lines)
Lines 42-47 Link Here
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43
    uint64_t arg1, uint64_t arg2);
43
    uint64_t arg1, uint64_t arg2);
44
44
45
extern	int	zio_use_uma;	/* Needs to be visible; DO NOT MODIFY! */
46
int	zfs_dynamic_write_buffer = 1;	/* Dynamically tune writes */
45
47
46
dmu_tx_t *
48
dmu_tx_t *
47
dmu_tx_create_dd(dsl_dir_t *dd)
49
dmu_tx_create_dd(dsl_dir_t *dd)
Lines 1060-1066 Link Here
1060
{
1062
{
1061
	dsl_pool_t *dp = tx->tx_pool;
1063
	dsl_pool_t *dp = tx->tx_pool;
1062
	uint64_t delay_min_bytes =
1064
	uint64_t delay_min_bytes =
1063
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1065
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
1064
	hrtime_t wakeup, min_tx_time, now;
1066
	hrtime_t wakeup, min_tx_time, now;
1065
1067
1066
	if (dirty <= delay_min_bytes)
1068
	if (dirty <= delay_min_bytes)
Lines 1072-1082 Link Here
1072
	 * have to handle the case of it being >= the max, which could
1074
	 * have to handle the case of it being >= the max, which could
1073
	 * cause a divide-by-zero if it's == the max.
1075
	 * cause a divide-by-zero if it's == the max.
1074
	 */
1076
	 */
1075
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1077
	ASSERT3U(dirty, <, zfs_dirty_data_max_internal);
1076
1078
1077
	now = gethrtime();
1079
	now = gethrtime();
1078
	min_tx_time = zfs_delay_scale *
1080
	min_tx_time = zfs_delay_scale *
1079
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1081
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty);
1080
	if (now > tx->tx_start + min_tx_time)
1082
	if (now > tx->tx_start + min_tx_time)
1081
		return;
1083
		return;
1082
1084
Lines 1281-1286 Link Here
1281
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1283
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1282
{
1284
{
1283
	int err;
1285
	int err;
1286
	static	uint64_t	last_max;
1284
1287
1285
	ASSERT(tx->tx_txg == 0);
1288
	ASSERT(tx->tx_txg == 0);
1286
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1289
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
Lines 1293-1298 Link Here
1293
	if (txg_how == TXG_WAITED)
1296
	if (txg_how == TXG_WAITED)
1294
		tx->tx_waited = B_TRUE;
1297
		tx->tx_waited = B_TRUE;
1295
1298
1299
#ifdef _KERNEL
1300
	/*
1301
	 * KD 2014-09-22
1302
	 * If UMA is enabled it can only return a previously-used block
1303
	 * of identical size to what it had out before.  If it's not the
1304
	 * same size it will allocate a new one.  This is a problem because
1305
	 * dirty_data_max is the total dirty write data allowed out at any
1306
	 * given time, but with UMA on that can multiply by the number of
1307
	 * different block sizes (!!) requested in terms of free RAM that
1308
	 * is left allocated but unused.  For this reason never allow
1309
	 * dirty_data_max to exceed the difference between the paging
1310
	 * threshold and the current free memory, with a minimum of 256MB.
1311
	 * This throttles "burst" allocations and prevents the system from
1312
	 * choking during times of high write I/O demand.
1313
	 *
1314
	 * We allow this to be turned off if you want with
1315
	 * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time.
1316
	 *
1317
	 * Note that we work on the zfs_dirty_data_max_internal variable,
1318
	 * because the user may set zfs_dirty_data_max himself and we must
1319
	 * must honor that as a hard cap so it remains a usable tunable value.
1320
	 */
1321
	if (zio_use_uma & zfs_dynamic_write_buffer) {
1322
		zfs_dirty_data_max_internal = 1 << 28;
1323
		zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(cnt.v_free_count - cnt.v_free_target));
1324
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max);
1325
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max);
1326
		if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) {
1327
			last_max = zfs_dirty_data_max_internal / (1024 * 1024);
1328
			DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max);
1329
		}
1330
	} else {
1331
		zfs_dirty_data_max_internal = zfs_dirty_data_max;
1332
	}
1333
#endif /* _KERNEL */
1334
1296
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1335
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1297
		dmu_tx_unassign(tx);
1336
		dmu_tx_unassign(tx);
1298
1337
Lines 1323-1329 Link Here
1323
		 * space.
1362
		 * space.
1324
		 */
1363
		 */
1325
		mutex_enter(&dp->dp_lock);
1364
		mutex_enter(&dp->dp_lock);
1326
		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1365
		while (dp->dp_dirty_total >= zfs_dirty_data_max_internal)
1327
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1366
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1328
		uint64_t dirty = dp->dp_dirty_total;
1367
		uint64_t dirty = dp->dp_dirty_total;
1329
		mutex_exit(&dp->dp_lock);
1368
		mutex_exit(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (-2 / +5 lines)
Lines 98-105 Link Here
98
/*
98
/*
99
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
99
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
100
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
100
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
101
 * We also dynamically tune during low memory, honoring the sysctl set, so
102
 * internal comparisons are against zfs_dirty_data_max_internal.
101
 */
103
 */
102
uint64_t zfs_dirty_data_max;
104
uint64_t zfs_dirty_data_max;
105
uint64_t zfs_dirty_data_max_internal;
103
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
106
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
104
int zfs_dirty_data_max_percent = 10;
107
int zfs_dirty_data_max_percent = 10;
105
108
Lines 557-563 Link Here
557
	 * Note: we signal even when increasing dp_dirty_total.
560
	 * Note: we signal even when increasing dp_dirty_total.
558
	 * This ensures forward progress -- each thread wakes the next waiter.
561
	 * This ensures forward progress -- each thread wakes the next waiter.
559
	 */
562
	 */
560
	if (dp->dp_dirty_total <= zfs_dirty_data_max)
563
	if (dp->dp_dirty_total <= zfs_dirty_data_max_internal)
561
		cv_signal(&dp->dp_spaceavail_cv);
564
		cv_signal(&dp->dp_spaceavail_cv);
562
}
565
}
563
566
Lines 736-742 Link Here
736
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
739
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
737
{
740
{
738
	uint64_t delay_min_bytes =
741
	uint64_t delay_min_bytes =
739
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
742
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
740
	boolean_t rv;
743
	boolean_t rv;
741
744
742
	mutex_enter(&dp->dp_lock);
745
	mutex_enter(&dp->dp_lock);
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (+1 lines)
Lines 50-55 Link Here
50
struct dsl_scan;
50
struct dsl_scan;
51
51
52
extern uint64_t zfs_dirty_data_max;
52
extern uint64_t zfs_dirty_data_max;
53
extern uint64_t zfs_dirty_data_max_internal;
53
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_sync;
55
extern uint64_t zfs_dirty_data_sync;
55
extern int zfs_dirty_data_max_percent;
56
extern int zfs_dirty_data_max_percent;
(-)sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c (-2 / +5 lines)
Lines 43-52 Link Here
43
43
44
SYSCTL_DECL(_vfs_zfs);
44
SYSCTL_DECL(_vfs_zfs);
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
46
/* KD 2015-07-15 Change class to "int" from "static int" as we reference 
47
 * this as an extern elsewhere
48
 */
46
#if defined(__amd64__)
49
#if defined(__amd64__)
47
static int zio_use_uma = 1;
50
int zio_use_uma = 1;
48
#else
51
#else
49
static int zio_use_uma = 0;
52
int zio_use_uma = 0;
50
#endif
53
#endif
51
TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
54
TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
52
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
55
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,

Return to bug 187594