View | Details | Raw Unified | Return to bug 187594 | Differences between
and this patch

Collapse All | Expand All

(-)cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (-15 / +67 lines)
Lines 190-195 static int arc_dead; Link Here
190
extern int zfs_prefetch_disable;
190
extern int zfs_prefetch_disable;
191
191
192
/*
192
/*
193
 * KD 2015-02-10
194
 * We have to be able to test for UIO use inside the arc allocator.
195
 * NOTE: DO NOT MODIFY HERE!
196
 */
197
extern int zio_use_uma;
198
extern int zfs_dynamic_write_buffer;
199
200
201
/*
193
 * The arc has filled available memory and has now warmed up.
202
 * The arc has filled available memory and has now warmed up.
194
 */
203
 */
195
static boolean_t arc_warm;
204
static boolean_t arc_warm;
Lines 212-218 static void Link Here
212
arc_free_target_init(void *unused __unused)
221
arc_free_target_init(void *unused __unused)
213
{
222
{
214
223
215
	zfs_arc_free_target = vm_pageout_wakeup_thresh;
224
	zfs_arc_free_target = vm_pageout_wakeup_thresh + ((cnt.v_free_target - vm_pageout_wakeup_thresh) / 2);
216
}
225
}
217
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
226
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218
    arc_free_target_init, NULL);
227
    arc_free_target_init, NULL);
Lines 233-239 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_block Link Here
233
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
242
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234
    &arc_shrink_shift, 0,
243
    &arc_shrink_shift, 0,
235
    "log2(fraction of arc to reclaim)");
244
    "log2(fraction of arc to reclaim)");
236
245
SYSCTL_INT(_vfs_zfs, OID_AUTO, dynamic_write_buffer, CTLFLAG_RWTUN,
246
    &zfs_dynamic_write_buffer, 0,
247
    "Dynamically restrict dirty data when memory is low");
237
/*
248
/*
238
 * We don't have a tunable for arc_free_target due to the dependency on
249
 * We don't have a tunable for arc_free_target due to the dependency on
239
 * pagedaemon initialisation.
250
 * pagedaemon initialisation.
Lines 2635-2640 extern kmem_cache_t *zio_buf_cache[]; Link Here
2635
extern kmem_cache_t	*zio_data_buf_cache[];
2646
extern kmem_cache_t	*zio_data_buf_cache[];
2636
extern kmem_cache_t	*range_seg_cache;
2647
extern kmem_cache_t	*range_seg_cache;
2637
2648
2649
static void __used
2650
reap_arc_caches()
2651
{
2652
	size_t          i;
2653
	kmem_cache_t            *prev_cache = NULL;
2654
	kmem_cache_t            *prev_data_cache = NULL;
2655
2656
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2657
		if (zio_buf_cache[i] != prev_cache) {
2658
			prev_cache = zio_buf_cache[i];
2659
			kmem_cache_reap_now(zio_buf_cache[i]);
2660
		}
2661
		if (zio_data_buf_cache[i] != prev_data_cache) {
2662
			prev_data_cache = zio_data_buf_cache[i];
2663
			kmem_cache_reap_now(zio_data_buf_cache[i]);
2664
		}
2665
	}
2666
	kmem_cache_reap_now(buf_cache);
2667
	kmem_cache_reap_now(hdr_cache);
2668
	kmem_cache_reap_now(range_seg_cache);
2669
}
2670
2638
static void __noinline
2671
static void __noinline
2639
arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2672
arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2640
{
2673
{
Lines 2666-2684 arc_kmem_reap_now(arc_reclaim_strategy_t strat) Link Here
2666
	if (strat == ARC_RECLAIM_AGGR)
2699
	if (strat == ARC_RECLAIM_AGGR)
2667
		arc_shrink();
2700
		arc_shrink();
2668
2701
2669
	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2702
	reap_arc_caches();
2670
		if (zio_buf_cache[i] != prev_cache) {
2671
			prev_cache = zio_buf_cache[i];
2672
			kmem_cache_reap_now(zio_buf_cache[i]);
2673
		}
2674
		if (zio_data_buf_cache[i] != prev_data_cache) {
2675
			prev_data_cache = zio_data_buf_cache[i];
2676
			kmem_cache_reap_now(zio_data_buf_cache[i]);
2677
		}
2678
	}
2679
	kmem_cache_reap_now(buf_cache);
2680
	kmem_cache_reap_now(hdr_cache);
2681
	kmem_cache_reap_now(range_seg_cache);
2682
2703
2683
#ifdef sun
2704
#ifdef sun
2684
	/*
2705
	/*
Lines 2697-2707 arc_reclaim_thread(void *dummy __unused) Link Here
2697
	clock_t			growtime = 0;
2718
	clock_t			growtime = 0;
2698
	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2719
	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2699
	callb_cpr_t		cpr;
2720
	callb_cpr_t		cpr;
2721
	int			autoreap = 0;
2700
2722
2701
	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2723
	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2702
2724
2703
	mutex_enter(&arc_reclaim_thr_lock);
2725
	mutex_enter(&arc_reclaim_thr_lock);
2704
	while (arc_thread_exit == 0) {
2726
	while (arc_thread_exit == 0) {
2727
2728
#ifdef _KERNEL
2729
/* KD 2015-02-10
2730
 * Protect against UMA free memory bloat.  We already do this on a low-memory
2731
 * basis in the allocator; it has to happen there rather than here due to
2732
 * response time considerations.  Make the call here once every 10 passes as
2733
 * well; this reclaims unused UMA buffers every 10 seconds on an idle system
2734
 * and more frequently if the reclaim thread gets woken up by low RAM
2735
 * conditions.
2736
 */
2737
		if ((zio_use_uma) && (autoreap++ == 10)) {
2738
			autoreap = 0;
2739
			DTRACE_PROBE(arc__reclaim_timed_reap);
2740
			reap_arc_caches();
2741
		}
2742
#endif /* _KERNEL */
2743
2705
		if (arc_reclaim_needed()) {
2744
		if (arc_reclaim_needed()) {
2706
2745
2707
			if (arc_no_grow) {
2746
			if (arc_no_grow) {
Lines 2889-2894 arc_get_data_buf(arc_buf_t *buf) Link Here
2889
			arc_space_consume(size, ARC_SPACE_DATA);
2928
			arc_space_consume(size, ARC_SPACE_DATA);
2890
		} else {
2929
		} else {
2891
			ASSERT(type == ARC_BUFC_DATA);
2930
			ASSERT(type == ARC_BUFC_DATA);
2931
#ifdef _KERNEL
2932
/* KD 2015-02-10
2933
 * It would be nice if we could leave this to the arc_reclaim thread.
2934
 * Unfortunately we cannot; the test has to be done here as well, because
2935
 * under heavy I/O demand we can grab enough RAM fast enough to induce
2936
 * nasty oscillation problems.  Fortunately we only need to call this when
2937
 * the system is under reasonably-severe memory stress.
2938
 */
2939
			if (zio_use_uma && (ptob(cnt.v_free_count) + size < ptob(cnt.v_free_target))) {
2940
				DTRACE_PROBE3(arc__alloc_lowmem_reap, int, cnt.v_free_count, int, size, int, cnt.v_free_target);
2941
				reap_arc_caches();
2942
			}
2943
#endif /* _KERNEL */
2892
			buf->b_data = zio_data_buf_alloc(size);
2944
			buf->b_data = zio_data_buf_alloc(size);
2893
			ARCSTAT_INCR(arcstat_data_size, size);
2945
			ARCSTAT_INCR(arcstat_data_size, size);
2894
			atomic_add_64(&arc_size, size);
2946
			atomic_add_64(&arc_size, size);
(-)cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (-4 / +43 lines)
Lines 42-47 Link Here
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
42
typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43
    uint64_t arg1, uint64_t arg2);
43
    uint64_t arg1, uint64_t arg2);
44
44
45
extern	int	zio_use_uma;	/* Needs to be visible; DO NOT MODIFY! */
46
int	zfs_dynamic_write_buffer = 1;	/* Dynamically tune writes */
45
47
46
dmu_tx_t *
48
dmu_tx_t *
47
dmu_tx_create_dd(dsl_dir_t *dd)
49
dmu_tx_create_dd(dsl_dir_t *dd)
Lines 1058-1064 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) Link Here
1058
{
1060
{
1059
	dsl_pool_t *dp = tx->tx_pool;
1061
	dsl_pool_t *dp = tx->tx_pool;
1060
	uint64_t delay_min_bytes =
1062
	uint64_t delay_min_bytes =
1061
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1063
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
1062
	hrtime_t wakeup, min_tx_time, now;
1064
	hrtime_t wakeup, min_tx_time, now;
1063
1065
1064
	if (dirty <= delay_min_bytes)
1066
	if (dirty <= delay_min_bytes)
Lines 1070-1080 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) Link Here
1070
	 * have to handle the case of it being >= the max, which could
1072
	 * have to handle the case of it being >= the max, which could
1071
	 * cause a divide-by-zero if it's == the max.
1073
	 * cause a divide-by-zero if it's == the max.
1072
	 */
1074
	 */
1073
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1075
	ASSERT3U(dirty, <, zfs_dirty_data_max_internal);
1074
1076
1075
	now = gethrtime();
1077
	now = gethrtime();
1076
	min_tx_time = zfs_delay_scale *
1078
	min_tx_time = zfs_delay_scale *
1077
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1079
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max_internal - dirty);
1078
	if (now > tx->tx_start + min_tx_time)
1080
	if (now > tx->tx_start + min_tx_time)
1079
		return;
1081
		return;
1080
1082
Lines 1279-1284 int Link Here
1279
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1281
dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1280
{
1282
{
1281
	int err;
1283
	int err;
1284
	static	uint64_t	last_max;
1282
1285
1283
	ASSERT(tx->tx_txg == 0);
1286
	ASSERT(tx->tx_txg == 0);
1284
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1287
	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
Lines 1291-1296 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) Link Here
1291
	if (txg_how == TXG_WAITED)
1294
	if (txg_how == TXG_WAITED)
1292
		tx->tx_waited = B_TRUE;
1295
		tx->tx_waited = B_TRUE;
1293
1296
1297
#ifdef _KERNEL
1298
	/*
1299
	 * KD 2014-09-22
1300
	 * If UMA is enabled it can only return a previously-used block
1301
	 * of identical size to what it had out before.  If it's not the
1302
	 * same size it will allocate a new one.  This is a problem because
1303
	 * dirty_data_max is the total dirty write data allowed out at any
1304
	 * given time, but with UMA on that can multiply by the number of
1305
	 * different block sizes (!!) requested in terms of free RAM that
1306
	 * is left allocated but unused.  For this reason never allow
1307
	 * dirty_data_max to exceed the difference between the paging
1308
	 * threshold and the current free memory, with a minimum of 256MB.
1309
	 * This throttles "burst" allocations and prevents the system from
1310
	 * choking during times of high write I/O demand.
1311
	 *
1312
	 * We allow this to be turned off if you want with
1313
	 * "vfs.zfs_dynamic_write_buffer=0", which can be done in real time.
1314
	 *
1315
	 * Note that we work on the zfs_dirty_data_max_internal variable,
1316
	 * because the user may set zfs_dirty_data_max himself and we must
1317
	 * must honor that as a hard cap so it remains a usable tunable value.
1318
	 */
1319
	if (zio_use_uma & zfs_dynamic_write_buffer) {
1320
		zfs_dirty_data_max_internal = 1 << 28;
1321
		zfs_dirty_data_max_internal = MAX(zfs_dirty_data_max_internal, ptob(cnt.v_free_count - cnt.v_free_target));
1322
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max);
1323
		zfs_dirty_data_max_internal = MIN(zfs_dirty_data_max_internal, zfs_dirty_data_max_max);
1324
		if (last_max != (zfs_dirty_data_max_internal / (1024 * 1024))) {
1325
			last_max = zfs_dirty_data_max_internal / (1024 * 1024);
1326
			DTRACE_PROBE1(dmu__tx_dirty, uint64_t, last_max);
1327
		}
1328
	} else {
1329
		zfs_dirty_data_max_internal = zfs_dirty_data_max;
1330
	}
1331
#endif /* _KERNEL */
1332
1294
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1333
	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1295
		dmu_tx_unassign(tx);
1334
		dmu_tx_unassign(tx);
1296
1335
Lines 1321-1327 dmu_tx_wait(dmu_tx_t *tx) Link Here
1321
		 * space.
1360
		 * space.
1322
		 */
1361
		 */
1323
		mutex_enter(&dp->dp_lock);
1362
		mutex_enter(&dp->dp_lock);
1324
		while (dp->dp_dirty_total >= zfs_dirty_data_max)
1363
		while (dp->dp_dirty_total >= zfs_dirty_data_max_internal)
1325
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1364
			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1326
		uint64_t dirty = dp->dp_dirty_total;
1365
		uint64_t dirty = dp->dp_dirty_total;
1327
		mutex_exit(&dp->dp_lock);
1366
		mutex_exit(&dp->dp_lock);
(-)cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (-2 / +5 lines)
Lines 97-104 Link Here
97
/*
97
/*
98
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
98
 * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
99
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
99
 * capped at zfs_dirty_data_max_max.  It can also be overridden in /etc/system.
100
 * We also dynamically tune during low memory, honoring the sysctl set, so
101
 * internal comparisons are against zfs_dirty_data_max_internal.
100
 */
102
 */
101
uint64_t zfs_dirty_data_max;
103
uint64_t zfs_dirty_data_max;
104
uint64_t zfs_dirty_data_max_internal;
102
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
105
uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
103
int zfs_dirty_data_max_percent = 10;
106
int zfs_dirty_data_max_percent = 10;
104
107
Lines 547-553 dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta Link Here
547
	 * Note: we signal even when increasing dp_dirty_total.
550
	 * Note: we signal even when increasing dp_dirty_total.
548
	 * This ensures forward progress -- each thread wakes the next waiter.
551
	 * This ensures forward progress -- each thread wakes the next waiter.
549
	 */
552
	 */
550
	if (dp->dp_dirty_total <= zfs_dirty_data_max)
553
	if (dp->dp_dirty_total <= zfs_dirty_data_max_internal)
551
		cv_signal(&dp->dp_spaceavail_cv);
554
		cv_signal(&dp->dp_spaceavail_cv);
552
}
555
}
553
556
Lines 726-732 boolean_t Link Here
726
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
729
dsl_pool_need_dirty_delay(dsl_pool_t *dp)
727
{
730
{
728
	uint64_t delay_min_bytes =
731
	uint64_t delay_min_bytes =
729
	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
732
	    zfs_dirty_data_max_internal * zfs_delay_min_dirty_percent / 100;
730
	boolean_t rv;
733
	boolean_t rv;
731
734
732
	mutex_enter(&dp->dp_lock);
735
	mutex_enter(&dp->dp_lock);
(-)cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h (+1 lines)
Lines 50-55 struct dmu_tx; Link Here
50
struct dsl_scan;
50
struct dsl_scan;
51
51
52
extern uint64_t zfs_dirty_data_max;
52
extern uint64_t zfs_dirty_data_max;
53
extern uint64_t zfs_dirty_data_max_internal;
53
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_max_max;
54
extern uint64_t zfs_dirty_data_sync;
55
extern uint64_t zfs_dirty_data_sync;
55
extern int zfs_dirty_data_max_percent;
56
extern int zfs_dirty_data_max_percent;

Return to bug 187594