View | Details | Raw Unified | Return to bug 187594 | Differences between
and this patch

Collapse All | Expand All

(-)arc.c (-39 / +156 lines)
Lines 138-143 Link Here
138
#include <sys/sdt.h>
138
#include <sys/sdt.h>
139
139
140
#include <vm/vm_pageout.h>
140
#include <vm/vm_pageout.h>
141
#include <machine/vmparam.h>
141
142
142
#ifdef illumos
143
#ifdef illumos
143
#ifndef _KERNEL
144
#ifndef _KERNEL
Lines 193-201 Link Here
193
 */
194
 */
194
static boolean_t arc_warm;
195
static boolean_t arc_warm;
195
196
196
/*
197
 * These tunables are for performance analysis.
198
 */
199
uint64_t zfs_arc_max;
197
uint64_t zfs_arc_max;
200
uint64_t zfs_arc_min;
198
uint64_t zfs_arc_min;
201
uint64_t zfs_arc_meta_limit = 0;
199
uint64_t zfs_arc_meta_limit = 0;
Lines 204-210 Link Here
204
int zfs_arc_p_min_shift = 0;
202
int zfs_arc_p_min_shift = 0;
205
int zfs_disable_dup_eviction = 0;
203
int zfs_disable_dup_eviction = 0;
206
uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
204
uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205
u_int zfs_arc_free_target = (1 << 16); /* default before pagedaemon init only */
206
int zfs_arc_reclaim_cache_free = 1;
207
207
208
static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
209
210
#ifdef _KERNEL
211
static void
212
arc_free_target_init(void *unused __unused)
213
{
214
215
	zfs_arc_free_target = cnt.v_free_target;
216
}
217
SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218
    arc_free_target_init, NULL);
219
208
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
220
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
209
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
221
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
210
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
222
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
Lines 217-223 Link Here
217
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
229
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
218
    &zfs_arc_average_blocksize, 0,
230
    &zfs_arc_average_blocksize, 0,
219
    "ARC average blocksize");
231
    "ARC average blocksize");
232
SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_reclaim_cache_free, CTLFLAG_RWTUN,
233
    &zfs_arc_reclaim_cache_free, 0,
234
    "ARC treats cached pages as free blocksize");
235
/*
236
 * We don't have a tunable for arc_free_target due to the dependency on
237
 * pagedaemon initialisation.
238
 */
239
SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
240
    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
241
    sysctl_vfs_zfs_arc_free_target, "IU",
242
    "Desired number of free pages below which ARC triggers reclaim");
220
243
244
static int
245
sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
246
{
247
	u_int val;
248
	int err;
249
250
	val = zfs_arc_free_target;
251
	err = sysctl_handle_int(oidp, &val, 0, req);
252
	if (err != 0 || req->newptr == NULL)
253
		return (err);
254
255
	if (val < minfree)
256
		return (EINVAL);
257
	if (val > cnt.v_page_count)
258
		return (EINVAL);
259
260
	zfs_arc_free_target = val;
261
262
	return (0);
263
}
264
#endif
265
221
/*
266
/*
222
 * Note that buffers can be in one of 6 states:
267
 * Note that buffers can be in one of 6 states:
223
 *	ARC_anon	- anonymous (discussed below)
268
 *	ARC_anon	- anonymous (discussed below)
Lines 2421-2426 Link Here
2421
void
2466
void
2422
arc_shrink(void)
2467
arc_shrink(void)
2423
{
2468
{
2469
2424
	if (arc_c > arc_c_min) {
2470
	if (arc_c > arc_c_min) {
2425
		uint64_t to_free;
2471
		uint64_t to_free;
2426
2472
Lines 2429-2434 Link Here
2429
#else
2475
#else
2430
		to_free = arc_c >> arc_shrink_shift;
2476
		to_free = arc_c >> arc_shrink_shift;
2431
#endif
2477
#endif
2478
		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2479
			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2480
2432
		if (arc_c > arc_c_min + to_free)
2481
		if (arc_c > arc_c_min + to_free)
2433
			atomic_add_64(&arc_c, -to_free);
2482
			atomic_add_64(&arc_c, -to_free);
2434
		else
2483
		else
Lines 2439-2450 Link Here
2439
			arc_c = MAX(arc_size, arc_c_min);
2488
			arc_c = MAX(arc_size, arc_c_min);
2440
		if (arc_p > arc_c)
2489
		if (arc_p > arc_c)
2441
			arc_p = (arc_c >> 1);
2490
			arc_p = (arc_c >> 1);
2491
2492
		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2493
			arc_p);
2494
2442
		ASSERT(arc_c >= arc_c_min);
2495
		ASSERT(arc_c >= arc_c_min);
2443
		ASSERT((int64_t)arc_p >= 0);
2496
		ASSERT((int64_t)arc_p >= 0);
2444
	}
2497
	}
2445
2498
2446
	if (arc_size > arc_c)
2499
	if (arc_size > arc_c) {
2500
		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2501
			uint64_t, arc_c);
2447
		arc_adjust();
2502
		arc_adjust();
2503
	}
2448
}
2504
}
2449
2505
2450
static int needfree = 0;
2506
static int needfree = 0;
Lines 2452-2469 Link Here
2452
static int
2508
static int
2453
arc_reclaim_needed(void)
2509
arc_reclaim_needed(void)
2454
{
2510
{
2511
	u_int fm;
2455
2512
2456
#ifdef _KERNEL
2513
#ifdef _KERNEL
2457
2514
2458
	if (needfree)
2515
/*
2516
 * First check to see if dirty_data_max needs adjusting.  Do not allow
2517
 * the dirty data amount per pool to exceed free, non-swappable RAM.
2518
 * Subject this to the original test of the max_max limit and a minimum
2519
 * of 16MB.
2520
 */
2521
        zfs_dirty_data_max = ptob(cnt.v_free_count) - ptob(cnt.v_free_min);
2522
        if (zfs_dirty_data_max <= 1 << 24) {
2523
                zfs_dirty_data_max = 1 << 24;
2524
        }
2525
        zfs_dirty_data_max = MIN(zfs_dirty_data_max, ptob(physmem) *
2526
                zfs_dirty_data_max_percent / 100);
2527
        zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max);
2528
2529
2530
	if (arc_size <= arc_c_min) {
2531
		DTRACE_PROBE2(arc__reclaim_min, uint64_t, arc_size,
2532
		    uint64_t, arc_c_min);
2533
		return (0);
2534
	}
2535
2536
	if (needfree) {
2537
		DTRACE_PROBE(arc__reclaim_needfree);
2459
		return (1);
2538
		return (1);
2539
	}
2460
2540
2461
	/*
2541
	/*
2462
	 * Cooperate with pagedaemon when it's time for it to scan
2542
	 * Cooperate with pagedaemon when it's time for it to scan
2463
	 * and reclaim some pages.
2543
	 * and reclaim some pages.
2464
	 */
2544
	 */
2465
	if (vm_paging_needed())
2545
	if (zfs_arc_reclaim_cache_free == 0)
2546
		fm = cnt.v_free_count;
2547
	else
2548
		fm = freemem;
2549
2550
	if (fm < zfs_arc_free_target) {
2551
		DTRACE_PROBE3(arc__reclaim_freemem, uint64_t,
2552
		    fm, uint64_t, zfs_arc_free_target,
2553
		    int, zfs_arc_reclaim_cache_free);
2466
		return (1);
2554
		return (1);
2555
	}
2467
2556
2468
#ifdef sun
2557
#ifdef sun
2469
	/*
2558
	/*
Lines 2491-2498 Link Here
2491
	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2580
	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2492
		return (1);
2581
		return (1);
2493
2582
2494
#if defined(__i386)
2495
	/*
2583
	/*
2584
	 * Check that we have enough availrmem that memory locking (e.g., via
2585
	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2586
	 * stores the number of pages that cannot be locked; when availrmem
2587
	 * drops below pages_pp_maximum, page locking mechanisms such as
2588
	 * page_pp_lock() will fail.)
2589
	 */
2590
	if (availrmem <= pages_pp_maximum)
2591
		return (1);
2592
2593
#endif	/* sun */
2594
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2595
	/*
2496
	 * If we're on an i386 platform, it's possible that we'll exhaust the
2596
	 * If we're on an i386 platform, it's possible that we'll exhaust the
2497
	 * kernel heap space before we ever run out of available physical
2597
	 * kernel heap space before we ever run out of available physical
2498
	 * memory.  Most checks of the size of the heap_area compare against
2598
	 * memory.  Most checks of the size of the heap_area compare against
Lines 2503-2534 Link Here
2503
	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2603
	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2504
	 * free)
2604
	 * free)
2505
	 */
2605
	 */
2506
	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2606
	if (vmem_size(heap_arena, VMEM_FREE) <
2507
	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2607
	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2608
		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2609
		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2610
		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2508
		return (1);
2611
		return (1);
2612
	}
2509
#endif
2613
#endif
2510
#else	/* !sun */
2614
#ifdef sun
2511
	if (kmem_used() > (kmem_size() * 3) / 4)
2615
	/*
2616
	 * If zio data pages are being allocated out of a separate heap segment,
2617
	 * then enforce that the size of available vmem for this arena remains
2618
	 * above about 1/16th free.
2619
	 *
2620
	 * Note: The 1/16th arena free requirement was put in place
2621
	 * to aggressively evict memory from the arc in order to avoid
2622
	 * memory fragmentation issues.
2623
	 */
2624
	if (zio_arena != NULL &&
2625
	    vmem_size(zio_arena, VMEM_FREE) <
2626
	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2512
		return (1);
2627
		return (1);
2513
#endif	/* sun */
2628
#endif	/* sun */
2514
2629
#else	/* _KERNEL */
2515
#else
2516
	if (spa_get_random(100) == 0)
2630
	if (spa_get_random(100) == 0)
2517
		return (1);
2631
		return (1);
2518
#endif
2632
#endif	/* _KERNEL */
2633
	DTRACE_PROBE(arc__reclaim_no);
2634
2519
	return (0);
2635
	return (0);
2520
}
2636
}
2521
2637
2522
extern kmem_cache_t	*zio_buf_cache[];
2638
extern kmem_cache_t	*zio_buf_cache[];
2523
extern kmem_cache_t	*zio_data_buf_cache[];
2639
extern kmem_cache_t	*zio_data_buf_cache[];
2524
2640
2525
static void
2641
static void __used
2526
arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2642
arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2527
{
2643
{
2528
	size_t			i;
2644
	size_t			i;
2529
	kmem_cache_t		*prev_cache = NULL;
2645
	kmem_cache_t		*prev_cache = NULL;
2530
	kmem_cache_t		*prev_data_cache = NULL;
2646
	kmem_cache_t		*prev_data_cache = NULL;
2531
2647
2648
	DTRACE_PROBE(arc__kmem_reap_start);
2532
#ifdef _KERNEL
2649
#ifdef _KERNEL
2533
	if (arc_meta_used >= arc_meta_limit) {
2650
	if (arc_meta_used >= arc_meta_limit) {
2534
		/*
2651
		/*
Lines 2564-2569 Link Here
2564
	}
2681
	}
2565
	kmem_cache_reap_now(buf_cache);
2682
	kmem_cache_reap_now(buf_cache);
2566
	kmem_cache_reap_now(hdr_cache);
2683
	kmem_cache_reap_now(hdr_cache);
2684
2685
#ifdef sun
2686
	/*
2687
	 * Ask the vmem areana to reclaim unused memory from its
2688
	 * quantum caches.
2689
	 */
2690
	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2691
		vmem_qcache_reap(zio_arena);
2692
#endif
2693
	DTRACE_PROBE(arc__kmem_reap_end);
2567
}
2694
}
2568
2695
2569
static void
2696
static void
Lines 2581-2586 Link Here
2581
2708
2582
			if (arc_no_grow) {
2709
			if (arc_no_grow) {
2583
				if (last_reclaim == ARC_RECLAIM_CONS) {
2710
				if (last_reclaim == ARC_RECLAIM_CONS) {
2711
					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2584
					last_reclaim = ARC_RECLAIM_AGGR;
2712
					last_reclaim = ARC_RECLAIM_AGGR;
2585
				} else {
2713
				} else {
2586
					last_reclaim = ARC_RECLAIM_CONS;
2714
					last_reclaim = ARC_RECLAIM_CONS;
Lines 2588-2593 Link Here
2588
			} else {
2716
			} else {
2589
				arc_no_grow = TRUE;
2717
				arc_no_grow = TRUE;
2590
				last_reclaim = ARC_RECLAIM_AGGR;
2718
				last_reclaim = ARC_RECLAIM_AGGR;
2719
				DTRACE_PROBE(arc__reclaim_aggr);
2591
				membar_producer();
2720
				membar_producer();
2592
			}
2721
			}
2593
2722
Lines 2602-2607 Link Here
2602
				 */
2731
				 */
2603
				arc_no_grow = TRUE;
2732
				arc_no_grow = TRUE;
2604
				last_reclaim = ARC_RECLAIM_AGGR;
2733
				last_reclaim = ARC_RECLAIM_AGGR;
2734
				DTRACE_PROBE(arc__reclaim_aggr_needfree);
2605
			}
2735
			}
2606
			arc_kmem_reap_now(last_reclaim);
2736
			arc_kmem_reap_now(last_reclaim);
2607
			arc_warm = B_TRUE;
2737
			arc_warm = B_TRUE;
Lines 2618-2623 Link Here
2618
#ifdef _KERNEL
2748
#ifdef _KERNEL
2619
		if (needfree) {
2749
		if (needfree) {
2620
			needfree = 0;
2750
			needfree = 0;
2751
			DTRACE_PROBE(arc__clear_needfree);
2621
			wakeup(&needfree);
2752
			wakeup(&needfree);
2622
		}
2753
		}
2623
#endif
2754
#endif
Lines 2692-2697 Link Here
2692
	 * cache size, increment the target cache size
2823
	 * cache size, increment the target cache size
2693
	 */
2824
	 */
2694
	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2825
	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2826
		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2695
		atomic_add_64(&arc_c, (int64_t)bytes);
2827
		atomic_add_64(&arc_c, (int64_t)bytes);
2696
		if (arc_c > arc_c_max)
2828
		if (arc_c > arc_c_max)
2697
			arc_c = arc_c_max;
2829
			arc_c = arc_c_max;
Lines 2713-2732 Link Here
2713
	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2845
	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2714
		return (1);
2846
		return (1);
2715
2847
2716
#ifdef sun
2717
#ifdef _KERNEL
2718
	/*
2719
	 * If zio data pages are being allocated out of a separate heap segment,
2720
	 * then enforce that the size of available vmem for this area remains
2721
	 * above about 1/32nd free.
2722
	 */
2723
	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2724
	    vmem_size(zio_arena, VMEM_FREE) <
2725
	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2726
		return (1);
2727
#endif
2728
#endif	/* sun */
2729
2730
	if (arc_reclaim_needed())
2848
	if (arc_reclaim_needed())
2731
		return (1);
2849
		return (1);
2732
2850
Lines 3885-3904 Link Here
3885
arc_memory_throttle(uint64_t reserve, uint64_t txg)
4003
arc_memory_throttle(uint64_t reserve, uint64_t txg)
3886
{
4004
{
3887
#ifdef _KERNEL
4005
#ifdef _KERNEL
3888
	uint64_t available_memory =
4006
	uint64_t available_memory = ptob(freemem);
3889
	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3890
	static uint64_t page_load = 0;
4007
	static uint64_t page_load = 0;
3891
	static uint64_t last_txg = 0;
4008
	static uint64_t last_txg = 0;
3892
4009
3893
#ifdef sun
4010
#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3894
#if defined(__i386)
3895
	available_memory =
4011
	available_memory =
3896
	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
4012
	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
3897
#endif
4013
#endif
3898
#endif	/* sun */
3899
4014
3900
	if (cnt.v_free_count + cnt.v_cache_count >
4015
	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
3901
	    (uint64_t)physmem * arc_lotsfree_percent / 100)
3902
		return (0);
4016
		return (0);
3903
4017
3904
	if (txg > last_txg) {
4018
	if (txg > last_txg) {
Lines 3911-3917 Link Here
3911
	 * continue to let page writes occur as quickly as possible.
4025
	 * continue to let page writes occur as quickly as possible.
3912
	 */
4026
	 */
3913
	if (curproc == pageproc) {
4027
	if (curproc == pageproc) {
3914
		if (page_load > available_memory / 4)
4028
		if (page_load > MAX(ptob(minfree), available_memory) / 4)
3915
			return (SET_ERROR(ERESTART));
4029
			return (SET_ERROR(ERESTART));
3916
		/* Note: reserve is inflated, so we deflate */
4030
		/* Note: reserve is inflated, so we deflate */
3917
		page_load += reserve / 8;
4031
		page_load += reserve / 8;
Lines 3939-3946 Link Here
3939
	int error;
4053
	int error;
3940
	uint64_t anon_size;
4054
	uint64_t anon_size;
3941
4055
3942
	if (reserve > arc_c/4 && !arc_no_grow)
4056
	if (reserve > arc_c/4 && !arc_no_grow) {
3943
		arc_c = MIN(arc_c_max, reserve * 4);
4057
		arc_c = MIN(arc_c_max, reserve * 4);
4058
		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4059
	}
3944
	if (reserve > arc_c)
4060
	if (reserve > arc_c)
3945
		return (SET_ERROR(ENOMEM));
4061
		return (SET_ERROR(ENOMEM));
3946
4062
Lines 3994-3999 Link Here
3994
	mutex_enter(&arc_lowmem_lock);
4110
	mutex_enter(&arc_lowmem_lock);
3995
	mutex_enter(&arc_reclaim_thr_lock);
4111
	mutex_enter(&arc_reclaim_thr_lock);
3996
	needfree = 1;
4112
	needfree = 1;
4113
	DTRACE_PROBE(arc__needfree);
3997
	cv_signal(&arc_reclaim_thr_cv);
4114
	cv_signal(&arc_reclaim_thr_cv);
3998
4115
3999
	/*
4116
	/*
(-)dmu_tx.c (+8 lines)
Lines 1063-1068 Link Here
1063
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1063
	ASSERT3U(dirty, <, zfs_dirty_data_max);
1064
1064
1065
	now = gethrtime();
1065
	now = gethrtime();
1066
	/*
1067
	 * Because of dynamic dirty_data_max sizing in arc.c, it is possible
1068
	 * for it to be exactly equal to dirty.  Prevent divide-by-zero panic
1069
	 * in the unlikely event that happens to be the case.
1070
	 */
1071
	if !(zfs_dirty_data_max - dirty)
1072
		zfs_dirty_data_max++;
1073
	}
1066
	min_tx_time = zfs_delay_scale *
1074
	min_tx_time = zfs_delay_scale *
1067
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1075
	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1068
	if (now > tx->tx_start + min_tx_time)
1076
	if (now > tx->tx_start + min_tx_time)

Return to bug 187594