@@ -, +, @@ to onetime devices (kgdb) where #0 sched_switch (td=0xfffff800055bf9a0, newtd=0xfffff80002341000, flags=) at /usr/src/sys/kern/sched_ule.c:1969 #1 0xffffffff80962635 in mi_switch (flags=, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:455 #2 0xffffffff809aaa3a in sleepq_wait (wchan=0x0, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:637 #3 0xffffffff80962038 in _sleep (ident=, lock=, priority=, wmesg=0xffffffff80e826ee "vmwait", sbt=0, pr=, flags=) at /usr/src/sys/kern/kern_synch.c:229 #4 0xffffffff80c1ac6b in vm_wait () at /usr/src/sys/vm/vm_page.c:2705 #5 0xffffffff80c06a9f in kmem_back (object=0xffffffff8144d6f0, addr=18446741874805047296, size=69632, flags=) at /usr/src/sys/vm/vm_kern.c:356 #6 0xffffffff80c068d2 in kmem_malloc (vmem=0xffffffff813aa500, size=69632, flags=2) at /usr/src/sys/vm/vm_kern.c:316 #7 0xffffffff80bfd7d6 in uma_large_malloc (size=69632, wait=2) at /usr/src/sys/vm/uma_core.c:1106 #8 0xffffffff8092f614 in malloc (size=, mtp=0xffffffff81b4d520, flags=0) at /usr/src/sys/kern/kern_malloc.c:513 #9 0xffffffff81b4ab99 in g_eli_crypto_run (wr=0xfffff80002560040, bp=0xfffff80008a86d90) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli_privacy.c:262 #10 0xffffffff81b3e860 in g_eli_worker (arg=0xfffff80002560040) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli.c:565 #11 0xffffffff80910f5c in fork_exit (callout=0xffffffff81b3e0b0 , arg=0xfffff80002560040, frame=0xfffffe005005ec00) at /usr/src/sys/kern/kern_fork.c:1034 #12 0xffffffff80c33f0e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:611 #13 0x0000000000000000 in ?? () (kgdb) p vm_cnt $16 = {v_swtch = 0, v_trap = 0, v_syscall = 0, v_intr = 0, v_soft = 0, v_vm_faults = 0, v_io_faults = 0, v_cow_faults = 0, v_cow_optim = 0, v_zfod = 0, v_ozfod = 0, v_swapin = 0, v_swapout = 0, v_swappgsin = 0, v_swappgsout = 0, v_vnodein = 0, v_vnodeout = 0, v_vnodepgsin = 0, v_vnodepgsout = 0, v_intrans = 0, v_reactivated = 0, v_pdwakeups = 22197, v_pdpages = 0, v_tcached = 0, v_dfree = 0, v_pfree = 0, v_tfree = 0, v_page_size = 4096, v_page_count = 247688, v_free_reserved = 372, v_free_target = 5320, v_free_min = 1609, v_free_count = 2, v_wire_count = 140735, v_active_count = 96194, v_inactive_target = 7980, v_inactive_count = 10756, v_cache_count = 0, v_pageout_free_min = 34, v_interrupt_free_min = 2, v_free_severe = 990, v_forks = 0, v_vforks = 0, v_rforks = 0, v_kthreads = 0, v_forkpages = 0, v_vforkpages = 0, v_rforkpages = 0, v_kthreadpages = 0, v_spare = 0xffffffff8144d5ac} -j4 and 1 GB of RAM: gpt/swap-ada1.eli BIO_WRITE value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4965848 8192 |@@@@@ 943980 12288 |@@ 362668 16384 |@ 161485 20480 |@ 120939 24576 | 87827 28672 | 57402 32768 | 40470 36864 | 42243 40960 | 28543 45056 | 20347 49152 | 15235 53248 | 13450 57344 | 9535 61440 | 9952 65536 |@ 179360 69632 | 0 gpt/swap-ada1.eli BIO_READ value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4645114 8192 | 0 12288 | 0 16384 | 3446 20480 | 0 --- sys/geom/eli/g_eli.c | 58 +++++++++++++++++++++++++++++-- sys/geom/eli/g_eli.h | 4 +++ sys/geom/eli/g_eli_privacy.c | 81 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 135 insertions(+), 8 deletions(-) --- b/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -82,6 +82,33 @@ u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); +uma_zone_t g_eli_zone; +static u_int g_eli_uma_reserve = 1; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, uma_reserve, CTLFLAG_RDTUN, + &g_eli_uma_reserve, 0, "Items to pre-allocate in dedicated uma zone " + "and reserve for unauthenticated writes to onetime disks"); + +u_int g_eli_all_writes_use_uma = 0; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, use_uma_for_all_writes, CTLFLAG_RDTUN, + &g_eli_all_writes_use_uma, 0, "Use the dedicated uma zone for all " + "unauthenticated writes that fit, not just for writes to onetime " + "providers. May reduce write latency but also increases memory usage."); + +/* + * Items in the dedicated uma zone have a fixed size and need + * to be big enough for all bio lengths for which it is being used. + * + * g_eli_max_uma_bio_length is the largest amount of data geli can receive + * in a row and fit into an uma zone item, it is used to calculate + * g_eli_zone_item_size which additionally accounts for the encryption + * overhead, which depends on the number of sectors. + */ +u_int g_eli_max_uma_bio_length = MAXPHYS; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, max_uma_bio_length, CTLFLAG_RDTUN, + &g_eli_max_uma_bio_length, 0, + "Maximum bio length for which the uma zone is being used."); +u_int g_eli_zone_item_size; + /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. @@ -246,8 +273,8 @@ g_eli_write_done(struct bio *bp) pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; - free(pbp->bio_driver2, M_ELI); - pbp->bio_driver2 = NULL; + sc = pbp->bio_to->geom->softc; + g_eli_free_driver2(pbp, sc->sc_flags); if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, pbp->bio_error); @@ -258,7 +285,6 @@ g_eli_write_done(struct bio *bp) /* * Write is finished, send it up. */ - sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -1254,6 +1280,31 @@ static void g_eli_init(struct g_class *mp) { + /* + * Calculate uma zone item size based on the largest bio length + * we need to use it for. + * + * 512 bytes is the smallest sector size supported and results in the + * largest overhead. If larger sectors are being used, we'll just waste + * a bit more memory. + * + * We currently only use the zone for unauthenticated writes, + * otherwise the item size would have to be even bigger. + */ + g_eli_zone_item_size = (g_eli_max_uma_bio_length + \ + (g_eli_max_uma_bio_length / 512) * \ + (sizeof(struct cryptop) + sizeof(struct cryptodesc))); + + G_ELI_DEBUG(3, "Using uma zone item size %d for max bio length %d", + g_eli_zone_item_size, g_eli_max_uma_bio_length); + g_eli_zone = uma_zcreate("g_eli", g_eli_zone_item_size, NULL, NULL, + NULL, NULL, 0, UMA_ZONE_NOFREE); + if (0 < g_eli_uma_reserve) { + /* Increase the chances that items are available when needed. */ + uma_prealloc(g_eli_zone, g_eli_uma_reserve); + uma_zone_reserve(g_eli_zone, g_eli_uma_reserve); + } + g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) @@ -1264,6 +1315,7 @@ static void g_eli_fini(struct g_class *mp) { + uma_zdestroy(g_eli_zone); if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } --- b/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -139,6 +139,9 @@ #define G_ELI_CRYPTO_SW 2 #ifdef _KERNEL +extern u_int g_eli_max_uma_bio_length; +extern u_int g_eli_zone_item_size; + extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; @@ -705,5 +708,6 @@ void g_eli_key_init(struct g_eli_softc *sc); void g_eli_key_destroy(struct g_eli_softc *sc); uint8_t *g_eli_key_hold(struct g_eli_softc *sc, off_t offset, size_t blocksize); void g_eli_key_drop(struct g_eli_softc *sc, uint8_t *rawkey); +void g_eli_free_driver2(struct bio *bp, int sc_flags); #endif #endif /* !_G_ELI_H_ */ --- b/sys/geom/eli/g_eli_privacy.c +++ b/sys/geom/eli/g_eli_privacy.c @@ -49,6 +49,9 @@ __FBSDID("$FreeBSD$"); #include #include +extern u_int g_eli_all_writes_use_uma; +extern uma_zone_t g_eli_zone; + /* * Code paths: * BIO_READ: @@ -59,6 +62,76 @@ __FBSDID("$FreeBSD$"); MALLOC_DECLARE(M_ELI); +static int +g_eli_use_uma_zone(const struct bio *bp, int sc_flags) +{ + + return (bp->bio_cmd == BIO_WRITE && + bp->bio_length <= g_eli_max_uma_bio_length && + ((sc_flags & G_ELI_FLAG_ONETIME) != 0 || g_eli_all_writes_use_uma) && + (sc_flags & G_ELI_FLAG_AUTH) == 0); +} + +static void * +g_eli_alloc_driver2(const struct bio *bp, size_t size, int sc_flags) +{ + void *p; + + /* + * While requests from the authentication code path should + * be handled correctly (by never using the uma zone) they + * currently aren't expected to occur at all. + */ + KASSERT((sc_flags & G_ELI_FLAG_AUTH) == 0, + ("g_eli_alloc_driver2() used for authenticated write")); + + if (g_eli_use_uma_zone(bp, sc_flags)) { + int uma_flags; + + KASSERT(size <= g_eli_zone_item_size, + ("Insufficient g_eli_zone_item_size %u < %u", + (unsigned)g_eli_zone_item_size, (unsigned)size)); + /* + * Writes to onetime providers are likely to originate + * from the page daemon, therefore we try to get the + * memory a bit harder for them to prevent vm deadlocks. + */ + if ((sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_flags = M_NOWAIT|M_USE_RESERVE; + else + uma_flags = M_WAITOK; + + G_ELI_DEBUG(3, "Using uma zone for size: %jd, bio_length: %jd", + size, bp->bio_length); + while (NULL == (p = uma_zalloc(g_eli_zone, uma_flags))) { + /* Only reachable for onetime providers */ + pause("g_eli:uma", min(hz/1000, 1)); + } + } else { + G_ELI_DEBUG(3, "Using malloc for size: %jd, bio_length: %jd", + size, bp->bio_length); + p = malloc(size, M_ELI, M_WAITOK); + } + + return (p); +} + +void +g_eli_free_driver2(struct bio *bp, int sc_flags) +{ + + if (g_eli_use_uma_zone(bp, sc_flags)) { + G_ELI_DEBUG(3, "Using uma_free for bio_length: %jd", + bp->bio_length); + uma_zfree(g_eli_zone, bp->bio_driver2); + } else { + G_ELI_DEBUG(3, "Using free for bio_length: %jd", + bp->bio_length); + free(bp->bio_driver2, M_ELI); + } + bp->bio_driver2 = NULL; +} + /* * The function is called after we read and decrypt data. * @@ -94,8 +167,7 @@ g_eli_crypto_read_done(struct cryptop *crp) */ if (bp->bio_inbed < bp->bio_children) return (0); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_driver2(bp, sc->sc_flags); if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto READ request failed (error=%d).", bp->bio_error); @@ -153,8 +225,7 @@ g_eli_crypto_write_done(struct cryptop *crp) if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); - bp->bio_driver2 = NULL; + g_eli_free_driver2(bp, sc->sc_flags); g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); atomic_subtract_int(&sc->sc_inflight, 1); @@ -259,7 +330,7 @@ g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; - p = malloc(size, M_ELI, M_WAITOK); + p = g_eli_alloc_driver2(bp, size, sc->sc_flags); bp->bio_inbed = 0; bp->bio_children = nsec; -- UMA_ZONE_OFFPAGE|UMA_ZONE_HASH --- sys/geom/eli/g_eli.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) --- b/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -1297,8 +1297,15 @@ g_eli_init(struct g_class *mp) G_ELI_DEBUG(3, "Using uma zone item size %d for max bio length %d", g_eli_zone_item_size, g_eli_max_uma_bio_length); + + /* + * XXX: Setting UMA_ZONE_OFFPAGE|UMA_ZONE_HASH instead of + * letting uma decide itself works around: + * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=218911 + * Once this is fixed, no flags are needed anymore. + */ g_eli_zone = uma_zcreate("g_eli", g_eli_zone_item_size, NULL, NULL, - NULL, NULL, 0, UMA_ZONE_NOFREE); + NULL, NULL, UMA_ZONE_OFFPAGE|UMA_ZONE_HASH, UMA_ZONE_NOFREE); if (0 < g_eli_uma_reserve) { /* Increase the chances that items are available when needed. */ uma_prealloc(g_eli_zone, g_eli_uma_reserve); --