We have a process which randomly gets stuck in uninterruptible sleep. The process is a large scale ping and SNMP poller, using libpcap, raw ICMP socket, and UDP socket. If sendto(2) fails, we call usleep(50000) and retry. We suspect (but can't confirm) that the process is stuck in this sleep. We have not been able to reliably reproduce this problem. It has happened on a customer's server and in our test lab. The process is still currently stuck in this state and we can provide additional debug info if requested. # freebsd-version -ku; uname -apKU 10.2-RELEASE-p3 10.2-RELEASE-p3 FreeBSD shed163.akips.com 10.2-RELEASE-p3 FreeBSD 10.2-RELEASE-p3 #0 r287883: Thu Sep 17 05:59:12 UTC 2015 root@shed31.akips.com:/usr/obj/usr/src/sys/GENERIC amd64 amd64 1002000 1002000 # top -b last pid: 27815; load averages: 1.79, 1.29, 1.23 up 12+18:46:53 14:50:31 107 processes: 4 running, 98 sleeping, 5 zombie Mem: 723M Active, 183M Inact, 2937M Wired, 2400K Cache, 435M Buf, 106M Free ARC: 2496M Total, 134M MFU, 2342M MRU, 100K Anon, 11M Header, 10M Other Swap: 4096M Total, 524M Used, 3572M Free, 12% Inuse PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAND 20499 akips 1 31 0 117M 71800K RUN 1 5:21 6.05% perl 19844 akips 1 23 0 70948K 21456K RUN 0 11:41 4.05% perl 19845 akips 1 22 0 750M 45900K select 0 9:53 3.08% nm-db 20500 akips 1 21 0 802M 104M select 1 1:06 1.46% nm-db 24516 akips 1 20 0 74020K 5352K select 0 70:06 1.07% nm-httpd 24464 akips 1 20 0 73948K 3308K RUN 0 41:37 0.59% nm-timed 24467 root 1 20 0 421M 63696K select 1 52:24 0.29% nm-joatd 24525 akips 1 20 0 104M 21600K select 0 85:04 0.20% nm-flow-collector 24528 akips 1 4 0 782M 77196K 0 222:50 0.00% nm-poller 24511 akips 1 20 0 123M 3512K select 0 18:14 0.00% nm-msgd 24538 akips 1 20 0 98M 28432K select 1 18:05 0.00% nm-flow-meter 24510 akips 1 20 0 750M 269M nanslp 0 17:34 0.00% nm-poller-helper 24536 akips 1 20 0 98M 27008K select 0 15:44 0.00% nm-flow-meter 24512 root 1 20 0 21948K 2652K nanslp 1 13:45 0.00% top 24537 akips 1 20 0 92408K 18668K select 0 6:37 0.00% nm-flow-meter 24527 akips 1 20 0 750M 43756K select 0 4:58 0.00% nm-snmp-trapd 24544 akips 1 20 0 88312K 13080K select 1 3:59 0.00% nm-flow-meter 24493 akips 1 20 0 78056K 3652K select 0 3:56 0.00% nm-fifo-tee # procstat -kk 24528 PID TID COMM TDNAME KSTACK 24528 100360 nm-poller - mi_switch+0xe1 sleepq_timedwait_sig+0x8b _cv_timedwait_sig_sbt+0x18b seltdwait+0xa4 kern_select+0x8fa sys_select+0x54 amd64_syscall+0x357 Xfast_syscall+0xfb # ps -o user,pid,ppid,pending,caught,ignored,blocked,stat,wchan 24528 USER PID PPID PENDING CAUGHT IGNORED BLOCKED STAT WCHAN akips 24528 1 0 f008568b 8408000 60085003 Ds - # ps -o pid,paddr 24528 PID PADDR 24528 fffff8009d563000 (kgdb) p *(struct proc *)0xfffff8009d563000 $1 = {p_list = {le_next = 0xfffff800194339d0, le_prev = 0xfffff800133da000}, p_threads = { tqh_first = 0xfffff8009d566000, tqh_last = 0xfffff8009d566010}, p_slock = {lock_object = { lo_name = 0xffffffff80fc8265 "process slock", lo_flags = 720896, lo_data = 0, lo_witness = 0x0}, mtx_lock = 4}, p_ucred = 0xfffff80013daf300, p_fd = 0xfffff800c19f8000, p_fdtol = 0x0, p_stats = 0xfffff80013c1b200, p_limit = 0xfffff800ac7d2400, p_limco = {c_links = { le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe = {tqe_next = 0x0, tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0}, p_sigacts = 0xfffff800320aa000, p_flag = 268435712, p_flag2 = 0, p_state = PRS_NORMAL, p_pid = 24528, p_hash = {le_next = 0x0, le_prev = 0xfffffe00006f1e80}, p_pglist = {le_next = 0x0, le_prev = 0xfffff8007bd35090}, p_pptr = 0xfffff800029724e8, p_sibling = { le_next = 0xfffff800194339d0, le_prev = 0xfffff800029725e0}, p_children = {lh_first = 0x0}, p_mtx = {lock_object = {lo_name = 0xffffffff80fc8258 "process lock", lo_flags = 21168128, lo_data = 0, lo_witness = 0x0}, mtx_lock = 4}, p_ksi = 0xfffff80013301070, p_sigqueue = { sq_signals = {__bits = {0, 0, 0, 0}}, sq_kill = {__bits = {0, 0, 0, 0}}, sq_list = { tqh_first = 0x0, tqh_last = 0xfffff8009d563148}, sq_proc = 0xfffff8009d563000, sq_flags = 1}, p_oppid = 0, p_vmspace = 0xfffff801062e7380, p_swtick = 686191978, p_realtimer = {it_interval = { tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, p_ru = {ru_utime = { tv_sec = 0, tv_usec = 0}, ru_stime = {tv_sec = 0, tv_usec = 0}, ru_maxrss = 0, ru_ixrss = 0, ru_idrss = 0, ru_isrss = 0, ru_minflt = 0, ru_majflt = 0, ru_nswap = 0, ru_inblock = 0, ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 0, ru_nvcsw = 0, ru_nivcsw = 0}, p_rux = {rux_runtime = 22692949475677, rux_uticks = 290352, rux_sticks = 1363204, rux_iticks = 0, rux_uu = 2347747468, rux_su = 11022676790, rux_tu = 13370422433}, p_crux = {rux_runtime = 0, rux_uticks = 0, rux_sticks = 0, rux_iticks = 0, rux_uu = 0, rux_su = 0, rux_tu = 0}, p_profthreads = 0, p_exitthreads = 0, p_traceflag = 0, p_tracevp = 0x0, p_tracecred = 0x0, p_textvp = 0xfffff800cfa9d588, p_lock = 0, p_sigiolst = {slh_first = 0x0}, p_sigparent = 20, p_sig = 0, p_code = 0, p_stops = 0, p_stype = 0, p_step = 0 '\0', p_pfsflags = 0 '\0', p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, p_suspcount = 0, p_xthread = 0x0, p_boundary_count = 0, p_pendingcnt = 0, p_itimers = 0x0, p_procdesc = 0x0, p_magic = 3203398350, p_osrel = 1002000, p_comm = "nm-poller\000\000\000\000\000\000\000\000\000\000", p_pgrp = 0xfffff8007bd35080, p_sysent = 0xffffffff814d4388, p_args = 0xfffff80002b73360, p_cpulimit = 9223372036854775807, p_nice = 0 '\0', p_fibnum = 0, p_xstat = 0, p_klist = { kl_list = {slh_first = 0x0}, kl_lock = 0xffffffff80907d90 <knlist_mtx_lock>, kl_unlock = 0xffffffff80907dd0 <knlist_mtx_unlock>, kl_assert_locked = 0xffffffff80907e10 <knlist_mtx_assert_locked>, kl_assert_unlocked = 0xffffffff80907e20 <knlist_mtx_assert_unlocked>, kl_lockarg = 0xfffff8009d563100}, p_numthreads = 1, p_md = {md_ldt = 0x0, md_ldt_sd = { sd_lolimit = 0, sd_lobase = 0, sd_type = 0, sd_dpl = 0, sd_p = 0, sd_hilimit = 0, sd_xx0 = 0, sd_gran = 0, sd_hibase = 0, sd_xx1 = 0, sd_mbz = 0, sd_xx2 = 0}}, p_itcallout = {c_links = { le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe = {tqe_next = 0x0, tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0}, p_acflag = 1, p_peers = 0x0, p_leader = 0xfffff8009d563000, p_emuldata = 0x0, p_label = 0x0, p_sched = 0xfffff8009d5634e8, p_ktr = {stqh_first = 0x0, stqh_last = 0xfffff8009d563448}, p_mqnotifier = {lh_first = 0x0}, p_dtrace = 0xfffff8008855f100, p_pwait = {cv_description = 0xffffffff80fc8c59 "ppwait", cv_waiters = 0}, p_dbgwait = {cv_description = 0xffffffff80fc8c60 "dbgwait", cv_waiters = 0}, p_prev_runtime = 0, p_racct = 0x0, p_throttled = 0 '\0', p_orphan = {le_next = 0x0, le_prev = 0x0}, p_orphans = {lh_first = 0x0}, p_treeflag = 0, p_reaper = 0xfffff800029724e8, p_reaplist = {lh_first = 0x0}, p_reapsibling = {le_next = 0xfffff800194339d0, le_prev = 0xfffff800133da4d0}, p_reapsubtree = 16} (kgdb) p/x *(((struct proc *)0xfffff8009d563000)->p_threads.tqh_first) $2 = {td_lock = 0xffffffff815cb380, td_proc = 0xfffff8009d563000, td_plist = {tqe_next = 0x0, tqe_prev = 0xfffff8009d563010}, td_runq = {tqe_next = 0x0, tqe_prev = 0xffffffff815cb5c8}, td_slpq = {tqe_next = 0x0, tqe_prev = 0xfffff800027cdc80}, td_lockq = {tqe_next = 0x0, tqe_prev = 0xfffffe0122b4e638}, td_hash = {le_next = 0x0, le_prev = 0xfffffe0000866040}, td_cpuset = 0xfffff8000296c3a8, td_sel = 0xfffff80106647300, td_sleepqueue = 0xfffff800027cdc80, td_turnstile = 0xfffff8009d5b5a80, td_rlqe = 0xfffff80013434d70, td_umtxq = 0xfffff80013c2cb80, td_tid = 0x18808, td_sigqueue = {sq_signals = {__bits = {0x0, 0x0, 0x0, 0x0}}, sq_kill = { __bits = {0x0, 0x0, 0x0, 0x0}}, sq_list = {tqh_first = 0x0, tqh_last = 0xfffff8009d5660b8}, sq_proc = 0xfffff8009d563000, sq_flags = 0x1}, td_lend_user_pri = 0xff, td_flags = 0x14, td_inhibitors = 0x2, td_pflags = 0x0, td_dupfd = 0x0, td_sqqueue = 0x0, td_wchan = 0x0, td_wmesg = 0x0, td_lastcpu = 0x0, td_oncpu = 0xff, td_owepreempt = 0x0, td_tsqueue = 0x0, td_locks = 0xca3f, td_rw_rlocks = 0x0, td_lk_slocks = 0x0, td_stopsched = 0x0, td_blocked = 0x0, td_lockname = 0x0, td_contested = {lh_first = 0x0}, td_sleeplocks = 0x0, td_intr_nesting_level = 0x0, td_pinned = 0x0, td_ucred = 0xfffff80013daf300, td_estcpu = 0x0, td_slptick = 0x0, td_blktick = 0x0, td_swvoltick = 0x36cff98d, td_cow = 0x22, td_ru = {ru_utime = { tv_sec = 0x0, tv_usec = 0x0}, ru_stime = {tv_sec = 0x0, tv_usec = 0x0}, ru_maxrss = 0x37b40, ru_ixrss = 0x2f4f0180, ru_idrss = 0xc3b0abce0, ru_isrss = 0xc9d9a00, ru_minflt = 0x20b07, ru_majflt = 0x3df, ru_nswap = 0x0, ru_inblock = 0x0, ru_oublock = 0x1, ru_msgsnd = 0x4e8632d, ru_msgrcv = 0x2385fa8, ru_nsignals = 0x0, ru_nvcsw = 0x2d34637, ru_nivcsw = 0xa450a3}, td_rux = { rux_runtime = 0x14a39d31415d, rux_uticks = 0x46e30, rux_sticks = 0x14cd04, rux_iticks = 0x0, rux_uu = 0x8befc16a, rux_su = 0x29100b336, rux_tu = 0x31cf074a1}, td_incruntime = 0x0, td_runtime = 0x14a39d31415d, td_pticks = 0x0, td_sticks = 0x0, td_iticks = 0x0, td_uticks = 0x0, td_intrval = 0x0, td_oldsigmask = {__bits = {0x0, 0x0, 0x0, 0x0}}, td_generation = 0x37796da, td_sigstk = {ss_sp = 0x0, ss_size = 0x0, ss_flags = 0x4}, td_xsig = 0x0, td_profil_addr = 0x0, td_profil_ticks = 0x0, td_name = {0x6e, 0x6d, 0x2d, 0x70, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x0 <repeats 11 times>}, td_fpop = 0x0, td_dbgflags = 0x0, td_dbgksi = {ksi_link = { tqe_next = 0x0, tqe_prev = 0x0}, ksi_info = {si_signo = 0x0, si_errno = 0x0, si_code = 0x0, si_pid = 0x0, si_uid = 0x0, si_status = 0x0, si_addr = 0x0, si_value = {sival_int = 0x0, sival_ptr = 0x0, sigval_int = 0x0, sigval_ptr = 0x0}, _reason = {_fault = {_trapno = 0x0}, _timer = {_timerid = 0x0, _overrun = 0x0}, _mesgq = {_mqd = 0x0}, _poll = {_band = 0x0}, __spare__ = {__spare1__ = 0x0, __spare2__ = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}}}}, ksi_flags = 0x0, ksi_sigq = 0x0}, td_ng_outbound = 0x0, td_osd = {osd_nslots = 0x0, osd_slots = 0x0, osd_next = {le_next = 0x0, le_prev = 0x0}}, td_map_def_user = 0x0, td_dbg_forked = 0x0, td_vp_reserv = 0x0, td_no_sleeping = 0x0, td_dom_rr_idx = 0x0, td_sigmask = { __bits = {0x60085003, 0x0, 0x0, 0x0}}, td_rqindex = 0x1a, td_base_pri = 0x68, td_priority = 0x68, td_pri_class = 0x3, td_user_pri = 0x7f, td_base_user_pri = 0x7f, td_pcb = 0xfffffe0122aa4cc0, td_state = 0x1, td_retval = {0x0, 0x0}, td_slpcallout = {c_links = { le = {le_next = 0x0, le_prev = 0xfffffe0000880840}, sle = {sle_next = 0x0}, tqe = { tqe_next = 0x0, tqe_prev = 0xfffffe0000880840}}, c_time = 0xe082d08b24e81, c_precision = 0x12381, c_arg = 0xfffff8009d566000, c_func = 0xffffffff809901f0, c_lock = 0x0, c_flags = 0x0, c_iflags = 0x110, c_cpu = 0x1}, td_frame = 0xfffffe0122aa4c00, td_kstack_obj = 0xfffff8009d709700, td_kstack = 0xfffffe0122aa1000, td_kstack_pages = 0x4, td_critnest = 0x1, td_md = {md_spinlock_count = 0x1, md_saved_flags = 0x246, md_spurflt_addr = 0x801018000}, td_sched = 0xfffff8009d566470, td_ar = 0x0, td_lprof = {{ lh_first = 0x0}, {lh_first = 0x0}}, td_dtrace = 0xfffff8001394db00, td_errno = 0x0, td_vnet = 0x0, td_vnet_lpush = 0x0, td_intr_frame = 0x0, td_rfppwait_p = 0xfffff8001334c4e8, td_ma = 0x0, td_ma_cnt = 0x0, td_su = 0x0}
will this be fixed in 10.3-RELEASE?
(In reply to me+freebsd from comment #1) There is a patch in https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200992 what was applied to 10.3. If this patch fixes this case, as well ?
(In reply to Kurt Jaeger from comment #2) Not sure yet. I upgraded and will keep an eye on it.
> Not sure yet. I upgraded and will keep an eye on it. It's been going ok so far. (For my specific case — riak on digital ocean.)
We have been running 10.3 since RC3 in our lab, and have had approx 60 servers running 10.3-RELEASE since 19th April 2016 without issue.