Running with this custom kernel config on 2 socket 96 core ThunderX at packet.net: nooptions VIMAGE options ALT_BREAK_TO_DEBUGGER options DEBUG_LOCKS options DEBUG_VFS_LOCKS options DIAGNOSTIC Entered the debugger via <CR> ~ ^B. 'c' command to continue remained in the debugger, and it eventually reset: root@arm64-kernel-test:~ # KDB: enter: Break to debugger [ thread pid 11 tid 100004 ] Stopped at 0 db> bt Tracing pid 11 tid 100004 td 0xfffffd0007cb9a80 db_trace_self() at db_stack_trace+0xec pc = 0xffff0000005f7c58 lr = 0xffff000000084690 sp = 0xffff000aa498fef0 fp = 0xffff000aa498ff20 db_stack_trace() at db_command+0x224 pc = 0xffff000000084690 lr = 0xffff00000008431c sp = 0xffff000aa498ff30 fp = 0xffff000aa4990010 db_command() at db_command_loop+0x60 pc = 0xffff00000008431c lr = 0xffff0000000840dc sp = 0xffff000aa4990020 fp = 0xffff000aa4990040 db_command_loop() at db_trap+0xf4 pc = 0xffff0000000840dc lr = 0xffff0000000871b0 sp = 0xffff000aa4990050 fp = 0xffff000aa4990270 db_trap() at kdb_trap+0x190 pc = 0xffff0000000871b0 lr = 0xffff00000035ad74 sp = 0xffff000aa4990280 fp = 0xffff000aa49902e0 kdb_trap() at do_el1h_sync+0x90 pc = 0xffff00000035ad74 lr = 0xffff0000006119c4 sp = 0xffff000aa49902f0 fp = 0xffff000aa4990320 do_el1h_sync() at handle_el1h_sync+0x74 pc = 0xffff0000006119c4 lr = 0xffff0000005fa074 sp = 0xffff000aa4990330 fp = 0xffff000aa4990440 handle_el1h_sync() at kdb_alt_break_internal+0x1a4 pc = 0xffff0000005fa074 lr = 0xffff00000035a65c sp = 0xffff000aa4990450 fp = 0xffff000aa49904f0 kdb_alt_break_internal() at kdb_alt_break+0xc pc = 0xffff00000035a65c lr = 0xffff00000035a4a8 sp = 0xffff000aa4990500 fp = 0xffff000aa4990500 kdb_alt_break() at uart_intr_rxready+0x88 pc = 0xffff00000035a4a8 lr = 0xffff00000019538c sp = 0xffff000aa4990510 fp = 0xffff000aa4990530 uart_intr_rxready() at uart_intr+0xfc pc = 0xffff00000019538c lr = 0xffff000000196078 sp = 0xffff000aa4990540 fp = 0xffff000aa4990570 uart_intr() at intr_event_handle+0xa8 pc = 0xffff000000196078 lr = 0xffff0000002e15a8 sp = 0xffff000aa4990580 fp = 0xffff000aa49905d0 intr_event_handle() at intr_isrc_dispatch+0x5c pc = 0xffff0000002e15a8 lr = 0xffff00000063aed8 sp = 0xffff000aa49905e0 fp = 0xffff000aa49905f0 intr_isrc_dispatch() at arm_gic_v3_intr+0x138 pc = 0xffff00000063aed8 lr = 0xffff0000005fd3e8 sp = 0xffff000aa4990600 fp = 0xffff000aa4990650 arm_gic_v3_intr() at intr_irq_handler+0x68 pc = 0xffff0000005fd3e8 lr = 0xffff00000063ad40 sp = 0xffff000aa4990660 fp = 0xffff000aa4990680 intr_irq_handler() at handle_el1h_irq+0x70 pc = 0xffff00000063ad40 lr = 0xffff0000005fa130 sp = 0xffff000aa4990690 fp = 0xffff000aa49907a0 handle_el1h_irq() at cpu_idle+0x3c pc = 0xffff0000005fa130 lr = 0xffff000000601484 sp = 0xffff000aa49907b0 fp = 0xffff000aa4990840 cpu_idle() at sched_idletd+0xe4 pc = 0xffff000000601484 lr = 0xffff000000345368 sp = 0xffff000aa4990850 fp = 0xffff000aa4990910 sched_idletd() at fork_exit+0x7c pc = 0xffff000000345368 lr = 0xffff0000002de724 sp = 0xffff000aa4990920 fp = 0xffff000aa4990950 fork_exit() at fork_trampoline+0x10 pc = 0xffff0000002de724 lr = 0xffff0000006117a4 sp = 0xffff000aa4990960 fp = 0x0000000000000000 db> c [ thread pid 11 tid 100004 ] Stopped at $d.2+0x3: undefined 00010102 db> c [ thread pid 11 tid 100004 ] Stopped at $d.2+0x7: undefined 00000000 db> c [ thread pid 11 tid 100004 ] Stopped at $d.9+0x1: undefined 00000000 db> continue timeout stopping cpus [ thread pid 11 tid 100004 ] Stopped at $d.9+0x5: undefined 00b70002 db> timeout stopping cpus panic: acquiring blockable sleep lock with spinlock or critical section held (sleep mutex) pmap @ /root/freebsd/sys/arm64/arm64/pmap.c:4711 cpuid = 62 time = 1511807367 KDB: stack backtrace: db_trace_self() at db_trace_self_wrapper+0x28 pc = 0xffff0000005f7c58 lr = 0xffff000000087048 sp = 0xffff000ba8894ba0 fp = 0xffff000ba8894db0 db_trace_self_wrapper() at vpanic+0x184 pc = 0xffff000000087048 lr = 0xffff00000031a750 sp = 0xffff000ba8894dc0 fp = 0xffff000ba8894e40 vpanic() at kassert_panic+0x158 pc = 0xffff00000031a750 lr = 0xffff00000031a5c8 sp = 0xffff000ba8894e50 fp = 0xffff000ba8894f10 kassert_panic() at witness_checkorder+0x140 pc = 0xffff00000031a5c8 lr = 0xffff000000378a6c sp = 0xffff000ba8894f20 fp = 0xffff000ba8894f90 witness_checkorder() at __mtx_lock_flags+0xb0 pc = 0xffff000000378a6c lr = 0xffff0000002fb5e8 sp = 0xffff000ba8894fa0 fp = 0xffff000ba8894fe0 __mtx_lock_flags() at pmap_fault+0x40 pc = 0xffff0000002fb5e8 lr = 0xffff00000060fae0 sp = 0xffff000ba8894ff0 fp = 0xffff000ba8895010 pmap_fault() at data_abort+0xb8 pc = 0xffff00000060fae0 lr = 0xffff000000611be8 sp = 0xffff000ba8895020 fp = 0xffff000ba88950d0 data_abort() at do_el1h_sync+0xf8 pc = 0xffff000000611be8 lr = 0xffff000000611a2c sp = 0xffff000ba88950e0 fp = 0xffff000ba8895110 do_el1h_sync() at handle_el1h_sync+0x74 pc = 0xffff000000611a2c lr = 0xffff0000005fa074 sp = 0xffff000ba8895120 fp = 0xffff000ba8895230 handle_el1h_sync() at vfp_save_state+0x4c pc = 0xffff0000005fa074 lr = 0xffff000000612e90 sp = 0xffff000ba8895240 fp = 0xffff000ba88952d0 vfp_save_state() at savectx+0x50 pc = 0xffff000000612e90 lr = 0xffff000000611858 sp = 0xffff000ba88952e0 fp = 0xffff000ba88952f0 savectx() at arm_gic_v3_intr+0x100 pc = 0xffff000000611858 lr = 0xffff0000005fd3b0 sp = 0xffff000ba8895300 fp = 0xffff000ba8895350 arm_gic_v3_intr() at intr_irq_handler+0x68 pc = 0xffff0000005fd3b0 lr = 0xffff00000063ad40 sp = 0xffff000ba8895360 fp = 0xffff000ba8895380 intr_irq_handler() at handle_el1h_irq+0x70 pc = 0xffff00000063ad40 lr = 0xffff0000005fa130 sp = 0xffff000ba8895390 fp = 0xffff000ba88954a0 handle_el1h_irq() at uma_zalloc_arg+0x488 pc = 0xffff0000005fa130 lr = 0xffff0000005a30d4 sp = 0xffff000ba88954b0 fp = 0xffff000ba88955a0 uma_zalloc_arg() at namei+0xdc pc = 0xffff0000005a30d4 lr = 0xffff0000003cded4 sp = 0xffff000ba88955b0 fp = 0xffff000ba8895680 namei() at kern_statat+0x9c pc = 0xffff0000003cded4 lr = 0xffff0000003e5f40 sp = 0xffff000ba8895690 fp = 0xffff000ba88958a0 kern_statat() at sys_fstatat+0x2c pc = 0xffff0000003e5f40 lr = 0xffff0000003e6534 sp = 0xffff000ba88958b0 fp = 0xffff000ba88959a0 sys_fstatat() at do_el0_sync+0x890 pc = 0xffff0000003e6534 lr = 0xffff000000612614 sp = 0xffff000ba88959b0 fp = 0xffff000ba8895a70 do_el0_sync() at handle_el0_sync+0x74 pc = 0xffff000000612614 lr = 0xffff0000005fa1f4 sp = 0xffff000ba8895a80 fp = 0xffff000ba8895b90 handle_el0_sync() at 0x4029de18 pc = 0xffff0000005fa1f4 lr = 0x000000004029de18 sp = 0xffff000ba8895ba0 fp = 0x0000ffffffffe100 Uptime: 9m46s
A commit references this bug: Author: andrew Date: Tue Nov 28 11:04:47 UTC 2017 New revision: 326312 URL: https://svnweb.freebsd.org/changeset/base/326312 Log: When we exit the kernel debugger having entered because of a breakpoint instruction we need to jump over the instruction. Without this we will execute the same instruction again and enter into the debugger again. PR: 223917 Reported by: emaste MFC after: 1 week Sponsored by: DARPA, AFRL Changes: head/sys/arm64/arm64/trap.c
The original problem seems to be solved, but at r331355 with probably-unrelated WIP on a Pine64 the console is unresponsive after 'c': root@:~ # sysctl debug.kdb.enter=1 debug.kdb.enter:KDB: enter: sysctl debug.kdb.enter [ thread pid 561 tid 100111 ] Stopped at 0x402319b4: undefined 54000042 db> c [ nothing ]
Running GENERIC + BREAK_TO_DEBUGGER on pine64 here Sending break and hitting c in kdb does works. Using sysctl debug.kdb.enter=1 is when I have problems : # sysctl debug.kdb.enter=1 debug.kdb.enter:KDB: enter: sysctl debug.kdb.enter [ thread pid 30 tid 100098 ] Stopped at 0x40232974: undefined 54000042 db> show thread 100098 Thread 100098 at 0xfffffd000788ea80: proc (pid 31): 0xfffffd0007880a50 name: sysctl stack: 0xffff0000402da000-0xffff0000402ddfff flags: 0x4 pflags: 0 state: RUNNING (CPU 1) priority: 120 container lock: sched lock 1 (0xffff000000af1100) last voluntary switch: 11 ms ago db> show thread 0xfffffd000788ea80 Thread 100098 at 0xfffffd000788ea80: proc (pid 31): 0xfffffd0007880a50 name: sysctl stack: 0xffff0000402da000-0xffff0000402ddfff flags: 0x4 pflags: 0 state: RUNNING (CPU 1) priority: 120 container lock: sched lock 1 (0xffff000000af1100) last voluntary switch: 11 ms ago db> c (HANG but not really) Sending BREAK : ~KDB: enter: Break to debugger [ thread pid 11 tid 100004 ] Stopped at 0x4 db> show thread 0xfffffd000788ea80 Thread 100098 at 0xfffffd000788ea80: proc (pid 31): 0xfffffd0007880a50 name: sysctl stack: 0xffff0000402da000-0xffff0000402ddfff flags: 0x4 pflags: 0 state: INACTIVE priority: 203 container lock: sched lock 0 (0xffff000000af0400) last voluntary switch: 1005 ms ago last involuntary switch: 1003 ms ago db> kill 2 31 Can't find process with pid 31 Adding some printfs almost everywhere along the path I've found that we do go back to userland but it seems that sysctl is block in sys_ioctl Side note: I have no problem on my overdrive 1000
It seems only related to the uart console as ssh session are still up after 'c'
(In reply to Emmanuel Vadot from comment #3) Behaviour confirmed on my pine64: I can 'c'ontinue after breaking via CR ~ ^B, not after sysctl debug.kdb.enter=1.
Also another CR ~ ^B will re-enter ddb while the console appears hung.
Seems to work for me on a packet.net ThunderX: login: KDB: enter: sysctl debug.kdb.enter [ thread pid 1123 tid 100976 ] Stopped at 0x40427bf4: undefined 54000042 db> c login: KDB: enter: sysctl debug.kdb.enter [ thread pid 1125 tid 100976 ] Stopped at 0x40427bf4: undefined 54000042 db> c vnic1: Interface stopped DISTRIBUTING, possible flapping login:
(In reply to Mark Johnston from comment #7) Interesting - I just tried the same thing on my pine64, and sysctl debug.kdb.enter=1 from the ssh session works fine (i.e., breaks to the debugger and 'c' works as expected).
(In reply to Ed Maste from comment #8) I don't see any problems with sysctl debug.kdb.enter=1 from the serial console either.
Looks like a race; this works on the serial console: # (sleep 0.1; sysctl debug.kdb.enter=1 >/dev/null) &
A commit in branch main references this bug: URL: https://cgit.FreeBSD.org/src/commit/?id=7e7f7beee732810d3afcc83828341ac3e139b5bd commit 7e7f7beee732810d3afcc83828341ac3e139b5bd Author: Mitchell Horne <mhorne@FreeBSD.org> AuthorDate: 2021-03-10 14:57:12 +0000 Commit: Mitchell Horne <mhorne@FreeBSD.org> CommitDate: 2021-03-10 15:04:42 +0000 ns8250: don't drop IER_TXRDY on bus_grab/ungrab It has been observed that some systems are often unable to resume from ddb after entering with debug.kdb.enter=1. Checking the status further shows the terminal is blocked waiting in tty_drain(), but it never makes progress in clearing the output queue, because sc->sc_txbusy is high. I noticed that when entering polling mode for the debugger, IER_TXRDY is set in the failure case. Since this bit is never tracked by the softc, it will not be restored by ns8250_bus_ungrab(). This creates a race in which a TX interrupt can be lost, creating the hang described above. Ensuring that this bit is restored is enough to prevent this, and resume from ddb as expected. The solution is to track this bit in the sc->ier field, for the same lifetime that TX interrupts are enabled. PR: 223917, 240122 Reviewed by: imp, manu Tested by: bz MFC after: 5 days Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D29130 sys/dev/uart/uart_dev_ns8250.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
A commit in branch stable/13 references this bug: URL: https://cgit.FreeBSD.org/src/commit/?id=17d301f7b59f49c52983fe0957208dddf40b1232 commit 17d301f7b59f49c52983fe0957208dddf40b1232 Author: Mitchell Horne <mhorne@FreeBSD.org> AuthorDate: 2021-03-10 14:57:12 +0000 Commit: Mitchell Horne <mhorne@FreeBSD.org> CommitDate: 2021-03-15 14:22:17 +0000 ns8250: don't drop IER_TXRDY on bus_grab/ungrab It has been observed that some systems are often unable to resume from ddb after entering with debug.kdb.enter=1. Checking the status further shows the terminal is blocked waiting in tty_drain(), but it never makes progress in clearing the output queue, because sc->sc_txbusy is high. I noticed that when entering polling mode for the debugger, IER_TXRDY is set in the failure case. Since this bit is never tracked by the softc, it will not be restored by ns8250_bus_ungrab(). This creates a race in which a TX interrupt can be lost, creating the hang described above. Ensuring that this bit is restored is enough to prevent this, and resume from ddb as expected. The solution is to track this bit in the sc->ier field, for the same lifetime that TX interrupts are enabled. PR: 223917, 240122 Sponsored by: The FreeBSD Foundation (cherry picked from commit 7e7f7beee732810d3afcc83828341ac3e139b5bd) sys/dev/uart/uart_dev_ns8250.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
A commit in branch stable/12 references this bug: URL: https://cgit.FreeBSD.org/src/commit/?id=a54c346ff3e80ff8f2f3d0ec56b5374a7dc34429 commit a54c346ff3e80ff8f2f3d0ec56b5374a7dc34429 Author: Mitchell Horne <mhorne@FreeBSD.org> AuthorDate: 2021-03-10 14:57:12 +0000 Commit: Mitchell Horne <mhorne@FreeBSD.org> CommitDate: 2021-03-16 17:56:03 +0000 ns8250: don't drop IER_TXRDY on bus_grab/ungrab It has been observed that some systems are often unable to resume from ddb after entering with debug.kdb.enter=1. Checking the status further shows the terminal is blocked waiting in tty_drain(), but it never makes progress in clearing the output queue, because sc->sc_txbusy is high. I noticed that when entering polling mode for the debugger, IER_TXRDY is set in the failure case. Since this bit is never tracked by the softc, it will not be restored by ns8250_bus_ungrab(). This creates a race in which a TX interrupt can be lost, creating the hang described above. Ensuring that this bit is restored is enough to prevent this, and resume from ddb as expected. The solution is to track this bit in the sc->ier field, for the same lifetime that TX interrupts are enabled. PR: 223917, 240122 Sponsored by: The FreeBSD Foundation (cherry picked from commit 7e7f7beee732810d3afcc83828341ac3e139b5bd) sys/dev/uart/uart_dev_ns8250.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)