Bug 253318 - lldb: aborted when debugging named
Summary: lldb: aborted when debugging named
Status: Open
Alias: None
Product: Base System
Classification: Unclassified
Component: bin (show other bugs)
Version: CURRENT
Hardware: Any Any
: --- Affects Only Me
Assignee: freebsd-bugs (Nobody)
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2021-02-07 04:59 UTC by Xin LI
Modified: 2021-02-12 20:11 UTC (History)
2 users (show)

See Also:


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description Xin LI freebsd_committer freebsd_triage 2021-02-07 04:59:51 UTC
Backtrace:

* thread #1, name = 'lldb', stop reason = signal SIGABRT
  * frame #0: 0x00000008055ea1fa libc.so.7`__sys_thr_kill at thr_kill.S:4
    frame #1: 0x000000080555fc64 libc.so.7`__raise(s=6) at raise.c:52:10
    frame #2: 0x0000000805613ad6 libc.so.7`abort at abort.c:79:8
    frame #3: 0x0000000805488d69 libcxxrt.so.1`report_failure(err=<unavailable>, thrown_exception=0x000000080628f368) at exception.cc:719:5
    frame #4: 0x000000080545f4ac libc++.so.1`std::__1::__throw_system_error(ev=11, what_arg="") at system_error.cpp:287:5
    frame #5: 0x000000080545361d libc++.so.1`std::__1::mutex::lock(this=<unavailable>) at mutex.cpp:35:9
    frame #6: 0x0000000002137a29 lldb`::PrintAsync() [inlined] lock_guard at __mutex_base:91:27
    frame #7: 0x0000000002137a21 lldb`::PrintAsync() at Editline.cpp:1465
    frame #8: 0x00000000020cee36 lldb`::PrintAsync() at IOHandler.cpp:127:14
    frame #9: 0x00000000020f5e66 lldb`::~StreamAsynchronousIO() [inlined] Flush at StreamAsynchronousIO.cpp:28:16
    frame #10: 0x00000000020f5e2e lldb`::~StreamAsynchronousIO() at StreamAsynchronousIO.cpp:23
    frame #11: 0x00000000024a0e8c lldb`::PrintWarning() [inlined] __release_shared at memory:3341:9
    frame #12: 0x00000000024a0e78 lldb`::PrintWarning() [inlined] __release_shared at memory:3383
    frame #13: 0x00000000024a0e78 lldb`::PrintWarning() [inlined] ~shared_ptr at memory:4098
    frame #14: 0x00000000024a0e68 lldb`::PrintWarning() at Process.cpp:5837
    frame #15: 0x00000000024a1110 lldb`::PrintWarningUnsupportedLanguage() at Process.cpp:5864:5
    frame #16: 0x00000000024d8f3f lldb`::FrameSelectedCallback() at Thread.cpp:335:19
    frame #17: 0x00000000024d8cbb lldb`::GetSelectedFrame() at Thread.cpp:274:3
    frame #18: 0x000000000247bad0 lldb`::SetTargetPtr() at ExecutionContext.cpp:527:56
    frame #19: 0x0000000002160125 lldb`::HandleCompletion() [inlined] UpdateExecutionContext at CommandInterpreter.cpp:2685:19
    frame #20: 0x00000000021600fe lldb`::HandleCompletion() at CommandInterpreter.cpp:1854
    frame #21: 0x0000000002135a81 lldb`::TabCommand() at Editline.cpp:988:3
    frame #22: 0x0000000002138b6c lldb`::__invoke() [inlined] operator() at Editline.cpp:1144:45
    frame #23: 0x0000000002138b57 lldb`::__invoke() at Editline.cpp:1142
    frame #24: 0x0000000805261741 libedit.so.8`el_wgets(el=0x00000008062e6600, nread=0x00007fffffffe0b4) at read.c:537:12
    frame #25: 0x00000000021375c2 lldb`::GetLine() at Editline.cpp:1407:16
    frame #26: 0x00000000020cfa56 lldb`::GetLine() at IOHandler.cpp:323:29
    frame #27: 0x00000000020d08ae lldb`::Run() at IOHandler.cpp:554:11
    frame #28: 0x00000000020b7f5f lldb`::RunIOHandlers() at Debugger.cpp:857:16
    frame #29: 0x0000000002163869 lldb`::RunCommandInterpreter() at CommandInterpreter.cpp:2984:16
    frame #30: 0x0000000001e2b21c lldb`::RunCommandInterpreter() at SBDebugger.cpp:1172:42
    frame #31: 0x0000000001def2ef lldb`::MainLoop() at Driver.cpp:672:18
    frame #32: 0x0000000001df0aa3 lldb`main at Driver.cpp:910:26
    frame #33: 0x0000000001decaff lldb`_start(ap=<unavailable>, cleanup=<unavailable>) at crt1_c.c:75:7

frame #1: 0x000000080555fc64 libc.so.7`__raise(s=6) at raise.c:52:10
   49  
   50           if (__sys_thr_self(&id) == -1)
   51                   return (-1);
-> 52           return (__sys_thr_kill(id, s));
   53   }
(lldb) up
frame #2: 0x0000000805613ad6 libc.so.7`abort at abort.c:79:8
   76           (void)__libc_sigaction(SIGABRT, &act, NULL);
   77           sigdelset(&act.sa_mask, SIGABRT);
   78           (void)__libc_sigprocmask(SIG_SETMASK, &act.sa_mask, NULL);
-> 79           (void)raise(SIGABRT);
   80           exit(1);
   81   }
(lldb) up
frame #3: 0x0000000805488d69 libcxxrt.so.1`report_failure(err=<unavailable>, thrown_exception=0x000000080628f368) at exception.cc:719:5
   716  #endif
   717                  case _URC_END_OF_STACK:
   718                          __cxa_begin_catch (&(thrown_exception->unwindHeader));
-> 719                          std::terminate();
   720                          fprintf(stderr, "Terminating due to uncaught exception %p", 
   721                                          static_cast<void*>(thrown_exception));
   722                          thrown_exception = realExceptionFromException(thrown_exception);
(lldb) up
frame #4: 0x000000080545f4ac libc++.so.1`std::__1::__throw_system_error(ev=11, what_arg="") at system_error.cpp:287:5
   284  __throw_system_error(int ev, const char* what_arg)
   285  {
   286  #ifndef _LIBCPP_NO_EXCEPTIONS
-> 287      throw system_error(error_code(ev, system_category()), what_arg);
   288  #else
   289      (void)ev;
   290      (void)what_arg;
(lldb) up
frame #5: 0x000000080545361d libc++.so.1`std::__1::mutex::lock(this=<unavailable>) at mutex.cpp:35:9
   32   {
   33       int ec = __libcpp_mutex_lock(&__m_);
   34       if (ec)
-> 35           __throw_system_error(ec, "mutex lock failed");
   36   }
   37  
   38   bool
(lldb) up
frame #6: 0x0000000002137a29 lldb`::PrintAsync() [inlined] lock_guard at __mutex_base:91:27
   88  
   89       _LIBCPP_NODISCARD_EXT _LIBCPP_INLINE_VISIBILITY
   90       explicit lock_guard(mutex_type& __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
-> 91           : __m_(__m) {__m_.lock();}
   92  
   93       _LIBCPP_NODISCARD_EXT _LIBCPP_INLINE_VISIBILITY
   94       lock_guard(mutex_type& __m, adopt_lock_t) _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
Comment 1 Dimitry Andric freebsd_committer freebsd_triage 2021-02-07 16:33:37 UTC
Any idea how to reproduce this? It appears that mutex::lock() just calls pthread_mutex_lock() and it fails somehow:

00000000000b5380 <std::__1::mutex::lock()@@Base>:
   b5380:       55                      push   %rbp
   b5381:       48 89 e5                mov    %rsp,%rbp
   b5384:       e8 67 e7 00 00          callq  c3af0 <pthread_mutex_lock@plt>
   b5389:       85 c0                   test   %eax,%eax
   b538b:       75 02                   jne    b538f <std::__1::mutex::lock()@@Base+0xf>
   b538d:       5d                      pop    %rbp
   b538e:       c3                      retq
   b538f:       48 8d 35 e8 a0 f8 ff    lea    -0x75f18(%rip),%rsi        # 3f47e <typeinfo name for std::bad_any_cast@@Base-0x8df>
   b5396:       89 c7                   mov    %eax,%edi
   b5398:       e8 d3 d2 00 00          callq  c2670 <std::__1::__throw_system_error(int, char const*)@plt>

So in frame 5, can you print the value of 'ec' ? pthread_mutex_lock(3) says:

ERRORS
     The pthread_mutex_lock() function will fail if:

     [EINVAL]            The value specified by mutex is invalid.

     [EDEADLK]           A deadlock would occur if the thread blocked waiting
                         for mutex.

     [EOWNERDEAD]        The argument mutex points to a robust mutex and the
                         process containing the previous owning thread
                         terminated while holding the mutex lock.  The lock
                         was granted to the caller and it is up to the new
                         owner to make the state consistent.

     [ENOTRECOVERABLE]   The state protected by the mutex is not recoverable.

I wonder what the actual error is...
Comment 2 Ed Maste freebsd_committer freebsd_triage 2021-02-08 01:25:04 UTC
Was this reproducible or just occurred once?

If you have a moment can you try upstream lldb built from source (https://lldb.llvm.org/resources/build.html)? A lot of work is going into lldb FreeBSD support upstream so we're well positioned to work on them if they still happen there.
Comment 3 Xin LI freebsd_committer freebsd_triage 2021-02-08 01:35:44 UTC
It would take about a day for me to catch the offending program (more context at https://gitlab.isc.org/isc-projects/bind9/-/issues/2465 if you are interested) to enter the state that I need to debug, so it would take some time for me to try to reproduce the lldb issue.

What I did was fairly basic, basically I have a runaway BIND instance, which after receiving a SIGTERM won't exit, and I was attaching to the process and trying to get a backtrace for all threads.

Or in other words, I did:

sudo lldb --attach-pid `pgrep named` `which named`

and were going to do a "thread backtrace all", while doing that I might have pressed <tab> and lldb dumped core.

===

Should I build a lldb version with -g and -O0 just in case, by the way?
Comment 4 Ed Maste freebsd_committer freebsd_triage 2021-02-08 02:17:04 UTC
(In reply to Xin LI from comment #3)
> Should I build a lldb version with -g and -O0 just in case, by the way?

I think cmake setting `-DCMAKE_BUILD_TYPE=Debug` should do the right thing.
Comment 5 Dimitry Andric freebsd_committer freebsd_triage 2021-02-08 20:09:51 UTC
Looking at the stack trace, it seems the line editor was processing a tab keypress (while being in libedit via el_wgets), and then the autocompletion stuff got some error, which it tried to print via PrintAsync.

That uses a non-recursive mutex 'm_output_mutex' in the EditLine class, which is also locked just before calling into el_wgets(). So I think it's getting a EDEADLK, as described in lib/libthr/thread/thr_mutex.c:

   885  static int
   886  mutex_self_lock(struct pthread_mutex *m, const struct timespec *abstime)
   887  {
   888          struct timespec ts1, ts2;
   889          int ret;
   890
   891          switch (PMUTEX_TYPE(m->m_flags)) {
   892          case PTHREAD_MUTEX_ERRORCHECK:
   893          case PTHREAD_MUTEX_ADAPTIVE_NP:.
   894                  if (abstime) {
...
   904                  } else {
   905                          /*
   906                           * POSIX specifies that mutexes should return
   907                           * EDEADLK if a recursive lock is detected.
   908                           */
   909                          ret = EDEADLK;
   910                  }

I think this is an upstream bug. But it would be handy to have an easier reproduction scenario.
Comment 6 Xin LI freebsd_committer freebsd_triage 2021-02-12 16:47:46 UTC
(In reply to Dimitry Andric from comment #1)
The error was 11 (EDEADLK).
Comment 7 Xin LI freebsd_committer freebsd_triage 2021-02-12 16:49:24 UTC
(In reply to Ed Maste from comment #2)
Yes, this is reliably reproducible; the current known reproduction way would be to attach on named and type thread<tab>.

I'll see if I can find a smaller test case, as I need about a day or so to have named to enter the state.
Comment 8 Xin LI freebsd_committer freebsd_triage 2021-02-12 20:11:21 UTC
I was able to reliably reproduce the issue with the following test program:

<--<--<--<--<--<--<--<--<--<--<--<--<--<--

#include <sys/cdefs.h>

#include <string.h>
#include <pthread.h>
#include <unistd.h>

#define	WORKERS	9

void *
worker(void *p)
{
	for (;;)
		sleep(10);
}

int
main(int argc, char **argv)
{
	int i;
	pthread_t thr[WORKERS];

	memset(&thr, 0, sizeof(thr));

	for (i=0; i<WORKERS; i++) {
		pthread_create(&thr[i], NULL, worker, NULL);
	}

	for (i=0; i<WORKERS; i++) {
		pthread_join(thr[i], NULL);
	}

	/* NOTREACHED */
	return (0);
}

-->-->-->-->-->-->-->-->-->-->-->-->-->-->

Run the test program, then lldb attach to it, and type "thread<tab>" would crash lldb with the same crash.