Bug 95339

Summary: [libexec] [patch] rtld is thread-unsafe. fixes for dlopen mt behavior
Product: Base System Reporter: Kostik Belousov <kostikbel>
Component: binAssignee: Konstantin Belousov <kib>
Status: Closed FIXED    
Severity: Affects Only Me CC: ararslan, emaste, markj
Priority: Normal    
Version: Unspecified   
Hardware: Any   
OS: Any   

Description Kostik Belousov 2006-04-05 08:50:13 UTC
dlopen/dlclose rtld operations are not thread safe. Dynamic loader can
be easily segfaulted or trapped into assertion with parallel invocations
of rtld operations from multiple threads. Test case and fix for problems
exposed by the test are attached.

Fix: 

Please note that patch forces dlopen for dso to fail if another
thread run finalizers from the same dso at the time of the call.

joerg at britannica dot bec dot de does not like the patch and
promised to supply better fix, but still not available.--Wr2a3R3u0lSRqCrFwXuJGpNzYAomUy9IGkUm1DBLRAAh2b3Z
Content-Type: text/plain; name="file.diff"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="file.diff"

Index: libexec/rtld-elf/rtld.c
===================================================================
RCS file: /usr/local/arch/ncvs/src/libexec/rtld-elf/rtld.c,v
retrieving revision 1.112
diff -u -r1.112 rtld.c
--- libexec/rtld-elf/rtld.c	24 Dec 2005 15:37:30 -0000	1.112
+++ libexec/rtld-elf/rtld.c	23 Mar 2006 10:44:27 -0000
@@ -105,7 +105,7 @@
 static int load_preload_objects(void);
 static Obj_Entry *load_object(const char *, const Obj_Entry *);
 static Obj_Entry *obj_from_addr(const void *);
-static void objlist_call_fini(Objlist *);
+static void objlist_call_fini(Objlist *, int *lockstate, unsigned long *gen);
 static void objlist_call_init(Objlist *);
 static void objlist_clear(Objlist *);
 static Objlist_Entry *objlist_find(Objlist *, const Obj_Entry *);
@@ -165,6 +165,7 @@
   STAILQ_HEAD_INITIALIZER(list_main);
 static Objlist list_fini =	/* Objects needing fini() calls */
   STAILQ_HEAD_INITIALIZER(list_fini);
+static unsigned long list_fini_gen = 0;
 
 static Elf_Sym sym_zero;	/* For resolving undefined weak refs. */
 
@@ -1168,8 +1169,10 @@
 	objlist_push_tail(list, obj);
 
     /* Add the object to the global fini list in the reverse order. */
-    if (obj->fini != (Elf_Addr)NULL)
+    if (obj->fini != (Elf_Addr)NULL) {
 	objlist_push_head(&list_fini, obj);
+	list_fini_gen++;
+    }
 }
 
 #ifndef FPTR_TARGET
@@ -1362,21 +1365,30 @@
  * non-NULL fini functions.
  */
 static void
-objlist_call_fini(Objlist *list)
+objlist_call_fini(Objlist *list, int *lockstate, unsigned long *gen)
 {
-    Objlist_Entry *elm;
+    Objlist_Entry *elm, *elm_tmp;
     char *saved_msg;
+    unsigned long saved_gen = *gen;
 
     /*
      * Preserve the current error message since a fini function might
      * call into the dynamic linker and overwrite it.
      */
     saved_msg = errmsg_save();
-    STAILQ_FOREACH(elm, list, link) {
+ again:
+    STAILQ_FOREACH_SAFE(elm, list, link, elm_tmp) {
 	if (elm->obj->refcount == 0) {
 	    dbg("calling fini function for %s at %p", elm->obj->path,
 	        (void *)elm->obj->fini);
+	    elm->obj->fini_now = true;
+	    wlock_release(rtld_bind_lock, *lockstate);
 	    call_initfini_pointer(elm->obj, elm->obj->fini);
+	    *lockstate = wlock_acquire(rtld_bind_lock);
+	    if (*gen != saved_gen) {
+		    saved_gen = *gen;
+		    goto again;
+	    }
 	}
     }
     errmsg_restore(saved_msg);
@@ -1390,7 +1402,7 @@
 static void
 objlist_call_init(Objlist *list)
 {
-    Objlist_Entry *elm;
+    Objlist_Entry *elm, *elm_tmp;
     char *saved_msg;
 
     /*
@@ -1398,7 +1410,7 @@
      * call into the dynamic linker and overwrite it.
      */
     saved_msg = errmsg_save();
-    STAILQ_FOREACH(elm, list, link) {
+    STAILQ_FOREACH_SAFE(elm, list, link, elm_tmp) {
 	dbg("calling init function for %s at %p", elm->obj->path,
 	    (void *)elm->obj->init);
 	call_initfini_pointer(elm->obj, elm->obj->init);
@@ -1563,15 +1575,18 @@
 rtld_exit(void)
 {
     Obj_Entry *obj;
+    int lockstate;
 
     dbg("rtld_exit()");
     /* Clear all the reference counts so the fini functions will be called. */
+    lockstate = wlock_acquire(rtld_bind_lock);
     for (obj = obj_list;  obj != NULL;  obj = obj->next)
 	obj->refcount = 0;
-    objlist_call_fini(&list_fini);
+    objlist_call_fini(&list_fini, &lockstate, &list_fini_gen);
     /* No need to remove the items from the list, since we are exiting. */
     if (!libmap_disable)
         lm_fini();
+    wlock_release(rtld_bind_lock, lockstate);
 }
 
 static void *
@@ -1685,10 +1700,10 @@
 	 * The object is no longer referenced, so we must unload it.
 	 * First, call the fini functions with no locks held.
 	 */
-	wlock_release(rtld_bind_lock, lockstate);
-	objlist_call_fini(&list_fini);
-	lockstate = wlock_acquire(rtld_bind_lock);
+	objlist_call_fini(&list_fini, &lockstate, &list_fini_gen);
+
 	objlist_remove_unref(&list_fini);
+	list_fini_gen++;
 
 	/* Finish cleaning up the newly-unreferenced objects. */
 	GDB_STATE(RT_DELETE,&root->linkmap);
@@ -1741,9 +1756,9 @@
     if (ld_tracing != NULL)
 	environ = (char **)*get_program_var_addr("environ");
 
+    lockstate = wlock_acquire(rtld_bind_lock);
     objlist_init(&initlist);
 
-    lockstate = wlock_acquire(rtld_bind_lock);
     GDB_STATE(RT_ADD,NULL);
 
     old_obj_tail = obj_tail;
@@ -1755,7 +1770,10 @@
 	obj = load_object(name, obj_main);
     }
 
-    if (obj) {
+    if (obj && obj->fini_now) {
+	obj = NULL;
+	_rtld_error("%s is running finalizers now", name);
+    } else if (obj) {
 	obj->dl_refcount++;
 	if (mode & RTLD_GLOBAL && objlist_find(&list_global, obj) == NULL)
 	    objlist_push_tail(&list_global, obj);
Index: libexec/rtld-elf/rtld.h
===================================================================
RCS file: /usr/local/arch/ncvs/src/libexec/rtld-elf/rtld.h,v
retrieving revision 1.37
diff -u -r1.37 rtld.h
--- libexec/rtld-elf/rtld.h	18 Dec 2005 19:43:32 -0000	1.37
+++ libexec/rtld-elf/rtld.h	23 Mar 2006 10:44:27 -0000
@@ -210,6 +210,7 @@
     bool jmpslots_done;		/* Already have relocated the jump slots */
     bool init_done;		/* Already have added object to init list */
     bool tls_done;		/* Already allocated offset for static TLS */
+    bool fini_now;		/* Finalizer for dso currently runs */
 
     struct link_map linkmap;	/* for GDB and dlinfo() */
     Objlist dldags;		/* Object belongs to these dlopened DAGs (%) */
How-To-Repeat: Use the following test by Kazuaki Oda <kaakun at highway dot ne dot jp>
to see errant behaviour:

#include <err.h>
#include <dlfcn.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define NTHREADS	10

void *func(void *dummy);

int main(void)
{
    pthread_t tids[NTHREADS];
    int error;
    int i;

    for (i = 0; i < NTHREADS; i++) {
	error = pthread_create(&tids[i], NULL, func, NULL);
	if (error)
	    errc(1, error, "pthread_create");
    }

    for (;;)
	sleep(1);

    /* NOTREACHED */

    exit(0);
}

void *func(void *dummy)
{
    void *h;
    unsigned long c = 0;

    for (;;) {
	    if ((h = dlopen("/usr/lib/libm.so", RTLD_NOW)) == NULL) {
		    fprintf(stderr, "dlopen: %s\n", dlerror());
		    continue;
	    }
	    if (dlclose(h) == -1)
		    fprintf(stderr, "dlclose: %s\n", dlerror());
	    if (c++ == 10000) {
		    printf(".\n");
		    c = 0;
	    }
    }

    /* NOTREACHED */

    return (NULL);
}
Comment 1 Mark Linimon 2008-07-16 17:17:39 UTC
----- Forwarded message from Oleg Dolgov <agile@sunbay.com> -----

Hi,

I'am able to reproduce this bug even with patched files
(rtld.c, rev 1.124, rtld.h, rev 1.38)

...
dlopen: /usr/lib/libm.so is running finalizers now
dlopen: (null)
dlopen: (null)
dlopen: (null)
dlopen: (null)
dlopen: (null)
dlopen: (null)
dlopen: /usr/lib/libm.so is running finalizers now
Segmentation fault (core dumped)
deimos# gdb test2 test2.core
GNU gdb 6.1.1 [FreeBSD]
Copyright 2004 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for details.
This GDB was configured as "amd64-marcel-freebsd"...
Core was generated by `test2'.
Program terminated with signal 11, Segmentation fault.
Reading symbols from /lib/libthr.so.3...done.
Loaded symbols for /lib/libthr.so.3
Reading symbols from /lib/libc.so.7...done.
Loaded symbols for /lib/libc.so.7
Reading symbols from /usr/lib/libm.so...done.
Loaded symbols for /usr/lib/libm.so
Reading symbols from /libexec/ld-elf.so.1...done.
Loaded symbols for /libexec/ld-elf.so.1
#0  0x0000000800509abb in _rtld_thread_init () from /libexec/ld-elf.so.1
[New Thread 0x800a01f80 (LWP 100202)]
[New Thread 0x800a01e10 (LWP 100201)]
[New Thread 0x800a01ca0 (LWP 100200)]
[New Thread 0x800a01b30 (LWP 100199)]
[New Thread 0x800a019c0 (LWP 100198)]
[New Thread 0x800a01850 (LWP 100197)]
[New Thread 0x800a016e0 (LWP 100196)]
[New Thread 0x800a01570 (LWP 100194)]
[New Thread 0x800a01400 (LWP 100160)]
[New Thread 0x800a01290 (LWP 100150)]
[New Thread 0x800a01120 (LWP 100184)]
(gdb) info threads
  11 Thread 0x800a01120 (LWP 100184)  0x000000080080ac5c in nanosleep () 
  from /lib/libc.so.7
  10 Thread 0x800a01290 (LWP 100150)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
  9 Thread 0x800a01400 (LWP 100160)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
* 8 Thread 0x800a01570 (LWP 100194)  0x0000000800509abb in _rtld_thread_init 
() from /libexec/ld-elf.so.1
  7 Thread 0x800a016e0 (LWP 100196)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
  6 Thread 0x800a01850 (LWP 100197)  0x000000080050ac0c in _rtld_thread_init 
  () from /libexec/ld-elf.so.1
  5 Thread 0x800a019c0 (LWP 100198)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
  4 Thread 0x800a01b30 (LWP 100199)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
  3 Thread 0x800a01ca0 (LWP 100200)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
  2 Thread 0x800a01e10 (LWP 100201)  0x0000000800825e4c in write () from 
/lib/libc.so.7
  1 Thread 0x800a01f80 (LWP 100202)  0x0000000800798e5c in _umtx_op () from 
/lib/libc.so.7
(gdb) bt
#0  0x0000000800509abb in _rtld_thread_init () from /libexec/ld-elf.so.1
#1  0x0000000800519f39 in _rtld_thread_init () from /libexec/ld-elf.so.1
#2  0x0000000800509e09 in _rtld_thread_init () from /libexec/ld-elf.so.1
#3  0x000000080050731c in dlsym () from /libexec/ld-elf.so.1
#4  0x0000000800507c76 in dlopen () from /libexec/ld-elf.so.1
#5  0x00000000004008c3 in func ()
#6  0x000000080063ab98 in pthread_getprio () from /lib/libthr.so.3
#7  0x0000000000000000 in ?? ()
Cannot access memory at address 0x7fffff7fd000
(gdb) t 6
[Switching to thread 6 (Thread 0x800a01850 (LWP 100197))]#0  
0x000000080050ac0c in _rtld_thread_init ()
   from /libexec/ld-elf.so.1
(gdb) bt
#0  0x000000080050ac0c in _rtld_thread_init () from /libexec/ld-elf.so.1
#1  0x0000000800504c6c in dl_iterate_phdr () from /libexec/ld-elf.so.1
#2  0x00000008005075ac in dlclose () from /libexec/ld-elf.so.1
#3  0x00000000004008f7 in func ()
#4  0x000000080063ab98 in pthread_getprio () from /lib/libthr.so.3
#5  0x0000000000000000 in ?? ()
Cannot access memory at address 0x7fffff3fb000

p.s. seems that bug (bin/123932: amd(8) core dumps while load high) related 
to this.

p.p.s. FreeBSD 7.0-RELEASE (SCHED_ULE, SMP), amd64, 2xcpu 4-core, 6 Gb RAM.

----- End forwarded message -----
Comment 2 Oleg 2008-08-29 22:47:58 UTC
My investigation shows that patch is correct, there other place which
also thread-unsafe, see kern/126950 (rtld malloc is thread-unsafe).
Comment 3 Eitan Adler freebsd_committer freebsd_triage 2018-05-20 23:57:15 UTC
For bugs matching the following conditions:
- Status == In Progress
- Assignee == "bugs@FreeBSD.org"
- Last Modified Year <= 2017

Do
- Set Status to "Open"
Comment 4 Mark Johnston freebsd_committer freebsd_triage 2018-06-04 14:18:45 UTC
I believe all the issues around thread safety in rtld have been addressed over time. Any issues encountered in 2018 certainly deserve a new PR.
Comment 5 Alex Arslan 2021-04-07 20:36:42 UTC
It seems that issues persist in `dlerror` in the presence of threads. See https://github.com/JuliaLang/julia/issues/39582 where this was encountered with Julia on FreeBSD 11.
Comment 6 Konstantin Belousov freebsd_committer freebsd_triage 2021-04-08 04:52:42 UTC
(In reply to Alex Arslan from comment #5)
It would be better to create a new PR if only for tracking.

The preliminary version of the fix for the issue as I understand it, is
available at https://reviews.freebsd.org/D29633
Comment 7 commit-hook freebsd_committer freebsd_triage 2021-04-10 14:35:34 UTC
A commit in branch main references this bug:

URL: https://cgit.FreeBSD.org/src/commit/?id=4d9128da54f8f8e2a29190ffb18880c4f116a205

commit 4d9128da54f8f8e2a29190ffb18880c4f116a205
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2021-04-07 22:02:33 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2021-04-10 14:33:34 +0000

    rtld: make dlerror() thread-local

    PR:     95339
    Discussed with: arichardson
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D29633

 lib/libthr/thread/thr_private.h |  4 ++++
 lib/libthr/thread/thr_rtld.c    | 21 +++++++++++++++++++++
 libexec/rtld-elf/rtld.c         | 38 ++++++++++++++++++++------------------
 libexec/rtld-elf/rtld_lock.c    | 33 +++++++++++++++++++++++++++++++++
 libexec/rtld-elf/rtld_lock.h    |  7 ++++++-
 5 files changed, 84 insertions(+), 19 deletions(-)