--- b/contrib/gdb/gdb/gdbthread.h +++ b/contrib/gdb/gdb/gdbthread.h @@ -75,6 +75,8 @@ struct thread_info struct private_thread_info *private; }; +extern int thread_list_empty (void); + /* Create an empty thread list, or empty the existing one. */ extern void init_thread_list (void); --- b/contrib/gdb/gdb/infrun.c +++ b/contrib/gdb/gdb/infrun.c @@ -384,9 +384,22 @@ follow_inferior_reset_breakpoints (void) insert_breakpoints (); } +void +clear_step_resume_breakpoint_thread (void) +{ + if (step_resume_breakpoint) + step_resume_breakpoint->thread = -1; +} + +void +clear_step_resume_breakpoint (void) +{ + step_resume_breakpoint = NULL; +} + /* EXECD_PATHNAME is assumed to be non-NULL. */ -static void +void follow_exec (int pid, char *execd_pathname) { int saved_pid = pid; @@ -1648,7 +1661,8 @@ handle_inferior_event (struct execution_control_state *ecs) /* This causes the eventpoints and symbol table to be reset. Must do this now, before trying to determine whether to stop. */ - follow_exec (PIDGET (inferior_ptid), pending_follow.execd_pathname); + target_follow_exec (PIDGET (inferior_ptid), + pending_follow.execd_pathname); xfree (pending_follow.execd_pathname); stop_pc = read_pc_pid (ecs->ptid); --- b/contrib/gdb/gdb/objfiles.c +++ b/contrib/gdb/gdb/objfiles.c @@ -482,11 +482,11 @@ free_all_objfiles (void) { struct objfile *objfile, *temp; + clear_symtab_users (); ALL_OBJFILES_SAFE (objfile, temp) { free_objfile (objfile); } - clear_symtab_users (); } /* Relocate OBJFILE to NEW_OFFSETS. There should be OBJFILE->NUM_SECTIONS --- b/contrib/gdb/gdb/target.c +++ b/contrib/gdb/gdb/target.c @@ -1307,6 +1307,52 @@ target_async_mask (int mask) } /* Look through the list of possible targets for a target that can + follow forks. */ + +int +target_follow_fork (int follow_child) +{ + struct target_ops *t; + + for (t = current_target.beneath; t != NULL; t = t->beneath) + { + if (t->to_follow_fork != NULL) + { + int retval = t->to_follow_fork (t, follow_child); + if (targetdebug) + fprintf_unfiltered (gdb_stdlog, "target_follow_fork (%d) = %d\n", + follow_child, retval); + return retval; + } + } + + /* Some target returned a fork event, but did not know how to follow it. */ + internal_error (__FILE__, __LINE__, + "could not find a target to follow fork"); +} + +void +target_follow_exec (int pid, char *execd_pathname) +{ + struct target_ops *t; + + for (t = current_target.beneath; t != NULL; t = t->beneath) + { + if (t->to_follow_exec != NULL) + { + t->to_follow_exec (pid, execd_pathname); + if (targetdebug) + fprintf_unfiltered (gdb_stdlog, "target_follow_exec (%d, %s)\n", + pid, execd_pathname); + return; + } + } + + /* If target does not specify a follow_exec handler, call the default. */ + follow_exec (pid, execd_pathname); +} + +/* Look through the list of possible targets for a target that can execute a run or attach command without any other data. This is used to locate the default process stratum. @@ -2159,9 +2205,9 @@ debug_to_remove_vfork_catchpoint (int pid) } static int -debug_to_follow_fork (int follow_child) +debug_to_follow_fork (struct target_ops* ops, int follow_child) { - int retval = debug_target.to_follow_fork (follow_child); + int retval = debug_target.to_follow_fork (ops, follow_child); fprintf_unfiltered (gdb_stdlog, "target_follow_fork (%d) = %d\n", follow_child, retval); --- b/contrib/gdb/gdb/target.h +++ b/contrib/gdb/gdb/target.h @@ -362,7 +362,8 @@ struct target_ops int (*to_remove_fork_catchpoint) (int); int (*to_insert_vfork_catchpoint) (int); int (*to_remove_vfork_catchpoint) (int); - int (*to_follow_fork) (int); + int (*to_follow_fork) (struct target_ops*, int); + void (*to_follow_exec) (int, char*); int (*to_insert_exec_catchpoint) (int); int (*to_remove_exec_catchpoint) (int); int (*to_reported_exec_events_per_exec_call) (void); @@ -761,8 +762,7 @@ extern void target_load (char *arg, int from_tty); This function returns 1 if the inferior should not be resumed (i.e. there is another event pending). */ -#define target_follow_fork(follow_child) \ - (*current_target.to_follow_fork) (follow_child) +int target_follow_fork (int follow_child); /* On some targets, we can catch an inferior exec event when it occurs. These functions insert/remove an already-created @@ -1248,4 +1248,6 @@ extern void push_remote_target (char *name, int from_tty); /* Blank target vector entries are initialized to target_ignore. */ void target_ignore (void); +void target_follow_exec (int pid, char *execd_pathname); + #endif /* !defined (TARGET_H) */ --- b/contrib/gdb/gdb/thread.c +++ b/contrib/gdb/gdb/thread.c @@ -65,6 +65,12 @@ static void restore_current_thread (ptid_t); static void switch_to_thread (ptid_t ptid); static void prune_threads (void); +int +thread_list_empty () +{ + return thread_list == NULL; +} + void delete_step_resume_breakpoint (void *arg) { --- b/gnu/usr.bin/gdb/arch/amd64/Makefile +++ b/gnu/usr.bin/gdb/arch/amd64/Makefile @@ -2,7 +2,7 @@ GENSRCS+= xm.h .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= fbsd-proc.c fbsd-threads.c gcore.c +LIBSRCS+= fbsd-nat.c fbsd-proc.c fbsd-threads.c gcore.c LIBSRCS+= amd64-nat.c amd64bsd-nat.c amd64fbsd-nat.c .endif LIBSRCS+= solib.c solib-svr4.c --- b/gnu/usr.bin/gdb/arch/amd64/init.c +++ b/gnu/usr.bin/gdb/arch/amd64/init.c @@ -115,6 +115,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -231,4 +232,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/arm/Makefile +++ b/gnu/usr.bin/gdb/arch/arm/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ GENSRCS+= xm.h -LIBSRCS+= armfbsd-nat.c +LIBSRCS+= armfbsd-nat.c fbsd-nat.c LIBSRCS+= arm-tdep.c armfbsd-tdep.c solib.c solib-svr4.c .if !defined(GDB_CROSS_DEBUGGER) LIBSRCS+= fbsd-threads.c --- b/gnu/usr.bin/gdb/arch/arm/init.c +++ b/gnu/usr.bin/gdb/arch/arm/init.c @@ -113,6 +113,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -225,4 +226,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/i386/Makefile +++ b/gnu/usr.bin/gdb/arch/i386/Makefile @@ -2,7 +2,7 @@ GENSRCS+= xm.h .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= fbsd-proc.c fbsd-threads.c gcore.c +LIBSRCS+= fbsd-nat.c fbsd-proc.c fbsd-threads.c gcore.c LIBSRCS+= i386-nat.c i386bsd-nat.c i386fbsd-nat.c .endif LIBSRCS+= solib.c solib-svr4.c --- b/gnu/usr.bin/gdb/arch/i386/init.c +++ b/gnu/usr.bin/gdb/arch/i386/init.c @@ -116,6 +116,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -233,4 +234,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/ia64/Makefile +++ b/gnu/usr.bin/gdb/arch/ia64/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= fbsd-proc.c fbsd-threads.c gcore.c +LIBSRCS+= fbsd-nat.c fbsd-proc.c fbsd-threads.c gcore.c LIBSRCS+= ia64-fbsd-nat.c .endif LIBSRCS+= solib.c solib-svr4.c --- b/gnu/usr.bin/gdb/arch/ia64/init.c +++ b/gnu/usr.bin/gdb/arch/ia64/init.c @@ -113,6 +113,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -227,4 +228,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/mips/Makefile +++ b/gnu/usr.bin/gdb/arch/mips/Makefile @@ -4,7 +4,7 @@ # XXX Should set DEFAULT_BFD_VEC based on target. # .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= mipsfbsd-nat.c fbsd-threads.c +LIBSRCS+= fbsd-nat.c mipsfbsd-nat.c fbsd-threads.c .endif LIBSRCS+= solib.c solib-svr4.c LIBSRCS+= mips-tdep.c mipsfbsd-tdep.c fbsd-proc.c --- b/gnu/usr.bin/gdb/arch/mips/init.c +++ b/gnu/usr.bin/gdb/arch/mips/init.c @@ -112,6 +112,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -230,4 +231,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/powerpc/Makefile +++ b/gnu/usr.bin/gdb/arch/powerpc/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= fbsd-proc.c fbsd-threads.c gcore.c +LIBSRCS+= fbsd-nat.c fbsd-proc.c fbsd-threads.c gcore.c LIBSRCS+= ppcfbsd-nat.c .endif LIBSRCS+= solib.c solib-svr4.c --- b/gnu/usr.bin/gdb/arch/powerpc/init.c +++ b/gnu/usr.bin/gdb/arch/powerpc/init.c @@ -113,6 +113,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -227,4 +228,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/powerpc64/Makefile +++ b/gnu/usr.bin/gdb/arch/powerpc64/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ .if !defined(GDB_CROSS_DEBUGGER) -LIBSRCS+= fbsd-proc.c fbsd-threads.c gcore.c +LIBSRCS+= fbsd-nat.c fbsd-proc.c fbsd-threads.c gcore.c LIBSRCS+= ppcfbsd-nat.c .endif LIBSRCS+= solib.c solib-svr4.c --- b/gnu/usr.bin/gdb/arch/powerpc64/init.c +++ b/gnu/usr.bin/gdb/arch/powerpc64/init.c @@ -113,6 +113,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -227,4 +228,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- b/gnu/usr.bin/gdb/arch/sparc64/init.c +++ b/gnu/usr.bin/gdb/arch/sparc64/init.c @@ -114,6 +114,7 @@ extern initialize_file_ftype _initialize_tui_out; extern initialize_file_ftype _initialize_tui_regs; extern initialize_file_ftype _initialize_tui_stack; extern initialize_file_ftype _initialize_tui_win; +extern initialize_file_ftype _initialize_fbsdnat; void initialize_all_files (void) { @@ -229,4 +230,5 @@ initialize_all_files (void) _initialize_tui_regs (); _initialize_tui_stack (); _initialize_tui_win (); + _initialize_fbsdnat (); } --- /dev/null +++ b/gnu/usr.bin/gdb/libgdb/fbsd-nat.c @@ -0,0 +1,342 @@ +/* Native-dependent code for FreeBSD. + + Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. + + This file is part of GDB. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +#include "defs.h" +#include "inferior.h" +#include "symfile.h" +#include "gdbcore.h" +#include "gdbthread.h" +#include "gdb_assert.h" +#include +#include +#include + +extern struct target_ops child_ops; +void clear_step_resume_breakpoint (void); +void clear_step_resume_breakpoint_thread (void); +void (*reactivate_threads) (char*) = NULL; +void (*disable_threads) (void) = NULL; + +static void (*mourn_inferior_beneath) (void); +static void (*detach_beneath) (char *args, int from_tty); +static ptid_t (*wait_beneath) (ptid_t ptid, + struct target_waitstatus *ourstatus); +int follow_event_pid = 0; + +/* Return a the name of file that can be opened to get the symbols for + the child process identified by PID. */ + +char * +fbsd_pid_to_exec_file (int pid) +{ + size_t len = MAXPATHLEN; + char *buf = xcalloc (len, sizeof (char)); + char *path; + +#ifdef KERN_PROC_PATHNAME + int mib[4]; + + mib[0] = CTL_KERN; + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PATHNAME; + mib[3] = pid; + if (sysctl (mib, 4, buf, &len, NULL, 0) == 0) + return buf; +#endif + + path = xstrprintf ("/proc/%d/file", pid); + if (readlink (path, buf, MAXPATHLEN) == -1) + { + xfree (buf); + buf = NULL; + } + + xfree (path); + return buf; +} + +/* Wait for the child specified by PTID to do something. Return the + process ID of the child, or MINUS_ONE_PTID in case of error; store + the status in *OURSTATUS. */ + +static ptid_t +inf_ptrace_wait (ptid_t ptid, struct target_waitstatus *ourstatus) +{ + pid_t pid; + int status, save_errno; + + do + { + set_sigint_trap (); + set_sigio_trap (); + do + { + pid = waitpid (PIDGET (ptid), &status, 0); + save_errno = errno; + } + while (pid == -1 && errno == EINTR); + + clear_sigio_trap (); + clear_sigint_trap (); + + if (pid == -1) + { + fprintf_unfiltered (gdb_stderr, + _("Child process unexpectedly missing: %s.\n"), + safe_strerror (save_errno)); + + /* Claim it exited with unknown signal. */ + ourstatus->kind = TARGET_WAITKIND_SIGNALLED; + ourstatus->value.sig = TARGET_SIGNAL_UNKNOWN; + return minus_one_ptid; + } + + /* Ignore terminated detached child processes. */ + if (!WIFSTOPPED (status) && pid != PIDGET (inferior_ptid)) + pid = -1; + } + while (pid == -1); + + store_waitstatus (ourstatus, status); + return pid_to_ptid (pid); +} + +static ptid_t +fbsd_wait (ptid_t ptid, struct target_waitstatus *ourstatus) +{ + long lwp; + struct ptrace_lwpinfo lwpinfo; + struct target_waitstatus stat; + ptid_t ret; + static ptid_t forking_child = {0,0,0}; + + ret = wait_beneath (ptid, ourstatus); + + if (PIDGET (ret) >= 0 && ourstatus->kind == TARGET_WAITKIND_STOPPED && + (ourstatus->value.sig == TARGET_SIGNAL_TRAP || + ourstatus->value.sig == TARGET_SIGNAL_STOP) && + (ptrace(PT_LWPINFO, PIDGET (ret), (caddr_t)&lwpinfo, + sizeof lwpinfo) == 0)) + { + if (lwpinfo.pl_flags & PL_FLAG_CHILD) + { + /* Leave the child in a stopped state until we get a fork event in + the parent. That's when we decide which process to follow. */ + ourstatus->kind = TARGET_WAITKIND_IGNORE; + forking_child = ret; + } + else if (lwpinfo.pl_flags & PL_FLAG_FORKED) + { + /* We'd better be in the middle of processing a fork() event. */ + gdb_assert (!ptid_equal (forking_child, null_ptid)); + ourstatus->kind = TARGET_WAITKIND_FORKED; + ourstatus->value.related_pid = lwpinfo.pl_child_pid; + forking_child = null_ptid; + } + else if (lwpinfo.pl_flags & PL_FLAG_EXEC && + PIDGET (ret) == follow_event_pid) + { + ourstatus->kind = TARGET_WAITKIND_EXECD; + ourstatus->value.execd_pathname = + xstrdup (fbsd_pid_to_exec_file (PIDGET (ret))); + } + } + + return ret; +} + +static void +fbsd_enable_event_reporting (int pid) +{ +#ifdef PT_FOLLOW_FORK + follow_event_pid = pid; + if (ptrace(PT_FOLLOW_FORK, pid, 0, 1) < 0) + error (_("Cannot follow fork on this target.")); +#endif +} + +static void +fbsd_post_attach (int pid) +{ + fbsd_enable_event_reporting (pid); +} + +static void +fbsd_post_startup_inferior (ptid_t ptid) +{ + fbsd_enable_event_reporting (PIDGET (ptid)); +} + +int +fbsd_follow_fork (struct target_ops *ops, int follow_child) +{ + ptid_t last_ptid, ret, child_ptid; + struct target_waitstatus last_status; + int parent_pid, child_pid; + struct target_waitstatus ourstatus; + + get_last_target_status (&last_ptid, &last_status); + parent_pid = PIDGET (last_ptid); + child_pid = last_status.value.related_pid; + + if (follow_child) + { + detach_breakpoints (child_pid); + remove_breakpoints (); + child_ptid = pid_to_ptid (child_pid); + + target_detach (NULL, 0); + inferior_ptid = child_ptid; + + /* Reinstall ourselves, since we might have been removed in + target_detach (which does other necessary cleanup). */ + push_target (ops); + + /* Need to restore some of the actions done by the threaded detach */ + if (reactivate_threads) + { + reactivate_threads (fbsd_pid_to_exec_file (child_pid)); + reactivate_threads = NULL; + } + + /* Reset breakpoints in the child as appropriate. */ + clear_step_resume_breakpoint_thread (); + follow_inferior_reset_breakpoints (); + + /* Enable fork/exec event reporting for the child. */ + fbsd_enable_event_reporting (child_pid); + } + else /* Follow parent */ + { + /* Before detaching from the child, remove all breakpoints from + it. (This won't actually modify the breakpoint list, but will + physically remove the breakpoints from the child.) */ + detach_breakpoints (child_pid); + ptrace (PT_DETACH, child_pid, (caddr_t) 1, 0); + } + + return 0; +} + +/* EXECD_PATHNAME is assumed to be non-NULL. */ + +static void +fbsd_follow_exec (int pid, char *execd_pathname) +{ + struct target_waitstatus status; + ptid_t ret = inferior_ptid; + + /* This is an exec event that we actually wish to pay attention to. + Refresh our symbol table to the newly exec'd program, remove any + momentary bp's, etc. + + If there are breakpoints, they aren't really inserted now, + since the exec() transformed our inferior into a fresh set + of instructions. + + We want to preserve symbolic breakpoints on the list, since + we have hopes that they can be reset after the new a.out's + symbol table is read. + + However, any "raw" breakpoints must be removed from the list + (e.g., the solib bp's), since their address is probably invalid + now. + + And, we DON'T want to call delete_breakpoints() here, since + that may write the bp's "shadow contents" (the instruction + value that was overwritten witha TRAP instruction). Since + we now have a new a.out, those shadow contents aren't valid. */ + update_breakpoints_after_exec (); + + /* If there was one, it's gone now. We cannot truly step-to-next + statement through an exec(). */ + clear_step_resume_breakpoint (); + step_range_start = 0; + step_range_end = 0; + + /* What is this a.out's name? */ + printf_unfiltered (_("Executing new program: %s\n"), execd_pathname); + + /* We've followed the inferior through an exec. Therefore, the + inferior has essentially been killed & reborn. */ + + gdb_flush (gdb_stdout); + + /* Disable thread library */ + if (disable_threads) + { + disable_threads (); + disable_threads = NULL; + } + + generic_mourn_inferior (); + inferior_ptid = ret; + + /* That a.out is now the one to use. */ + exec_file_attach (execd_pathname, 0); + + /* And also is where symbols can be found. */ + symbol_file_add_main (execd_pathname, 0); + + /* Reset the shared library package. This ensures that we get + a shlib event when the child reaches "_start", at which point + the dld will have had a chance to initialize the child. */ +#if defined(SOLIB_RESTART) + SOLIB_RESTART (); +#endif +#ifdef SOLIB_CREATE_INFERIOR_HOOK + SOLIB_CREATE_INFERIOR_HOOK (PIDGET (inferior_ptid)); +#else + solib_create_inferior_hook (); +#endif + + /* Reinsert all breakpoints. (Those which were symbolic have + been reset to the proper address in the new a.out, thanks + to symbol_file_command...) */ + insert_breakpoints (); +} + +static void fbsd_mourn_inferior (void) +{ + follow_event_pid = 0; + mourn_inferior_beneath (); +} + +static void fbsd_detach (char *args, int from_tty) +{ + follow_event_pid = 0; + detach_beneath (args, from_tty); +} + +void +_initialize_fbsdnat (void) +{ + wait_beneath = inf_ptrace_wait; + detach_beneath = child_ops.to_detach; + mourn_inferior_beneath = child_ops.to_mourn_inferior; + child_ops.to_wait = fbsd_wait; + child_ops.to_detach = fbsd_detach; + child_ops.to_mourn_inferior = fbsd_mourn_inferior; + child_ops.to_post_attach = fbsd_post_attach; + child_ops.to_post_startup_inferior = fbsd_post_startup_inferior; + child_ops.to_follow_fork = fbsd_follow_fork; + child_ops.to_follow_exec = fbsd_follow_exec; +} --- b/gnu/usr.bin/gdb/libgdb/fbsd-threads.c +++ b/gnu/usr.bin/gdb/libgdb/fbsd-threads.c @@ -68,6 +68,9 @@ extern struct target_ops core_ops; /* Pointer to the next function on the objfile event chain. */ static void (*target_new_objfile_chain) (struct objfile *objfile); + +/* Non-zero while processing thread library re-activation after fork() */ +static int fbsd_forking; /* Non-zero if there is a thread module */ static int fbsd_thread_present; @@ -154,6 +157,10 @@ static int fbsd_thread_alive (ptid_t ptid); static void attach_thread (ptid_t ptid, const td_thrhandle_t *th_p, const td_thrinfo_t *ti_p, int verbose); static void fbsd_thread_detach (char *args, int from_tty); +extern void (*reactivate_threads) (char*); +extern void (*disable_threads) (void); +static void fbsd_thread_activate (void); +static void fbsd_thread_deactivate (void); /* Building process ids. */ @@ -405,15 +412,50 @@ disable_thread_event_reporting (void) td_death_bp_addr = 0; } +static void +fbsd_thread_reactivate_after_fork (char *pathname) +{ + fbsd_forking = 1; + + /* That a.out is now the one to use. */ + exec_file_attach (pathname, 0); + + /* And also is where symbols can be found. */ + symbol_file_add_main (pathname, 0); + push_target (&fbsd_thread_ops); + +#ifdef SOLIB_CREATE_INFERIOR_HOOK + SOLIB_CREATE_INFERIOR_HOOK (PIDGET (inferior_ptid)); +#else + solib_create_inferior_hook (); +#endif + fbsd_forking = 0; +} + +static void +fbsd_thread_disable_after_exec (void) +{ + if (fbsd_thread_active) + fbsd_thread_deactivate (); + + unpush_target (&fbsd_thread_ops); +} + static void fbsd_thread_activate (void) { fbsd_thread_active = 1; + reactivate_threads = fbsd_thread_reactivate_after_fork; + disable_threads = fbsd_thread_disable_after_exec; init_thread_list(); if (fbsd_thread_core == 0) enable_thread_event_reporting (); - fbsd_thread_find_new_threads (); - get_current_thread (); + + if (!fbsd_forking) + { + fbsd_thread_find_new_threads (); + get_current_thread (); + } } static void @@ -626,7 +668,7 @@ fbsd_thread_resume (ptid_t ptid, int step, enum target_signal signo) } lwp = GET_LWP (work_ptid); - if (lwp == 0) + if (lwp == 0 && GET_THREAD (work_ptid) != 0) { /* check user thread */ ret = td_ta_map_id2thr_p (thread_agent, GET_THREAD(work_ptid), &th); @@ -790,6 +832,9 @@ fbsd_thread_wait (ptid_t ptid, struct target_waitstatus *ourstatus) ret = child_ops.to_wait (ptid, ourstatus); if (GET_PID(ret) >= 0 && ourstatus->kind == TARGET_WAITKIND_STOPPED) { + if (thread_list_empty ()) + fbsd_thread_find_new_threads (); + lwp = get_current_lwp (GET_PID(ret)); ret = thread_from_lwp (BUILD_LWP(lwp, GET_PID(ret)), &th, &ti); @@ -1065,6 +1110,9 @@ fbsd_thread_create_inferior (char *exec_file, char *allargs, char **env) static void fbsd_thread_post_startup_inferior (ptid_t ptid) { + if (child_ops.to_post_startup_inferior) + child_ops.to_post_startup_inferior (ptid); + if (fbsd_thread_present && !fbsd_thread_active) { /* The child process is now the actual multi-threaded --- b/share/man/man9/fpu_kern.9 +++ b/share/man/man9/fpu_kern.9 @@ -120,6 +120,16 @@ could be used from both kernel thread and syscall contexts. The .Fn fpu_kern_leave function correctly handles such contexts. +.It Dv FPU_KERN_NOCTX +Avoid nesting save area. +If the flag is specified, the +.Fa ctx +must be passed as +.Va NULL . +The flag should only be used for really short code blocks +which can be executed in a critical section. +It avoids the need to allocate the FPU context by the cost +of increased system latency. .El .El .Pp --- b/sys/amd64/amd64/fpu.c +++ b/sys/amd64/amd64/fpu.c @@ -348,7 +348,7 @@ fpuexit(struct thread *td) stop_emulating(); fpusave(curpcb->pcb_save); start_emulating(); - PCPU_SET(fpcurthread, 0); + PCPU_SET(fpcurthread, NULL); } critical_exit(); } @@ -603,6 +603,8 @@ fpudna(void) { critical_enter(); + KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0, + ("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)")); if (PCPU_GET(fpcurthread) == curthread) { printf("fpudna: fpcurthread == curthread %d times\n", ++err_count); @@ -636,7 +638,8 @@ fpudna(void) * fpu_initialstate, to ignite the XSAVEOPT * tracking engine. */ - bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); + bcopy(fpu_initialstate, curpcb->pcb_save, + cpu_max_ext_state_size); fpurestore(curpcb->pcb_save); if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__) fldcw(curpcb->pcb_initial_fpucw); @@ -934,11 +937,36 @@ fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx, u_int flags) { struct pcb *pcb; + KASSERT((flags & FPU_KERN_NOCTX) != 0 || ctx != NULL, + ("ctx is required when !FPU_KERN_NOCTX")); + pcb = td->td_pcb; + KASSERT((pcb->pcb_flags & PCB_FPUNOSAVE) == 0, + ("recursive fpu_kern_enter while in PCB_FPUNOSAVE state")); + if ((flags & FPU_KERN_NOCTX) != 0) { + critical_enter(); + stop_emulating(); + if (curthread == PCPU_GET(fpcurthread)) { + fpusave(curpcb->pcb_save); + PCPU_SET(fpcurthread, NULL); + } else { + KASSERT(PCPU_GET(fpcurthread) == NULL, + ("invalid fpcurthread")); + } + + /* + * This breaks XSAVEOPT tracker, but + * PCB_FPUNOSAVE state is supposed to never need to + * save FPU context at all. + */ + fpurestore(fpu_initialstate); + set_pcb_flags(pcb, PCB_KERNFPU | PCB_FPUNOSAVE | + PCB_FPUINITDONE); + return (0); + } if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) { ctx->flags = FPU_KERN_CTX_DUMMY; return (0); } - pcb = td->td_pcb; KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save == get_pcb_user_save_pcb(pcb), ("mangled pcb_save")); ctx->flags = 0; @@ -957,15 +985,26 @@ fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx) { struct pcb *pcb; + pcb = td->td_pcb; + if ((pcb->pcb_flags & PCB_FPUNOSAVE) != 0) { + KASSERT(ctx == NULL, ("non-null ctx after FPU_KERN_NOCTX")); + KASSERT(PCPU_GET(fpcurthread) == NULL, + ("non-NULL fpcurthread for PCB_FPUNOSAVE")); + CRITICAL_ASSERT(td); + clear_pcb_flags(pcb, PCB_FPUNOSAVE | PCB_FPUINITDONE); + start_emulating(); + critical_exit(); + goto restore_flags; + } if (is_fpu_kern_thread(0) && (ctx->flags & FPU_KERN_CTX_DUMMY) != 0) return (0); KASSERT((ctx->flags & FPU_KERN_CTX_DUMMY) == 0, ("dummy ctx")); - pcb = td->td_pcb; critical_enter(); if (curthread == PCPU_GET(fpcurthread)) fpudrop(); critical_exit(); pcb->pcb_save = ctx->prev; +restore_flags: if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) { if ((pcb->pcb_flags & PCB_USERFPUINITDONE) != 0) { set_pcb_flags(pcb, PCB_FPUINITDONE); --- b/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -88,6 +88,11 @@ static void init_amd(void) { + if (CPUID_TO_FAMILY(cpu_id) == 0x9) { + if ((cpu_feature2 & CPUID2_HV) == 0) + wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) | (1 << 6)); + } + /* * Work around Erratum 721 for Family 10h and 12h processors. * These processors may incorrectly update the stack pointer --- b/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -58,6 +58,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include "opt_ddb.h" +#ifdef DDB +#include +#include +#endif + #include #include #include @@ -1415,6 +1421,13 @@ ipi_nmi_handler() cpustop_handler(); return (0); } + +#ifdef DDB +static int ddb_migrate_cpu = -1; +static int ddb_orig_cpu = -1; +static jmp_buf ddb_migrate_buf; +void db_command_loop(void); +#endif /* * Handle an IPI_STOP by saving our current context and spinning until we @@ -1429,6 +1442,9 @@ cpustop_handler(void) savectx(&stoppcbs[cpu]); +#ifdef DDB +migration_exited: +#endif /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); @@ -1436,6 +1452,21 @@ cpustop_handler(void) while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); +#ifdef DDB + if (ddb_migrate_cpu == cpu) { + if (setjmp(ddb_migrate_buf)) { + db_printf("leaving cpu %d\n", cpu); + ddb_migrate_cpu = -1; + CPU_CLR_ATOMIC(cpu, &started_cpus); + CPU_SET_ATOMIC(ddb_orig_cpu, &started_cpus); + goto migration_exited; + } + db_printf("current cpu %d\n", cpu); + db_command_loop(); + panic("continued from migrated\n"); + } +#endif + CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); @@ -1449,6 +1480,50 @@ cpustop_handler(void) } } +#ifdef DDB +DB_COMMAND(cpuret, db_cpuret) +{ + + if (ddb_migrate_cpu == -1) { + db_printf("not migrated\n"); + return; + } + longjmp(ddb_migrate_buf, 1); +} + +DB_COMMAND(cpu, db_cpu) +{ + int mcpu, currcpu; + + if (ddb_migrate_cpu != -1) { + db_printf("already migrated, return to orig cpu first\n"); + return; + } + if (!have_addr) { + db_printf("specify cpu to migrate\n"); + return; + } + mcpu = (int)addr; + if (mcpu < 0 || mcpu >= mp_ncpus) { + db_printf("cpu %d does not exist\n", mcpu); + return; + } + + ddb_migrate_cpu = mcpu; + currcpu = PCPU_GET(cpuid); + ddb_orig_cpu = cpu; + savectx(&stoppcbs[currcpu]); + CPU_CLR_ATOMIC(currcpu, &started_cpus); + CPU_SET_ATOMIC(currcpu, &stopped_cpus); + CPU_SET_ATOMIC(mcpu, &started_cpus); + while (!CPU_ISSET(currcpu, &started_cpus)) + ia32_pause(); + CPU_CLR_ATOMIC(currcpu, &started_cpus); + CPU_CLR_ATOMIC(currcpu, &stopped_cpus); + db_printf("current cpu %d\n", currcpu); +} +#endif + /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. --- b/sys/amd64/include/fpu.h +++ b/sys/amd64/include/fpu.h @@ -85,6 +85,7 @@ void fpu_save_area_reset(struct savefpu *fsa); #define FPU_KERN_NORMAL 0x0000 #define FPU_KERN_NOWAIT 0x0001 #define FPU_KERN_KTHR 0x0002 +#define FPU_KERN_NOCTX 0x0004 #endif --- b/sys/amd64/include/pcb.h +++ b/sys/amd64/include/pcb.h @@ -79,6 +79,7 @@ struct pcb { #define PCB_FPUINITDONE 0x08 /* fpu state is initialized */ #define PCB_USERFPUINITDONE 0x10 /* fpu user state is initialized */ #define PCB_32BIT 0x40 /* process has 32 bit context (segs etc) */ +#define PCB_FPUNOSAVE 0x80 /* no save area for current FPU ctx */ uint16_t pcb_initial_fpucw; --- b/sys/dev/random/ivy.c +++ b/sys/dev/random/ivy.c @@ -58,7 +58,8 @@ static int random_ivy_read(void *, int); static struct random_hardware_source random_ivy = { .ident = "Hardware, Intel Secure Key RNG", .source = RANDOM_PURE_RDRAND, - .read = random_ivy_read + .read = random_ivy_read, + .entropy_cdev_name = "ivy", }; static inline int --- b/sys/dev/random/live_entropy_sources.c +++ b/sys/dev/random/live_entropy_sources.c @@ -28,6 +28,7 @@ #include __FBSDID("$FreeBSD$"); +#include #include #include #include @@ -38,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -57,6 +59,41 @@ static struct les_head sources = LIST_HEAD_INITIALIZER(sources); */ static struct sx les_lock; /* need a sleepable lock */ +static int +entropy_read(struct cdev *dev, struct uio *uio, int flags) +{ + uint8_t buf[HARVESTSIZE]; + struct random_hardware_source *rsource; + ssize_t resid; + int c, error; + + sx_slock(&les_lock); + rsource = dev->si_drv1; + if (rsource == NULL) { + error = ENXIO; + } else { + error = 0; + resid = uio->uio_resid; + while (uio->uio_resid > 0) { + c = rsource->read(buf, sizeof(buf)); + if (c > 0) + error = uiomove(buf, c, uio); + if (error != 0 || c == 0) + break; + } + if (resid != uio->uio_resid) + error = 0; + } + sx_sunlock(&les_lock); + return (error); +} + +static struct cdevsw entropy_cdevsw = { + .d_version = D_VERSION, + .d_read = entropy_read, + .d_name = "entropy", +}; + void live_entropy_source_register(struct random_hardware_source *rsource) { @@ -66,8 +103,13 @@ live_entropy_source_register(struct random_hardware_source *rsource) les = malloc(sizeof(struct live_entropy_sources), M_ENTROPY, M_WAITOK); les->rsource = rsource; + les->dev = make_dev_credf(MAKEDEV_ETERNAL_KLD | MAKEDEV_WAITOK | + MAKEDEV_CHECKNAME, &entropy_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, + 0400, "entropy/%s", rsource->entropy_cdev_name); sx_xlock(&les_lock); + if (les->dev != NULL) + les->dev->si_drv1 = rsource; LIST_INSERT_HEAD(&sources, les, entries); sx_xunlock(&les_lock); } @@ -76,18 +118,27 @@ void live_entropy_source_deregister(struct random_hardware_source *rsource) { struct live_entropy_sources *les = NULL; + struct cdev *dev; KASSERT(rsource != NULL, ("invalid input to %s", __func__)); + dev = NULL; sx_xlock(&les_lock); - LIST_FOREACH(les, &sources, entries) + LIST_FOREACH(les, &sources, entries) { if (les->rsource == rsource) { LIST_REMOVE(les, entries); break; } + } + if (les != NULL) { + dev = les->dev; + if (dev != NULL) + dev->si_drv1 = NULL; + } sx_xunlock(&les_lock); - if (les != NULL) - free(les, M_ENTROPY); + if (dev != NULL) + destroy_dev(dev); + free(les, M_ENTROPY); } static int --- b/sys/dev/random/live_entropy_sources.h +++ b/sys/dev/random/live_entropy_sources.h @@ -38,6 +38,7 @@ struct live_entropy_sources { LIST_ENTRY(live_entropy_sources) entries; /* list of providers */ struct random_hardware_source *rsource; /* associated random adaptor */ + struct cdev *dev; }; extern struct mtx live_mtx; --- b/sys/dev/random/nehemiah.c +++ b/sys/dev/random/nehemiah.c @@ -55,7 +55,8 @@ static int random_nehemiah_read(void *, int); static struct random_hardware_source random_nehemiah = { .ident = "Hardware, VIA Nehemiah Padlock RNG", .source = RANDOM_PURE_NEHEMIAH, - .read = random_nehemiah_read + .read = random_nehemiah_read, + .entropy_cdev_name = "nehemiah", }; /* TODO: now that the Davies-Meyer hash is gone and we only use --- b/sys/dev/random/randomdev.h +++ b/sys/dev/random/randomdev.h @@ -55,6 +55,7 @@ struct random_adaptor { struct random_hardware_source { const char *ident; + const char *entropy_cdev_name; enum esource source; random_read_func_t *read; }; --- b/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -251,6 +251,7 @@ null_nodeget(mp, lowervp, vpp) vp->v_type = lowervp->v_type; vp->v_data = xp; vp->v_vnlock = lowervp->v_vnlock; + vp->v_vflag = lowervp->v_vflag & VV_ROOT; error = insmntque1(vp, mp, null_insmntque_dtr, xp); if (error != 0) return (error); --- b/sys/fs/tmpfs/tmpfs.h +++ b/sys/fs/tmpfs/tmpfs.h @@ -51,7 +51,8 @@ #include #include #include -#include +#include +#include MALLOC_DECLARE(M_TMPFSMNT); MALLOC_DECLARE(M_TMPFSNAME); --- b/sys/fs/tmpfs/tmpfs_subr.c +++ b/sys/fs/tmpfs/tmpfs_subr.c @@ -53,9 +53,10 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include +#include #include +#include #include #include --- b/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -45,19 +45,12 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include #include #include -#include -#include -#include -#include -#include - #include #include --- b/sys/i386/i386/initcpu.c +++ b/sys/i386/i386/initcpu.c @@ -651,6 +651,32 @@ init_transmeta(void) } #endif +static void +init_amd(void) +{ + +#ifdef CPU_ATHLON_SSE_HACK + /* + * Sometimes the BIOS doesn't enable SSE instructions. + * According to AMD document 20734, the mobile Duron, the + * (mobile) Athlon 4 and the Athlon MP support SSE. These + * correspond to cpu_id 0x66X or 0x67X. + */ + if ((cpu_feature & CPUID_XMM) == 0 && ((cpu_id & ~0xf) == 0x660 || + (cpu_id & ~0xf) == 0x670 || (cpu_id & ~0xf) == 0x680)) { + u_int regs[4]; + + wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000); + do_cpuid(1, regs); + cpu_feature = regs[3]; + } +#endif + if (CPUID_TO_FAMILY(cpu_id) == 0x9) { + if ((cpu_feature2 & CPUID2_HV) == 0) + wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) | (1 << 6)); + } +} + /* * Initialize CR4 (Control register 4) to enable SSE instructions. */ @@ -725,26 +751,9 @@ initializecpu(void) break; } break; -#ifdef CPU_ATHLON_SSE_HACK case CPU_VENDOR_AMD: - /* - * Sometimes the BIOS doesn't enable SSE instructions. - * According to AMD document 20734, the mobile - * Duron, the (mobile) Athlon 4 and the Athlon MP - * support SSE. These correspond to cpu_id 0x66X - * or 0x67X. - */ - if ((cpu_feature & CPUID_XMM) == 0 && - ((cpu_id & ~0xf) == 0x660 || - (cpu_id & ~0xf) == 0x670 || - (cpu_id & ~0xf) == 0x680)) { - u_int regs[4]; - wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000); - do_cpuid(1, regs); - cpu_feature = regs[3]; - } + init_amd(); break; -#endif case CPU_VENDOR_CENTAUR: init_via(); break; --- b/sys/i386/i386/pmap.c +++ b/sys/i386/i386/pmap.c @@ -3477,17 +3477,21 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, PMAP_LOCK(pmap); sched_pin(); - /* - * In the case that a page table page is not - * resident, we are creating it here. - */ - if (va < VM_MAXUSER_ADDRESS) { + pde = pmap_pde(pmap, va); + if ((*pde & PG_PS) != 0) { + /* PG_V is asserted by pmap_demote_pde */ + pmap_demote_pde(pmap, pde, va); + if (va < VM_MAXUSER_ADDRESS) { + mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); + mpte->wire_count++; + } + } else if (va < VM_MAXUSER_ADDRESS) { + /* + * In the case that a page table page is not resident, + * we are creating it here. + */ mpte = pmap_allocpte(pmap, va, M_WAITOK); } - - pde = pmap_pde(pmap, va); - if ((*pde & PG_PS) != 0) - panic("pmap_enter: attempted pmap_enter on 4MB page"); pte = pmap_pte_quick(pmap, va); /* --- b/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -314,20 +314,25 @@ struct getdtablesize_args { int dummy; }; #endif -/* ARGSUSED */ + int sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap) { - struct proc *p = td->td_proc; + struct proc *p; uint64_t lim; + int maxfd, res; + p = td->td_proc; PROC_LOCK(p); - td->td_retval[0] = - min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); + res = lim_cur(p, RLIMIT_NOFILE); lim = racct_get_limit(td->td_proc, RACCT_NOFILE); PROC_UNLOCK(p); - if (lim < td->td_retval[0]) - td->td_retval[0] = lim; + maxfd = maxfilesperproc; + if (maxfd > res) + maxfd = res; + if (maxfd > lim) + maxfd = lim; + td->td_retval[0] = maxfd; return (0); } @@ -775,13 +780,8 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) static int getmaxfd(struct proc *p) { - int maxfd; - - PROC_LOCK(p); - maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc); - PROC_UNLOCK(p); - return (maxfd); + return (imin(lim_cur_unlocked(p, RLIMIT_NOFILE), maxfilesperproc)); } /* --- b/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -141,6 +141,10 @@ uma_zone_t proc_zone; int kstack_pages = KSTACK_PAGES; SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "Kernel stack size in pages"); +static int vmmap_skip_res_cnt = 1; +SYSCTL_INT(_kern, OID_AUTO, proc_vmmap_skip_resident_count, CTLFLAG_RW, + &vmmap_skip_res_cnt, 0, + "Skip calculation of the pages resident count in kern.proc.vmmap"); CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE); #ifdef COMPAT_FREEBSD32 @@ -2136,15 +2140,19 @@ int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb) { vm_map_entry_t entry, tmp_entry; - unsigned int last_timestamp; + struct vattr va; + vm_map_t map; + vm_page_t m; + vm_object_t obj, tobj, lobj; char *fullpath, *freepath; struct kinfo_vmentry *kve; - struct vattr va; struct ucred *cred; - int error; struct vnode *vp; struct vmspace *vm; - vm_map_t map; + vm_pindex_t pindex; + vm_offset_t addr, clp; + unsigned int last_timestamp; + int error; PROC_LOCK_ASSERT(p, MA_OWNED); @@ -2162,44 +2170,57 @@ kern_proc_vmmap_out(struct proc *p, struct sbuf *sb) vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { - vm_object_t obj, tobj, lobj; - vm_offset_t addr; - vm_paddr_t locked_pa; - int mincoreinfo; - if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) continue; bzero(kve, sizeof(*kve)); kve->kve_private_resident = 0; + kve->kve_resident = 0; obj = entry->object.vm_object; if (obj != NULL) { - VM_OBJECT_RLOCK(obj); + for (tobj = obj; tobj != NULL; + tobj = tobj->backing_object) { + VM_OBJECT_RLOCK(tobj); + lobj = tobj; + } if (obj->shadow_count == 1) kve->kve_private_resident = obj->resident_page_count; - } - kve->kve_resident = 0; - addr = entry->start; - while (addr < entry->end) { - locked_pa = 0; - mincoreinfo = pmap_mincore(map->pmap, addr, &locked_pa); - if (locked_pa != 0) - vm_page_unlock(PHYS_TO_VM_PAGE(locked_pa)); - if (mincoreinfo & MINCORE_INCORE) - kve->kve_resident++; - if (mincoreinfo & MINCORE_SUPER) - kve->kve_flags |= KVME_FLAG_SUPER; - addr += PAGE_SIZE; - } - - for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) { - if (tobj != obj) - VM_OBJECT_RLOCK(tobj); - if (lobj != obj) - VM_OBJECT_RUNLOCK(lobj); - lobj = tobj; + if (vmmap_skip_res_cnt) + goto skip_resident_count; + for (addr = entry->start; addr < entry->end;) { + pindex = OFF_TO_IDX(entry->offset + addr - + entry->start); + for (tobj = obj;;) { + m = vm_page_lookup(tobj, pindex); + if (m != NULL) + break; + if (tobj->backing_object == NULL) + break; + pindex += OFF_TO_IDX( + tobj->backing_object_offset); + tobj = tobj->backing_object; + } + if (m == NULL) { + addr += PAGE_SIZE; + continue; + } + if (m->psind != 0) + kve->kve_flags |= KVME_FLAG_SUPER; + clp = addr + pagesizes[m->psind] <= entry->end ? + pagesizes[m->psind] : entry->end - addr; + kve->kve_resident += clp / PAGE_SIZE; + addr += pagesizes[m->psind]; + } +skip_resident_count: + for (tobj = obj; tobj != NULL; + tobj = tobj->backing_object) { + if (tobj != obj && tobj != lobj) + VM_OBJECT_RUNLOCK(tobj); + } + } else { + lobj = NULL; } kve->kve_start = entry->start; @@ -2229,7 +2250,7 @@ kern_proc_vmmap_out(struct proc *p, struct sbuf *sb) freepath = NULL; fullpath = ""; - if (lobj) { + if (lobj != NULL) { vp = NULL; switch (lobj->type) { case OBJT_DEFAULT: --- b/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -1212,6 +1212,17 @@ lim_cur(struct proc *p, int which) return (rl.rlim_cur); } +rlim_t +lim_cur_unlocked(struct proc *p, int which) +{ + struct rlimit rl; + + PROC_LOCK(p); + lim_rlimit(p, which, &rl); + PROC_UNLOCK(p); + return (rl.rlim_cur); +} + /* * Return a copy of the entire rlimit structure for the system limit * specified by 'which' in the rlimit structure pointed to by 'rlp'. --- b/sys/kern/sys_pipe.c +++ b/sys/kern/sys_pipe.c @@ -1293,13 +1293,13 @@ pipe_write(fp, uio, active_cred, flags, td) } /* - * Don't return EPIPE if I/O was successful + * Don't return EPIPE if any byte was written. + * EINTR and other interrupts are handled by generic I/O layer. + * Do not pretend that I/O succeeded for obvious user error + * like EFAULT. */ - if ((wpipe->pipe_buffer.cnt == 0) && - (uio->uio_resid == 0) && - (error == EPIPE)) { + if (uio->uio_resid != orig_resid && error == EPIPE) error = 0; - } if (error == 0) vfs_timestamp(&wpipe->pipe_mtime); --- b/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -2177,12 +2177,10 @@ vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, { int error; - error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); - /* - * From utimes(2): - * Grant permission if the caller is the owner of the file or - * the super-user. If the time pointer is null, then write + * Grant permission if the caller is the owner of the file, or + * the super-user, or has ACL_WRITE_ATTRIBUTES permission on + * on the file. If the time pointer is null, then write * permission on the file is also sufficient. * * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes: @@ -2190,6 +2188,7 @@ vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, * will be allowed to set the times [..] to the current * server time. */ + error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td); if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0) error = VOP_ACCESS(vp, VWRITE, cred, td); return (error); --- b/sys/sys/resourcevar.h +++ b/sys/sys/resourcevar.h @@ -128,6 +128,7 @@ struct plimit *lim_alloc(void); void lim_copy(struct plimit *dst, struct plimit *src); rlim_t lim_cur(struct proc *p, int which); +rlim_t lim_cur_unlocked(struct proc *p, int which); void lim_fork(struct proc *p1, struct proc *p2); void lim_free(struct plimit *limp); struct plimit --- b/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -1252,30 +1252,59 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); + KASSERT(upgrade || dst_entry->object.vm_object == NULL, + ("vm_fault_copy_entry: vm_object not NULL")); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* - * Create the top-level object for the destination entry. (Doesn't - * actually shadow anything - we copy the pages directly.) + * Create the top-level object for the destination + * entry. (Doesn't actually shadow anything - we copy + * the pages directly.) */ - dst_object = vm_object_allocate(OBJT_DEFAULT, - OFF_TO_IDX(dst_entry->end - dst_entry->start)); + vm_object_shadow(&dst_entry->object.vm_object, + &dst_entry->offset, OFF_TO_IDX(dst_entry->end - + dst_entry->start)); + dst_object = dst_entry->object.vm_object; #if VM_NRESERVLEVEL > 0 - dst_object->flags |= OBJ_COLORED; - dst_object->pg_color = atop(dst_entry->start); + if (dst_object != src_object) { + dst_object->flags |= OBJ_COLORED; + dst_object->pg_color = atop(dst_entry->start); + } #endif + + /* + * If not an upgrade, then enter the mappings in the + * pmap as read and/or execute accesses. Otherwise, + * enter them as write accesses. + * + * A writeable large page mapping is only created if + * all of the constituent small page mappings are + * modified. Marking PTEs as modified on inception + * allows promotion to happen without taking + * potentially large number of soft faults. + */ + access &= ~VM_PROT_WRITE; } + /* + * dst_entry->offset is either left unchanged in the upgrade + * case, or vm_object_shadow takes care of recalculating the + * offset depending on creation of the new object. + */ - VM_OBJECT_WLOCK(dst_object); - KASSERT(upgrade || dst_entry->object.vm_object == NULL, - ("vm_fault_copy_entry: vm_object not NULL")); - if (src_object != dst_object) { - dst_entry->object.vm_object = dst_object; - dst_entry->offset = 0; - dst_object->charge = dst_entry->end - dst_entry->start; + /* + * This can only happen for upgrade case, due to src_object + * reference bump above, and it means that all pages are + * private already. + */ + if (dst_object == src_object && + (src_entry->protection & VM_PROT_WRITE) == 0) { + KASSERT(upgrade, ("XXX")); + goto uncow; } + + VM_OBJECT_WLOCK(dst_object); if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); @@ -1290,19 +1319,6 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, } /* - * If not an upgrade, then enter the mappings in the pmap as - * read and/or execute accesses. Otherwise, enter them as - * write accesses. - * - * A writeable large page mapping is only created if all of - * the constituent small page mappings are modified. Marking - * PTEs as modified on inception allows promotion to happen - * without taking potentially large number of soft faults. - */ - if (!upgrade) - access &= ~VM_PROT_WRITE; - - /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages @@ -1408,6 +1424,7 @@ again: } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { +uncow: dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } --- b/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -2096,17 +2096,19 @@ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { - vm_pindex_t next_pindex; + vm_object_t shadow_object; + vm_page_t m; + vm_pindex_t next_pindex, pi; + boolean_t ret; if (prev_object == NULL) return (TRUE); + ret = FALSE; VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || - (prev_object->flags & OBJ_TMPFS) != 0) { - VM_OBJECT_WUNLOCK(prev_object); - return (FALSE); - } + (prev_object->flags & OBJ_TMPFS) != 0) + goto out; /* * Try to collapse the object first @@ -2114,24 +2116,61 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_object_collapse(prev_object); /* - * Can't coalesce if: . more than one reference . paged out . shadows - * another object . has a copy elsewhere (any of which mean that the - * pages not mapped to prev_entry may be in use anyway) + * Can't coalesce if shadows another object, which means that + * the pages not mapped to prev_entry may be in use anyway. */ - if (prev_object->backing_object != NULL) { - VM_OBJECT_WUNLOCK(prev_object); - return (FALSE); - } + if (prev_object->backing_object != NULL) + goto out; prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; - if ((prev_object->ref_count > 1) && - (prev_object->size != next_pindex)) { - VM_OBJECT_WUNLOCK(prev_object); - return (FALSE); + /* + * If object has more than one reference or is larger than the + * end of the previous mapping, still allow coalescing map + * entries for the case when this is due to other mappings of + * the object into the current address space. + */ + if (prev_object->ref_count > 1 && prev_object->size != next_pindex) { + /* + * Only one mapping allowed, otherwise coalesce could + * result in the contradictory content in the regions. + */ + if ((prev_object->flags & OBJ_ONEMAPPING) == 0) + goto out; + + /* No pages in the region, either resident ... */ + m = vm_page_find_least(prev_object, next_pindex); + if (m != NULL && m->pindex < next_pindex + next_size) + goto out; + /* ... or swapped out. */ + if (prev_object->type == OBJT_SWAP) { + for (pi = next_pindex; pi < next_pindex + next_size; + pi++) { + if (vm_pager_has_page(prev_object, pi, NULL, + NULL)) + goto out; + } + } + + /* + * Region must be not shadowed, otherwise the + * instantiated page in the our (backing) object could + * leak to the shadow. + */ + LIST_FOREACH(shadow_object, &prev_object->shadow_head, + shadow_list) { + KASSERT(shadow_object->backing_object == prev_object, + ("corrupted shadow")); + if (shadow_object->backing_object_offset < + next_pindex + next_size && + shadow_object->backing_object_offset + + shadow_object->size > next_pindex) + goto out; + } } + ret = TRUE; /* * Account for the charge. @@ -2183,8 +2222,9 @@ vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; +out: VM_OBJECT_WUNLOCK(prev_object); - return (TRUE); + return (ret); } void --- b/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -227,6 +227,7 @@ struct vm_domain { long vmd_segs; /* bitmask of the segments */ boolean_t vmd_oom; int vmd_pass; /* local pagedaemon pass */ + int vmd_oom_seq; struct vm_page vmd_marker; /* marker for pagedaemon private use */ }; --- b/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -117,7 +117,8 @@ __FBSDID("$FreeBSD$"); static void vm_pageout(void); static int vm_pageout_clean(vm_page_t); static void vm_pageout_scan(struct vm_domain *vmd, int pass); -static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); +static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, + int starting_page_shortage); struct proc *pageproc; @@ -147,6 +148,7 @@ int vm_pages_needed; /* Event on which pageout daemon sleeps */ int vm_pageout_deficit; /* Estimated number of pages deficit */ int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ int vm_pageout_wakeup_thresh; +static int vm_pageout_oom_seq = 24; #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ @@ -206,6 +208,10 @@ static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); +SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, + CTLFLAG_RW, &vm_pageout_oom_seq, 0, + "side-to-side calls to oom detector to start OOM"); + #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; @@ -910,7 +916,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; - int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; + int act_delta, addl_page_shortage, deficit, maxscan; + int page_shortage, starting_page_shortage; int vnodes_skipped = 0; int maxlaunder; int lockmode; @@ -951,6 +958,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass) page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; + starting_page_shortage = page_shortage; /* * maxlaunder limits the number of dirty pages we flush per scan. @@ -1309,6 +1317,15 @@ relock_queues: vm_pagequeue_unlock(pq); /* + * If we are critically low on one of RAM or swap and low on + * the other, kill the largest process. However, we avoid + * doing this on the first pass in order to give ourselves a + * chance to flush out dirty vnode-backed pages and to allow + * active pages to be moved to the inactive queue and reclaimed. + */ + vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); + + /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ @@ -1431,15 +1448,6 @@ relock_queues: vm_req_vmdaemon(VM_SWAP_NORMAL); #endif } - - /* - * If we are critically low on one of RAM or swap and low on - * the other, kill the largest process. However, we avoid - * doing this on the first pass in order to give ourselves a - * chance to flush out dirty vnode-backed pages and to allow - * active pages to be moved to the inactive queue and reclaimed. - */ - vm_pageout_mightbe_oom(vmd, pass); } static int vm_pageout_oom_vote; @@ -1450,18 +1458,36 @@ static int vm_pageout_oom_vote; * failed to reach free target is premature. */ static void -vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) +vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, + int starting_page_shortage) { int old_vote; - if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) || - (swap_pager_full && vm_paging_target() > 0))) { + if (starting_page_shortage <= 0 || starting_page_shortage != + page_shortage) { +#if 0 + if (vmd->vmd_oom_seq != 0) + printf("CLR oom_seq %d ps %d sps %d\n", vmd->vmd_oom_seq, page_shortage, starting_page_shortage); +#endif + vmd->vmd_oom_seq = 0; + } else + vmd->vmd_oom_seq++; + if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } +#if 0 +printf("OOM oom_seq %d ps %d sps %d\n", vmd->vmd_oom_seq, page_shortage, starting_page_shortage); +#endif + + /* + * Do not follow the call sequence until OOM condition is + * cleared. + */ + vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return;