[Midnightbsd-cvs] src [9944] trunk/sys/kern: sync with freebsd 10-stable

Fri May 25 16:46:52 EDT 2018

Revision: 9944
          http://svnweb.midnightbsd.org/src/?rev=9944
Author:   laffer1
Date:     2018-05-25 16:46:51 -0400 (Fri, 25 May 2018)
Log Message:
-----------
sync with freebsd 10-stable

Modified Paths:
--------------
    trunk/sys/kern/kern_cons.c
    trunk/sys/kern/kern_cpu.c
    trunk/sys/kern/kern_cpuset.c
    trunk/sys/kern/kern_descrip.c
    trunk/sys/kern/kern_environment.c
    trunk/sys/kern/kern_et.c
    trunk/sys/kern/kern_event.c
    trunk/sys/kern/kern_exec.c
    trunk/sys/kern/kern_exit.c
    trunk/sys/kern/kern_fail.c
    trunk/sys/kern/kern_fork.c
    trunk/sys/kern/kern_gzio.c
    trunk/sys/kern/kern_hhook.c
    trunk/sys/kern/kern_intr.c
    trunk/sys/kern/kern_jail.c
    trunk/sys/kern/kern_khelp.c
    trunk/sys/kern/kern_kthread.c
    trunk/sys/kern/kern_ktr.c

Modified: trunk/sys/kern/kern_cons.c
===================================================================

--- trunk/sys/kern/kern_cons.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cons.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
@@ -38,9 +39,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cons.c 283333 2015-05-23 22:34:25Z ian $");
 
 #include "opt_ddb.h"
+#include "opt_syscons.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -155,6 +157,13 @@
 	 * Make the best console the preferred console.
 	 */
 	cnselect(best_cn);
+
+#ifdef EARLY_PRINTF
+	/*
+	 * Release early console.
+	 */
+	early_putc = NULL;
+#endif
 }
 
 void
@@ -432,10 +441,8 @@
 		case '\b':
 		case '\177':
 			if (lp > cp) {
-				if (visible) {
-					cnputc(c);
-					cnputs(" \b");
-				}
+				if (visible)
+					cnputs("\b \b");
 				lp--;
 			}
 			continue;
@@ -466,6 +473,15 @@
 	struct consdev *cn;
 	char *cp;
 
+#ifdef EARLY_PRINTF
+	if (early_putc != NULL) {
+		if (c == '\n')
+			early_putc('\r');
+		early_putc(c);
+		return;
+	}
+#endif
+
 	if (cn_mute || c == '\0')
 		return;
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
@@ -497,6 +513,13 @@
 	int unlock_reqd = 0;
 
 	if (use_cnputs_mtx) {
+	  	/*
+		 * NOTE: Debug prints and/or witness printouts in
+		 * console driver clients can cause the "cnputs_mtx"
+		 * mutex to recurse. Simply return if that happens.
+		 */
+		if (mtx_owned(&cnputs_mtx))
+			return;
 		mtx_lock_spin(&cnputs_mtx);
 		unlock_reqd = 1;
 	}
@@ -641,3 +664,63 @@
 
 #endif
 
+/*
+ * Temporary support for sc(4) to vt(4) transition.
+ */
+static unsigned vty_prefer;
+static char vty_name[16];
+SYSCTL_STRING(_kern, OID_AUTO, vty, CTLFLAG_RDTUN, vty_name, 0,
+    "Console vty driver");
+
+int
+vty_enabled(unsigned vty)
+{
+	static unsigned vty_selected = 0;
+
+	if (vty_selected == 0) {
+		TUNABLE_STR_FETCH("kern.vty", vty_name, sizeof(vty_name));
+		do {
+#if defined(DEV_SC)
+			if (strcmp(vty_name, "sc") == 0) {
+				vty_selected = VTY_SC;
+				break;
+			}
+#endif
+#if defined(DEV_VT)
+			if (strcmp(vty_name, "vt") == 0) {
+				vty_selected = VTY_VT;
+				break;
+			}
+#endif
+			if (vty_prefer != 0) {
+				vty_selected = vty_prefer;
+				break;
+			}
+#if defined(DEV_SC)
+			vty_selected = VTY_SC;
+#elif defined(DEV_VT)
+			vty_selected = VTY_VT;
+#endif
+		} while (0);
+
+		if (vty_selected == VTY_VT)
+			strcpy(vty_name, "vt");
+		else if (vty_selected == VTY_SC)
+			strcpy(vty_name, "sc");
+	}
+	return ((vty_selected & vty) != 0);
+}
+
+void
+vty_set_preferred(unsigned vty)
+{
+
+	vty_prefer = vty;
+#if !defined(DEV_SC)
+	vty_prefer &= ~VTY_SC;
+#endif
+#if !defined(DEV_VT)
+	vty_prefer &= ~VTY_VT;
+#endif
+}
+

Modified: trunk/sys/kern/kern_cpu.c
===================================================================
--- trunk/sys/kern/kern_cpu.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cpu.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004-2007 Nate Lawson (SDG)
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cpu.c 266165 2014-05-15 18:07:35Z cperciva $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -135,7 +136,8 @@
 static int		cf_verbose;
 TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
 TUNABLE_INT("debug.cpufreq.verbose", &cf_verbose);
-static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL, "cpufreq debugging");
+static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
+    "cpufreq debugging");
 SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RW, &cf_lowest_freq, 1,
     "Don't provide levels below this frequency.");
 SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RW, &cf_verbose, 1,
@@ -267,7 +269,7 @@
 	 * switching the main CPU.  XXXTODO: Need to think more about how to
 	 * handle having different CPUs at different frequencies.  
 	 */
-	if (mp_ncpus > 1 && !smp_active) {
+	if (mp_ncpus > 1 && !smp_started) {
 		device_printf(dev, "rejecting change, SMP not started yet\n");
 		error = ENXIO;
 		goto out;
@@ -1037,6 +1039,7 @@
 	if (cf_dev == NULL) {
 		device_printf(dev,
 	"warning: cpufreq_unregister called with no cpufreq device active\n");
+		free(devs, M_TEMP);
 		return (0);
 	}
 	cfcount = 0;

Modified: trunk/sys/kern/kern_cpuset.c
===================================================================
--- trunk/sys/kern/kern_cpuset.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cpuset.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008,  Jeffrey Roberson <jeff at freebsd.org>
  * All rights reserved.
@@ -29,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cpuset.c 273736 2014-10-27 14:38:00Z hselasky $");
 
 #include "opt_ddb.h"
 
@@ -110,7 +111,7 @@
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
-	0, sizeof(cpuset_t), "sizeof(cpuset_t)");
+    SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 
@@ -303,7 +304,7 @@
  * empty as well as RDONLY flags.
  */
 static int
-cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
+cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
@@ -312,13 +313,16 @@
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
-	if (!CPU_OVERLAP(&set->cs_mask, mask))
-		return (EDEADLK);
-	CPU_COPY(&set->cs_mask, &newmask);
-	CPU_AND(&newmask, mask);
+	if (check_mask) {
+		if (!CPU_OVERLAP(&set->cs_mask, mask))
+			return (EDEADLK);
+		CPU_COPY(&set->cs_mask, &newmask);
+		CPU_AND(&newmask, mask);
+	} else
+		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
-		if ((error = cpuset_testupdate(nset, &newmask)) != 0)
+		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
@@ -370,11 +374,11 @@
 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
 		return (EINVAL);
 	mtx_lock_spin(&cpuset_lock);
-	error = cpuset_testupdate(set, mask);
+	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
+	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
-	CPU_COPY(mask, &set->cs_mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
@@ -618,26 +622,6 @@
 }
 
 /*
- * Calculate the ffs() of the cpuset.
- */
-int
-cpusetobj_ffs(const cpuset_t *set)
-{
-	size_t i;
-	int cbit;
-
-	cbit = 0;
-	for (i = 0; i < _NCPUWORDS; i++) {
-		if (set->__bits[i] != 0) {
-			cbit = ffsl(set->__bits[i]);
-			cbit += i * _NCPUBITS;
-			break;
-		}
-	}
-	return (cbit);
-}
-
-/*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
@@ -651,12 +635,12 @@
 	bytesp = 0;
 	bufsiz = CPUSETBUFSIZ;
 
-	for (i = _NCPUWORDS - 1; i > 0; i--) {
-		bytesp = snprintf(tbuf, bufsiz, "%lx, ", set->__bits[i]);
+	for (i = 0; i < (_NCPUWORDS - 1); i++) {
+		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
 		bufsiz -= bytesp;
 		tbuf += bytesp;
 	}
-	snprintf(tbuf, bufsiz, "%lx", set->__bits[0]);
+	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
 	return (buf);
 }
 
@@ -682,16 +666,16 @@
 		return (-1);
 
 	CPU_ZERO(set);
-	for (i = nwords - 1; i > 0; i--) {
-		ret = sscanf(buf, "%lx, ", &set->__bits[i]);
+	for (i = 0; i < (nwords - 1); i++) {
+		ret = sscanf(buf, "%lx,", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			return (-1);
-		buf = strstr(buf, " ");
+		buf = strstr(buf, ",");
 		if (buf == NULL)
 			return (-1);
 		buf++;
 	}
-	ret = sscanf(buf, "%lx", &set->__bits[0]);
+	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
 	if (ret == 0 || ret == -1)
 		return (-1);
 	return (0);
@@ -900,6 +884,7 @@
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
+};
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
@@ -1146,10 +1131,27 @@
 }
 
 #ifdef DDB
+void
+ddb_display_cpuset(const cpuset_t *set)
+{
+	int cpu, once;
+
+	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+		if (CPU_ISSET(cpu, set)) {
+			if (once == 0) {
+				db_printf("%d", cpu);
+				once = 1;
+			} else  
+				db_printf(",%d", cpu);
+		}
+	}
+	if (once == 0)
+		db_printf("<none>");
+}
+
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
-	int cpu, once;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
@@ -1156,15 +1158,7 @@
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  mask=");
-		for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
-			if (CPU_ISSET(cpu, &set->cs_mask)) {
-				if (once == 0) {
-					db_printf("%d", cpu);
-					once = 1;
-				} else  
-					db_printf(",%d", cpu);
-			}
-		}
+		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;

Modified: trunk/sys/kern/kern_descrip.c
===================================================================
--- trunk/sys/kern/kern_descrip.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_descrip.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_descrip.c 321020 2017-07-15 17:25:40Z dchagin $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -46,9 +47,10 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/domain.h>
+#include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
@@ -55,6 +57,7 @@
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
+#include <sys/ksem.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -103,41 +106,55 @@
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
-		     "file desc to leader structures");
+    "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static uma_zone_t file_zone;
 
+void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
 
-/* Flags for do_dup() */
-#define DUP_FIXED	0x1	/* Force fixed allocation */
-#define DUP_FCNTL	0x2	/* fcntl()-style errors */
-#define	DUP_CLOEXEC	0x4	/* Atomically set FD_CLOEXEC. */
-
-static int do_dup(struct thread *td, int flags, int old, int new,
-    register_t *retval);
-static int	fd_first_free(struct filedesc *, int, int);
-static int	fd_last_used(struct filedesc *, int, int);
-static void	fdgrowtable(struct filedesc *, int);
+static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
+		    struct thread *td, int holdleaders);
+static int	fd_first_free(struct filedesc *fdp, int low, int size);
+static int	fd_last_used(struct filedesc *fdp, int size);
+static void	fdgrowtable(struct filedesc *fdp, int nfd);
+static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
-static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
-static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
-static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
 static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
 static int	fill_procdesc_info(struct procdesc *pdp,
-    struct kinfo_file *kif);
+		    struct kinfo_file *kif);
+static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
+static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
 static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
+static int	getmaxfd(struct proc *p);
 
 /*
- * A process is initially started out with NDFILE descriptors stored within
- * this structure, selected to be enough for typical applications based on
- * the historical limit of 20 open files (and the usage of descriptors by
- * shells).  If these descriptors are exhausted, a larger descriptor table
- * may be allocated, up to a process' resource limit; the internal arrays
- * are then unused.
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors.  The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those.  This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed.  Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
@@ -147,34 +164,22 @@
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
- * Storage required per open file descriptor.
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
  */
-#define OFILESIZE (sizeof(struct file *) + sizeof(char))
-
-/*
- * Storage to hold unused ofiles that need to be reclaimed.
- */
 struct freetable {
-	struct file	**ft_table;
+	struct filedescent *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
- * Basic allocation of descriptors:
- * one of the above, plus arrays for NDFILE descriptors.
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 struct filedesc0 {
-	struct	filedesc fd_fd;
-	/*
-	 * ofiles which need to be reclaimed on free.
-	 */
-	SLIST_HEAD(,freetable) fd_free;
-	/*
-	 * These arrays are used when the number of open files is
-	 * <= NDFILE, and are then pointed to by the pointers above.
-	 */
-	struct	file *fd_dfiles[NDFILE];
-	char	fd_dfileflags[NDFILE];
+	struct filedesc fd_fd;
+	SLIST_HEAD(, freetable) fd_free;
+	struct	filedescent fd_dfiles[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
@@ -183,14 +188,15 @@
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
-void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
-static struct mtx	fdesc_mtx;
+static struct mtx fdesc_mtx;
 
 /*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
@@ -216,19 +222,16 @@
 }
 
 /*
- * Find the highest non-zero bit in the given bitmap, starting at low and
- * not exceeding size - 1.
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
  */
 static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
-	if (low >= size)
-		return (-1);
-
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
@@ -236,17 +239,21 @@
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
-	for (minoff = NDSLOT(low); off >= minoff; --off)
+	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
-	return (low - 1);
+	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
-        KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
-            ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
@@ -258,9 +265,9 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(!fdisused(fdp, fd),
-	    ("fd already used"));
 
+	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
@@ -276,19 +283,58 @@
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
-	KASSERT(fdisused(fdp, fd),
-	    ("fd is already unused"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("fd is still in use"));
 
+	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("fd=%d is still in use", fd));
+
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
-		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
+		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
+ * Free a file descriptor.
+ *
+ * Avoid some work if fdp is about to be destroyed.
+ */
+static inline void
+_fdfree(struct filedesc *fdp, int fd, int last)
+{
+	struct filedescent *fde;
+
+	fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+	if (!last)
+		seq_write_begin(&fde->fde_seq);
+#endif
+	filecaps_free(&fde->fde_caps);
+	if (last)
+		return;
+	bzero(fde, fde_change_size);
+	fdunused(fdp, fd);
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
+}
+
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+
+	_fdfree(fdp, fd, 0);
+}
+
+static inline void
+fdfree_last(struct filedesc *fdp, int fd)
+{
+
+	_fdfree(fdp, fd, 1);
+}
+
+/*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -364,15 +410,21 @@
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
+
+	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
+}
+
+int
+kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
+{
 	struct flock fl;
 	struct __oflock ofl;
-	intptr_t arg;
-	int error;
-	int cmd;
+	intptr_t arg1;
+	int error, newcmd;
 
 	error = 0;
-	cmd = uap->cmd;
-	switch (uap->cmd) {
+	newcmd = cmd;
+	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
@@ -379,7 +431,7 @@
 		/*
 		 * Convert old flock structure to new.
 		 */
-		error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
@@ -387,96 +439,62 @@
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
-		switch (uap->cmd) {
+		switch (cmd) {
 		case F_OGETLK:
-		    cmd = F_GETLK;
-		    break;
+			newcmd = F_GETLK;
+			break;
 		case F_OSETLK:
-		    cmd = F_SETLK;
-		    break;
+			newcmd = F_SETLK;
+			break;
 		case F_OSETLKW:
-		    cmd = F_SETLKW;
-		    break;
+			newcmd = F_SETLKW;
+			break;
 		}
-		arg = (intptr_t)&fl;
+		arg1 = (intptr_t)&fl;
 		break;
-        case F_GETLK:
-        case F_SETLK:
-        case F_SETLKW:
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
 	case F_SETLK_REMOTE:
-                error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
-                arg = (intptr_t)&fl;
-                break;
+		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
+		arg1 = (intptr_t)&fl;
+		break;
 	default:
-		arg = uap->arg;
+		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
-	error = kern_fcntl(td, uap->fd, cmd, arg);
+	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
-	if (uap->cmd == F_OGETLK) {
+	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
-		error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
-	} else if (uap->cmd == F_GETLK) {
-		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
+	} else if (cmd == F_GETLK) {
+		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
-static inline struct file *
-fdtofp(int fd, struct filedesc *fdp)
-{
-	struct file *fp;
-
-	FILEDESC_LOCK_ASSERT(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
-		return (NULL);
-	return (fp);
-}
-
-static inline int
-fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
-{
-
-	*fpp = fdtofp(fd, fdp);
-	if (*fpp == NULL)
-		return (EBADF);
-
-#ifdef CAPABILITIES
-	if ((*fpp)->f_type == DTYPE_CAPABILITY) {
-		int err = cap_funwrap(*fpp, rights, fpp);
-		if (err != 0) {
-			*fpp = NULL;
-			return (err);
-		}
-	}
-#endif /* CAPABILITIES */
-	return (0);
-}
-
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
-	struct file *fp;
+	struct file *fp, *fp2;
+	struct filedescent *fde;
 	struct proc *p;
-	char *pop;
 	struct vnode *vp;
+	cap_rights_t rights;
 	int error, flg, tmp;
-	int vfslocked;
-	u_int old, new;
 	uint64_t bsize;
 	off_t foffset;
 
-	vfslocked = 0;
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
@@ -507,49 +525,44 @@
 
 	case F_GETFD:
 		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
-		pop = &fdp->fd_ofileflags[fd];
-		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+		fde = &fdp->fd_ofiles[fd];
+		td->td_retval[0] =
+		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		FILEDESC_XLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
+		if ((fp = fget_locked(fdp, fd)) == NULL) {
 			FILEDESC_XUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
-		pop = &fdp->fd_ofileflags[fd];
-		*pop = (*pop &~ UF_EXCLOSE) |
+		fde = &fdp->fd_ofiles[fd];
+		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
+		if (error != 0)
 			break;
-		}
 		td->td_retval[0] = OFLAGS(fp->f_flag);
-		FILEDESC_SUNLOCK(fdp);
+		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
+		if (error != 0)
 			break;
-		}
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
@@ -557,7 +570,7 @@
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
-		if (error) {
+		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
@@ -574,14 +587,10 @@
 		break;
 
 	case F_GETOWN:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
+		if (error != 0)
 			break;
-		}
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
@@ -589,14 +598,10 @@
 		break;
 
 	case F_SETOWN:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
+		if (error != 0)
 			break;
-		}
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
@@ -615,17 +620,16 @@
 
 	case F_SETLK:
 	do_setlk:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		cap_rights_init(&rights, CAP_FLOCK);
+		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
+		if (error != 0)
 			break;
-		}
 		if (fp->f_type != DTYPE_VNODE) {
-			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
+			fdrop(fp, td);
 			break;
 		}
+
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
@@ -634,18 +638,13 @@
 			     foffset > OFF_MAX - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
+				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
-		/*
-		 * VOP_ADVLOCK() may block.
-		 */
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
@@ -689,44 +688,60 @@
 			error = EINVAL;
 			break;
 		}
-		VFS_UNLOCK_GIANT(vfslocked);
-		vfslocked = 0;
-		/* Check for race with close */
-		FILEDESC_SLOCK(fdp);
-		if ((unsigned) fd >= fdp->fd_nfiles ||
-		    fp != fdp->fd_ofiles[fd]) {
-			FILEDESC_SUNLOCK(fdp);
+		if (error != 0 || flp->l_type == F_UNLCK ||
+		    flp->l_type == F_UNLCKSYS) {
+			fdrop(fp, td);
+			break;
+		}
+
+		/*
+		 * Check for a race with close.
+		 *
+		 * The vnode is now advisory locked (or unlocked, but this case
+		 * is not really important) as the caller requested.
+		 * We had to drop the filedesc lock, so we need to recheck if
+		 * the descriptor is still valid, because if it was closed
+		 * in the meantime we need to remove advisory lock from the
+		 * vnode - close on any descriptor leading to an advisory
+		 * locked vnode, removes that lock.
+		 * We will return 0 on purpose in that case, as the result of
+		 * successful advisory lock might have been externally visible
+		 * already. This is fine - effectively we pretend to the caller
+		 * that the closing thread was a bit slower and that the
+		 * advisory lock succeeded before the close.
+		 */
+		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
-					   F_UNLCK, flp, F_POSIX);
-			VFS_UNLOCK_GIANT(vfslocked);
-			vfslocked = 0;
-		} else
-			FILEDESC_SUNLOCK(fdp);
+			    F_UNLCK, flp, F_POSIX);
+		}
 		fdrop(fp, td);
+		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
-		FILEDESC_SLOCK(fdp);
-		error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
-		if (error != 0) {
-			FILEDESC_SUNLOCK(fdp);
+		error = fget_unlocked(fdp, fd,
+		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
+		if (error != 0)
 			break;
-		}
 		if (fp->f_type != DTYPE_VNODE) {
-			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
+			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
-			FILEDESC_SUNLOCK(fdp);
 			error = EINVAL;
+			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
@@ -734,24 +749,17 @@
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
-			     foffset < OFF_MIN - flp->l_start)) {
+			    foffset < OFF_MIN - flp->l_start)) {
 				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
+				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
-		/*
-		 * VOP_ADVLOCK() may block.
-		 */
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
-		VFS_UNLOCK_GIANT(vfslocked);
-		vfslocked = 0;
 		fdrop(fp, td);
 		break;
 
@@ -759,41 +767,32 @@
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
-		FILEDESC_SLOCK(fdp);
-		if ((fp = fdtofp(fd, fdp)) == NULL) {
-			FILEDESC_SUNLOCK(fdp);
-			error = EBADF;
+		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
+		if (error != 0)
 			break;
-		}
 		if (fp->f_type != DTYPE_VNODE) {
-			FILEDESC_SUNLOCK(fdp);
+			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
-		fhold(fp);
-		FILEDESC_SUNLOCK(fdp);
-		if (arg != 0) {
-			vp = fp->f_vnode;
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-			error = vn_lock(vp, LK_SHARED);
-			if (error != 0)
-				goto readahead_vnlock_fail;
+		vp = fp->f_vnode;
+		/*
+		 * Exclusive lock synchronizes against f_seqcount reads and
+		 * writes in sequential_heuristic().
+		 */
+		error = vn_lock(vp, LK_EXCLUSIVE);
+		if (error != 0) {
+			fdrop(fp, td);
+			break;
+		}
+		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
-			VOP_UNLOCK(vp, 0);
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
-			do {
-				new = old = fp->f_flag;
-				new |= FRDAHEAD;
-			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
-readahead_vnlock_fail:
-			VFS_UNLOCK_GIANT(vfslocked);
-			vfslocked = 0;
+			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
-			do {
-				new = old = fp->f_flag;
-				new &= ~FRDAHEAD;
-			} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
+		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
@@ -801,22 +800,34 @@
 		error = EINVAL;
 		break;
 	}
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
+static int
+getmaxfd(struct proc *p)
+{
+	int maxfd;
+
+	PROC_LOCK(p);
+	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+	PROC_UNLOCK(p);
+
+	return (maxfd);
+}
+
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
-static int
+int
 do_dup(struct thread *td, int flags, int old, int new,
     register_t *retval)
 {
 	struct filedesc *fdp;
+	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
-	int error, holdleaders, maxfd;
+	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
@@ -830,33 +841,30 @@
 		return (EBADF);
 	if (new < 0)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
-	PROC_LOCK(p);
-	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
-	PROC_UNLOCK(p);
+	maxfd = getmaxfd(p);
 	if (new >= maxfd)
 		return (flags & DUP_FCNTL ? EINVAL : EBADF);
 
 	FILEDESC_XLOCK(fdp);
-	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+	if (fget_locked(fdp, old) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
+	oldfde = &fdp->fd_ofiles[old];
 	if (flags & DUP_FIXED && old == new) {
 		*retval = new;
 		if (flags & DUP_CLOEXEC)
-			fdp->fd_ofileflags[new] |= UF_EXCLOSE;
+			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
-	fp = fdp->fd_ofiles[old];
+	fp = oldfde->fde_file;
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
-	 * allocate a new descriptor the usual way.  Since the filedesc
-	 * lock may be temporarily dropped in the process, we have to look
-	 * out for a race.
+	 * allocate a new descriptor the usual way.
 	 */
 	if (flags & DUP_FIXED) {
 		if (new >= fdp->fd_nfiles) {
@@ -869,18 +877,22 @@
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
-			PROC_LOCK(p);
-			error = racct_set(p, RACCT_NOFILE, new + 1);
-			PROC_UNLOCK(p);
-			if (error != 0) {
-				FILEDESC_XUNLOCK(fdp);
-				fdrop(fp, td);
-				return (EMFILE);
+			if (racct_enable) {
+				PROC_LOCK(p);
+				error = racct_set(p, RACCT_NOFILE, new + 1);
+				PROC_UNLOCK(p);
+				if (error != 0) {
+					FILEDESC_XUNLOCK(fdp);
+					fdrop(fp, td);
+					return (EMFILE);
+				}
 			}
 #endif
-			fdgrowtable(fdp, new + 1);
+			fdgrowtable_exp(fdp, new + 1);
+			oldfde = &fdp->fd_ofiles[old];
 		}
-		if (fdp->fd_ofiles[new] == NULL)
+		newfde = &fdp->fd_ofiles[new];
+		if (newfde->fde_file == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
@@ -888,82 +900,39 @@
 			fdrop(fp, td);
 			return (error);
 		}
+		newfde = &fdp->fd_ofiles[new];
 	}
 
-	/*
-	 * If the old file changed out from under us then treat it as a
-	 * bad file descriptor.  Userland should do its own locking to
-	 * avoid this case.
-	 */
-	if (fdp->fd_ofiles[old] != fp) {
-		/* we've allocated a descriptor which we won't use */
-		if (fdp->fd_ofiles[new] == NULL)
-			fdunused(fdp, new);
-		FILEDESC_XUNLOCK(fdp);
-		fdrop(fp, td);
-		return (EBADF);
-	}
-	KASSERT(old != new,
-	    ("new fd is same as old"));
+	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
+	KASSERT(old != new, ("new fd is same as old"));
 
-	/*
-	 * Save info on the descriptor being overwritten.  We cannot close
-	 * it without introducing an ownership race for the slot, since we
-	 * need to drop the filedesc lock to call closef().
-	 *
-	 * XXX this duplicates parts of close().
-	 */
-	delfp = fdp->fd_ofiles[new];
-	holdleaders = 0;
-	if (delfp != NULL) {
-		if (td->td_proc->p_fdtol != NULL) {
-			/*
-			 * Ask fdfree() to sleep to ensure that all relevant
-			 * process leaders can be traversed in closef().
-			 */
-			fdp->fd_holdleaderscount++;
-			holdleaders = 1;
-		}
-	}
+	delfp = newfde->fde_file;
 
 	/*
-	 * Duplicate the source descriptor
+	 * Duplicate the source descriptor.
 	 */
-	fdp->fd_ofiles[new] = fp;
+#ifdef CAPABILITIES
+	seq_write_begin(&newfde->fde_seq);
+#endif
+	filecaps_free(&newfde->fde_caps);
+	memcpy(newfde, oldfde, fde_change_size);
+	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
 	if ((flags & DUP_CLOEXEC) != 0)
-		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] | UF_EXCLOSE;
+		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
-		fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
-	if (new > fdp->fd_lastfile)
-		fdp->fd_lastfile = new;
+		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+#ifdef CAPABILITIES
+	seq_write_end(&newfde->fde_seq);
+#endif
 	*retval = new;
 
-	/*
-	 * If we dup'd over a valid file, we now own the reference to it
-	 * and must dispose of it using closef() semantics (as if a
-	 * close() were performed on it).
-	 *
-	 * XXX this duplicates parts of close().
-	 */
 	if (delfp != NULL) {
-		knote_fdclose(td, new);
-		if (delfp->f_type == DTYPE_MQUEUE)
-			mq_fdclose(td, new, delfp);
-		FILEDESC_XUNLOCK(fdp);
-		(void) closef(delfp, td);
-		if (holdleaders) {
-			FILEDESC_XLOCK(fdp);
-			fdp->fd_holdleaderscount--;
-			if (fdp->fd_holdleaderscount == 0 &&
-			    fdp->fd_holdleaderswakeup != 0) {
-				fdp->fd_holdleaderswakeup = 0;
-				wakeup(&fdp->fd_holdleaderscount);
-			}
-			FILEDESC_XUNLOCK(fdp);
-		}
+		(void) closefp(fdp, new, delfp, td, 1);
+		/* closefp() drops the FILEDESC lock for us. */
 	} else {
 		FILEDESC_XUNLOCK(fdp);
 	}
+
 	return (0);
 }
 
@@ -988,13 +957,13 @@
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
-			     sigio, sio_pgsigio);
+			    sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
-			     sigio, sio_pgsigio);
+			    sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
@@ -1187,56 +1156,27 @@
 }
 
 /*
- * Close a file descriptor.
+ * Function drops the filedesc lock on return.
  */
-#ifndef _SYS_SYSPROTO_H_
-struct close_args {
-	int     fd;
-};
-#endif
-/* ARGSUSED */
-int
-sys_close(td, uap)
-	struct thread *td;
-	struct close_args *uap;
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+    int holdleaders)
 {
-
-	return (kern_close(td, uap->fd));
-}
-
-int
-kern_close(td, fd)
-	struct thread *td;
-	int fd;
-{
-	struct filedesc *fdp;
-	struct file *fp, *fp_object;
 	int error;
-	int holdleaders;
 
-	error = 0;
-	holdleaders = 0;
-	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK_ASSERT(fdp);
 
-	AUDIT_SYSCLOSE(td, fd);
-
-	FILEDESC_XLOCK(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL) {
-		FILEDESC_XUNLOCK(fdp);
-		return (EBADF);
+	if (holdleaders) {
+		if (td->td_proc->p_fdtol != NULL) {
+			/*
+			 * Ask fdfree() to sleep to ensure that all relevant
+			 * process leaders can be traversed in closef().
+			 */
+			fdp->fd_holdleaderscount++;
+		} else {
+			holdleaders = 0;
+		}
 	}
-	fdp->fd_ofiles[fd] = NULL;
-	fdp->fd_ofileflags[fd] = 0;
-	fdunused(fdp, fd);
-	if (td->td_proc->p_fdtol != NULL) {
-		/*
-		 * Ask fdfree() to sleep to ensure that all relevant
-		 * process leaders can be traversed in closef().
-		 */
-		fdp->fd_holdleaderscount++;
-		holdleaders = 1;
-	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
@@ -1247,12 +1187,10 @@
 	knote_fdclose(td, fd);
 
 	/*
-	 * When we're closing an fd with a capability, we need to notify
-	 * mqueue if the underlying object is of type mqueue.
+	 * We need to notify mqueue if the object is of type mqueue.
 	 */
-	(void)cap_funwrap(fp, 0, &fp_object);
-	if (fp_object->f_type == DTYPE_MQUEUE)
-		mq_fdclose(td, fd, fp_object);
+	if (fp->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
@@ -1270,6 +1208,43 @@
 }
 
 /*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+	int     fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(struct thread *td, struct close_args *uap)
+{
+
+	return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(struct thread *td, int fd)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	fdp = td->td_proc->p_fd;
+
+	AUDIT_SYSCLOSE(td, fd);
+
+	FILEDESC_XLOCK(fdp);
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	fdfree(fdp, fd);
+
+	/* closefp() drops the FILEDESC lock for us. */
+	return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -1294,8 +1269,8 @@
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
-	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
-		if (fdp->fd_ofiles[fd] != NULL) {
+	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
+		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
@@ -1358,11 +1333,13 @@
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
-	if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
+	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
+	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
@@ -1416,9 +1393,11 @@
 {
 	struct file *fp;
 	struct vnode *vp;
+	cap_rights_t rights;
 	int error;
 
-	if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
+	if (error != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
@@ -1428,18 +1407,15 @@
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
-		int vfslocked;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
-		error = 0;
+			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
@@ -1450,80 +1426,190 @@
 }
 
 /*
- * Grow the file table to accomodate (at least) nfd descriptors.  This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
+ * Initialize filecaps structure.
  */
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+	bzero(fcaps, sizeof(*fcaps));
+	fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ */
+void
+filecaps_copy(const struct filecaps *src, struct filecaps *dst)
+{
+	size_t size;
+
+	*dst = *src;
+	if (src->fc_ioctls != NULL) {
+		KASSERT(src->fc_nioctls > 0,
+		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+	}
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+	*dst = *src;
+	bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
 static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+	CAP_ALL(&fcaps->fc_rights);
+	fcaps->fc_ioctls = NULL;
+	fcaps->fc_nioctls = -1;
+	fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+	free(fcaps->fc_ioctls, M_FILECAPS);
+	bzero(fcaps, sizeof(*fcaps));
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+	    ("%s: invalid rights", func));
+	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+	    ("%s: invalid fcntls", func));
+	KASSERT(fcaps->fc_fcntls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+	    ("%s: fcntls without CAP_FCNTL", func));
+	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+	    ("%s: invalid ioctls", func));
+	KASSERT(fcaps->fc_nioctls == 0 ||
+	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+	    ("%s: ioctls without CAP_IOCTL", func));
+}
+
+static void
+fdgrowtable_exp(struct filedesc *fdp, int nfd)
+{
+	int nfd1;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	nfd1 = fdp->fd_nfiles * 2;
+	if (nfd1 < nfd)
+		nfd1 = nfd;
+	fdgrowtable(fdp, nfd1);
+}
+
+/*
+ * Grow the file table to accommodate (at least) nfd descriptors.
+ */
+static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
-	struct freetable *fo;
-	struct file **ntable;
-	struct file **otable;
-	char *nfileflags;
+	struct freetable *ft;
+	struct filedescent *ntable;
+	struct filedescent *otable;
 	int nnfiles, onfiles;
-	NDSLOTTYPE *nmap;
+	NDSLOTTYPE *nmap, *omap;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
-	KASSERT(fdp->fd_nfiles > 0,
-	    ("zero-length file table"));
+	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
+	/* save old values */
+	onfiles = fdp->fd_nfiles;
+	otable = fdp->fd_ofiles;
+	omap = fdp->fd_map;
+
 	/* compute the size of the new table */
-	onfiles = fdp->fd_nfiles;
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
-	/* allocate a new table and (if required) new bitmaps */
-	FILEDESC_XUNLOCK(fdp);
-	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
+	/*
+	 * Allocate a new table.  We need enough space for the
+	 * file entries themselves and the struct freetable we will use
+	 * when we decommission the table and place it on the freelist.
+	 * We place the struct freetable in the middle so we don't have
+	 * to worry about padding.
+	 */
+	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
-	nfileflags = (char *)&ntable[nnfiles];
-	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
-		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
-		    M_FILEDESC, M_ZERO | M_WAITOK);
-	else
-		nmap = NULL;
-	FILEDESC_XLOCK(fdp);
+	/* copy the old data over and point at the new tables */
+	memcpy(ntable, otable, onfiles * sizeof(*otable));
+	fdp->fd_ofiles = ntable;
 
 	/*
-	 * We now have new tables ready to go.  Since we dropped the
-	 * filedesc lock to call malloc(), watch out for a race.
+	 * Allocate a new map only if the old is not large enough.  It will
+	 * grow at a slower rate than the table as it can map more
+	 * entries than the table can hold.
 	 */
-	onfiles = fdp->fd_nfiles;
-	if (onfiles >= nnfiles) {
-		/* we lost the race, but that's OK */
-		free(ntable, M_FILEDESC);
-		if (nmap != NULL)
-			free(nmap, M_FILEDESC);
-		return;
+	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
+		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+		    M_ZERO | M_WAITOK);
+		/* copy over the old data and update the pointer */
+		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+		fdp->fd_map = nmap;
 	}
-	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
-	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
-	otable = fdp->fd_ofiles;
-	fdp->fd_ofileflags = nfileflags;
-	fdp->fd_ofiles = ntable;
+
 	/*
-	 * We must preserve ofiles until the process exits because we can't
-	 * be certain that no threads have references to the old table via
-	 * _fget().
+	 * In order to have a valid pattern for fget_unlocked()
+	 * fdp->fd_nfiles must be the last member to be updated, otherwise
+	 * fget_unlocked() consumers may reference a new, higher value for
+	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
+	 * resulting in OOB accesses.
 	 */
+	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
+
+	/*
+	 * Do not free the old file table, as some threads may still
+	 * reference entries within it.  Instead, place it on a freelist
+	 * which will be processed when the struct filedesc is released.
+	 *
+	 * Note that if onfiles == NDFILE, we're dealing with the original
+	 * static allocation contained within (struct filedesc0 *)fdp,
+	 * which must not be freed.
+	 */
 	if (onfiles > NDFILE) {
-		fo = (struct freetable *)&otable[onfiles];
+		ft = (struct freetable *)&otable[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
-		fo->ft_table = otable;
-		SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
+		ft->ft_table = otable;
+		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
-	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
-		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
-		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
-			free(fdp->fd_map, M_FILEDESC);
-		fdp->fd_map = nmap;
-	}
-	fdp->fd_nfiles = nnfiles;
+	/*
+	 * The map does not have the same possibility of threads still
+	 * holding references to it.  So always free it as long as it
+	 * does not reference the original static allocation.
+	 */
+	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
+		free(omap, M_FILEDESC);
 }
 
 /*
@@ -1534,7 +1620,7 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
-	int fd = -1, maxfd;
+	int fd = -1, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
@@ -1544,30 +1630,31 @@
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
-	PROC_LOCK(p);
-	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
-	PROC_UNLOCK(p);
+	maxfd = getmaxfd(p);
 
 	/*
-	 * Search the bitmap for a free descriptor.  If none is found, try
-	 * to grow the file table.  Keep at it until we either get a file
-	 * descriptor or run into process or system limits; fdgrowtable()
-	 * may drop the filedesc lock, so we're in a race.
+	 * Search the bitmap for a free descriptor starting at minfd.
+	 * If none is found, grow the file table.
 	 */
-	for (;;) {
-		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
-		if (fd >= maxfd)
-			return (EMFILE);
-		if (fd < fdp->fd_nfiles)
-			break;
+	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+	if (fd >= maxfd)
+		return (EMFILE);
+	if (fd >= fdp->fd_nfiles) {
+		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
-		PROC_LOCK(p);
-		error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
-		PROC_UNLOCK(p);
-		if (error != 0)
-			return (EMFILE);
+		if (racct_enable) {
+			PROC_LOCK(p);
+			error = racct_set(p, RACCT_NOFILE, allocfd);
+			PROC_UNLOCK(p);
+			if (error != 0)
+				return (EMFILE);
+		}
 #endif
-		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
+		/*
+		 * fd is already equal to first free descriptor >= minfd, so
+		 * we only need to grow the table and we are done.
+		 */
+		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
@@ -1574,11 +1661,13 @@
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
+	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
-	KASSERT(fdp->fd_ofiles[fd] == NULL,
-	    ("free descriptor isn't"));
-	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+	    ("file descriptor isn't free"));
+	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
@@ -1585,6 +1674,34 @@
 }
 
 /*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+	struct proc *p = td->td_proc;
+	struct filedesc *fdp = p->p_fd;
+	int i;
+
+	FILEDESC_XLOCK_ASSERT(fdp);
+
+	if (!fdavail(td, n))
+		return (EMFILE);
+
+	for (i = 0; i < n; i++)
+		if (fdalloc(td, 0, &fds[i]) != 0)
+			break;
+
+	if (i < n) {
+		for (i--; i >= 0; i--)
+			fdunused(fdp, fds[i]);
+		return (EMFILE);
+	}
+
+	return (0);
+}
+
+/*
  * Check to see whether n user file descriptors are available to the process
  * p.
  */
@@ -1593,7 +1710,6 @@
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
-	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp);
@@ -1603,15 +1719,12 @@
 	 *      call racct_add() from there instead of dealing with containers
 	 *      here.
 	 */
-	PROC_LOCK(p);
-	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
-	PROC_UNLOCK(p);
+	lim = getmaxfd(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
-	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
-	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
-		if (*fpp == NULL && --n <= 0)
+	for (i = fdp->fd_freefile; i < last; i++) {
+		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
@@ -1618,7 +1731,7 @@
 }
 
 /*
- * Create a new open file structure and allocate a file decriptor for the
+ * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
@@ -1634,7 +1747,7 @@
 	if (error)
 		return (error);		/* no reference held on error */
 
-	error = finstall(td, fp, &fd, flags);
+	error = finstall(td, fp, &fd, flags, NULL);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
@@ -1668,8 +1781,8 @@
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
-			printf("kern.maxfiles limit exceeded by uid %i, "
-			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
+			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
+			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
@@ -1688,13 +1801,17 @@
  * Install a file in a file descriptor table.
  */
 int
-finstall(struct thread *td, struct file *fp, int *fd, int flags)
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+    struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
+	struct filedescent *fde;
 	int error;
 
 	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
 	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+	if (fcaps != NULL)
+		filecaps_validate(fcaps, __func__);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
@@ -1702,9 +1819,20 @@
 		return (error);
 	}
 	fhold(fp);
-	fdp->fd_ofiles[*fd] = fp;
+	fde = &fdp->fd_ofiles[*fd];
+#ifdef CAPABILITIES
+	seq_write_begin(&fde->fde_seq);
+#endif
+	fde->fde_file = fp;
 	if ((flags & O_CLOEXEC) != 0)
-		fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
+		fde->fde_flags |= UF_EXCLOSE;
+	if (fcaps != NULL)
+		filecaps_move(fcaps, &fde->fde_caps);
+	else
+		filecaps_fill(&fde->fde_caps);
+#ifdef CAPABILITIES
+	seq_write_end(&fde->fde_seq);
+#endif
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
@@ -1721,7 +1849,7 @@
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
 	if (fdp != NULL) {
-		FILEDESC_XLOCK(fdp);
+		FILEDESC_SLOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
@@ -1731,7 +1859,7 @@
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
-		FILEDESC_XUNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
@@ -1739,7 +1867,6 @@
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
-	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
@@ -1798,19 +1925,17 @@
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
-fdunshare(struct proc *p, struct thread *td)
+fdunshare(struct thread *td)
 {
+	struct filedesc *tmp;
+	struct proc *p = td->td_proc;
 
-	FILEDESC_XLOCK(p->p_fd);
-	if (p->p_fd->fd_refcnt > 1) {
-		struct filedesc *tmp;
+	if (p->p_fd->fd_refcnt == 1)
+		return;
 
-		FILEDESC_XUNLOCK(p->p_fd);
-		tmp = fdcopy(p->p_fd);
-		fdfree(td);
-		p->p_fd = tmp;
-	} else
-		FILEDESC_XUNLOCK(p->p_fd);
+	tmp = fdcopy(p->p_fd);
+	fdescfree(td);
+	p->p_fd = tmp;
 }
 
 /*
@@ -1821,6 +1946,7 @@
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
+	struct filedescent *nfde, *ofde;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
@@ -1839,12 +1965,14 @@
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
+		ofde = &fdp->fd_ofiles[i];
 		if (fdisused(fdp, i) &&
-		    (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
-		    fdp->fd_ofiles[i]->f_ops != &badfileops) {
-			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
-			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
-			fhold(newfdp->fd_ofiles[i]);
+		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
+		    ofde->fde_file->f_ops != &badfileops) {
+			nfde = &newfdp->fd_ofiles[i];
+			*nfde = *ofde;
+			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
+			fhold(nfde->fde_file);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
@@ -1854,9 +1982,10 @@
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	FILEDESC_XLOCK(newfdp);
-	for (i = 0; i <= newfdp->fd_lastfile; ++i)
-		if (newfdp->fd_ofiles[i] != NULL)
+	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
+		if (newfdp->fd_ofiles[i].fde_file != NULL)
 			fdused(newfdp, i);
+	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	FILEDESC_XUNLOCK(newfdp);
@@ -1867,11 +1996,10 @@
  * Release a filedesc structure.
  */
 void
-fdfree(struct thread *td)
+fdescfree(struct thread *td)
 {
 	struct filedesc *fdp;
-	struct file **fpp;
-	int i, locked;
+	int i;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
@@ -1883,9 +2011,11 @@
 		return;
 
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	racct_set(td->td_proc, RACCT_NOFILE, 0);
-	PROC_UNLOCK(td->td_proc);
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		racct_set(td->td_proc, RACCT_NOFILE, 0);
+		PROC_UNLOCK(td->td_proc);
+	}
 #endif
 
 	/* Check for special need to clear POSIX style locks */
@@ -1893,17 +2023,14 @@
 	if (fdtol != NULL) {
 		FILEDESC_XLOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
-			("filedesc_to_refcount botch: fdl_refcount=%d",
-			 fdtol->fdl_refcount));
+		    ("filedesc_to_refcount botch: fdl_refcount=%d",
+		    fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
-			for (i = 0, fpp = fdp->fd_ofiles;
-			     i <= fdp->fd_lastfile;
-			     i++, fpp++) {
-				if (*fpp == NULL ||
-				    (*fpp)->f_type != DTYPE_VNODE)
+			for (i = 0; i <= fdp->fd_lastfile; i++) {
+				fp = fdp->fd_ofiles[i].fde_file;
+				if (fp == NULL || fp->f_type != DTYPE_VNODE)
 					continue;
-				fp = *fpp;
 				fhold(fp);
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
@@ -1911,17 +2038,11 @@
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
-				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)td->td_proc->
-						   p_leader,
-						   F_UNLCK,
-						   &lf,
-						   F_POSIX);
-				VFS_UNLOCK_GIANT(locked);
+				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
+				    &lf, F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
-				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
@@ -1960,36 +2081,18 @@
 		if (fdtol != NULL)
 			free(fdtol, M_FILEDESC_TO_LEADER);
 	}
-	FILEDESC_XLOCK(fdp);
-	i = --fdp->fd_refcnt;
-	FILEDESC_XUNLOCK(fdp);
-	if (i > 0)
-		return;
 
-	fpp = fdp->fd_ofiles;
-	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
-		if (*fpp) {
-			FILEDESC_XLOCK(fdp);
-			fp = *fpp;
-			*fpp = NULL;
-			FILEDESC_XUNLOCK(fdp);
-			(void) closef(fp, td);
-		}
-	}
-	FILEDESC_XLOCK(fdp);
-
-	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
-	if (fdp->fd_nfiles > NDFILE)
-		free(fdp->fd_ofiles, M_FILEDESC);
-	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
-		free(fdp->fd_map, M_FILEDESC);
+	FILEDESC_XLOCK(fdp);
+	i = --fdp->fd_refcnt;
+	if (i > 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return;
+	}
 
-	fdp->fd_nfiles = 0;
-
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
@@ -1998,21 +2101,25 @@
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
-	if (cdir) {
-		locked = VFS_LOCK_GIANT(cdir->v_mount);
+	for (i = 0; i <= fdp->fd_lastfile; i++) {
+		fp = fdp->fd_ofiles[i].fde_file;
+		if (fp != NULL) {
+			fdfree_last(fdp, i);
+			(void) closef(fp, td);
+		}
+	}
+
+	if (fdp->fd_nfiles > NDFILE)
+		free(fdp->fd_ofiles, M_FILEDESC);
+	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+		free(fdp->fd_map, M_FILEDESC);
+
+	if (cdir != NULL)
 		vrele(cdir);
-		VFS_UNLOCK_GIANT(locked);
-	}
-	if (rdir) {
-		locked = VFS_LOCK_GIANT(rdir->v_mount);
+	if (rdir != NULL)
 		vrele(rdir);
-		VFS_UNLOCK_GIANT(locked);
-	}
-	if (jdir) {
-		locked = VFS_LOCK_GIANT(jdir->v_mount);
+	if (jdir != NULL)
 		vrele(jdir);
-		VFS_UNLOCK_GIANT(locked);
-	}
 
 	fddrop(fdp);
 }
@@ -2045,33 +2152,23 @@
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
+	struct file *fp;
 	int i;
 
-	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
-	if (fdp == NULL)
-		return;
-
-	/*
-	 * Note: fdp->fd_ofiles may be reallocated out from under us while
-	 * we are blocked in a close.  Be careful!
-	 */
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
-		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
-			struct file *fp;
-
+		fp = fdp->fd_ofiles[i].fde_file;
+		if (fp != NULL && is_unsafe(fp)) {
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
-			fp = fdp->fd_ofiles[i];
-			fdp->fd_ofiles[i] = NULL;
-			fdp->fd_ofileflags[i] = 0;
-			fdunused(fdp, i);
+			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_XLOCK(fdp);
@@ -2088,13 +2185,13 @@
  * file descriptor out from under the thread creating the file object.
  */
 void
-fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
+fdclose(struct thread *td, struct file *fp, int idx)
 {
+	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
-	if (fdp->fd_ofiles[idx] == fp) {
-		fdp->fd_ofiles[idx] = NULL;
-		fdunused(fdp, idx);
+	if (fdp->fd_ofiles[idx].fde_file == fp) {
+		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
@@ -2108,38 +2205,21 @@
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
+	struct filedescent *fde;
+	struct file *fp;
 	int i;
 
-	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
-	if (fdp == NULL)
-		return;
-
+	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	FILEDESC_XLOCK(fdp);
-
-	/*
-	 * We cannot cache fd_ofiles or fd_ofileflags since operations
-	 * may block and rip them out from under us.
-	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
-		if (fdp->fd_ofiles[i] != NULL &&
-		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
-		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
-			struct file *fp;
-
-			knote_fdclose(td, i);
-			/*
-			 * NULL-out descriptor prior to close to avoid
-			 * a race while close blocks.
-			 */
-			fp = fdp->fd_ofiles[i];
-			fdp->fd_ofiles[i] = NULL;
-			fdp->fd_ofileflags[i] = 0;
-			fdunused(fdp, i);
-			if (fp->f_type == DTYPE_MQUEUE)
-				mq_fdclose(td, i, fp);
-			FILEDESC_XUNLOCK(fdp);
-			(void) closef(fp, td);
+		fde = &fdp->fd_ofiles[i];
+		fp = fde->fde_file;
+		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+		    (fde->fde_flags & UF_EXCLOSE))) {
+			fdfree(fdp, i);
+			(void) closefp(fdp, i, fp, td, 0);
+			/* closefp() drops the FILEDESC lock. */
 			FILEDESC_XLOCK(fdp);
 		}
 	}
@@ -2161,13 +2241,11 @@
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
-	if (fdp == NULL)
-		return (0);
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
-		if (fdp->fd_ofiles[i] != NULL)
+		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 		if (devnull < 0) {
 			save = td->td_retval[0];
@@ -2202,7 +2280,6 @@
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
-	struct file *fp_object;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
@@ -2215,16 +2292,9 @@
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
-	 *
-	 * If this is a capability, we do lock processing under the underlying
-	 * node, not the capability itself.
 	 */
-	(void)cap_funwrap(fp, 0, &fp_object);
-	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
-		int vfslocked;
-
-		vp = fp_object->f_vnode;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (fp->f_type == DTYPE_VNODE && td != NULL) {
+		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
@@ -2231,7 +2301,7 @@
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
-					   F_UNLCK, &lf, F_POSIX);
+			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
@@ -2242,10 +2312,10 @@
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
-			     fdtol != td->td_proc->p_fdtol;
-			     fdtol = fdtol->fdl_next) {
+			    fdtol != td->td_proc->p_fdtol;
+			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
-				     P_ADVLOCK) == 0)
+				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
@@ -2253,10 +2323,10 @@
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
-				vp = fp_object->f_vnode;
+				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
-						   (caddr_t)fdtol->fdl_leader,
-						   F_UNLCK, &lf, F_POSIX);
+				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
@@ -2267,7 +2337,6 @@
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
-		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (fdrop(fp, td));
 }
@@ -2288,15 +2357,28 @@
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
-struct file *
-fget_unlocked(struct filedesc *fdp, int fd)
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
 {
+#ifdef CAPABILITIES
+	struct filedescent fde;
+#endif
 	struct file *fp;
 	u_int count;
+#ifdef CAPABILITIES
+	seq_t seq;
+	cap_rights_t haverights;
+	int error;
+#endif
 
-	if (fd < 0 || fd >= fdp->fd_nfiles)
-		return (NULL);
 	/*
+	 * Avoid reads reordering and then a first access to the
+	 * fdp->fd_ofiles table which could result in OOB operation.
+	 */
+	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
+		return (EBADF);
+	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
@@ -2305,9 +2387,32 @@
 	 * due to preemption.
 	 */
 	for (;;) {
-		fp = fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+		seq = seq_read(fd_seq(fdp, fd));
+		fde = fdp->fd_ofiles[fd];
+		if (!seq_consistent(fd_seq(fdp, fd), seq)) {
+			cpu_spinwait();
+			continue;
+		}
+		fp = fde.fde_file;
+#else
+		fp = fdp->fd_ofiles[fd].fde_file;
+#endif
 		if (fp == NULL)
-			break;
+			return (EBADF);
+#ifdef CAPABILITIES
+		haverights = *cap_rights_fde(&fde);
+		if (needrightsp != NULL) {
+			error = cap_check(&haverights, needrightsp);
+			if (error != 0)
+				return (error);
+			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
+				error = cap_fcntl_check_fde(&fde, needfcntl);
+				if (error != 0)
+					return (error);
+			}
+		}
+#endif
 		count = fp->f_count;
 		if (count == 0)
 			continue;
@@ -2317,12 +2422,23 @@
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
 			continue;
-		if (fp == fdp->fd_ofiles[fd])
+#ifdef	CAPABILITIES
+		if (seq_consistent_nomb(fd_seq(fdp, fd), seq))
+#else
+		if (fp == fdp->fd_ofiles[fd].fde_file)
+#endif
 			break;
 		fdrop(fp, curthread);
 	}
-
-	return (fp);
+	*fpp = fp;
+	if (haverightsp != NULL) {
+#ifdef CAPABILITIES
+		*haverightsp = haverights;
+#else
+		CAP_ALL(haverightsp);
+#endif
+	}
+	return (0);
 }
 
 /*
@@ -2332,33 +2448,33 @@
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
- * If the FGET_GETCAP flag is set, the capability itself will be returned.
- * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
- * Otherwise, if the file is a capability, its rights will be checked against
- * the capability rights mask, and if successful, the object will be unwrapped.
+ * File's rights will be checked against the capability rights mask.
  *
- * If an error occured the non-zero error is returned and *fpp is set to
+ * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
-#define	FGET_GETCAP	0x00000001
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
-    cap_rights_t needrights, cap_rights_t *haverightsp, u_char *maxprotp,
-    int fget_flags)
+    cap_rights_t *needrightsp, u_char *maxprotp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_fromcap;
-#endif
+	cap_rights_t haverights, needrights;
 	int error;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
-	if ((fp = fget_unlocked(fdp, fd)) == NULL)
-		return (EBADF);
+	if (needrightsp != NULL)
+		needrights = *needrightsp;
+	else
+		cap_rights_init(&needrights);
+	if (maxprotp != NULL)
+		cap_rights_set(&needrights, CAP_MMAP);
+	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
+	if (error != 0)
+		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
@@ -2366,50 +2482,11 @@
 
 #ifdef CAPABILITIES
 	/*
-	 * If this is a capability, what rights does it have?
+	 * If requested, convert capability rights to access flags.
 	 */
-	if (haverightsp != NULL) {
-		if (fp->f_type == DTYPE_CAPABILITY)
-			*haverightsp = cap_rights(fp);
-		else
-			*haverightsp = CAP_MASK_VALID;
-	}
-
-	/*
-	 * If a capability has been requested, return the capability directly.
-	 * Otherwise, check capability rights, extract the underlying object,
-	 * and check its access flags.
-	 */
-	if (fget_flags & FGET_GETCAP) {
-		if (fp->f_type != DTYPE_CAPABILITY) {
-			fdrop(fp, td);
-			return (EINVAL);
-		}
-	} else {
-		if (maxprotp == NULL)
-			error = cap_funwrap(fp, needrights, &fp_fromcap);
-		else
-			error = cap_funwrap_mmap(fp, needrights, maxprotp,
-			    &fp_fromcap);
-		if (error != 0) {
-			fdrop(fp, td);
-			return (error);
-		}
-
-		/*
-		 * If we've unwrapped a file, drop the original capability
-		 * and hold the new descriptor.  fp after this point refers to
-		 * the actual (unwrapped) object, not the capability.
-		 */
-		if (fp != fp_fromcap) {
-			fhold(fp_fromcap);
-			fdrop(fp, td);
-			fp = fp_fromcap;
-		}
-	}
+	if (maxprotp != NULL)
+		*maxprotp = cap_rights_to_vmprot(&haverights);
 #else /* !CAPABILITIES */
-	KASSERT(fp->f_type != DTYPE_CAPABILITY,
-	    ("%s: saw capability", __func__));
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #endif /* CAPABILITIES */
@@ -2445,48 +2522,35 @@
 }
 
 int
-fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0));
+	return(_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
-fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp,
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
     struct file **fpp)
 {
 
-	return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0));
+	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
 }
 
 int
-fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
-	return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0));
+	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
-fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
-	return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0));
+	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 /*
- * Unlike the other fget() calls, which accept and check capability rights
- * but never return capabilities, fgetcap() returns the capability but doesn't
- * check capability rights.
- */
-int
-fgetcap(struct thread *td, int fd, struct file **fpp)
-{
-
-	return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
-}
-
-
-/*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
@@ -2494,15 +2558,15 @@
  * XXX: what about the unused flags ?
  */
 static __inline int
-_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
-    cap_rights_t *haverightsp, struct vnode **vpp)
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+    struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
-	if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp,
-	    NULL, 0)) != 0)
+	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
@@ -2516,40 +2580,68 @@
 }
 
 int
-fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
-	return (_fgetvp(td, fd, 0, rights, NULL, vpp));
+	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
-fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have,
-    struct vnode **vpp)
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+    struct filecaps *havecaps, struct vnode **vpp)
 {
-	return (_fgetvp(td, fd, 0, need, have, vpp));
+	struct filedesc *fdp;
+	struct file *fp;
+#ifdef CAPABILITIES
+	int error;
+#endif
+
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return (EBADF);
+
+	fp = fget_locked(fdp, fd);
+	if (fp == NULL || fp->f_ops == &badfileops)
+		return (EBADF);
+
+#ifdef CAPABILITIES
+	if (needrightsp != NULL) {
+		error = cap_check(cap_rights(fdp, fd), needrightsp);
+		if (error != 0)
+			return (error);
+	}
+#endif
+
+	if (fp->f_vnode == NULL)
+		return (EINVAL);
+
+	*vpp = fp->f_vnode;
+	vref(*vpp);
+	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
+
+	return (0);
 }
 
 int
-fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
-	return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
+	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
-fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
-	return (_fgetvp(td, fd, FEXEC, rights, NULL, vpp));
+	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
-fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
-	return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp));
+	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
@@ -2565,7 +2657,7 @@
  * during use.
  */
 int
-fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp,
+fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
     u_int *fflagp)
 {
 	struct file *fp;
@@ -2574,7 +2666,7 @@
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
-	if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0)
+	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
@@ -2610,9 +2702,6 @@
 
 /*
  * Handle the last reference to a file being closed.
- *
- * No special capability handling here, as the capability's fo_close will run
- * instead of the object here, and perform any necessary drop on the object.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
@@ -2651,10 +2740,11 @@
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
-	int vfslocked;
+	cap_rights_t rights;
 	int error;
 
-	if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
+	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
@@ -2662,7 +2752,6 @@
 	}
 
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
@@ -2685,7 +2774,6 @@
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 /*
@@ -2692,11 +2780,16 @@
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
-dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+    int openerror, int *indxp)
 {
-	struct file *wfp;
+	struct filedescent *newfde, *oldfde;
 	struct file *fp;
+	int error, indx;
 
+	KASSERT(openerror == ENODEV || openerror == ENXIO,
+	    ("unexpected error %d in %s", openerror, __func__));
+
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
@@ -2703,12 +2796,17 @@
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
-	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
-	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
+	error = fdalloc(td, 0, &indx);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+
 	/*
 	 * There are two cases of interest here.
 	 *
@@ -2716,61 +2814,50 @@
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
-	 *
-	 * Any other error code is just returned.
 	 */
-	switch (error) {
+	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
-		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = wfp;
-		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		if (fp == NULL)
-			fdused(fdp, indx);
-		fhold(wfp);
-		FILEDESC_XUNLOCK(fdp);
-		if (fp != NULL)
-			/*
-			 * We now own the reference to fp that the ofiles[]
-			 * array used to own.  Release it.
-			 */
-			fdrop(fp, td);
-		return (0);
-
+		fhold(fp);
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
+		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
-		fp = fdp->fd_ofiles[indx];
-		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
-		fdp->fd_ofiles[dfd] = NULL;
-		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		fdp->fd_ofileflags[dfd] = 0;
+		newfde = &fdp->fd_ofiles[indx];
+		oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+		seq_write_begin(&newfde->fde_seq);
+#endif
+		memcpy(newfde, oldfde, fde_change_size);
+		bzero(oldfde, fde_change_size);
 		fdunused(fdp, dfd);
-		if (fp == NULL)
-			fdused(fdp, indx);
-		FILEDESC_XUNLOCK(fdp);
-
-		/*
-		 * We now own the reference to fp that the ofiles[] array
-		 * used to own.  Release it.
-		 */
-		if (fp != NULL)
-			fdrop(fp, td);
-		return (0);
-
-	default:
-		FILEDESC_XUNLOCK(fdp);
-		return (error);
+#ifdef CAPABILITIES
+		seq_write_end(&newfde->fde_seq);
+#endif
+		break;
 	}
-	/* NOTREACHED */
+	FILEDESC_XUNLOCK(fdp);
+	*indxp = indx;
+	return (0);
 }
 
 /*
@@ -2846,8 +2933,7 @@
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
-	       M_FILEDESC_TO_LEADER,
-	       M_WAITOK);
+	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
@@ -2919,8 +3005,8 @@
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
-		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
-			if ((fp = fdp->fd_ofiles[n]) == NULL)
+		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
@@ -2944,7 +3030,7 @@
 	return (error);
 }
 
-SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_OFILE_SIZE
@@ -2958,7 +3044,6 @@
 {
 	int error;
 	char *fullpath, *freepath;
-	int vfslocked;
 
 	bzero(kif, sizeof(*kif));
 	kif->kf_structsize = sizeof(*kif);
@@ -2984,9 +3069,7 @@
 	fullpath = "-";
 	FILEDESC_SUNLOCK(fdp);
 	vn_fullpath(curthread, vp, &fullpath, &freepath);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
@@ -3008,13 +3091,13 @@
 	struct shmfd *shmfd;
 	struct socket *so;
 	struct vnode *vp;
+	struct ksem *ks;
 	struct file *fp;
 	struct proc *p;
 	struct tty *tp;
-	int vfslocked;
 
 	name = (int *)arg1;
-	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
@@ -3032,11 +3115,12 @@
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 				fdp, req);
-	for (i = 0; i < fdp->fd_nfiles; i++) {
-		if ((fp = fdp->fd_ofiles[i]) == NULL)
+	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		bzero(kif, sizeof(*kif));
 		kif->kf_structsize = sizeof(*kif);
+		ks = NULL;
 		vp = NULL;
 		so = NULL;
 		tp = NULL;
@@ -3043,21 +3127,6 @@
 		shmfd = NULL;
 		kif->kf_fd = i;
 
-#ifdef CAPABILITIES
-		/*
-		 * When reporting a capability, most fields will be from the
-		 * underlying object, but do mark as a capability. With
-		 * ofiledesc, we don't have a field to export the cap_rights_t,
-		 * but we do with the new filedesc.
-		 */
-		if (fp->f_type == DTYPE_CAPABILITY) {
-			kif->kf_flags |= KF_FLAG_CAPABILITY;
-			(void)cap_funwrap(fp, 0, &fp);
-		}
-#else
-		KASSERT(fp->f_type != DTYPE_CAPABILITY,
-		    ("sysctl_kern_proc_ofiledesc: saw capability"));
-#endif
 		switch (fp->f_type) {
 		case DTYPE_VNODE:
 			kif->kf_type = KF_TYPE_VNODE;
@@ -3097,6 +3166,7 @@
 
 		case DTYPE_SEM:
 			kif->kf_type = KF_TYPE_SEM;
+			ks = fp->f_data;
 			break;
 
 		case DTYPE_PTS:
@@ -3175,9 +3245,7 @@
 			fullpath = "-";
 			FILEDESC_SUNLOCK(fdp);
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			strlcpy(kif->kf_path, fullpath,
 			    sizeof(kif->kf_path));
 			if (freepath != NULL)
@@ -3208,6 +3276,8 @@
 		}
 		if (shmfd != NULL)
 			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
+		if (ks != NULL && ksem_info != NULL)
+			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
 		error = SYSCTL_OUT(req, kif, sizeof(*kif));
 		if (error)
 			break;
@@ -3218,8 +3288,9 @@
 	return (0);
 }
 
-static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
-    sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
+    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
+    "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 #ifdef KINFO_FILE_SIZE
@@ -3226,10 +3297,17 @@
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
+struct export_fd_buf {
+	struct filedesc		*fdp;
+	struct sbuf 		*sb;
+	ssize_t			remainder;
+	struct kinfo_file	kif;
+	int			flags;
+};
+
 static int
 export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
-    int64_t offset, int fd_is_cap, cap_rights_t fd_cap_rights,
-    struct kinfo_file *kif, struct sbuf *sb, ssize_t *remainder)
+    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
 {
 	struct {
 		int	fflag;
@@ -3252,21 +3330,26 @@
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 #define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
+	struct kinfo_file *kif;
 	struct vnode *vp;
-	int error, vfslocked;
+	int error, locked;
 	unsigned int i;
 
-	if (*remainder == 0)
+	if (efbuf->remainder == 0)
 		return (0);
+	kif = &efbuf->kif;
 	bzero(kif, sizeof(*kif));
+	locked = efbuf->fdp != NULL;
 	switch (type) {
 	case KF_TYPE_FIFO:
 	case KF_TYPE_VNODE:
+		if (locked) {
+			FILEDESC_SUNLOCK(efbuf->fdp);
+			locked = 0;
+		}
 		vp = (struct vnode *)data;
 		error = fill_vnode_info(vp, kif);
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		break;
 	case KF_TYPE_SOCKET:
 		error = fill_socket_info((struct socket *)data, kif);
@@ -3280,6 +3363,9 @@
 	case KF_TYPE_PROCDESC:
 		error = fill_procdesc_info((struct procdesc *)data, kif);
 		break;
+	case KF_TYPE_SEM:
+		error = fill_sem_info((struct file *)data, kif);
+		break;
 	case KF_TYPE_SHM:
 		error = fill_shm_info((struct file *)data, kif);
 		break;
@@ -3295,27 +3381,36 @@
 	for (i = 0; i < NFFLAGS; i++)
 		if (fflags & fflags_table[i].fflag)
 			kif->kf_flags |=  fflags_table[i].kf_fflag;
-	if (fd_is_cap)
-		kif->kf_flags |= KF_FLAG_CAPABILITY;
-	if (fd_is_cap)
-		kif->kf_cap_rights = fd_cap_rights;
+	if (rightsp != NULL)
+		kif->kf_cap_rights = *rightsp;
+	else
+		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_type = type;
 	kif->kf_ref_count = refcnt;
 	kif->kf_offset = offset;
-	/* Pack record size down */
-	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
-	    strlen(kif->kf_path) + 1;
+	if ((efbuf->flags & KERN_FILEDESC_PACK_KINFO) != 0)
+		/* Pack record size down */
+		kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+		    strlen(kif->kf_path) + 1;
+	else
+		kif->kf_structsize = sizeof(*kif);
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
-	if (*remainder != -1) {
-		if (*remainder < kif->kf_structsize) {
+	if (efbuf->remainder != -1) {
+		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
-			*remainder = 0;
+			efbuf->remainder = 0;
+			if (efbuf->fdp != NULL && !locked)
+				FILEDESC_SLOCK(efbuf->fdp);
 			return (0);
 		}
-		*remainder -= kif->kf_structsize;
+		efbuf->remainder -= kif->kf_structsize;
 	}
-	error = sbuf_bcat(sb, kif, kif->kf_structsize);
+	if (locked)
+		FILEDESC_SUNLOCK(efbuf->fdp);
+	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM;
+	if (efbuf->fdp != NULL)
+		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
@@ -3325,22 +3420,21 @@
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
-kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
+kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
+    int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
-	struct kinfo_file *kif;
+	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int64_t offset;
 	void *data;
-	ssize_t remainder;
 	int error, i;
-	int fd_is_cap, type, refcnt, fflags;
-	cap_rights_t fd_cap_rights;
+	int type, refcnt, fflags;
+	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
-	remainder = maxlen;
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
@@ -3358,68 +3452,54 @@
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
-	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+	efbuf->fdp = NULL;
+	efbuf->sb = sb;
+	efbuf->remainder = maxlen;
+	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
-		    FREAD | FWRITE, -1, -1, 0, 0, kif, sb, &remainder);
+		    FREAD | FWRITE, -1, -1, NULL, efbuf);
 	if (textvp != NULL)
 		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
-		    FREAD, -1, -1, 0, 0, kif, sb, &remainder);
+		    FREAD, -1, -1, NULL, efbuf);
 	if (cttyvp != NULL)
 		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
-		    FREAD | FWRITE, -1, -1, 0, 0, kif, sb, &remainder);
+		    FREAD | FWRITE, -1, -1, NULL, efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
+	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vref(fdp->fd_cdir);
 		data = fdp->fd_cdir;
-		FILEDESC_SUNLOCK(fdp);
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
-		    FREAD, -1, -1, 0, 0, kif, sb, &remainder);
-		FILEDESC_SLOCK(fdp);
+		    FREAD, -1, -1, NULL, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vref(fdp->fd_rdir);
 		data = fdp->fd_rdir;
-		FILEDESC_SUNLOCK(fdp);
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
-		    FREAD, -1, -1, 0, 0, kif, sb, &remainder);
-		FILEDESC_SLOCK(fdp);
+		    FREAD, -1, -1, NULL, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vref(fdp->fd_jdir);
 		data = fdp->fd_jdir;
-		FILEDESC_SUNLOCK(fdp);
 		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
-		    FREAD, -1, -1, 0, 0, kif, sb, &remainder);
-		FILEDESC_SLOCK(fdp);
+		    FREAD, -1, -1, NULL, efbuf);
 	}
-	for (i = 0; i < fdp->fd_nfiles; i++) {
-		if ((fp = fdp->fd_ofiles[i]) == NULL)
+	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		data = NULL;
-		fd_is_cap = 0;
-		fd_cap_rights = 0;
-
 #ifdef CAPABILITIES
-		/*
-		 * When reporting a capability, most fields will be from the
-		 * underlying object, but do mark as a capability and export
-		 * the capability rights mask.
-		 */
-		if (fp->f_type == DTYPE_CAPABILITY) {
-			fd_is_cap = 1;
-			fd_cap_rights = cap_rights(fp);
-			(void)cap_funwrap(fp, 0, &fp);
-		}
+		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
-		KASSERT(fp->f_type != DTYPE_CAPABILITY,
-		    ("sysctl_kern_proc_filedesc: saw capability"));
+		cap_rights_init(&rights);
 #endif
 		switch (fp->f_type) {
 		case DTYPE_VNODE:
@@ -3463,6 +3543,7 @@
 
 		case DTYPE_SEM:
 			type = KF_TYPE_SEM;
+			data = fp;
 			break;
 
 		case DTYPE_PTS:
@@ -3491,20 +3572,15 @@
 		 * re-validate and re-evaluate its properties when
 		 * the loop continues.
 		 */
-		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
-			FILEDESC_SUNLOCK(fdp);
 		error = export_fd_to_sb(data, type, i, fflags, refcnt,
-		    offset, fd_is_cap, fd_cap_rights, kif, sb, &remainder);
-		if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
-			FILEDESC_SLOCK(fdp);
-		if (error)
+		    offset, &rights, efbuf);
+		if (error != 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
+	fddrop(fdp);
 fail:
-	if (fdp != NULL)
-		fddrop(fdp);
-	free(kif, M_TEMP);
+	free(efbuf, M_TEMP);
 	return (error);
 }
 
@@ -3524,13 +3600,14 @@
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
-	error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
-	error = kern_proc_filedesc_out(p, &sb, maxlen);
+	error = kern_proc_filedesc_out(p, &sb, maxlen,
+	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
@@ -3568,12 +3645,30 @@
 	return (KF_VTYPE_UNKNOWN);
 }
 
+static inline void
+vn_fill_junk(struct kinfo_file *kif)
+{
+	size_t len, olen;
+
+	/*
+	 * Simulate vn_fullpath returning changing values for a given
+	 * vp during e.g. coredump.
+	 */
+	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
+	olen = strlen(kif->kf_path);
+	if (len < olen)
+		strcpy(&kif->kf_path[len - 1], "$");
+	else
+		for (; olen < len; olen++)
+			strcpy(&kif->kf_path[olen], "A");
+}
+
 static int
 fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
-	int error, vfslocked;
+	int error;
 
 	if (vp == NULL)
 		return (1);
@@ -3587,16 +3682,18 @@
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
+	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
+		vn_fill_junk(kif);
+	);
+
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
@@ -3699,6 +3796,25 @@
 }
 
 static int
+fill_sem_info(struct file *fp, struct kinfo_file *kif)
+{
+	struct thread *td;
+	struct stat sb;
+
+	td = curthread;
+	if (fp->f_data == NULL)
+		return (1);
+	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+		return (1);
+	if (ksem_info == NULL)
+		return (1);
+	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
+	    &kif->kf_un.kf_sem.kf_sem_value);
+	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
+	return (0);
+}
+
+static int
 fill_shm_info(struct file *fp, struct kinfo_file *kif)
 {
 	struct thread *td;
@@ -3715,8 +3831,9 @@
 	return (0);
 }
 
-static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
-    sysctl_kern_proc_filedesc, "Process filedesc entries");
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
+    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
+    "Process filedesc entries");
 
 #ifdef DDB
 /*
@@ -3771,8 +3888,8 @@
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
-		for (n = 0; n < fdp->fd_nfiles; n++) {
-			if (fp == fdp->fd_ofiles[n])
+		for (n = 0; n <= fdp->fd_lastfile; n++) {
+			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
@@ -3821,8 +3938,8 @@
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
-		for (n = 0; n < fdp->fd_nfiles; ++n) {
-			if ((fp = fdp->fd_ofiles[n]) == NULL)
+		for (n = 0; n <= fdp->fd_lastfile; ++n) {
+			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
@@ -3924,6 +4041,15 @@
 	return (EBADF);
 }
 
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+
+	return (EBADF);
+}
+
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
@@ -3935,6 +4061,7 @@
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
+	.fo_sendfile = badfo_sendfile,
 };
 
 int
@@ -3953,6 +4080,15 @@
 	return (EINVAL);
 }
 
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+
+	return (EINVAL);
+}
+
 /*-------------------------------------------------------------------*/
 
 /*

Modified: trunk/sys/kern/kern_environment.c
===================================================================
--- trunk/sys/kern/kern_environment.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_environment.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998 Michael Smith
  * All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_environment.c 294683 2016-01-24 21:04:06Z ian $");
 
 #include <sys/types.h>
 #include <sys/param.h>
@@ -210,12 +211,44 @@
 	return (error);
 }
 
+/*
+ * Populate the initial kernel environment.
+ *
+ * This is called very early in MD startup, either to provide a copy of the
+ * environment obtained from a boot loader, or to provide an empty buffer into
+ * which MD code can store an initial environment using kern_setenv() calls.
+ *
+ * If the global envmode is 1, the environment is initialized from the global
+ * static_env[], regardless of the arguments passed.  This implements the env
+ * keyword described in config(5).  In this case env_pos is set to env_len,
+ * causing kern_setenv() to return -1 (if len > 0) or panic (if len == 0) until
+ * the dynamic environment is available.  The envmode and static_env variables
+ * are defined in env.c which is generated by config(8).
+ *
+ * If len is non-zero, the caller is providing an empty buffer.  The caller will
+ * subsequently use kern_setenv() to add up to len bytes of initial environment
+ * before the dynamic environment is available.
+ *
+ * If len is zero, the caller is providing a pre-loaded buffer containing
+ * environment strings.  Additional strings cannot be added until the dynamic
+ * environment is available.  The memory pointed to must remain stable at least
+ * until sysinit runs init_dynamic_kenv().  If no initial environment is
+ * available from the boot loader, passing a NULL pointer allows the static_env
+ * to be installed if it is configured.
+ */
 void
 init_static_kenv(char *buf, size_t len)
 {
-	kern_envp = buf;
-	env_len = len;
-	env_pos = 0;
+
+	if (envmode == 1) {
+		kern_envp = static_env;
+		env_len = len;
+		env_pos = len;
+	} else {
+		kern_envp = buf;
+		env_len = len;
+		env_pos = 0;
+	}
 }
 
 /*
@@ -231,20 +264,23 @@
 	kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
 		M_WAITOK | M_ZERO);
 	i = 0;
-	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
-		len = strlen(cp) + 1;
-		if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
-			printf("WARNING: too long kenv string, ignoring %s\n",
-			    cp);
-			continue;
+	if (kern_envp && *kern_envp != '\0') {
+		for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+			len = strlen(cp) + 1;
+			if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
+				printf(
+				"WARNING: too long kenv string, ignoring %s\n",
+				    cp);
+				continue;
+			}
+			if (i < KENV_SIZE) {
+				kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+				strcpy(kenvp[i++], cp);
+			} else
+				printf(
+				"WARNING: too many kenv strings, ignoring %s\n",
+				    cp);
 		}
-		if (i < KENV_SIZE) {
-			kenvp[i] = malloc(len, M_KENV, M_WAITOK);
-			strcpy(kenvp[i++], cp);
-		} else
-			printf(
-			    "WARNING: too many kenv strings, ignoring %s\n",
-			    cp);
 	}
 	kenvp[i] = NULL;
 
@@ -312,20 +348,12 @@
 getenv(const char *name)
 {
 	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
-	char *ret, *cp;
-	int len;
+	char *ret;
 
 	if (dynamic_kenv) {
-		mtx_lock(&kenv_lock);
-		cp = _getenv_dynamic(name, NULL);
-		if (cp != NULL) {
-			strcpy(buf, cp);
-			mtx_unlock(&kenv_lock);
-			len = strlen(buf) + 1;
-			ret = malloc(len, M_KENV, M_WAITOK);
-			strcpy(ret, buf);
+		if (getenv_string(name, buf, sizeof(buf))) {
+			ret = strdup(buf, M_KENV);
 		} else {
-			mtx_unlock(&kenv_lock);
 			ret = NULL;
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 			    "getenv");
@@ -455,15 +483,20 @@
 int
 getenv_string(const char *name, char *data, int size)
 {
-	char *tmp;
+	char *cp;
 
-	tmp = getenv(name);
-	if (tmp != NULL) {
-		strlcpy(data, tmp, size);
-		freeenv(tmp);
-		return (1);
-	} else
-		return (0);
+	if (dynamic_kenv) {
+		mtx_lock(&kenv_lock);
+		cp = _getenv_dynamic(name, NULL);
+		if (cp != NULL)
+			strlcpy(data, cp, size);
+		mtx_unlock(&kenv_lock);
+	} else {
+		cp = _getenv_static(name);
+		if (cp != NULL)
+			strlcpy(data, cp, size);
+	}
+	return (cp != NULL);
 }
 
 /*
@@ -532,18 +565,15 @@
 int
 getenv_quad(const char *name, quad_t *data)
 {
-	char	*value;
+	char	value[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
 	char	*vtp;
 	quad_t	iv;
 
-	value = getenv(name);
-	if (value == NULL)
+	if (!getenv_string(name, value, sizeof(value)))
 		return (0);
 	iv = strtoq(value, &vtp, 0);
-	if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
-		freeenv(value);
+	if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0'))
 		return (0);
-	}
 	switch (vtp[0]) {
 	case 't': case 'T':
 		iv *= 1024;
@@ -556,11 +586,9 @@
 	case '\0':
 		break;
 	default:
-		freeenv(value);
 		return (0);
 	}
 	*data = iv;
-	freeenv(value);
 	return (1);
 }
 

Modified: trunk/sys/kern/kern_et.c
===================================================================
--- trunk/sys/kern/kern_et.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_et.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org>
+ * Copyright (c) 2010-2013 Alexander Motin <mav at FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_et.c 266347 2014-05-17 20:10:12Z ian $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -34,6 +35,8 @@
 #include <sys/queue.h>
 #include <sys/timeet.h>
 
+#include "opt_timer.h"
+
 SLIST_HEAD(et_eventtimers_list, eventtimer);
 static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers);
 
@@ -62,6 +65,7 @@
 			    et->et_quality);
 		}
 	}
+	KASSERT(et->et_start, ("et_register: timer has no start function"));
 	et->et_sysctl = SYSCTL_ADD_NODE(NULL,
 	    SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name,
 	    CTLFLAG_RW, 0, "event timer description");
@@ -112,6 +116,20 @@
 }
 
 /*
+ * Change the frequency of the given timer.  If it is the active timer,
+ * reconfigure it on all CPUs (reschedules all current events based on the new
+ * timer frequency).
+ */
+void
+et_change_frequency(struct eventtimer *et, uint64_t newfreq)
+{
+
+#ifndef NO_EVENTTIMERS
+	cpu_et_frequency(et, newfreq);
+#endif
+}
+
+/*
  * Find free event timer hardware with specified parameters.
  */
 struct eventtimer *
@@ -159,43 +177,29 @@
  * period - period of subsequent periodic ticks.
  */
 int
-et_start(struct eventtimer *et,
-    struct bintime *first, struct bintime *period)
+et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 
 	if (!et->et_active)
 		return (ENXIO);
-	if (first == NULL && period == NULL)
-		return (EINVAL);
-	if ((et->et_flags & ET_FLAGS_PERIODIC) == 0 &&
-	    period != NULL)
-		return (ENODEV);
-	if ((et->et_flags & ET_FLAGS_ONESHOT) == 0 &&
-	    period == NULL)
-		return (ENODEV);
-	if (first != NULL) {
-		if (first->sec < et->et_min_period.sec ||
-		    (first->sec == et->et_min_period.sec &&
-		     first->frac < et->et_min_period.frac))
-		        first = &et->et_min_period;
-		if (first->sec > et->et_max_period.sec ||
-		    (first->sec == et->et_max_period.sec &&
-		     first->frac > et->et_max_period.frac))
-		        first = &et->et_max_period;
+	KASSERT(period >= 0, ("et_start: negative period"));
+	KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0,
+		("et_start: period specified for oneshot-only timer"));
+	KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0,
+		("et_start: period not specified for periodic-only timer"));
+	if (period != 0) {
+		if (period < et->et_min_period)
+		        period = et->et_min_period;
+		else if (period > et->et_max_period)
+		        period = et->et_max_period;
 	}
-	if (period != NULL) {
-		if (period->sec < et->et_min_period.sec ||
-		    (period->sec == et->et_min_period.sec &&
-		     period->frac < et->et_min_period.frac))
-		        period = &et->et_min_period;
-		if (period->sec > et->et_max_period.sec ||
-		    (period->sec == et->et_max_period.sec &&
-		     period->frac > et->et_max_period.frac))
-		        period = &et->et_max_period;
+	if (period == 0 || first != 0) {
+		if (first < et->et_min_period)
+		        first = et->et_min_period;
+		else if (first > et->et_max_period)
+		        first = et->et_max_period;
 	}
-	if (et->et_start)
-		return (et->et_start(et, first, period));
-	return (0);
+	return (et->et_start(et, first, period));
 }
 
 /* Stop event timer hardware. */

Modified: trunk/sys/kern/kern_event.c
===================================================================
--- trunk/sys/kern/kern_event.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_event.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon at FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg at FreeBSD.org>
@@ -27,16 +28,17 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_event.c 320293 2017-06-23 19:04:40Z kib $");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
@@ -51,7 +53,6 @@
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
-#include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
@@ -65,6 +66,7 @@
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
+#include <machine/atomic.h>
 
 #include <vm/uma.h>
 
@@ -126,6 +128,7 @@
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
@@ -182,9 +185,9 @@
 };
 
 static uma_zone_t	knote_zone;
-static int 		kq_ncallouts = 0;
-static int 		kq_calloutmax = (4 * 1024);
-SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+static unsigned int	kq_ncallouts = 0;
+static unsigned int 	kq_calloutmax = 4 * 1024;
+SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not KN_INFLUX?? */
@@ -362,11 +365,21 @@
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
-	 * internal flag indicating registration done by kernel
+	 * Internal flag indicating registration done by kernel for the
+	 * purposes of getting a NOTE_CHILD notification.
 	 */
-	if (kn->kn_flags & EV_FLAG1) {
+	if (kn->kn_flags & EV_FLAG2) {
+		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
+                kn->kn_sfflags &= ~NOTE_EXIT;
+		immediate = 1; /* Force immediate activation of child note. */
+	}
+	/*
+	 * Internal flag indicating registration done by kernel (for other than
+	 * NOTE_CHILD).
+	 */
+	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
@@ -374,9 +387,10 @@
 		knlist_add(&p->p_klist, kn, 1);
 
 	/*
-	 * Immediately activate any exit notes if the target process is a
-	 * zombie.  This is necessary to handle the case where the target
-	 * process, e.g. a child, dies before the kevent is registered.
+	 * Immediately activate any child notes or, in the case of a zombie
+	 * target process, exit notes.  The latter is necessary to handle the
+	 * case where the target process, e.g. a child, dies before the kevent
+	 * is registered.
 	 */
 	if (immediate && filt_proc(kn, NOTE_EXIT))
 		KNOTE_ACTIVATE(kn, 0);
@@ -430,8 +444,11 @@
 		if (!(kn->kn_status & KN_DETACHED))
 			knlist_remove_inevent(&p->p_klist, kn);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
-		kn->kn_data = p->p_xstat;
 		kn->kn_ptr.p_proc = NULL;
+		if (kn->kn_fflags & NOTE_EXIT)
+			kn->kn_data = p->p_xstat;
+		if (kn->kn_fflags == 0)
+			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
@@ -463,7 +480,7 @@
 			continue;
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
-		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
@@ -473,7 +490,7 @@
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			kn->kn_status |= KN_HASKQLOCK;
-			if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
+			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			kn->kn_status &= ~KN_HASKQLOCK;
 			KQ_UNLOCK(kq);
@@ -482,7 +499,7 @@
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
-		 * of the event, we need to register new event to
+		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
@@ -491,20 +508,39 @@
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
-		 * Activate existing knote and register a knote with
+		 * Activate existing knote and register tracking knotes with
 		 * new process.
+		 *
+		 * First register a knote to get just the child notice. This
+		 * must be a separate note from a potential NOTE_EXIT
+		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
+		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
+		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2;
+		kev.fflags = kn->kn_sfflags;
+		kev.data = kn->kn_id;		/* parent */
+		kev.udata = kn->kn_kevent.udata;/* preserve udata */
+		error = kqueue_register(kq, &kev, NULL, 0);
+		if (error)
+			kn->kn_fflags |= NOTE_TRACKERR;
+
+		/*
+		 * Then register another knote to track other potential events
+		 * from the new process.
+		 */
+		kev.ident = pid;
+		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, 0);
-		if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
-			KNOTE_ACTIVATE(kn, 0);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
+		if (kn->kn_fop->f_event(kn, NOTE_FORK))
+			KNOTE_ACTIVATE(kn, 0);
 		KQ_LOCK(kq);
 		kn->kn_status &= ~KN_INFLUX;
 		KQ_UNLOCK_FLUX(kq);
@@ -517,64 +553,127 @@
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
-static int
-timertoticks(intptr_t data)
+
+#define NOTE_TIMER_PRECMASK						\
+    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
+
+static sbintime_t
+timer2sbintime(intptr_t data, int flags)
 {
-	struct timeval tv;
-	int tticks;
+	int64_t secs;
 
-	tv.tv_sec = data / 1000;
-	tv.tv_usec = (data % 1000) * 1000;
-	tticks = tvtohz(&tv);
-
-	return tticks;
+        /*
+         * Macros for converting to the fractional second portion of an
+         * sbintime_t using 64bit multiplication to improve precision.
+         */
+#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
+#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
+#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
+	switch (flags & NOTE_TIMER_PRECMASK) {
+	case NOTE_SECONDS:
+#ifdef __LP64__
+		if (data > (SBT_MAX / SBT_1S))
+			return (SBT_MAX);
+#endif
+		return ((sbintime_t)data << 32);
+	case NOTE_MSECONDS: /* FALLTHROUGH */
+	case 0:
+		if (data >= 1000) {
+			secs = data / 1000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return (SBT_MAX);
+#endif
+			return (secs << 32 | MS_TO_SBT(data % 1000));
+		}
+		return (MS_TO_SBT(data));
+	case NOTE_USECONDS:
+		if (data >= 1000000) {
+			secs = data / 1000000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return (SBT_MAX);
+#endif
+			return (secs << 32 | US_TO_SBT(data % 1000000));
+		}
+		return (US_TO_SBT(data));
+	case NOTE_NSECONDS:
+		if (data >= 1000000000) {
+			secs = data / 1000000000;
+#ifdef __LP64__
+			if (secs > (SBT_MAX / SBT_1S))
+				return (SBT_MAX);
+#endif
+			return (secs << 32 | US_TO_SBT(data % 1000000000));
+		}
+		return (NS_TO_SBT(data));
+	default:
+		break;
+	}
+	return (-1);
 }
 
+struct kq_timer_cb_data {
+	struct callout c;
+	sbintime_t next;	/* next timer event fires at */
+	sbintime_t to;		/* precalculated timer period */
+};
+
 static void
 filt_timerexpire(void *knx)
 {
-	struct knote *kn = knx;
-	struct callout *calloutp;
+	struct knote *kn;
+	struct kq_timer_cb_data *kc;
 
+	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
-	/*
-	 * timertoticks() uses tvtohz() which always adds 1 to allow
-	 * for the time until the next clock interrupt being strictly
-	 * less than 1 clock tick.  We don't want that here since we
-	 * want to appear to be in sync with the clock interrupt even
-	 * when we're delayed.
-	 */
-	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
-		calloutp = (struct callout *)kn->kn_hook;
-		callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
-		    filt_timerexpire, kn);
-	}
+	if ((kn->kn_flags & EV_ONESHOT) != 0)
+		return;
+
+	kc = kn->kn_ptr.p_v;
+	kc->next += kc->to;
+	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
+	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 /*
- * data contains amount of time to sleep, in milliseconds
+ * data contains amount of time to sleep
  */
 static int
 filt_timerattach(struct knote *kn)
 {
-	struct callout *calloutp;
+	struct kq_timer_cb_data *kc;
+	sbintime_t to;
+	unsigned int ncallouts;
 
-	atomic_add_int(&kq_ncallouts, 1);
+	if (kn->kn_sdata < 0)
+		return (EINVAL);
+	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
+		kn->kn_sdata = 1;
+	/* Only precision unit are supported in flags so far */
+	if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0)
+		return (EINVAL);
 
-	if (kq_ncallouts >= kq_calloutmax) {
-		atomic_add_int(&kq_ncallouts, -1);
-		return (ENOMEM);
-	}
+	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
+	if (to < 0)
+		return (EINVAL);
 
+	do {
+		ncallouts = kq_ncallouts;
+		if (ncallouts >= kq_calloutmax)
+			return (ENOMEM);
+	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
+
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
-	kn->kn_status &= ~KN_DETACHED;		/* knlist_add usually sets it */
-	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
-	callout_init(calloutp, CALLOUT_MPSAFE);
-	kn->kn_hook = calloutp;
-	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
-	    filt_timerexpire, kn);
+	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
+	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
+	callout_init(&kc->c, 1);
+	kc->next = to + sbinuptime();
+	kc->to = to;
+	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
+	    PCPU_GET(cpuid), C_ABSOLUTE);
 
 	return (0);
 }
@@ -582,13 +681,15 @@
 static void
 filt_timerdetach(struct knote *kn)
 {
-	struct callout *calloutp;
+	struct kq_timer_cb_data *kc;
+	unsigned int old;
 
-	calloutp = (struct callout *)kn->kn_hook;
-	callout_drain(calloutp);
-	free(calloutp, M_KQUEUE);
-	atomic_add_int(&kq_ncallouts, -1);
-	kn->kn_status |= KN_DETACHED;	/* knlist_remove usually clears it */
+	kc = kn->kn_ptr.p_v;
+	callout_drain(&kc->c);
+	free(kc, M_KQUEUE);
+	old = atomic_fetchadd_int(&kq_ncallouts, -1);
+	KASSERT(old > 0, ("Number of callouts cannot become negative"));
+	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static int
@@ -689,41 +790,33 @@
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
+
+	return (kern_kqueue(td, 0));
+}
+
+int
+kern_kqueue(struct thread *td, int flags)
+{
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
-	struct proc *p;
-	struct ucred *cred;
 	int fd, error;
 
-	p = td->td_proc;
-	cred = td->td_ucred;
-	crhold(cred);
-	PROC_LOCK(p);
-	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td->td_proc,
-	    RLIMIT_KQUEUES))) {
-		PROC_UNLOCK(p);
-		crfree(cred);
-		return (EMFILE);
-	}
-	PROC_UNLOCK(p);
-
-	fdp = p->p_fd;
-	error = falloc(td, &fp, &fd, 0);
+	fdp = td->td_proc->p_fd;
+	error = falloc(td, &fp, &fd, flags);
 	if (error)
 		goto done2;
 
-	/* An extra reference on `nfp' has been held for us by falloc(). */
+	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	kq->kq_fdp = fdp;
-	kq->kq_cred = cred;
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 
 	FILEDESC_XLOCK(fdp);
-	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
@@ -731,13 +824,20 @@
 
 	td->td_retval[0] = fd;
 done2:
-	if (error != 0) {
-		chgkqcnt(cred->cr_ruidinfo, -1, 0);
-		crfree(cred);
-	}
 	return (error);
 }
 
+#ifdef KTRACE
+static size_t
+kev_iovlen(int n, u_int kgio)
+{
+
+	if (n < 0 || n >= kgio / sizeof(struct kevent))
+		return (kgio);
+	return (n * sizeof(struct kevent));
+}
+#endif
+
 #ifndef _SYS_SYSPROTO_H_
 struct kevent_args {
 	int	fd;
@@ -761,6 +861,7 @@
 	struct iovec ktriov;
 	struct uio *ktruioin = NULL;
 	struct uio *ktruioout = NULL;
+	u_int kgio;
 #endif
 
 	if (uap->timeout != NULL) {
@@ -773,13 +874,15 @@
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO)) {
+		kgio = ktr_geniosize;
 		ktriov.iov_base = uap->changelist;
-		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+		ktriov.iov_len = kev_iovlen(uap->nchanges, kgio);
 		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
 		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
 		    .uio_td = td };
 		ktruioin = cloneuio(&ktruio);
 		ktriov.iov_base = uap->eventlist;
+		ktriov.iov_len = kev_iovlen(uap->nevents, kgio);
 		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
 		ktruioout = cloneuio(&ktruio);
 	}
@@ -790,9 +893,9 @@
 
 #ifdef KTRACE
 	if (ktruioin != NULL) {
-		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+		ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio);
 		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
-		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+		ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio);
 		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
 	}
 #endif
@@ -840,16 +943,37 @@
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
+	cap_rights_t rights;
+	struct file *fp;
+	int error;
+
+	cap_rights_init(&rights);
+	if (nchanges > 0)
+		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
+	if (nevents > 0)
+		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
+	error = fget(td, fd, &rights, &fp);
+	if (error != 0)
+		return (error);
+
+	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
+	fdrop(fp, td);
+
+	return (error);
+}
+
+int
+kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
+    struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	struct kqueue *kq;
-	struct file *fp;
 	int i, n, nerrors, error;
 
-	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+	error = kqueue_acquire(fp, &kq);
+	if (error != 0)
 		return (error);
-	if ((error = kqueue_acquire(fp, &kq)) != 0)
-		goto done_norel;
 
 	nerrors = 0;
 
@@ -889,8 +1013,6 @@
 	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
 done:
 	kqueue_release(kq, 0);
-done_norel:
-	fdrop(fp, td);
 	return (error);
 }
 
@@ -984,13 +1106,15 @@
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
+	cap_rights_t rights;
 	int error, filt, event;
-	int haskqglobal;
+	int haskqglobal, filedesc_unlock;
 
 	fp = NULL;
 	kn = NULL;
 	error = 0;
 	haskqglobal = 0;
+	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
@@ -1002,7 +1126,11 @@
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
-		error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
+		if (kev->ident > INT_MAX)
+			error = EBADF;
+		else
+			error = fget(td, kev->ident,
+			    cap_rights_init(&rights, CAP_EVENT), &fp);
 		if (error)
 			goto done;
 
@@ -1019,7 +1147,7 @@
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
-			 * if we add some inteligence about what we are doing,
+			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
@@ -1030,6 +1158,13 @@
 				goto done;
 			}
 
+			/*
+			 * Pre-lock the filedesc before the global
+			 * lock mutex, see the comment in
+			 * kqueue_close().
+			 */
+			FILEDESC_XLOCK(td->td_proc->p_fd);
+			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
@@ -1044,7 +1179,18 @@
 			kqueue_expand(kq, fops, kev->ident, waitok);
 
 		KQ_LOCK(kq);
-		if (kq->kq_knhashmask != 0) {
+
+		/*
+		 * If possible, find an existing knote to use for this kevent.
+		 */
+		if (kev->filter == EVFILT_PROC &&
+		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+			/* This is an internal creation of a process tracking
+			 * note. Don't attempt to coalesce this with an
+			 * existing note.
+			 */
+			;			
+		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
@@ -1056,9 +1202,13 @@
 		}
 	}
 
-	/* knote is in the process of changing, wait for it to stablize. */
+	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+		if (filedesc_unlock) {
+			FILEDESC_XUNLOCK(td->td_proc->p_fd);
+			filedesc_unlock = 0;
+		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
@@ -1134,7 +1284,7 @@
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
-	kn->kn_status |= KN_INFLUX;
+	kn->kn_status |= KN_INFLUX | KN_SCAN;
 	KQ_UNLOCK(kq);
 	KN_LIST_LOCK(kn);
 	kn->kn_kevent.udata = kev->udata;
@@ -1157,7 +1307,7 @@
 	KQ_LOCK(kq);
 	if (event)
 		KNOTE_ACTIVATE(kn, 1);
-	kn->kn_status &= ~KN_INFLUX;
+	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
 	KN_LIST_UNLOCK(kn);
 
 	if ((kev->flags & EV_DISABLE) &&
@@ -1175,6 +1325,8 @@
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+	if (filedesc_unlock)
+		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	if (tkn != NULL)
@@ -1338,10 +1490,9 @@
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
-	struct timeval atv, rtv, ttv;
 	struct knote *kn, *marker;
-	int count, timeout, nkev, error, influx;
-	int haskqglobal, touch;
+	sbintime_t asbt, rsbt;
+	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
@@ -1351,24 +1502,29 @@
 	if (maxevents == 0)
 		goto done_nl;
 
+	rsbt = 0;
 	if (tsp != NULL) {
-		TIMESPEC_TO_TIMEVAL(&atv, tsp);
-		if (itimerfix(&atv)) {
+		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
+		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
-		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
-			timeout = -1;
-		else
-			timeout = atv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&atv);
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
-		timeout = 0;
-	}
+		if (timespecisset(tsp)) {
+			if (tsp->tv_sec <= INT32_MAX) {
+				rsbt = tstosbt(*tsp);
+				if (TIMESEL(&asbt, rsbt))
+					asbt += tc_tick_sbt;
+				if (asbt <= SBT_MAX - rsbt)
+					asbt += rsbt;
+				else
+					asbt = 0;
+				rsbt >>= tc_precexp;
+			} else
+				asbt = 0;
+		} else
+			asbt = -1;
+	} else
+		asbt = 0;
 	marker = knote_alloc(1);
 	if (marker == NULL) {
 		error = ENOMEM;
@@ -1376,28 +1532,16 @@
 	}
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
-	goto start;
 
 retry:
-	if (atv.tv_sec || atv.tv_usec) {
-		getmicrouptime(&rtv);
-		if (timevalcmp(&rtv, &atv, >=))
-			goto done;
-		ttv = atv;
-		timevalsub(&ttv, &rtv);
-		timeout = ttv.tv_sec > 24 * 60 * 60 ?
-			24 * 60 * 60 * hz : tvtohz(&ttv);
-	}
-
-start:
 	kevp = keva;
 	if (kq->kq_count == 0) {
-		if (timeout < 0) {
+		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
-			error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
-			    "kqread", timeout);
+			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
+			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
@@ -1442,7 +1586,7 @@
 		KASSERT((kn->kn_status & KN_INFLUX) == 0,
 		    ("KN_INFLUX set when not suppose to be"));
 
-		if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn->kn_status |= KN_INFLUX;
 			kq->kq_count--;
@@ -1451,6 +1595,20 @@
 			 * We don't need to lock the list since we've marked
 			 * it _INFLUX.
 			 */
+			if (!(kn->kn_status & KN_DETACHED))
+				kn->kn_fop->f_detach(kn);
+			knote_drop(kn, td);
+			KQ_LOCK(kq);
+			continue;
+		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+			kn->kn_status &= ~KN_QUEUED;
+			kn->kn_status |= KN_INFLUX;
+			kq->kq_count--;
+			KQ_UNLOCK(kq);
+			/*
+			 * We don't need to lock the list since we've marked
+			 * it _INFLUX.
+			 */
 			*kevp = kn->kn_kevent;
 			if (!(kn->kn_status & KN_DETACHED))
 				kn->kn_fop->f_detach(kn);
@@ -1458,7 +1616,7 @@
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
-			kn->kn_status |= KN_INFLUX;
+			kn->kn_status |= KN_INFLUX | KN_SCAN;
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
@@ -1467,7 +1625,8 @@
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &=
-				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
+				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
+				    KN_SCAN);
 				kq->kq_count--;
 				KN_LIST_UNLOCK(kn);
 				influx = 1;
@@ -1481,7 +1640,7 @@
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
-			if (kn->kn_flags & (EV_CLEAR |  EV_DISPATCH)) {
+			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
@@ -1497,7 +1656,7 @@
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
-			kn->kn_status &= ~(KN_INFLUX);
+			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
 			KN_LIST_UNLOCK(kn);
 			influx = 1;
 		}
@@ -1662,10 +1821,12 @@
 	struct knote *kn;
 	int i;
 	int error;
+	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 
+	filedesc_unlock = 0;
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
@@ -1727,9 +1888,20 @@
 
 	KQ_UNLOCK(kq);
 
-	FILEDESC_XLOCK(fdp);
-	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
-	FILEDESC_XUNLOCK(fdp);
+	/*
+	 * We could be called due to the knote_drop() doing fdrop(),
+	 * called from kqueue_register().  In this case the global
+	 * lock is owned, and filedesc sx is locked before, to not
+	 * take the sleepable lock after non-sleepable.
+	 */
+	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
+		FILEDESC_XLOCK(fdp);
+		filedesc_unlock = 1;
+	} else
+		filedesc_unlock = 0;
+	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
+	if (filedesc_unlock)
+		FILEDESC_XUNLOCK(fdp);
 
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
@@ -1742,8 +1914,6 @@
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
-	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
-	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
@@ -1783,8 +1953,9 @@
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
-	struct knote *kn;
+	struct knote *kn, *tkn;
 	int error;
+	bool own_influx;
 
 	if (list == NULL)
 		return;
@@ -1795,37 +1966,44 @@
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
-	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
-	 * the kqueue scheduling, but this will introduce four
-	 * lock/unlock's for each knote to test.  If we do, continue to use
-	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
-	 * only safe if you want to remove the current item, which we are
-	 * not doing.
+	 * If we unlock the list lock (and set KN_INFLUX), we can
+	 * eliminate the kqueue scheduling, but this will introduce
+	 * four lock/unlock's for each knote to test.  Also, marker
+	 * would be needed to keep iteration position, since filters
+	 * or other threads could remove events.
 	 */
-	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
-		if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+		KQ_LOCK(kq);
+		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
+			/*
+			 * Do not process the influx notes, except for
+			 * the influx coming from the kq unlock in the
+			 * kqueue_scan().  In the later case, we do
+			 * not interfere with the scan, since the code
+			 * fragment in kqueue_scan() locks the knlist,
+			 * and cannot proceed until we finished.
+			 */
+			KQ_UNLOCK(kq);
+		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
+			own_influx = (kn->kn_status & KN_INFLUX) == 0;
+			if (own_influx)
+				kn->kn_status |= KN_INFLUX;
+			KQ_UNLOCK(kq);
+			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
-			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
-				KQ_UNLOCK(kq);
-			} else if ((lockflags & KNF_NOKQLOCK) != 0) {
-				kn->kn_status |= KN_INFLUX;
-				KQ_UNLOCK(kq);
-				error = kn->kn_fop->f_event(kn, hint);
-				KQ_LOCK(kq);
+			if (own_influx)
 				kn->kn_status &= ~KN_INFLUX;
-				if (error)
-					KNOTE_ACTIVATE(kn, 1);
-				KQ_UNLOCK_FLUX(kq);
-			} else {
-				kn->kn_status |= KN_HASKQLOCK;
-				if (kn->kn_fop->f_event(kn, hint))
-					KNOTE_ACTIVATE(kn, 1);
-				kn->kn_status &= ~KN_HASKQLOCK;
-				KQ_UNLOCK(kq);
-			}
+			if (error)
+				KNOTE_ACTIVATE(kn, 1);
+			KQ_UNLOCK_FLUX(kq);
+		} else {
+			kn->kn_status |= KN_HASKQLOCK;
+			if (kn->kn_fop->f_event(kn, hint))
+				KNOTE_ACTIVATE(kn, 1);
+			kn->kn_status &= ~KN_HASKQLOCK;
+			KQ_UNLOCK(kq);
 		}
-		kq = NULL;
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
@@ -1875,7 +2053,7 @@
 }
 
 /*
- * remove all knotes from a specified klist
+ * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
@@ -1885,7 +2063,7 @@
 }
 
 /*
- * remove knote from a specified klist while in f_event handler.
+ * remove knote from the specified knlist while in f_event handler.
  */
 void
 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
@@ -1898,13 +2076,14 @@
 int
 knlist_empty(struct knlist *knl)
 {
+
 	KNL_ASSERT_LOCKED(knl);
-	return SLIST_EMPTY(&knl->kl_list);
+	return (SLIST_EMPTY(&knl->kl_list));
 }
 
-static struct mtx	knlist_lock;
+static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
-	MTX_DEF);
+    MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
@@ -1911,6 +2090,7 @@
 static void
 knlist_mtx_lock(void *arg)
 {
+
 	mtx_lock((struct mtx *)arg);
 }
 
@@ -1917,6 +2097,7 @@
 static void
 knlist_mtx_unlock(void *arg)
 {
+
 	mtx_unlock((struct mtx *)arg);
 }
 
@@ -1923,6 +2104,7 @@
 static void
 knlist_mtx_assert_locked(void *arg)
 {
+
 	mtx_assert((struct mtx *)arg, MA_OWNED);
 }
 
@@ -1929,9 +2111,38 @@
 static void
 knlist_mtx_assert_unlocked(void *arg)
 {
+
 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
+static void
+knlist_rw_rlock(void *arg)
+{
+
+	rw_rlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_runlock(void *arg)
+{
+
+	rw_runlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_assert_locked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_LOCKED);
+}
+
+static void
+knlist_rw_assert_unlocked(void *arg)
+{
+
+	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
+}
+
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
@@ -1971,20 +2182,19 @@
 }
 
 void
+knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
+{
+
+	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
+	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
+}
+
+void
 knlist_destroy(struct knlist *knl)
 {
 
-#ifdef INVARIANTS
-	/*
-	 * if we run across this error, we need to find the offending
-	 * driver and have it call knlist_clear.
-	 */
-	if (!SLIST_EMPTY(&knl->kl_list))
-		printf("WARNING: destroying knlist w/ knotes on it!\n");
-#endif
-
-	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
-	SLIST_INIT(&knl->kl_list);
+	KASSERT(KNLIST_EMPTY(knl),
+	    ("destroying knlist %p with knotes on it", knl));
 }
 
 /*
@@ -2066,7 +2276,7 @@
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
-	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
+	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
@@ -2103,17 +2313,15 @@
 
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
-			return ENOMEM;
+			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
-			return ENOMEM;
+			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
-
 	SLIST_INSERT_HEAD(list, kn, kn_link);
-
-	return 0;
+	return (0);
 }
 
 /*
@@ -2212,19 +2420,19 @@
 {
 	struct kqueue *kq;
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
+	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, waitok);
-
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
-
-	return error;
+	return (error);
 }

Modified: trunk/sys/kern/kern_exec.c
===================================================================
--- trunk/sys/kern/kern_exec.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_exec.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_exec.c 330067 2018-02-27 14:45:55Z avg $");
 
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
@@ -34,9 +35,9 @@
 #include "opt_vm.h"
 
 #include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -55,8 +56,10 @@
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
+#include <sys/ptrace.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/sf_buf.h>
@@ -95,15 +98,22 @@
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
-SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
-SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");
+SDT_PROBE_DEFINE1(proc, , , exec, "char *");
+SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
+SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
+int coredump_pack_fileinfo = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
+    &coredump_pack_fileinfo, 0,
+    "Enable file path packing in 'procstat -f' coredump notes");
+
+int coredump_pack_vmmapinfo = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
+    &coredump_pack_vmmapinfo, 0,
+    "Enable file path packing in 'procstat -v' coredump notes");
+
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
@@ -125,6 +135,11 @@
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
+static int disallow_high_osrel;
+SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
+    &disallow_high_osrel, 0,
+    "Disallow execution of binaries built for higher version of the world");
+
 static int map_at_zero = 0;
 TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
@@ -193,21 +208,20 @@
 #endif
 
 int
-sys_execve(td, uap)
-	struct thread *td;
-	struct execve_args /* {
-		char *fname;
-		char **argv;
-		char **envv;
-	} */ *uap;
+sys_execve(struct thread *td, struct execve_args *uap)
 {
+	struct image_args args;
+	struct vmspace *oldvmspace;
 	int error;
-	struct image_args args;
 
+	error = pre_execve(td, &oldvmspace);
+	if (error != 0)
+		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
+	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
@@ -221,9 +235,13 @@
 int
 sys_fexecve(struct thread *td, struct fexecve_args *uap)
 {
+	struct image_args args;
+	struct vmspace *oldvmspace;
 	int error;
-	struct image_args args;
 
+	error = pre_execve(td, &oldvmspace);
+	if (error != 0)
+		return (error);
 	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0) {
@@ -230,6 +248,7 @@
 		args.fd = uap->fd;
 		error = kern_execve(td, &args, NULL);
 	}
+	post_execve(td, error, oldvmspace);
 	return (error);
 }
 
@@ -243,23 +262,21 @@
 #endif
 
 int
-sys___mac_execve(td, uap)
-	struct thread *td;
-	struct __mac_execve_args /* {
-		char *fname;
-		char **argv;
-		char **envv;
-		struct mac *mac_p;
-	} */ *uap;
+sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
 {
 #ifdef MAC
+	struct image_args args;
+	struct vmspace *oldvmspace;
 	int error;
-	struct image_args args;
 
+	error = pre_execve(td, &oldvmspace);
+	if (error != 0)
+		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
+	post_execve(td, error, oldvmspace);
 	return (error);
 #else
 	return (ENOSYS);
@@ -266,39 +283,35 @@
 #endif
 }
 
-/*
- * XXX: kern_execve has the astonishing property of not always returning to
- * the caller.  If sufficiently bad things happen during the call to
- * do_execve(), it can end up calling exit1(); as a result, callers must
- * avoid doing anything which they might need to undo (e.g., allocating
- * memory).
- */
 int
-kern_execve(td, args, mac_p)
-	struct thread *td;
-	struct image_args *args;
-	struct mac *mac_p;
+pre_execve(struct thread *td, struct vmspace **oldvmspace)
 {
-	struct proc *p = td->td_proc;
+	struct proc *p;
 	int error;
 
-	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
-	    args->begin_envv - args->begin_argv);
-	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
-	    args->endp - args->begin_envv);
-	if (p->p_flag & P_HADTHREADS) {
+	KASSERT(td == curthread, ("non-current thread %p", td));
+	error = 0;
+	p = td->td_proc;
+	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
-		if (thread_single(SINGLE_BOUNDARY)) {
-			PROC_UNLOCK(p);
-	       		exec_free_args(args);
-			return (ERESTART);	/* Try again later. */
-		}
+		if (thread_single(p, SINGLE_BOUNDARY) != 0)
+			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
+	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
+	    ("nested execve"));
+	*oldvmspace = p->p_vmspace;
+	return (error);
+}
 
-	error = do_execve(td, args, mac_p);
+void
+post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
+{
+	struct proc *p;
 
-	if (p->p_flag & P_HADTHREADS) {
+	KASSERT(td == curthread, ("non-current thread %p", td));
+	p = td->td_proc;
+	if ((p->p_flag & P_HADTHREADS) != 0) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
@@ -305,13 +318,35 @@
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
-			thread_single(SINGLE_EXIT);
+			thread_single(p, SINGLE_EXIT);
 		else
-			thread_single_end();
+			thread_single_end(p, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p);
 	}
+	if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
+		KASSERT(p->p_vmspace != oldvmspace,
+		    ("oldvmspace still used"));
+		vmspace_free(oldvmspace);
+		td->td_pflags &= ~TDP_EXECVMSPC;
+	}
+}
 
-	return (error);
+/*
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller.  If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
+ */
+int
+kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
+{
+
+	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
+	    args->begin_envv - args->begin_argv);
+	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
+	    args->endp - args->begin_envv);
+	return (do_execve(td, args, mac_p));
 }
 
 /*
@@ -326,8 +361,8 @@
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd;
-	struct ucred *newcred = NULL, *oldcred;
-	struct uidinfo *euip;
+	struct ucred *oldcred;
+	struct uidinfo *euip = NULL;
 	register_t *stack_base;
 	int error, i;
 	struct image_params image_params, *imgp;
@@ -334,14 +369,14 @@
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
-	struct sigacts *oldsigacts, *newsigacts;
+	struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
-	struct vnode *textvp = NULL, *binvp = NULL;
+	struct vnode *oldtextvp = NULL, *newtextvp;
+	cap_rights_t rights;
 	int credential_changing;
-	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interpvplabel = NULL;
@@ -352,7 +387,6 @@
 #endif
 	static const char fexecv_proc_title[] = "(fexecv)";
 
-	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
@@ -371,29 +405,11 @@
 	/*
 	 * Initialize part of the common data
 	 */
+	bzero(imgp, sizeof(*imgp));
 	imgp->proc = p;
-	imgp->execlabel = NULL;
 	imgp->attr = &attr;
-	imgp->entry_addr = 0;
-	imgp->reloc_base = 0;
-	imgp->vmspace_destroyed = 0;
-	imgp->interpreted = 0;
-	imgp->opened = 0;
-	imgp->interpreter_name = NULL;
-	imgp->auxargs = NULL;
-	imgp->vp = NULL;
-	imgp->object = NULL;
-	imgp->firstpage = NULL;
-	imgp->ps_strings = 0;
-	imgp->auxarg_size = 0;
 	imgp->args = args;
-	imgp->execpath = imgp->freepath = NULL;
-	imgp->execpathp = 0;
-	imgp->canary = 0;
-	imgp->canarylen = 0;
-	imgp->pagesizes = 0;
-	imgp->pagesizeslen = 0;
-	imgp->stack_prot = 0;
+	oldcred = p->p_ucred;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
@@ -401,11 +417,9 @@
 		goto exec_fail;
 #endif
 
-	imgp->image_header = NULL;
-
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
-	 *	in ni_vp amoung other things.
+	 *	in ni_vp among other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
@@ -412,10 +426,10 @@
 	 */
 	if (args->fname != NULL) {
 		NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
-		    | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
+		    | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 	}
 
-	SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
+	SDT_PROBE1(proc, , , exec, args->fname);
 
 interpret:
 	if (args->fname != NULL) {
@@ -435,24 +449,20 @@
 		if (error)
 			goto exec_fail;
 
-		vfslocked = NDHASGIANT(&nd);
-		binvp  = nd.ni_vp;
-		imgp->vp = binvp;
+		newtextvp = nd.ni_vp;
+		imgp->vp = newtextvp;
 	} else {
 		AUDIT_ARG_FD(args->fd);
 		/*
-		 * Some might argue that CAP_READ and/or CAP_MMAP should also
-		 * be required here; such arguments will be entertained.
-		 *
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
-		error = fgetvp_exec(td, args->fd, CAP_FEXECVE, &binvp);
+		error = fgetvp_exec(td, args->fd,
+		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
 		if (error)
 			goto exec_fail;
-		vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
-		vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
-		AUDIT_ARG_VNODE1(binvp);
-		imgp->vp = binvp;
+		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
+		AUDIT_ARG_VNODE1(newtextvp);
+		imgp->vp = newtextvp;
 	}
 
 	/*
@@ -481,7 +491,101 @@
 		goto exec_fail_dealloc;
 
 	imgp->proc->p_osrel = 0;
+
 	/*
+	 * Implement image setuid/setgid.
+	 *
+	 * Determine new credentials before attempting image activators
+	 * so that it can be used by process_exec handlers to determine
+	 * credential/setid changes.
+	 *
+	 * Don't honor setuid/setgid if the filesystem prohibits it or if
+	 * the process is being traced.
+	 *
+	 * We disable setuid/setgid/etc in capability mode on the basis
+	 * that most setugid applications are not written with that
+	 * environment in mind, and will therefore almost certainly operate
+	 * incorrectly. In principle there's no reason that setugid
+	 * applications might not be useful in capability mode, so we may want
+	 * to reconsider this conservative design choice in the future.
+	 *
+	 * XXXMAC: For the time being, use NOSUID to also prohibit
+	 * transitions on the file system.
+	 */
+	credential_changing = 0;
+	credential_changing |= (attr.va_mode & S_ISUID) &&
+	    oldcred->cr_uid != attr.va_uid;
+	credential_changing |= (attr.va_mode & S_ISGID) &&
+	    oldcred->cr_gid != attr.va_gid;
+#ifdef MAC
+	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
+	    interpvplabel, imgp);
+	credential_changing |= will_transition;
+#endif
+
+	if (credential_changing &&
+#ifdef CAPABILITY_MODE
+	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
+	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+	    (p->p_flag & P_TRACED) == 0) {
+		imgp->credential_setid = true;
+		VOP_UNLOCK(imgp->vp, 0);
+		imgp->newcred = crdup(oldcred);
+		if (attr.va_mode & S_ISUID) {
+			euip = uifind(attr.va_uid);
+			change_euid(imgp->newcred, euip);
+		}
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+		if (attr.va_mode & S_ISGID)
+			change_egid(imgp->newcred, attr.va_gid);
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 *
+		 * XXXMAC: Note that the current logic will save the
+		 * uid and gid if a MAC domain transition occurs, even
+		 * though maybe it shouldn't.
+		 */
+		change_svuid(imgp->newcred, imgp->newcred->cr_uid);
+		change_svgid(imgp->newcred, imgp->newcred->cr_gid);
+	} else {
+		/*
+		 * Implement correct POSIX saved-id behavior.
+		 *
+		 * XXX: It's not clear that the existing behavior is
+		 * POSIX-compliant.  A number of sources indicate that the
+		 * saved uid/gid should only be updated if the new ruid is
+		 * not equal to the old ruid, or the new euid is not equal
+		 * to the old euid and the new euid is not equal to the old
+		 * ruid.  The FreeBSD code always updates the saved uid/gid.
+		 * Also, this code uses the new (replaced) euid and egid as
+		 * the source, which may or may not be the right ones to use.
+		 */
+		if (oldcred->cr_svuid != oldcred->cr_uid ||
+		    oldcred->cr_svgid != oldcred->cr_gid) {
+			VOP_UNLOCK(imgp->vp, 0);
+			imgp->newcred = crdup(oldcred);
+			vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+			change_svuid(imgp->newcred, imgp->newcred->cr_uid);
+			change_svgid(imgp->newcred, imgp->newcred->cr_gid);
+		}
+	}
+	/* The new credentials are installed into the process later. */
+
+	/*
+	 * Do the best to calculate the full path to the image file.
+	 */
+	if (args->fname != NULL && args->fname[0] == '/')
+		imgp->execpath = args->fname;
+	else {
+		VOP_UNLOCK(imgp->vp, 0);
+		if (vn_fullpath(td, imgp->vp, &imgp->execpath,
+		    &imgp->freepath) != 0)
+			imgp->execpath = args->fname;
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+	}
+
+	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
@@ -529,19 +633,25 @@
 		if (args->fname != NULL)
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
-		mac_execve_interpreter_enter(binvp, &interpvplabel);
+		mac_execve_interpreter_enter(newtextvp, &interpvplabel);
 #endif
 		if (imgp->opened) {
-			VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
+			VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
 			imgp->opened = 0;
 		}
-		vput(binvp);
+		vput(newtextvp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
-		VFS_UNLOCK_GIANT(vfslocked);
-		vfslocked = 0;
+		imgp->credential_setid = false;
+		if (imgp->newcred != NULL) {
+			crfree(imgp->newcred);
+			imgp->newcred = NULL;
+		}
+		imgp->execpath = NULL;
+		free(imgp->freepath, M_TEMP);
+		imgp->freepath = NULL;
 		/* set new name to that of the interpreter */
-		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
+		NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		args->fname = imgp->interpreter_name;
 		goto interpret;
@@ -553,13 +663,14 @@
 	 */
 	VOP_UNLOCK(imgp->vp, 0);
 
-	/*
-	 * Do the best to calculate the full path to the image file.
-	 */
-	if (imgp->auxargs != NULL &&
-	    ((args->fname != NULL && args->fname[0] == '/') ||
-	     vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
-		imgp->execpath = args->fname;
+	if (disallow_high_osrel &&
+	    P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
+		error = ENOEXEC;
+		uprintf("Osrel %d for image %s too high\n", p->p_osrel,
+		    imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
+		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+		goto exec_fail_dealloc;
+	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
@@ -583,13 +694,13 @@
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
-	fdunshare(p, td);
+	fdunshare(td);
+	/* close files on exec */
+	fdcloseexec(td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
-	newcred = crget();
-	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
@@ -597,13 +708,6 @@
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
-	/* close files on exec */
-	fdcloseexec(td);
-	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
-
-	/* Get a reference to the vnode prior to locking the proc */
-	VREF(binvp);
-
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
@@ -610,18 +714,17 @@
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
-	PROC_LOCK(p);
-	oldcred = crcopysafe(p, newcred);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
-		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
-		PROC_LOCK(p);
+	}
+
+	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+
+	PROC_LOCK(p);
+	if (oldsigacts)
 		p->p_sigacts = newsigacts;
-	} else
-		oldsigacts = NULL;
-
 	/* Stop profiling */
 	stopprofclock(p);
 
@@ -633,7 +736,7 @@
 	if (args->fname)
 		bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
 		    min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
-	else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
+	else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
 		bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
 	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
 #ifdef KTR
@@ -645,44 +748,19 @@
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
-	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+	if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
+		p->p_flag2 &= ~P2_NOTRACE;
+	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
+		/* STOPs are no longer ignored, arrange for AST */
+		signotify(td);
 	}
 
 	/*
-	 * Implement image setuid/setgid.
-	 *
-	 * Don't honor setuid/setgid if the filesystem prohibits it or if
-	 * the process is being traced.
-	 *
-	 * We disable setuid/setgid/etc in compatibility mode on the basis
-	 * that most setugid applications are not written with that
-	 * environment in mind, and will therefore almost certainly operate
-	 * incorrectly. In principle there's no reason that setugid
-	 * applications might not be useful in capability mode, so we may want
-	 * to reconsider this conservative design choice in the future.
-	 *
-	 * XXXMAC: For the time being, use NOSUID to also prohibit
-	 * transitions on the file system.
+	 * Implement image setuid/setgid installation.
 	 */
-	credential_changing = 0;
-	credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
-	    attr.va_uid;
-	credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
-	    attr.va_gid;
-#ifdef MAC
-	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
-	    interpvplabel, imgp);
-	credential_changing |= will_transition;
-#endif
-
-	if (credential_changing &&
-#ifdef CAPABILITY_MODE
-	    ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
-#endif
-	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
-	    (p->p_flag & P_TRACED) == 0) {
+	if (imgp->credential_setid) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
@@ -710,63 +788,34 @@
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
 		if (error != 0)
-			goto done1;
+			goto exec_fail_dealloc;
 		PROC_LOCK(p);
-		/*
-		 * Set the new credentials.
-		 */
-		if (attr.va_mode & S_ISUID)
-			change_euid(newcred, euip);
-		if (attr.va_mode & S_ISGID)
-			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
-			mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
-			    interpvplabel, imgp);
+			mac_vnode_execve_transition(oldcred, imgp->newcred,
+			    imgp->vp, interpvplabel, imgp);
 		}
 #endif
-		/*
-		 * Implement correct POSIX saved-id behavior.
-		 *
-		 * XXXMAC: Note that the current logic will save the
-		 * uid and gid if a MAC domain transition occurs, even
-		 * though maybe it shouldn't.
-		 */
-		change_svuid(newcred, newcred->cr_uid);
-		change_svgid(newcred, newcred->cr_gid);
-		p->p_ucred = newcred;
-		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
-		/*
-		 * Implement correct POSIX saved-id behavior.
-		 *
-		 * XXX: It's not clear that the existing behavior is
-		 * POSIX-compliant.  A number of sources indicate that the
-		 * saved uid/gid should only be updated if the new ruid is
-		 * not equal to the old ruid, or the new euid is not equal
-		 * to the old euid and the new euid is not equal to the old
-		 * ruid.  The FreeBSD code always updates the saved uid/gid.
-		 * Also, this code uses the new (replaced) euid and egid as
-		 * the source, which may or may not be the right ones to use.
-		 */
-		if (oldcred->cr_svuid != oldcred->cr_uid ||
-		    oldcred->cr_svgid != oldcred->cr_gid) {
-			change_svuid(newcred, newcred->cr_uid);
-			change_svgid(newcred, newcred->cr_gid);
-			p->p_ucred = newcred;
-			newcred = NULL;
-		}
 	}
+	/*
+	 * Set the new credentials.
+	 */
+	if (imgp->newcred != NULL) {
+		proc_set_cred(p, imgp->newcred);
+		crfree(oldcred);
+		oldcred = NULL;
+	}
 
 	/*
-	 * Store the vp for use in procfs.  This vnode was referenced prior
-	 * to locking the proc lock.
+	 * Store the vp for use in procfs.  This vnode was referenced by namei
+	 * or fgetvp_exec.
 	 */
-	textvp = p->p_textvp;
-	p->p_textvp = binvp;
+	oldtextvp = p->p_textvp;
+	p->p_textvp = newtextvp;
 
 #ifdef KDTRACE_HOOKS
 	/*
@@ -828,53 +877,9 @@
 
 	vfs_mark_atime(imgp->vp, td->td_ucred);
 
-	SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
+	SDT_PROBE1(proc, , , exec__success, args->fname);
 
-done1:
-	/*
-	 * Free any resources malloc'd earlier that we didn't use.
-	 */
-	uifree(euip);
-	if (newcred == NULL)
-		crfree(oldcred);
-	else
-		crfree(newcred);
-	VOP_UNLOCK(imgp->vp, 0);
-
-	/*
-	 * Handle deferred decrement of ref counts.
-	 */
-	if (textvp != NULL) {
-		int tvfslocked;
-
-		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
-		vrele(textvp);
-		VFS_UNLOCK_GIANT(tvfslocked);
-	}
-	if (binvp && error != 0)
-		vrele(binvp);
-#ifdef KTRACE
-	if (tracevp != NULL) {
-		int tvfslocked;
-
-		tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
-		vrele(tracevp);
-		VFS_UNLOCK_GIANT(tvfslocked);
-	}
-	if (tracecred != NULL)
-		crfree(tracecred);
-#endif
-	vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
-	pargs_drop(oldargs);
-	pargs_drop(newargs);
-	if (oldsigacts != NULL)
-		sigacts_free(oldsigacts);
-
 exec_fail_dealloc:
-
-	/*
-	 * free various allocated resources
-	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
@@ -883,7 +888,10 @@
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (imgp->opened)
 			VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
-		vput(imgp->vp);
+		if (error != 0)
+			vput(imgp->vp);
+		else
+			VOP_UNLOCK(imgp->vp, 0);
 	}
 
 	if (imgp->object != NULL)
@@ -893,7 +901,8 @@
 
 	if (error == 0) {
 		PROC_LOCK(p);
-		td->td_dbgflags |= TDB_EXEC;
+		if (p->p_ptevents & PTRACE_EXEC)
+			td->td_dbgflags |= TDB_EXEC;
 		PROC_UNLOCK(p);
 
 		/*
@@ -901,25 +910,43 @@
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
-		goto done2;
+	} else {
+exec_fail:
+		/* we're done here, clear P_INEXEC */
+		PROC_LOCK(p);
+		p->p_flag &= ~P_INEXEC;
+		PROC_UNLOCK(p);
+
+		SDT_PROBE1(proc, , , exec__failure, error);
 	}
 
-exec_fail:
-	/* we're done here, clear P_INEXEC */
-	PROC_LOCK(p);
-	p->p_flag &= ~P_INEXEC;
-	PROC_UNLOCK(p);
+	if (imgp->newcred != NULL && oldcred != NULL)
+		crfree(imgp->newcred);
 
-	SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
-
-done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
 #endif
-	VFS_UNLOCK_GIANT(vfslocked);
 	exec_free_args(args);
 
+	/*
+	 * Handle deferred decrement of ref counts.
+	 */
+	if (oldtextvp != NULL)
+		vrele(oldtextvp);
+#ifdef KTRACE
+	if (tracevp != NULL)
+		vrele(tracevp);
+	if (tracecred != NULL)
+		crfree(tracecred);
+#endif
+	pargs_drop(oldargs);
+	pargs_drop(newargs);
+	if (oldsigacts != NULL)
+		sigacts_free(oldsigacts);
+	if (euip != NULL)
+		uifree(euip);
+
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
@@ -949,7 +976,7 @@
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 #if VM_NRESERVLEVEL > 0
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->flags |= OBJ_COLORED;
@@ -956,7 +983,7 @@
 		object->pg_color = 0;
 	}
 #endif
-	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
 	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
@@ -965,9 +992,8 @@
 			if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
 				if (ma[i]->valid)
 					break;
-				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
+				if (!vm_page_tryxbusy(ma[i]))
 					break;
-				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
@@ -984,15 +1010,16 @@
 				vm_page_free(ma[0]);
 				vm_page_unlock(ma[0]);
 			}
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 	}
+	vm_page_xunbusy(ma[0]);
 	vm_page_lock(ma[0]);
 	vm_page_hold(ma[0]);
+	vm_page_activate(ma[0]);
 	vm_page_unlock(ma[0]);
-	vm_page_wakeup(ma[0]);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
@@ -1017,9 +1044,9 @@
 }
 
 /*
- * Destroy old address space, and allocate a new stack
- *	The new stack is only SGROWSIZ large because it is grown
- *	automatically in trap.c.
+ * Destroy old address space, and allocate a new stack.
+ *	The new stack is only sgrowsiz large because it is grown
+ *	automatically on a page fault.
  */
 int
 exec_new_vmspace(imgp, sv)
@@ -1030,6 +1057,7 @@
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_object_t obj;
+	struct rlimit rlim_stack;
 	vm_offset_t sv_minuser, stack_addr;
 	vm_map_t map;
 	u_long ssiz;
@@ -1055,6 +1083,10 @@
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
+		/* An exec terminates mlockall(MCL_FUTURE). */
+		vm_map_lock(map);
+		vm_map_modflags(map, 0, MAP_WIREFUTURE);
+		vm_map_unlock(map);
 	} else {
 		error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 		if (error)
@@ -1069,31 +1101,42 @@
 		vm_object_reference(obj);
 		error = vm_map_fixed(map, obj, 0,
 		    sv->sv_shared_page_base, sv->sv_shared_page_len,
-		    VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
-		    MAP_COPY_ON_WRITE | MAP_ACC_NO_CHARGE);
-		if (error) {
+		    VM_PROT_READ | VM_PROT_EXECUTE,
+		    VM_PROT_READ | VM_PROT_EXECUTE,
+		    MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
+		if (error != KERN_SUCCESS) {
 			vm_object_deallocate(obj);
-			return (error);
+			return (vm_mmap_to_errno(error));
 		}
 	}
 
 	/* Allocate a new stack */
-	if (sv->sv_maxssiz != NULL)
+	if (imgp->stack_sz != 0) {
+		ssiz = trunc_page(imgp->stack_sz);
+		PROC_LOCK(p);
+		lim_rlimit(p, RLIMIT_STACK, &rlim_stack);
+		PROC_UNLOCK(p);
+		if (ssiz > rlim_stack.rlim_max)
+			ssiz = rlim_stack.rlim_max;
+		if (ssiz > rlim_stack.rlim_cur) {
+			rlim_stack.rlim_cur = ssiz;
+			kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
+		}
+	} else if (sv->sv_maxssiz != NULL) {
 		ssiz = *sv->sv_maxssiz;
-	else
+	} else {
 		ssiz = maxssiz;
+	}
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
-		sv->sv_stackprot,
-	    VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
-	if (error)
-		return (error);
+	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
+	if (error != KERN_SUCCESS)
+		return (vm_mmap_to_errno(error));
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
-	stack_addr = IA64_BACKINGSTORE;
-	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+	error = vm_map_stack(map, IA64_BACKINGSTORE, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
@@ -1104,7 +1147,7 @@
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
-	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
+	vmspace->vm_maxsaddr = (char *)stack_addr;
 
 	return (0);
 }
@@ -1117,7 +1160,7 @@
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
-	char *argp, *envp;
+	u_long argp, envp;
 	int error;
 	size_t length;
 
@@ -1153,13 +1196,17 @@
 	/*
 	 * extract arguments first
 	 */
-	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
-		if (argp == (caddr_t) -1) {
+	for (;;) {
+		error = fueword(argv++, &argp);
+		if (error == -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
-		if ((error = copyinstr(argp, args->endp,
-		    args->stringspace, &length))) {
+		if (argp == 0)
+			break;
+		error = copyinstr((void *)(uintptr_t)argp, args->endp,
+		    args->stringspace, &length);
+		if (error != 0) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
@@ -1175,13 +1222,17 @@
 	 * extract environment strings
 	 */
 	if (envv) {
-		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
-			if (envp == (caddr_t)-1) {
+		for (;;) {
+			error = fueword(envv++, &envp);
+			if (error == -1) {
 				error = EFAULT;
 				goto err_exit;
 			}
-			if ((error = copyinstr(envp, args->endp,
-			    args->stringspace, &length))) {
+			if (envp == 0)
+				break;
+			error = copyinstr((void *)(uintptr_t)envp,
+			    args->endp, args->stringspace, &length);
+			if (error != 0) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
@@ -1208,7 +1259,7 @@
 exec_alloc_args(struct image_args *args)
 {
 
-	args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
+	args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
 	return (args->buf != NULL ? 0 : ENOMEM);
 }
 
@@ -1217,7 +1268,7 @@
 {
 
 	if (args->buf != NULL) {
-		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
+		kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX);
 		args->buf = NULL;
 	}
@@ -1238,7 +1289,8 @@
 {
 	int argc, envc;
 	char **vectp;
-	char *stringp, *destp;
+	char *stringp;
+	uintptr_t destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
@@ -1262,26 +1314,24 @@
 		if (p->p_sysent->sv_szsigcode != NULL)
 			szsigcode = *(p->p_sysent->sv_szsigcode);
 	}
-	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
-	    roundup(execpath_len, sizeof(char *)) -
-	    roundup(sizeof(canary), sizeof(char *)) -
-	    roundup(szps, sizeof(char *)) -
-	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
+	destp =	(uintptr_t)arginfo;
 
 	/*
 	 * install sigcode
 	 */
-	if (szsigcode != 0)
-		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
-		    szsigcode), szsigcode);
+	if (szsigcode != 0) {
+		destp -= szsigcode;
+		destp = rounddown2(destp, sizeof(void *));
+		copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
+	}
 
 	/*
 	 * Copy the image path for the rtld.
 	 */
 	if (execpath_len != 0) {
-		imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
-		copyout(imgp->execpath, (void *)imgp->execpathp,
-		    execpath_len);
+		destp -= execpath_len;
+		imgp->execpathp = destp;
+		copyout(imgp->execpath, (void *)destp, execpath_len);
 	}
 
 	/*
@@ -1288,19 +1338,23 @@
 	 * Prepare the canary for SSP.
 	 */
 	arc4rand(canary, sizeof(canary), 0);
-	imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
-	    sizeof(canary);
-	copyout(canary, (void *)imgp->canary, sizeof(canary));
+	destp -= sizeof(canary);
+	imgp->canary = destp;
+	copyout(canary, (void *)destp, sizeof(canary));
 	imgp->canarylen = sizeof(canary);
 
 	/*
 	 * Prepare the pagesizes array.
 	 */
-	imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
-	    roundup(sizeof(canary), sizeof(char *)) - szps;
-	copyout(pagesizes, (void *)imgp->pagesizes, szps);
+	destp -= szps;
+	destp = rounddown2(destp, sizeof(void *));
+	imgp->pagesizes = destp;
+	copyout(pagesizes, (void *)destp, szps);
 	imgp->pagesizeslen = szps;
 
+	destp -= ARG_MAX - imgp->args->stringspace;
+	destp = rounddown2(destp, sizeof(void *));
+
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
@@ -1325,8 +1379,8 @@
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
-		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
-		    sizeof(char *));
+		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
+		    + 2) * sizeof(char *));
 	}
 
 	/*
@@ -1341,7 +1395,7 @@
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
-	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
+	copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
@@ -1468,8 +1522,6 @@
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
-	if (newexecsw == NULL)
-		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
@@ -1502,8 +1554,6 @@
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
-	if (newexecsw == NULL)
-		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)

Modified: trunk/sys/kern/kern_exit.c
===================================================================
--- trunk/sys/kern/kern_exit.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_exit.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_exit.c 310585 2016-12-26 10:16:05Z kib $");
 
 #include "opt_compat.h"
 #include "opt_kdtrace.h"
@@ -45,7 +46,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
@@ -73,6 +74,7 @@
 #include <sys/sdt.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
+#include <sys/umtx.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
@@ -94,22 +96,78 @@
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , exit, exit);
-SDT_PROBE_ARGTYPE(proc, kernel, , exit, 0, "int");
+SDT_PROBE_DEFINE1(proc, , , exit, "int");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
+struct proc *
+proc_realparent(struct proc *child)
+{
+	struct proc *p, *parent;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
+		if (child->p_oppid == 0 ||
+		    child->p_pptr->p_pid == child->p_oppid)
+			parent = child->p_pptr;
+		else
+			parent = initproc;
+		return (parent);
+	}
+	for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
+		/* Cannot use LIST_PREV(), since the list head is not known. */
+		p = __containerof(p->p_orphan.le_prev, struct proc,
+		    p_orphan.le_next);
+		KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0,
+		    ("missing P_ORPHAN %p", p));
+	}
+	parent = __containerof(p->p_orphan.le_prev, struct proc,
+	    p_orphans.lh_first);
+	return (parent);
+}
+
+void
+reaper_abandon_children(struct proc *p, bool exiting)
+{
+	struct proc *p1, *p2, *ptmp;
+
+	sx_assert(&proctree_lock, SX_LOCKED);
+	KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
+	if ((p->p_treeflag & P_TREE_REAPER) == 0)
+		return;
+	p1 = p->p_reaper;
+	LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
+		LIST_REMOVE(p2, p_reapsibling);
+		p2->p_reaper = p1;
+		p2->p_reapsubtree = p->p_reapsubtree;
+		LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
+		if (exiting && p2->p_pptr == p) {
+			PROC_LOCK(p2);
+			proc_reparent(p2, p1);
+			PROC_UNLOCK(p2);
+		}
+	}
+	KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
+	p->p_treeflag &= ~P_TREE_REAPER;
+}
+
 static void
 clear_orphan(struct proc *p)
 {
+	struct proc *p1;
 
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-
-	if (p->p_flag & P_ORPHAN) {
-		LIST_REMOVE(p, p_orphan);
-		p->p_flag &= ~P_ORPHAN;
+	sx_assert(&proctree_lock, SA_XLOCKED);
+	if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
+		return;
+	if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
+		p1 = LIST_NEXT(p, p_orphan);
+		if (p1 != NULL)
+			p1->p_treeflag |= P_TREE_FIRST_ORPHAN;
+		p->p_treeflag &= ~P_TREE_FIRST_ORPHAN;
 	}
+	LIST_REMOVE(p, p_orphan);
+	p->p_treeflag &= ~P_TREE_ORPHANED;
 }
 
 /*
@@ -131,11 +189,9 @@
 void
 exit1(struct thread *td, int rv)
 {
-	struct proc *p, *nq, *q;
-	struct vnode *vtmp;
+	struct proc *p, *nq, *q, *t;
+	struct thread *tdt;
 	struct vnode *ttyvp = NULL;
-	struct plimit *plim;
-	int locked;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 
@@ -152,17 +208,24 @@
 	}
 
 	/*
+	 * Deref SU mp, since the thread does not return to userspace.
+	 */
+	if (softdep_ast_cleanup != NULL)
+		softdep_ast_cleanup();
+
+	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
+	/*
+	 * First check if some other thread or external request got
+	 * here before us.  If so, act appropriately: exit or suspend.
+	 * We must ensure that stop requests are handled before we set
+	 * P_WEXIT.
+	 */
+	thread_suspend_check(0);
 	while (p->p_flag & P_HADTHREADS) {
 		/*
-		 * First check if some other thread got here before us..
-		 * if so, act apropriatly, (exit or suspend);
-		 */
-		thread_suspend_check(0);
-
-		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
@@ -179,13 +242,19 @@
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
-		if (! thread_single(SINGLE_EXIT))
+		if (!thread_single(p, SINGLE_EXIT))
+			/*
+			 * All other activity in this process is now
+			 * stopped.  Threading support has been turned
+			 * off.
+			 */
 			break;
-
 		/*
-		 * All other activity in this process is now stopped.
-		 * Threading support has been turned off.
+		 * Recheck for new stop or suspend requests which
+		 * might appear while process lock was dropped in
+		 * thread_single().
 		 */
+		thread_suspend_check(0);
 	}
 	KASSERT(p->p_numthreads == 1,
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
@@ -268,6 +337,7 @@
 	rv = p->p_xstat;	/* Event handler could change exit status */
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
+	p->p_ptevents = 0;
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
@@ -298,7 +368,7 @@
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
-	fdfree(td);
+	fdescfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
@@ -381,21 +451,16 @@
 	/*
 	 * Release reference to text vnode
 	 */
-	if ((vtmp = p->p_textvp) != NULL) {
+	if (p->p_textvp != NULL) {
+		vrele(p->p_textvp);
 		p->p_textvp = NULL;
-		locked = VFS_LOCK_GIANT(vtmp->v_mount);
-		vrele(vtmp);
-		VFS_UNLOCK_GIANT(locked);
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
-	PROC_LOCK(p);
-	plim = p->p_limit;
+	lim_free(p->p_limit);
 	p->p_limit = NULL;
-	PROC_UNLOCK(p);
-	lim_free(plim);
 
 	tidhash_remove(td);
 
@@ -420,25 +485,40 @@
 	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
 
 	/*
-	 * Reparent all of our children to init.
+	 * Reparent all children processes:
+	 * - traced ones to the original parent (or init if we are that parent)
+	 * - the rest to init
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
-		wakeup(initproc);
+		wakeup(q->p_reaper);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
-		proc_reparent(q, initproc);
 		q->p_sigparent = SIGCHLD;
-		/*
-		 * Traced processes are killed
-		 * since their existence means someone is screwing up.
-		 */
-		if (q->p_flag & P_TRACED) {
-			struct thread *temp;
 
+		if (!(q->p_flag & P_TRACED)) {
+			proc_reparent(q, q->p_reaper);
+			if (q->p_state == PRS_ZOMBIE) {
+				PROC_LOCK(q->p_reaper);
+				pksignal(q->p_reaper, SIGCHLD, q->p_ksi);
+				PROC_UNLOCK(q->p_reaper);
+			}
+		} else {
 			/*
+			 * Traced processes are killed since their existence
+			 * means someone is screwing up.
+			 */
+			t = proc_realparent(q);
+			if (t == p) {
+				proc_reparent(q, q->p_reaper);
+			} else {
+				PROC_LOCK(t);
+				proc_reparent(q, t);
+				PROC_UNLOCK(t);
+			}
+			/*
 			 * Since q was found on our children list, the
 			 * proc_reparent() call moved q to the orphan
 			 * list due to present P_TRACED flag. Clear
@@ -446,8 +526,12 @@
 			 */
 			clear_orphan(q);
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
-			FOREACH_THREAD_IN_PROC(q, temp)
-				temp->td_dbgflags &= ~TDB_SUSPEND;
+			q->p_flag2 &= ~P2_PTRACE_FSTP;
+			q->p_ptevents = 0;
+			FOREACH_THREAD_IN_PROC(q, tdt) {
+				tdt->td_dbgflags &= ~(TDB_SUSPEND | TDB_XSIG |
+				    TDB_FSTP);
+			}
 			kern_psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
@@ -458,6 +542,8 @@
 	 */
 	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
 		PROC_LOCK(q);
+		CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid,
+		    q->p_pid);
 		clear_orphan(q);
 		PROC_UNLOCK(q);
 	}
@@ -489,7 +575,7 @@
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(rv))
 		reason = CLD_KILLED;
-	SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
+	SDT_PROBE1(proc, , , exit, reason);
 #endif
 
 	/*
@@ -523,7 +609,7 @@
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 			pp = p->p_pptr;
 			PROC_UNLOCK(pp);
-			proc_reparent(p, initproc);
+			proc_reparent(p, p->p_reaper);
 			p->p_sigparent = SIGCHLD;
 			PROC_LOCK(p->p_pptr);
 
@@ -536,8 +622,8 @@
 		} else
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
-		if (p->p_pptr == initproc)
-			kern_psignal(p->p_pptr, SIGCHLD);
+		if (p->p_pptr == p->p_reaper || p->p_pptr == initproc)
+			childproc_exited(p);
 		else if (p->p_sigparent != 0) {
 			if (p->p_sigparent == SIGCHLD)
 				childproc_exited(p);
@@ -571,6 +657,7 @@
 	wakeup(p->p_pptr);
 	cv_broadcast(&p->p_pwait);
 	sched_exit(p->p_pptr, td);
+	umtx_thread_exit(td);
 	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
@@ -584,7 +671,9 @@
 	/*
 	 * Save our children's rusage information in our exit rusage.
 	 */
+	PROC_STATLOCK(p);
 	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+	PROC_STATUNLOCK(p);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
@@ -702,9 +791,9 @@
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
-	if (uap->status != NULL && error == 0)
+	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
-	if (uap->rusage != NULL && error == 0)
+	if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
@@ -713,10 +802,10 @@
 sys_wait6(struct thread *td, struct wait6_args *uap)
 {
 	struct __wrusage wru, *wrup;
-	siginfo_t  si, *sip;
-	int error, status;
+	siginfo_t si, *sip;
 	idtype_t idtype;
 	id_t id;
+	int error, status;
 
 	idtype = uap->idtype;
 	id = uap->id;
@@ -738,9 +827,9 @@
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
-	if (uap->status != NULL && error == 0)
+	if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&status, uap->status, sizeof(status));
-	if (uap->wrusage != NULL && error == 0)
+	if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));
@@ -781,14 +870,19 @@
 	PROC_LOCK(q);
 	sigqueue_take(p->p_ksi);
 	PROC_UNLOCK(q);
-	PROC_UNLOCK(p);
 
 	/*
 	 * If we got the child via a ptrace 'attach', we need to give it back
 	 * to the old parent.
 	 */
-	if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+	if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) {
+		PROC_UNLOCK(p);
+		t = proc_realparent(p);
+		PROC_LOCK(t);
 		PROC_LOCK(p);
+		CTR2(KTR_PTRACE,
+		    "wait: traced child %d moved back to parent %d", p->p_pid,
+		    t->p_pid);
 		proc_reparent(p, t);
 		p->p_oppid = 0;
 		PROC_UNLOCK(p);
@@ -799,6 +893,8 @@
 		sx_xunlock(&proctree_lock);
 		return;
 	}
+	p->p_oppid = 0;
+	PROC_UNLOCK(p);
 
 	/*
 	 * Remove other references to this process to ensure we have an
@@ -808,6 +904,8 @@
 	LIST_REMOVE(p, p_list);	/* off zombproc */
 	sx_xunlock(&allproc_lock);
 	LIST_REMOVE(p, p_sibling);
+	reaper_abandon_children(p, true);
+	LIST_REMOVE(p, p_reapsibling);
 	PROC_LOCK(p);
 	clear_orphan(p);
 	PROC_UNLOCK(p);
@@ -838,9 +936,11 @@
 	 * Destroy resource accounting information associated with the process.
 	 */
 #ifdef RACCT
-	PROC_LOCK(p);
-	racct_sub(p, RACCT_NPROC, 1);
-	PROC_UNLOCK(p);
+	if (racct_enable) {
+		PROC_LOCK(p);
+		racct_sub(p, RACCT_NPROC, 1);
+		PROC_UNLOCK(p);
+	}
 #endif
 	racct_proc_exit(p);
 
@@ -848,7 +948,7 @@
 	 * Free credentials, arguments, and sigacts.
 	 */
 	crfree(p->p_ucred);
-	p->p_ucred = NULL;
+	proc_set_cred(p, NULL);
 	pargs_drop(p->p_args);
 	p->p_args = NULL;
 	sigacts_free(p->p_sigacts);
@@ -870,21 +970,18 @@
 	KASSERT(FIRST_THREAD_IN_PROC(p),
 	    ("proc_reap: no residual thread!"));
 	uma_zfree(proc_zone, p);
-	sx_xlock(&allproc_lock);
-	nprocs--;
-	sx_xunlock(&allproc_lock);
+	atomic_add_int(&nprocs, -1);
 }
 
 static int
 proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
-    int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+    int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo,
+    int check_only)
 {
-	struct proc *q;
 	struct rusage *rup;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 
-	q = td->td_proc;
 	PROC_LOCK(p);
 
 	switch (idtype) {
@@ -921,8 +1018,7 @@
 		}
 		break;
 	case P_JAILID:
-		if (p->p_ucred->cr_prison == NULL ||
-		    (p->p_ucred->cr_prison->pr_id != (int)id)) {
+		if (p->p_ucred->cr_prison->pr_id != (int)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
@@ -935,7 +1031,6 @@
 	default:
 		PROC_UNLOCK(p);
 		return (0);
-		break;
 	}
 
 	if (p_canwait(td, p)) {
@@ -962,10 +1057,8 @@
 		return (0);
 	}
 
-	PROC_SLOCK(p);
-
 	if (siginfo != NULL) {
-		bzero (siginfo, sizeof (*siginfo));
+		bzero(siginfo, sizeof(*siginfo));
 		siginfo->si_errno = 0;
 
 		/*
@@ -980,16 +1073,19 @@
 		 *  This is still a rough estimate.  We will fix the
 		 *  cases TRAPPED, STOPPED, and CONTINUED later.
 		 */
-		if (WCOREDUMP(p->p_xstat))
+		if (WCOREDUMP(p->p_xstat)) {
 			siginfo->si_code = CLD_DUMPED;
-		else if (WIFSIGNALED(p->p_xstat))
+			siginfo->si_status = WTERMSIG(p->p_xstat);
+		} else if (WIFSIGNALED(p->p_xstat)) {
 			siginfo->si_code = CLD_KILLED;
-		else
+			siginfo->si_status = WTERMSIG(p->p_xstat);
+		} else {
 			siginfo->si_code = CLD_EXITED;
+			siginfo->si_status = WEXITSTATUS(p->p_xstat);
+		}
 
 		siginfo->si_pid = p->p_pid;
 		siginfo->si_uid = p->p_ucred->cr_uid;
-		siginfo->si_status = p->p_xstat;
 
 		/*
 		 * The si_addr field would be useful additional
@@ -1007,7 +1103,9 @@
 	if (wrusage != NULL) {
 		rup = &wrusage->wru_self;
 		*rup = p->p_ru;
+		PROC_STATLOCK(p);
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
+		PROC_STATUNLOCK(p);
 
 		rup = &wrusage->wru_children;
 		*rup = p->p_stats->p_cru;
@@ -1014,11 +1112,11 @@
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 	}
 
-	if (p->p_state == PRS_ZOMBIE) {
+	if (p->p_state == PRS_ZOMBIE && !check_only) {
+		PROC_SLOCK(p);
 		proc_reap(td, p, status, options);
 		return (-1);
 	}
-	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (1);
 }
@@ -1054,8 +1152,8 @@
 		wrup = NULL;
 
 	/*
-	 *  For backward compatibility we implicitly add flags WEXITED
-	 *  and WTRAPPED here.
+	 * For backward compatibility we implicitly add flags WEXITED
+	 * and WTRAPPED here.
 	 */
 	options |= WEXITED | WTRAPPED;
 	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
@@ -1108,7 +1206,7 @@
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		ret = proc_to_reap(td, p, idtype, id, status, options,
-		    wrusage, siginfo);
+		    wrusage, siginfo, 0);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
@@ -1142,6 +1240,10 @@
 				PROC_UNLOCK(q);
 			}
 
+			CTR4(KTR_PTRACE,
+	    "wait: returning trapped pid %d status %#x (xstat %d) xthread %d",
+			    p->p_pid, W_STOPCODE(p->p_xstat), p->p_xstat,
+			    p->p_xthread != NULL ? p->p_xthread->td_tid : -1);
 			PROC_UNLOCK(p);
 			return (0);
 		}
@@ -1206,15 +1308,17 @@
 	 * for.  By maintaining a list of orphans we allow the parent
 	 * to successfully wait until the child becomes a zombie.
 	 */
-	LIST_FOREACH(p, &q->p_orphans, p_orphan) {
-		ret = proc_to_reap(td, p, idtype, id, status, options,
-		    wrusage, siginfo);
-		if (ret == 0)
-			continue;
-		else if (ret == 1)
-			nfound++;
-		else
-			return (0);
+	if (nfound == 0) {
+		LIST_FOREACH(p, &q->p_orphans, p_orphan) {
+			ret = proc_to_reap(td, p, idtype, id, NULL, options,
+			    NULL, NULL, 1);
+			if (ret != 0) {
+				KASSERT(ret != -1, ("reaped an orphan (pid %d)",
+				    (int)td->td_retval[0]));
+				nfound++;
+				break;
+			}
+		}
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
@@ -1259,8 +1363,15 @@
 
 	clear_orphan(child);
 	if (child->p_flag & P_TRACED) {
-		LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan);
-		child->p_flag |= P_ORPHAN;
+		if (LIST_EMPTY(&child->p_pptr->p_orphans)) {
+			child->p_treeflag |= P_TREE_FIRST_ORPHAN;
+			LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child,
+			    p_orphan);
+		} else {
+			LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans),
+			    child, p_orphan);
+		}
+		child->p_treeflag |= P_TREE_ORPHANED;
 	}
 
 	child->p_pptr = parent;

Modified: trunk/sys/kern/kern_fail.c
===================================================================
--- trunk/sys/kern/kern_fail.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_fail.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2009 Isilon Inc http://www.isilon.com/
  *
@@ -50,7 +51,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_fail.c 227293 2011-11-07 06:44:47Z ed $");
 
 #include <sys/ctype.h>
 #include <sys/errno.h>

Modified: trunk/sys/kern/kern_fork.c
===================================================================
--- trunk/sys/kern/kern_fork.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_fork.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_fork.c 321020 2017-07-15 17:25:40Z dchagin $");
 
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
@@ -59,6 +60,7 @@
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/pioctl.h>
+#include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
@@ -89,10 +91,7 @@
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , create, create);
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");
+SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
@@ -269,11 +268,21 @@
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
+		 *
+		 * Avoid reuse of the process group id, session id or
+		 * the reaper subtree id.  Note that for process group
+		 * and sessions, the amount of reserved pids is
+		 * limited by process limit.  For the subtree ids, the
+		 * id is kept reserved only while there is a
+		 * non-reaped process in the subtree, so amount of
+		 * reserved pids is limited by process limit times
+		 * two.
 		 */
 		p = LIST_FIRST(&allproc);
 again:
 		for (; p != NULL; p = LIST_NEXT(p, p_list)) {
 			while (p->p_pid == trypid ||
+			    p->p_reapsubtree == trypid ||
 			    (p->p_pgrp != NULL &&
 			    (p->p_pgrp->pg_id == trypid ||
 			    (p->p_session != NULL &&
@@ -325,7 +334,7 @@
 	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 	    (flags & (RFCFDG | RFFDG))) {
 		PROC_LOCK(p1);
-		if (thread_single(SINGLE_BOUNDARY)) {
+		if (thread_single(p1, SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p1);
 			return (ERESTART);
 		}
@@ -342,7 +351,7 @@
 	if (flags & RFCFDG) {
 		struct filedesc *fdtmp;
 		fdtmp = fdinit(td->td_proc->p_fd);
-		fdfree(td);
+		fdescfree(td);
 		p1->p_fd = fdtmp;
 	}
 
@@ -349,14 +358,14 @@
 	/*
 	 * Unshare file descriptors (from parent).
 	 */
-	if (flags & RFFDG) 
-		fdunshare(p1, td);
+	if (flags & RFFDG)
+		fdunshare(td);
 
 fail:
 	if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 	    (flags & (RFCFDG | RFFDG))) {
 		PROC_LOCK(p1);
-		thread_single_end();
+		thread_single_end(p1, SINGLE_BOUNDARY);
 		PROC_UNLOCK(p1);
 	}
 	return (error);
@@ -378,12 +387,6 @@
 	p2_held = 0;
 	p1 = td->td_proc;
 
-	/*
-	 * Increment the nprocs resource before blocking can occur.  There
-	 * are hard-limits as to the number of processes that can run.
-	 */
-	nprocs++;
-
 	trypid = fork_findpid(flags);
 
 	sx_sunlock(&proctree_lock);
@@ -392,6 +395,7 @@
 	p2->p_pid = trypid;
 	AUDIT_ARG_PID(p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
+	allproc_gen++;
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 	tidhash_add(td2);
 	PROC_LOCK(p2);
@@ -401,14 +405,17 @@
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
+	p2->p_elf_machine = p1->p_elf_machine;
+	p2->p_elf_flags = p1->p_elf_flags;
 	pargs_hold(p2->p_args);
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
+	p2->p_treeflag = 0;
+	p2->p_filemon = NULL;
+	p2->p_ptevents = 0;
 
-	p2->p_ucred = crhold(td->td_ucred);
-
 	/* Tell the prison that we exist. */
 	prison_proc_hold(p2->p_ucred->cr_prison);
 
@@ -465,6 +472,8 @@
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
+	td2->td_su = NULL;
+	td2->td_sleeptimo = 0;
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
@@ -471,9 +480,10 @@
 
 	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
 	td2->td_sigstk = td->td_sigstk;
-	td2->td_sigmask = td->td_sigmask;
 	td2->td_flags = TDF_INMEM;
 	td2->td_lend_user_pri = PRI_MAX;
+	td2->td_dbg_sc_code = td->td_dbg_sc_code;
+	td2->td_dbg_sc_narg = td->td_dbg_sc_narg;
 
 #ifdef VIMAGE
 	td2->td_vnet = NULL;
@@ -492,6 +502,7 @@
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = P_INMEM;
+	p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC);
 	p2->p_swtick = ticks;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
@@ -515,6 +526,11 @@
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
+	if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
+		p2->p_flag |= P_PROTECTED;
+		p2->p_flag2 |= P2_INHERIT_PROTECTED;
+	}
+
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
@@ -613,12 +629,22 @@
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
-	if (flags & RFNOWAIT)
-		pptr = initproc;
-	else
+	if ((flags & RFNOWAIT) != 0) {
+		pptr = p1->p_reaper;
+		p2->p_reaper = pptr;
+	} else {
+		p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
+		    p1 : p1->p_reaper;
 		pptr = p1;
+	}
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+	LIST_INIT(&p2->p_reaplist);
+	LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
+	if (p2->p_reaper == p1)
+		p2->p_reapsubtree = p2->p_pid;
+	else
+		p2->p_reapsubtree = p1->p_reapsubtree;		
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
@@ -681,16 +707,15 @@
 
 #ifdef KDTRACE_HOOKS
 	/*
-	 * Tell the DTrace fasttrap provider about the new process
-	 * if it has registered an interest. We have to do this only after
-	 * p_state is PRS_NORMAL since the fasttrap module will use pfind()
-	 * later on.
+	 * Tell the DTrace fasttrap provider about the new process so that any
+	 * tracepoints inherited from the parent can be removed. We have to do
+	 * this only after p_state is PRS_NORMAL since the fasttrap module will
+	 * use pfind() later on.
 	 */
-	if (dtrace_fasttrap_fork)
+	if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork)
 		dtrace_fasttrap_fork(p1, p2);
 #endif
-	if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
-	    P_FOLLOWFORK)) {
+	if (p1->p_ptevents & PTRACE_FORK) {
 		/*
 		 * Arrange for debugger to receive the fork event.
 		 *
@@ -707,6 +732,7 @@
 	if (flags & RFPPWAIT) {
 		td->td_pflags |= TDP_RFPPWAIT;
 		td->td_rfppwait_p = p2;
+		td->td_dbgflags |= TDB_VFORK;
 	}
 	PROC_UNLOCK(p2);
 	if ((flags & RFSTOPPED) == 0) {
@@ -730,7 +756,7 @@
 	 * Tell any interested parties about the new process.
 	 */
 	knote_fork(&p1->p_klist, p2->p_pid);
-	SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
+	SDT_PROBE3(proc, , , create, p2, p1, flags);
 
 	/*
 	 * Wait until debugger is attached to child.
@@ -747,18 +773,16 @@
 fork1(struct thread *td, int flags, int pages, struct proc **procp,
     int *procdescp, int pdflags)
 {
-	struct proc *p1;
-	struct proc *newproc;
-	int ok;
+	struct proc *p1, *newproc;
 	struct thread *td2;
 	struct vmspace *vm2;
+#ifdef PROCDESC
+	struct file *fp_procdesc;
+#endif
 	vm_ooffset_t mem_charged;
-	int error;
+	int error, nprocs_new, ok;
 	static int curfail;
 	static struct timeval lastfail;
-#ifdef PROCDESC
-	struct file *fp_procdesc = NULL;
-#endif
 
 	/* Check for the undefined or unimplemented flags. */
 	if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
@@ -800,7 +824,38 @@
 	}
 
 #ifdef PROCDESC
+	fp_procdesc = NULL;
+#endif
+	newproc = NULL;
+	vm2 = NULL;
+
 	/*
+	 * Increment the nprocs resource before allocations occur.
+	 * Although process entries are dynamically created, we still
+	 * keep a global limit on the maximum number we will
+	 * create. There are hard-limits as to the number of processes
+	 * that can run, established by the KVA and memory usage for
+	 * the process data.
+	 *
+	 * Don't allow a nonprivileged user to use the last ten
+	 * processes; don't let root exceed the limit.
+	 */
+	nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
+	if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
+	    PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
+		sx_xlock(&allproc_lock);
+		if (ppsratecheck(&lastfail, &curfail, 1)) {
+			printf("maxproc limit exceeded by uid %u (pid %d); "
+			    "see tuning(7) and login.conf(5)\n",
+			    td->td_ucred->cr_ruid, p1->p_pid);
+		}
+		sx_xunlock(&allproc_lock);
+		error = EAGAIN;
+		goto fail1;
+	}
+
+#ifdef PROCDESC
+	/*
 	 * If required, create a process descriptor in the parent first; we
 	 * will abandon it if something goes wrong. We don't finit() until
 	 * later.
@@ -808,12 +863,11 @@
 	if (flags & RFPROCDESC) {
 		error = falloc(td, &fp_procdesc, procdescp, 0);
 		if (error != 0)
-			return (error);
+			goto fail1;
 	}
 #endif
 
 	mem_charged = 0;
-	vm2 = NULL;
 	if (pages == 0)
 		pages = KSTACK_PAGES;
 	/* Allocate new proc. */
@@ -823,7 +877,7 @@
 		td2 = thread_alloc(pages);
 		if (td2 == NULL) {
 			error = ENOMEM;
-			goto fail1;
+			goto fail2;
 		}
 		proc_linkup(newproc, td2);
 	} else {
@@ -832,7 +886,7 @@
 				vm_thread_dispose(td2);
 			if (!thread_alloc_stack(td2, pages)) {
 				error = ENOMEM;
-				goto fail1;
+				goto fail2;
 			}
 		}
 	}
@@ -841,18 +895,18 @@
 		vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
 		if (vm2 == NULL) {
 			error = ENOMEM;
-			goto fail1;
+			goto fail2;
 		}
 		if (!swap_reserve(mem_charged)) {
 			/*
 			 * The swap reservation failed. The accounting
 			 * from the entries of the copied vm2 will be
-			 * substracted in vmspace_free(), so force the
+			 * subtracted in vmspace_free(), so force the
 			 * reservation there.
 			 */
 			swap_reserve_force(mem_charged);
 			error = ENOMEM;
-			goto fail1;
+			goto fail2;
 		}
 	} else
 		vm2 = NULL;
@@ -861,7 +915,7 @@
 	 * XXX: This is ugly; when we copy resource usage, we need to bump
 	 *      per-cred resource counters.
 	 */
-	newproc->p_ucred = p1->p_ucred;
+	proc_set_cred_init(newproc, crhold(td->td_ucred));
 
 	/*
 	 * Initialize resource accounting for the child process.
@@ -880,20 +934,7 @@
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
-
-	/*
-	 * Although process entries are dynamically created, we still keep
-	 * a global limit on the maximum number we will create.  Don't allow
-	 * a nonprivileged user to use the last ten processes; don't let root
-	 * exceed the limit. The variable nprocs is the current number of
-	 * processes, maxproc is the limit.
-	 */
 	sx_xlock(&allproc_lock);
-	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
-	    PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
-		error = EAGAIN;
-		goto fail;
-	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
@@ -918,8 +959,10 @@
 		 */
 		*procp = newproc;
 #ifdef PROCDESC
-		if (flags & RFPROCDESC)
+		if (flags & RFPROCDESC) {
 			procdesc_finit(newproc->p_procdesc, fp_procdesc);
+			fdrop(fp_procdesc, td);
+		}
 #endif
 		racct_proc_fork_done(newproc);
 		return (0);
@@ -926,11 +969,7 @@
 	}
 
 	error = EAGAIN;
-fail:
 	sx_sunlock(&proctree_lock);
-	if (ppsratecheck(&lastfail, &curfail, 1))
-		printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
-		    td->td_ucred->cr_ruid, p1->p_pid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_proc_destroy(newproc);
@@ -937,13 +976,19 @@
 #endif
 	racct_proc_exit(newproc);
 fail1:
+	crfree(newproc->p_ucred);
+	newproc->p_ucred = NULL;
+fail2:
 	if (vm2 != NULL)
 		vmspace_free(vm2);
 	uma_zfree(proc_zone, newproc);
 #ifdef PROCDESC
-	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
+	if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
+		fdclose(td, fp_procdesc, *procdescp);
 		fdrop(fp_procdesc, td);
+	}
 #endif
+	atomic_add_int(&nprocs, -1);
 	pause("fork", hz / 2);
 	return (error);
 }
@@ -994,7 +1039,7 @@
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    td->td_name, p->p_pid);
-		kproc_exit(0);
+		kthread_exit();
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
@@ -1004,9 +1049,9 @@
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
- * directly into user mode.  Giant is not held on entry, and must not
- * be held on return.  This function is passed in to fork_exit() as the
- * first parameter and is called when returning to a new userland process.
+ * directly into user mode.  This function is passed in to fork_exit()
+ * as the first parameter and is called when returning to a new
+ * userland process.
  */
 void
 fork_return(struct thread *td, struct trapframe *frame)
@@ -1013,24 +1058,25 @@
 {
 	struct proc *p, *dbg;
 
+	p = td->td_proc;
 	if (td->td_dbgflags & TDB_STOPATFORK) {
-		p = td->td_proc;
 		sx_xlock(&proctree_lock);
 		PROC_LOCK(p);
-		if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
-		    (P_TRACED | P_FOLLOWFORK)) {
+		if (p->p_pptr->p_ptevents & PTRACE_FORK) {
 			/*
 			 * If debugger still wants auto-attach for the
 			 * parent's children, do it now.
 			 */
 			dbg = p->p_pptr->p_pptr;
-			p->p_flag |= P_TRACED;
-			p->p_oppid = p->p_pptr->p_pid;
+			proc_set_traced(p, true);
+			CTR2(KTR_PTRACE,
+		    "fork_return: attaching to new child pid %d: oppid %d",
+			    p->p_pid, p->p_oppid);
 			proc_reparent(p, dbg);
 			sx_xunlock(&proctree_lock);
-			td->td_dbgflags |= TDB_CHILD;
-			ptracestop(td, SIGSTOP);
-			td->td_dbgflags &= ~TDB_CHILD;
+			td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
+			ptracestop(td, SIGSTOP, NULL);
+			td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
 		} else {
 			/*
 			 * ... otherwise clear the request.
@@ -1040,6 +1086,19 @@
 			cv_broadcast(&p->p_dbgwait);
 		}
 		PROC_UNLOCK(p);
+	} else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) {
+ 		/*
+		 * This is the start of a new thread in a traced
+		 * process.  Report a system call exit event.
+		 */
+		PROC_LOCK(p);
+		td->td_dbgflags |= TDB_SCX;
+		_STOPEVENT(p, S_SCX, td->td_dbg_sc_code);
+		if ((p->p_ptevents & PTRACE_SCX) != 0 ||
+		    (td->td_dbgflags & TDB_BORN) != 0)
+			ptracestop(td, SIGTRAP, NULL);
+		td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
+		PROC_UNLOCK(p);
 	}
 
 	userret(td, frame);
@@ -1048,5 +1107,4 @@
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
-	mtx_assert(&Giant, MA_NOTOWNED);
 }

Modified: trunk/sys/kern/kern_gzio.c
===================================================================
--- trunk/sys/kern/kern_gzio.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_gzio.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*
- * $Id: kern_gzio.c,v 1.2 2012-10-09 04:08:16 laffer1 Exp $
+ * $Id: kern_gzio.c,v 1.6 2008-10-18 22:54:45 lbazinet Exp $
  *
  * core_gzip.c -- gzip routines used in compressing user process cores
  *
@@ -12,7 +13,7 @@
  *
  */
 
-/* @(#) $MidnightBSD$ */
+/* @(#) $FreeBSD: stable/10/sys/kern/kern_gzio.c 241896 2012-10-22 17:50:54Z kib $ */
 
 #include <sys/param.h>
 #include <sys/proc.h>
@@ -219,7 +220,6 @@
     off_t curoff;
     size_t resid;
     int error;
-    int vfslocked;
 
     if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
 
@@ -232,11 +232,9 @@
         if (s->stream.avail_out == 0) {
 
             s->stream.next_out = s->outbuf;
-            vfslocked = VFS_LOCK_GIANT(s->file->v_mount);
             error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, Z_BUFSIZE,
                         curoff, UIO_SYSSPACE, IO_UNIT,
                         curproc->p_ucred, NOCRED, &resid, curthread);
-            VFS_UNLOCK_GIANT(vfslocked);
             if (error) {
                 log(LOG_ERR, "gzwrite: vn_rdwr return %d\n", error);
                 curoff += Z_BUFSIZE - resid;
@@ -274,7 +272,6 @@
     gz_stream *s = (gz_stream*)file;
     off_t curoff = s->outoff;
     size_t resid;
-    int vfslocked = 0;
     int error;
 
     if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
@@ -289,11 +286,9 @@
         len = Z_BUFSIZE - s->stream.avail_out;
 
         if (len != 0) {
-            vfslocked = VFS_LOCK_GIANT(s->file->v_mount);
             error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, len, curoff,
                         UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
                         NOCRED, &resid, curthread);
-            VFS_UNLOCK_GIANT(vfslocked);
 	    if (error) {
                 s->z_err = Z_ERRNO;
                 s->outoff = curoff + len - resid;

Modified: trunk/sys/kern/kern_hhook.c
===================================================================
--- trunk/sys/kern/kern_hhook.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_hhook.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2010 Lawrence Stewart <lstewart at freebsd.org>
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart at freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_hhook.c 251787 2013-06-15 10:08:34Z lstewart $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -61,15 +62,20 @@
 static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists");
 
 LIST_HEAD(hhookheadhead, hhook_head);
-VNET_DEFINE(struct hhookheadhead, hhook_head_list);
-#define	V_hhook_head_list VNET(hhook_head_list)
+struct hhookheadhead hhook_head_list;
+VNET_DEFINE(struct hhookheadhead, hhook_vhead_list);
+#define	V_hhook_vhead_list VNET(hhook_vhead_list)
 
 static struct mtx hhook_head_list_lock;
 MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
     MTX_DEF);
 
+/* Protected by hhook_head_list_lock. */
+static uint32_t n_hhookheads;
+
 /* Private function prototypes. */
 static void hhook_head_destroy(struct hhook_head *hhh);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
 
 #define	HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock)
 #define	HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
@@ -164,22 +170,72 @@
 }
 
 /*
- * Lookup a helper hook point and register a new helper hook function with it.
+ * Register a helper hook function with a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ *
+ * The logic is unfortunately far more complex than for
+ * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with
+ * M_WAITOK and thus we cannot call hhook_add_hook() with the
+ * hhook_head_list_lock held.
+ *
+ * The logic assembles an array of hhook_head structs that correspond to the
+ * helper hook point being hooked and bumps the refcount on each (all done with
+ * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and
+ * hhook_add_hook() is called and the refcount dropped for each hhook_head
+ * struct in the array.
  */
 int
 hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags)
 {
-	struct hhook_head *hhh;
-	int error;
+	struct hhook_head **heads_to_hook, *hhh;
+	int error, i, n_heads_to_hook;
 
-	hhh = hhook_head_get(hki->hook_type, hki->hook_id);
+tryagain:
+	error = i = 0;
+	/*
+	 * Accessing n_hhookheads without hhook_head_list_lock held opens up a
+	 * race with hhook_head_register() which we are unlikely to lose, but
+	 * nonetheless have to cope with - hence the complex goto logic.
+	 */
+	n_heads_to_hook = n_hhookheads;
+	heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *),
+	    M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT);
+	if (heads_to_hook == NULL)
+		return (ENOMEM);
 
-	if (hhh == NULL)
-		return (ENOENT);
+	HHHLIST_LOCK();
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+		if (hhh->hhh_type == hki->hook_type &&
+		    hhh->hhh_id == hki->hook_id) {
+			if (i < n_heads_to_hook) {
+				heads_to_hook[i] = hhh;
+				refcount_acquire(&heads_to_hook[i]->hhh_refcount);
+				i++;
+			} else {
+				/*
+				 * We raced with hhook_head_register() which
+				 * inserted a hhook_head that we need to hook
+				 * but did not malloc space for. Abort this run
+				 * and try again.
+				 */
+				for (i--; i >= 0; i--)
+					refcount_release(&heads_to_hook[i]->hhh_refcount);
+				free(heads_to_hook, M_HHOOK);
+				HHHLIST_UNLOCK();
+				goto tryagain;
+			}
+		}
+	}
+	HHHLIST_UNLOCK();
 
-	error = hhook_add_hook(hhh, hki, flags);
-	hhook_head_release(hhh);
+	for (i--; i >= 0; i--) {
+		if (!error)
+			error = hhook_add_hook(heads_to_hook[i], hki, flags);
+		refcount_release(&heads_to_hook[i]->hhh_refcount);
+	}
 
+	free(heads_to_hook, M_HHOOK);
+
 	return (error);
 }
 
@@ -210,7 +266,8 @@
 }
 
 /*
- * Lookup a helper hook point and remove a helper hook function from it.
+ * Remove a helper hook function from a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
  */
 int
 hhook_remove_hook_lookup(struct hookinfo *hki)
@@ -217,14 +274,14 @@
 {
 	struct hhook_head *hhh;
 
-	hhh = hhook_head_get(hki->hook_type, hki->hook_id);
+	HHHLIST_LOCK();
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+		if (hhh->hhh_type == hki->hook_type &&
+		    hhh->hhh_id == hki->hook_id)
+			hhook_remove_hook(hhh, hki);
+	}
+	HHHLIST_UNLOCK();
 
-	if (hhh == NULL)
-		return (ENOENT);
-
-	hhook_remove_hook(hhh, hki);
-	hhook_head_release(hhh);
-
 	return (0);
 }
 
@@ -245,13 +302,6 @@
 		return (EEXIST);
 	}
 
-	/* XXXLAS: Need to implement support for non-virtualised hooks. */
-	if ((flags & HHOOK_HEADISINVNET) == 0) {
-		printf("%s: only vnet-style virtualised hooks can be used\n",
-		    __func__);
-		return (EINVAL);
-	}
-
 	tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK,
 	    M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
 
@@ -263,23 +313,28 @@
 	tmphhh->hhh_nhooks = 0;
 	STAILQ_INIT(&tmphhh->hhh_hooks);
 	HHH_LOCK_INIT(tmphhh);
+	refcount_init(&tmphhh->hhh_refcount, 1);
 
-	if (hhh != NULL)
-		refcount_init(&tmphhh->hhh_refcount, 1);
-	else
-		refcount_init(&tmphhh->hhh_refcount, 0);
-
+	HHHLIST_LOCK();
 	if (flags & HHOOK_HEADISINVNET) {
 		tmphhh->hhh_flags |= HHH_ISINVNET;
-		HHHLIST_LOCK();
-		LIST_INSERT_HEAD(&V_hhook_head_list, tmphhh, hhh_next);
-		HHHLIST_UNLOCK();
-	} else {
-		/* XXXLAS: Add tmphhh to the non-virtualised list. */
+#ifdef VIMAGE
+		KASSERT(curvnet != NULL, ("curvnet is NULL"));
+		tmphhh->hhh_vid = (uintptr_t)curvnet;
+		LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext);
+#endif
 	}
+	LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next);
+	n_hhookheads++;
+	HHHLIST_UNLOCK();
 
-	*hhh = tmphhh;
+	khelp_new_hhook_registered(tmphhh, flags);
 
+	if (hhh != NULL)
+		*hhh = tmphhh;
+	else
+		refcount_release(&tmphhh->hhh_refcount);
+
 	return (0);
 }
 
@@ -289,8 +344,13 @@
 	struct hhook *tmp, *tmp2;
 
 	HHHLIST_LOCK_ASSERT();
+	KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0"));
 
 	LIST_REMOVE(hhh, hhh_next);
+#ifdef VIMAGE
+	if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET)
+		LIST_REMOVE(hhh, hhh_vnext);
+#endif
 	HHH_WLOCK(hhh);
 	STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2)
 		free(tmp, M_HHOOK);
@@ -297,6 +357,7 @@
 	HHH_WUNLOCK(hhh);
 	HHH_LOCK_DESTROY(hhh);
 	free(hhh, M_HHOOK);
+	n_hhookheads--;
 }
 
 /*
@@ -348,10 +409,17 @@
 {
 	struct hhook_head *hhh;
 
-	/* XXXLAS: Pick hhook_head_list based on hhook_head flags. */
 	HHHLIST_LOCK();
-	LIST_FOREACH(hhh, &V_hhook_head_list, hhh_next) {
+	LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
 		if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) {
+#ifdef VIMAGE
+			if (hhook_head_is_virtualised(hhh) ==
+			    HHOOK_HEADISINVNET) {
+				KASSERT(curvnet != NULL, ("curvnet is NULL"));
+				if (hhh->hhh_vid != (uintptr_t)curvnet)
+					continue;
+			}
+#endif
 			refcount_acquire(&hhh->hhh_refcount);
 			break;
 		}
@@ -413,7 +481,7 @@
 hhook_vnet_init(const void *unused __unused)
 {
 
-	LIST_INIT(&V_hhook_head_list);
+	LIST_INIT(&V_hhook_vhead_list);
 }
 
 /*
@@ -430,7 +498,7 @@
 	 * subsystem should have already called hhook_head_deregister().
 	 */
 	HHHLIST_LOCK();
-	LIST_FOREACH_SAFE(hhh, &V_hhook_head_list, hhh_next, tmphhh) {
+	LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) {
 		printf("%s: hhook_head type=%d, id=%d cleanup required\n",
 		    __func__, hhh->hhh_type, hhh->hhh_id);
 		hhook_head_destroy(hhh);
@@ -440,9 +508,9 @@
 
 
 /*
- * When a vnet is created and being initialised, init the V_hhook_head_list.
+ * When a vnet is created and being initialised, init the V_hhook_vhead_list.
  */
-VNET_SYSINIT(hhook_vnet_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
+VNET_SYSINIT(hhook_vnet_init, SI_SUB_MBUF, SI_ORDER_FIRST,
     hhook_vnet_init, NULL);
 
 /*
@@ -450,5 +518,5 @@
  * points to clean up on vnet tear down, but in case the KPI is misused,
  * provide a function to clean up and free memory for a vnet being destroyed.
  */
-VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
+VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_MBUF, SI_ORDER_ANY,
     hhook_vnet_uninit, NULL);

Modified: trunk/sys/kern/kern_intr.c
===================================================================
--- trunk/sys/kern/kern_intr.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_intr.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1997, Stefan Esser <se at freebsd.org>
  * All rights reserved.
@@ -25,9 +26,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_intr.c 272946 2014-10-11 17:49:51Z kib $");
 
 #include "opt_ddb.h"
+#include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -336,7 +338,7 @@
 			if (ie->ie_cpu == NOCPU)
 				CPU_COPY(cpuset_root, &mask);
 			else
-				CPU_SET(cpu, &mask);
+				CPU_SET(ie->ie_cpu, &mask);
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
 			(void)cpuset_setthread(id, &mask);
@@ -626,7 +628,7 @@
 		mtx_lock(&ie->ie_lock);
 		it->it_event = ie;
 		ih->ih_thread = it;
-		ithread_update(it); // XXX - do we really need this?!?!?
+		ithread_update(it); /* XXX - do we really need this?!?!? */
 	} else { /* Create the global per-event thread if we need one. */
 		while (ie->ie_thread == NULL && handler != NULL) {
 			if (ie->ie_flags & IE_ADDING_THREAD)
@@ -698,9 +700,9 @@
 	 * description at that point.  If one is not found, find the
 	 * end of the name to use as the insertion point.
 	 */
-	start = index(ih->ih_name, ':');
+	start = strchr(ih->ih_name, ':');
 	if (start == NULL)
-		start = index(ih->ih_name, 0);
+		start = strchr(ih->ih_name, 0);
 
 	/*
 	 * See if there is enough remaining room in the string for the
@@ -841,7 +843,7 @@
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 */
-		ie->ie_thread->it_need = 1;
+		atomic_store_rel_int(&ie->ie_thread->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(ie->ie_thread->it_thread);
@@ -901,7 +903,7 @@
 		    p->p_pid, td->td_name);
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
-		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		random_harvest(&entropy, sizeof(entropy), 2,
 		    RANDOM_INTERRUPT);
 	}
 
@@ -912,7 +914,7 @@
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 */
-	it->it_need = 1;
+	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@@ -990,7 +992,7 @@
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 */
-		it->it_need = 1;
+		atomic_store_rel_int(&it->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(it->it_thread);
@@ -1055,7 +1057,7 @@
 		    p->p_pid, td->td_name);
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
-		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		random_harvest(&entropy, sizeof(entropy), 2,
 		    RANDOM_INTERRUPT);
 	}
 
@@ -1066,7 +1068,7 @@
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 */
-	it->it_need = 1;
+	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@@ -1103,7 +1105,6 @@
 swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
-	struct thread *td;
 	struct intr_event *ie;
 	int error;
 
@@ -1125,15 +1126,7 @@
 	}
 	error = intr_event_add_handler(ie, name, NULL, handler, arg,
 	    PI_SWI(pri), flags, cookiep);
-	if (error)
-		return (error);
-	if (pri == SWI_CLOCK) {
-		td = ie->ie_thread->it_thread;
-		thread_lock(td);
-		td->td_flags |= TDF_NOLOAD;
-		thread_unlock(td);
-	}
-	return (0);
+	return (error);
 }
 
 /*
@@ -1155,8 +1148,8 @@
 		    curproc->p_pid, curthread->td_name);
 		entropy.event = (uintptr_t)ih;
 		entropy.td = curthread;
-		random_harvest(&entropy, sizeof(entropy), 1, 0,
-		    RANDOM_INTERRUPT);
+		random_harvest(&entropy, sizeof(entropy), 1,
+		    RANDOM_SWI);
 	}
 
 	/*
@@ -1256,7 +1249,7 @@
 		 * interrupt threads always invoke all of their handlers.
 		 */
 		if (ie->ie_flags & IE_SOFT) {
-			if (!ih->ih_need)
+			if (atomic_load_acq_int(&ih->ih_need) == 0)
 				continue;
 			else
 				atomic_store_rel_int(&ih->ih_need, 0);
@@ -1358,7 +1351,7 @@
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 */
-		while (ithd->it_need) {
+		while (atomic_load_acq_int(&ithd->it_need) != 0) {
 			/*
 			 * This might need a full read and write barrier
 			 * to make sure that this write posts before any
@@ -1377,7 +1370,8 @@
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
-		if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+		if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+		    !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
@@ -1415,6 +1409,10 @@
 
 	td = curthread;
 
+#ifdef KSTACK_USAGE_PROF
+	intr_prof_stack_use(td, frame);
+#endif
+
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
@@ -1538,7 +1536,7 @@
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 */
-		while (ithd->it_need) {
+		while (atomic_load_acq_int(&ithd->it_need) != 0) {
 			/*
 			 * This might need a full read and write barrier
 			 * to make sure that this write posts before any
@@ -1560,7 +1558,8 @@
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
-		if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+		if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+		    !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
@@ -1856,8 +1855,8 @@
 	struct intr_event *ie;
 	int all, verbose;
 
-	verbose = index(modif, 'v') != NULL;
-	all = index(modif, 'a') != NULL;
+	verbose = strchr(modif, 'v') != NULL;
+	all = strchr(modif, 'a') != NULL;
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
 			continue;
@@ -1902,6 +1901,24 @@
 static int
 sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
 {
+#ifdef SCTL_MASK32
+	uint32_t *intrcnt32;
+	unsigned i;
+	int error;
+
+	if (req->flags & SCTL_MASK32) {
+		if (!req->oldptr)
+			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
+		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
+		if (intrcnt32 == NULL)
+			return (ENOMEM);
+		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
+			intrcnt32[i] = intrcnt[i];
+		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
+		free(intrcnt32, M_TEMP);
+		return (error);
+	}
+#endif
 	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
 }
 

Modified: trunk/sys/kern/kern_jail.c
===================================================================
--- trunk/sys/kern/kern_jail.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_jail.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
@@ -27,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -206,6 +207,10 @@
 	"allow.mount.nullfs",
 	"allow.mount.zfs",
 	"allow.mount.procfs",
+	"allow.mount.tmpfs",
+	"allow.mount.fdescfs",
+	"allow.mount.linprocfs",
+	"allow.mount.linsysfs",
 };
 const size_t pr_allow_names_size = sizeof(pr_allow_names);
 
@@ -221,6 +226,10 @@
 	"allow.mount.nonullfs",
 	"allow.mount.nozfs",
 	"allow.mount.noprocfs",
+	"allow.mount.notmpfs",
+	"allow.mount.nofdescfs",
+	"allow.mount.nolinprocfs",
+	"allow.mount.nolinsysfs",
 };
 const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
 
@@ -234,6 +243,19 @@
 static unsigned jail_max_af_ips = 255;
 #endif
 
+/*
+ * Initialize the parts of prison0 that can't be static-initialized with
+ * constants.  This is called from proc0_init() after creating thread0 cpuset.
+ */
+void
+prison0_init(void)
+{
+
+	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
+	prison0.pr_osreldate = osreldate;
+	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
+}
+
 #ifdef INET
 static int
 qcmp_v4(const void *ip1, const void *ip2)
@@ -311,7 +333,7 @@
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
-		j.ip4s = j0.ip_number;
+		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
@@ -533,17 +555,18 @@
 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
-	char *g_path;
+	char *g_path, *osrelstr;
 #if defined(INET) || defined(INET6)
 	struct prison *tppr;
 	void *op;
 #endif
 	unsigned long hid;
-	size_t namelen, onamelen;
-	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
+	size_t namelen, onamelen, pnamelen;
+	int born, created, cuflags, descend, enforce;
+	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int fi, jid, jsys, len, level;
-	int childmax, rsnum, slevel, vfslocked;
+	int childmax, osreldt, rsnum, slevel;
 	int fullpath_disabled;
 #if defined(INET) || defined(INET6)
 	int ii, ij;
@@ -563,7 +586,7 @@
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
-	mypr = ppr = td->td_ucred->cr_prison;
+	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
@@ -590,6 +613,13 @@
 #endif
 	g_path = NULL;
 
+	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
+	if (!cuflags) {
+		error = EINVAL;
+		vfs_opterror(opts, "no valid operation (create or update)");
+		goto done_errmsg;
+	}
+
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
@@ -797,7 +827,7 @@
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
-		ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
+		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(*ip4) - 1)) {
@@ -855,7 +885,7 @@
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
-		ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
+		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(*ip6) - 1)) {
@@ -903,6 +933,46 @@
 	}
 #endif
 
+	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
+	if (error == ENOENT)
+		osrelstr = NULL;
+	else if (error != 0)
+		goto done_free;
+	else {
+		if (flags & JAIL_UPDATE) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "osrelease cannot be changed after creation");
+			goto done_errmsg;
+		}
+		if (len == 0 || len >= OSRELEASELEN) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "osrelease string must be 1-%d bytes long",
+			    OSRELEASELEN - 1);
+			goto done_errmsg;
+		}
+	}
+
+	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
+	if (error == ENOENT)
+		osreldt = 0;
+	else if (error != 0)
+		goto done_free;
+	else {
+		if (flags & JAIL_UPDATE) {
+			error = EINVAL;
+			vfs_opterror(opts,
+			    "osreldate cannot be changed after creation");
+			goto done_errmsg;
+		}
+		if (osreldt == 0) {
+			error = EINVAL;
+			vfs_opterror(opts, "osreldate cannot be 0");
+			goto done_errmsg;
+		}
+	}
+
 	fullpath_disabled = 0;
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
@@ -921,12 +991,11 @@
 			error = EINVAL;
 			goto done_free;
 		}
-		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_SYSSPACE,
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    path, td);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
-		vfslocked = NDHASGIANT(&nd);
 		root = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
@@ -941,22 +1010,20 @@
 				path = NULL;
 		} else {
 			/* exit on other errors */
-			VFS_UNLOCK_GIANT(vfslocked);
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
-			VFS_UNLOCK_GIANT(vfslocked);
 			goto done_free;
 		}
 		VOP_UNLOCK(root, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if (fullpath_disabled) {
 			/* Leave room for a real-root full pathname. */
 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
 				error = ENAMETOOLONG;
+				vrele(root);
 				goto done_free;
 			}
 		}
@@ -963,36 +1030,11 @@
 	}
 
 	/*
-	 * Grab the allprison lock before letting modules check their
-	 * parameters.  Once we have it, do not let go so we'll have a
-	 * consistent view of the OSD list.
-	 */
-	sx_xlock(&allprison_lock);
-	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
-	if (error)
-		goto done_unlock_list;
-
-	/* By now, all parameters should have been noted. */
-	TAILQ_FOREACH(opt, opts, link) {
-		if (!opt->seen && strcmp(opt->name, "errmsg")) {
-			error = EINVAL;
-			vfs_opterror(opts, "unknown parameter: %s", opt->name);
-			goto done_unlock_list;
-		}
-	}
-
-	/*
-	 * See if we are creating a new record or updating an existing one.
+	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
-	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
-	if (!cuflags) {
-		error = EINVAL;
-		vfs_opterror(opts, "no valid operation (create or update)");
-		goto done_unlock_list;
-	}
 	pr = NULL;
-	namelc = NULL;
+	ppr = mypr;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -999,6 +1041,7 @@
 		if (*p != '\0')
 			jid = 0;
 	}
+	sx_xlock(&allprison_lock);
 	if (jid != 0) {
 		/*
 		 * See if a requested jid already exists.  There is an
@@ -1064,6 +1107,7 @@
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
+	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
@@ -1074,7 +1118,6 @@
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
-			*namelc = '\0';
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
@@ -1085,6 +1128,7 @@
 					goto done_unlock_list;
 				}
 			} else {
+				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
@@ -1093,17 +1137,18 @@
 					goto done_unlock_list;
 				}
 				mtx_unlock(&ppr->pr_mtx);
+				*namelc = '.';
 			}
-			name = ++namelc;
+			namelc++;
 		}
-		if (name[0] != '\0') {
-			namelen =
+		if (namelc[0] != '\0') {
+			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
  name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr && tpr->pr_ref > 0 &&
-				    !strcmp(tpr->pr_name + namelen, name)) {
+				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
 					if (pr == NULL &&
 					    cuflags != JAIL_CREATE) {
 						mtx_lock(&tpr->pr_mtx);
@@ -1177,10 +1222,11 @@
 			}
 		created = 1;
 		mtx_lock(&ppr->pr_mtx);
-		if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
+		if (ppr->pr_ref == 0) {
 			mtx_unlock(&ppr->pr_mtx);
 			error = ENOENT;
-			vfs_opterror(opts, "parent jail went away!");
+			vfs_opterror(opts, "jail \"%s\" not found",
+			    prison_name(mypr, ppr));
 			goto done_unlock_list;
 		}
 		ppr->pr_ref++;
@@ -1234,8 +1280,8 @@
 		pr->pr_id = jid;
 
 		/* Set some default values, and inherit some from the parent. */
-		if (name == NULL)
-			name = "";
+		if (namelc == NULL)
+			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
@@ -1290,8 +1336,15 @@
 		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
+		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
+		if (osrelstr == NULL)
+		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
+		else
+		    strcpy(pr->pr_osrelease, osrelstr);
+
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
+		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
@@ -1311,7 +1364,7 @@
 		mtx_lock(&pr->pr_mtx);
 		/*
 		 * New prisons do not yet have a reference, because we do not
-		 * want other to see the incomplete prison once the
+		 * want others to see the incomplete prison once the
 		 * allprison_lock is downgraded.
 		 */
 	} else {
@@ -1525,13 +1578,13 @@
 	}
 #endif
 	onamelen = namelen = 0;
-	if (name != NULL) {
+	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
-		if (name[0] == '\0')
-			snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
+		if (namelc[0] == '\0')
+			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
@@ -1543,9 +1596,10 @@
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
-		onamelen = strlen(pr->pr_name);
-		namelen = strlen(name);
-		if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
+		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
+		onamelen = strlen(pr->pr_name + pnamelen);
+		namelen = strlen(namelc);
+		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref_locked;
 		}
@@ -1562,6 +1616,30 @@
 		goto done_deref_locked;
 	}
 
+	/*
+	 * Let modules check their parameters.  This requires unlocking and
+	 * then re-locking the prison, but this is still a valid state as long
+	 * as allprison_lock remains xlocked.
+	 */
+	mtx_unlock(&pr->pr_mtx);
+	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
+	if (error != 0) {
+		prison_deref(pr, created
+		    ? PD_LIST_XLOCKED
+		    : PD_DEREF | PD_LIST_XLOCKED);
+		goto done_releroot;
+	}
+	mtx_lock(&pr->pr_mtx);
+
+	/* At this point, all valid parameters should have been noted. */
+	TAILQ_FOREACH(opt, opts, link) {
+		if (!opt->seen && strcmp(opt->name, "errmsg")) {
+			error = EINVAL;
+			vfs_opterror(opts, "unknown parameter: %s", opt->name);
+			goto done_deref_locked;
+		}
+	}
+
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = 0;
@@ -1635,12 +1713,12 @@
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
-	if (name != NULL) {
+	if (namelc != NULL) {
 		if (ppr == &prison0)
-			strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
+			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
-			    ppr->pr_name, name);
+			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
@@ -1718,6 +1796,7 @@
 	 * for now, so new ones will remain unseen until after the module
 	 * handlers have completed.
 	 */
+	born = pr->pr_uref == 0;
 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 		if (pr_flags & PR_PERSIST) {
 			pr->pr_ref++;
@@ -1731,7 +1810,7 @@
 	mtx_unlock(&pr->pr_mtx);
 
 #ifdef RACCT
-	if (created)
+	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
@@ -1787,15 +1866,20 @@
 
 	/* Let the modules do their work. */
 	sx_downgrade(&allprison_lock);
-	if (created) {
+	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
-			prison_deref(pr, PD_LIST_SLOCKED);
+			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
+			prison_deref(pr, created
+			    ? PD_LIST_SLOCKED
+			    : PD_DEREF | PD_LIST_SLOCKED);
 			goto done_errmsg;
 		}
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error) {
+		if (born)
+			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 		prison_deref(pr, created
 		    ? PD_LIST_SLOCKED
 		    : PD_DEREF | PD_LIST_SLOCKED);
@@ -1815,10 +1899,12 @@
 	}
 
 #ifdef RACCT
-	if (!created) {
-		sx_sunlock(&allprison_lock);
+	if (racct_enable && !created) {
+		if (!(flags & JAIL_ATTACH))
+			sx_sunlock(&allprison_lock);
 		prison_racct_modify(pr);
-		sx_slock(&allprison_lock);
+		if (!(flags & JAIL_ATTACH))
+			sx_slock(&allprison_lock);
 	}
 #endif
 
@@ -1845,7 +1931,7 @@
 			sx_sunlock(&allprison_lock);
 	}
 
-	goto done_errmsg;
+	goto done_free;
 
  done_deref_locked:
 	prison_deref(pr, created
@@ -1855,26 +1941,21 @@
  done_unlock_list:
 	sx_xunlock(&allprison_lock);
  done_releroot:
-	if (root != NULL) {
-		vfslocked = VFS_LOCK_GIANT(root->v_mount);
+	if (root != NULL)
 		vrele(root);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
  done_errmsg:
 	if (error) {
-		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
-		if (errmsg_len > 0) {
+		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
+		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
-			if (errmsg_pos > 0) {
-				if (optuio->uio_segflg == UIO_SYSSPACE)
-					bcopy(errmsg,
-					   optuio->uio_iov[errmsg_pos].iov_base,
-					   errmsg_len);
-				else
-					copyout(errmsg,
-					   optuio->uio_iov[errmsg_pos].iov_base,
-					   errmsg_len);
-			}
+			if (optuio->uio_segflg == UIO_SYSSPACE)
+				bcopy(errmsg,
+				    optuio->uio_iov[errmsg_pos].iov_base,
+				    errmsg_len);
+			else
+				copyout(errmsg,
+				    optuio->uio_iov[errmsg_pos].iov_base,
+				    errmsg_len);
 		}
 	}
  done_free:
@@ -2130,6 +2211,13 @@
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
+	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
+	    sizeof(pr->pr_osreldate));
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
+	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
+	if (error != 0 && error != ENOENT)
+		goto done_deref;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
@@ -2221,7 +2309,6 @@
 
 	/* Remove all descendants of this prison, then remove this prison. */
 	pr->pr_ref++;
-	pr->pr_flags |= PR_REMOVE;
 	if (!LIST_EMPTY(&pr->pr_children)) {
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
@@ -2230,7 +2317,6 @@
 			if (cpr->pr_ref > 0) {
 				tpr = cpr;
 				cpr->pr_ref++;
-				cpr->pr_flags |= PR_REMOVE;
 			} else {
 				/* Already removed - do not do it again. */
 				tpr = NULL;
@@ -2314,7 +2400,14 @@
 	if (error)
 		return (error);
 
-	sx_slock(&allprison_lock);
+	/*
+	 * Start with exclusive hold on allprison_lock to ensure that a possible
+	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
+	 * But then immediately downgrade it since we don't need to stop
+	 * readers.
+	 */
+	sx_xlock(&allprison_lock);
+	sx_downgrade(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
@@ -2337,10 +2430,9 @@
 static int
 do_jail_attach(struct thread *td, struct prison *pr)
 {
-	struct prison *ppr;
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
-	int vfslocked, error;
+	int error;
 
 	/*
 	 * XXX: Note that there is a slight race here if two threads
@@ -2365,13 +2457,11 @@
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
-	ppr = td->td_ucred->cr_prison;
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
-	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
@@ -2381,30 +2471,27 @@
 #endif
 	VOP_UNLOCK(pr->pr_root, 0);
 	if ((error = change_root(pr->pr_root, td)))
-		goto e_unlock_giant;
-	VFS_UNLOCK_GIANT(vfslocked);
+		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
-	oldcred = p->p_ucred;
+	oldcred = crcopysafe(p, newcred);
+	newcred->cr_prison = pr;
+	proc_set_cred(p, newcred);
 	setsugid(p);
-	crcopy(newcred, oldcred);
-	newcred->cr_prison = pr;
-	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 #endif
+	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
 	crfree(oldcred);
-	prison_deref(ppr, PD_DEREF | PD_DEUREF);
 	return (0);
+
  e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0);
- e_unlock_giant:
-	VFS_UNLOCK_GIANT(vfslocked);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
-	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
+	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 	prison_deref(pr, PD_DEREF | PD_DEUREF);
 	return (error);
 }
@@ -2513,16 +2600,13 @@
 void
 prison_free_locked(struct prison *pr)
 {
+	int ref;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
-	pr->pr_ref--;
-	if (pr->pr_ref == 0) {
-		mtx_unlock(&pr->pr_mtx);
-		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
+	ref = --pr->pr_ref;
+	mtx_unlock(&pr->pr_mtx);
+	if (ref == 0)
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
-		return;
-	}
-	mtx_unlock(&pr->pr_mtx);
 }
 
 void
@@ -2533,11 +2617,19 @@
 	prison_free_locked(pr);
 }
 
+/*
+ * Complete a call to either prison_free or prison_proc_free.
+ */
 static void
 prison_complete(void *context, int pending)
 {
+	struct prison *pr = context;
 
-	prison_deref((struct prison *)context, 0);
+	sx_xlock(&allprison_lock);
+	mtx_lock(&pr->pr_mtx);
+	prison_deref(pr, pr->pr_uref
+	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
+	    : PD_LOCKED | PD_LIST_XLOCKED);
 }
 
 /*
@@ -2550,20 +2642,48 @@
 prison_deref(struct prison *pr, int flags)
 {
 	struct prison *ppr, *tpr;
-	int vfslocked;
+	int ref, lasturef;
 
 	if (!(flags & PD_LOCKED))
 		mtx_lock(&pr->pr_mtx);
 	for (;;) {
 		if (flags & PD_DEUREF) {
+			KASSERT(pr->pr_uref > 0,
+			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
+			     pr->pr_id));
 			pr->pr_uref--;
+			lasturef = pr->pr_uref == 0;
+			if (lasturef)
+				pr->pr_ref++;
 			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
+		} else
+			lasturef = 0;
+		if (flags & PD_DEREF) {
+			KASSERT(pr->pr_ref > 0,
+			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
+			     pr->pr_id));
+			pr->pr_ref--;
 		}
-		if (flags & PD_DEREF)
-			pr->pr_ref--;
+		ref = pr->pr_ref;
+		mtx_unlock(&pr->pr_mtx);
+
+		/*
+		 * Tell the modules if the last user reference was removed
+		 * (even it sticks around in dying state).
+		 */
+		if (lasturef) {
+			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
+				sx_xlock(&allprison_lock);
+				flags |= PD_LIST_XLOCKED;
+			}
+			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
+			mtx_lock(&pr->pr_mtx);
+			ref = --pr->pr_ref;
+			mtx_unlock(&pr->pr_mtx);
+		}
+
 		/* If the prison still has references, nothing else to do. */
-		if (pr->pr_ref > 0) {
-			mtx_unlock(&pr->pr_mtx);
+		if (ref > 0) {
 			if (flags & PD_LIST_SLOCKED)
 				sx_sunlock(&allprison_lock);
 			else if (flags & PD_LIST_XLOCKED)
@@ -2571,7 +2691,6 @@
 			return;
 		}
 
-		mtx_unlock(&pr->pr_mtx);
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
@@ -2591,11 +2710,8 @@
 		if (pr->pr_vnet != ppr->pr_vnet)
 			vnet_destroy(pr->pr_vnet);
 #endif
-		if (pr->pr_root != NULL) {
-			vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
+		if (pr->pr_root != NULL)
 			vrele(pr->pr_root);
-			VFS_UNLOCK_GIANT(vfslocked);
-		}
 		mtx_destroy(&pr->pr_mtx);
 #ifdef INET
 		free(pr->pr_ip4, M_PRISON);
@@ -2607,7 +2723,8 @@
 			cpuset_rel(pr->pr_cpuset);
 		osd_jail_exit(pr);
 #ifdef RACCT
-		prison_racct_detach(pr);
+		if (racct_enable)
+			prison_racct_detach(pr);
 #endif
 		free(pr, M_PRISON);
 
@@ -2655,7 +2772,20 @@
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
-	prison_deref(pr, PD_DEUREF | PD_LOCKED);
+	if (pr->pr_uref > 1)
+		pr->pr_uref--;
+	else {
+		/*
+		 * Don't remove the last user reference in this context, which
+		 * is expected to be a process that is not only locked, but
+		 * also half dead.
+		 */
+		pr->pr_ref++;
+		mtx_unlock(&pr->pr_mtx);
+		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
+		return;
+	}
+	mtx_unlock(&pr->pr_mtx);
 }
 
 
@@ -3062,7 +3192,7 @@
 				ii++;
 				continue;
 			}
-			switch (ij >= ppr->pr_ip4s ? -1 :
+			switch (ij >= ppr->pr_ip6s ? -1 :
 				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
 			case -1:
 				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
@@ -3901,6 +4031,13 @@
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
+
+		/*
+		 * As in the non-jail case, non-root users are expected to be
+		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
+		 * exists in the jail and they have permission to access it).
+		 */
+	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
@@ -3936,7 +4073,7 @@
 		return (0);
 
 		/*
-		 * Allow jailed root to set certian IPv4/6 (option) headers.
+		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
@@ -4173,11 +4310,11 @@
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
-    "Number of IP addresses a jail may have at most per address family");
+    "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
- * Default parameters for jail(2) compatability.  For historical reasons,
+ * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
@@ -4213,43 +4350,59 @@
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
-    "Processes in jail can set their hostnames");
+    "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
-    "Processes in jail are limited to creating UNIX/IP/route sockets only");
+    "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
-    "Processes in jail can use System V IPC primitives");
+    "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
-    "Prison root can create raw sockets");
+    "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
-    "Processes in jail can alter system file flags");
+    "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
-    "Processes in jail can mount/unmount jail-friendly file systems");
+    "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
-    "Processes in jail can mount the devfs file system");
+    "Processes in jail can mount the devfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the fdescfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
-    "Processes in jail can mount the nullfs file system");
+    "Processes in jail can mount the nullfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
-    "Processes in jail can mount the procfs file system");
+    "Processes in jail can mount the procfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the linprocfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the linsysfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
+    "Processes in jail can mount the tmpfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
-    "Processes in jail can mount the zfs file system");
+    "Processes in jail can mount the zfs file system (deprecated)");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
@@ -4270,13 +4423,13 @@
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
-    "Processes in jail cannot see all mounted file systems");
+    "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
-    "Ruleset for the devfs filesystem in jail");
+    "Ruleset for the devfs filesystem in jail (deprecated)");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
@@ -4318,6 +4471,10 @@
 	return (0);
 }
 
+/*
+ * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
+ * jail creation time but cannot be changed in an existing jail.
+ */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
@@ -4324,6 +4481,10 @@
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
+SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
+    "Jail value for kern.osreldate and uname -K");
+SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
+    "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
@@ -4394,13 +4555,22 @@
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the devfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the fdescfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the nullfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the procfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the linprocfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the linsysfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
+    "B", "Jail may mount the tmpfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the zfs file system");
 
+#ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void *arg2, void *arg3)
@@ -4407,6 +4577,8 @@
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_slock(&allprison_lock);
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
@@ -4418,6 +4590,7 @@
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
@@ -4448,6 +4621,8 @@
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
@@ -4458,6 +4633,8 @@
 prison_racct_hold(struct prison_racct *prr)
 {
 
+	ASSERT_RACCT_ENABLED();
+
 	refcount_acquire(&prr->prr_refcount);
 }
 
@@ -4465,6 +4642,7 @@
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
@@ -4479,6 +4657,7 @@
 {
 	int old;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	old = prr->prr_refcount;
@@ -4490,12 +4669,12 @@
 	sx_xunlock(&allprison_lock);
 }
 
-#ifdef RACCT
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
@@ -4515,6 +4694,8 @@
 	struct ucred *cred;
 	struct prison_racct *oldprr;
 
+	ASSERT_RACCT_ENABLED();
+
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
@@ -4554,6 +4735,7 @@
 prison_racct_detach(struct prison *pr)
 {
 
+	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)

Modified: trunk/sys/kern/kern_khelp.c
===================================================================
--- trunk/sys/kern/kern_khelp.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_khelp.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2010 Lawrence Stewart <lstewart at freebsd.org>
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart at freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
@@ -35,12 +36,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_khelp.c 251778 2013-06-15 06:45:17Z lstewart $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/hhook.h>
-#include <sys/jail.h>
 #include <sys/khelp.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -52,8 +52,6 @@
 #include <sys/rwlock.h>
 #include <sys/systm.h>
 
-#include <net/vnet.h>
-
 static struct rwlock khelp_list_lock;
 RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
 
@@ -61,6 +59,7 @@
 
 /* Private function prototypes. */
 static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
 
 #define	KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
 #define	KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
@@ -74,33 +73,32 @@
 	struct helper *tmph;
 	int error, i, inserted;
 
-	error = 0;
-	inserted = 0;
+	error = inserted = 0;
 	refcount_init(&h->h_refcount, 0);
 	h->h_id = osd_register(OSD_KHELP, NULL, NULL);
 
 	/* It's only safe to add the hooks after osd_register(). */
-	if (h->h_nhooks > 0) {
-		for (i = 0; i < h->h_nhooks && !error; i++) {
-			/* We don't require the module to assign hook_helper. */
-			h->h_hooks[i].hook_helper = h;
-			error = khelp_add_hhook(&h->h_hooks[i], HHOOK_NOWAIT);
-		}
-
-		if (error) {
-			for (i--; i >= 0; i--)
-				khelp_remove_hhook(&h->h_hooks[i]);
-
-			osd_deregister(OSD_KHELP, h->h_id);
-		}
+	for (i = 0; i < h->h_nhooks && !error; i++) {
+		/* We don't require the module to assign hook_helper. */
+		h->h_hooks[i].hook_helper = h;
+		error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
+		if (error)
+			printf("%s: \"%s\" khelp module unable to "
+			    "hook type %d id %d due to error %d\n", __func__,
+			    h->h_name, h->h_hooks[i].hook_type,
+			    h->h_hooks[i].hook_id, error);
 	}
 
-	if (!error) {
+	if (error) {
+		for (i--; i >= 0; i--)
+			hhook_remove_hook_lookup(&h->h_hooks[i]);
+		osd_deregister(OSD_KHELP, h->h_id);
+	} else {
 		KHELP_LIST_WLOCK();
 		/*
 		 * Keep list of helpers sorted in descending h_id order. Due to
 		 * the way osd_set() works, a sorted list ensures
-		 * init_helper_osd() will operate with improved efficiency.
+		 * khelp_init_osd() will operate with improved efficiency.
 		 */
 		TAILQ_FOREACH(tmph, &helpers, h_next) {
 			if (tmph->h_id < h->h_id) {
@@ -124,8 +122,6 @@
 	struct helper *tmph;
 	int error, i;
 
-	error = 0;
-
 	KHELP_LIST_WLOCK();
 	if (h->h_refcount > 0)
 		error = EBUSY;
@@ -142,10 +138,8 @@
 	KHELP_LIST_WUNLOCK();
 
 	if (!error) {
-		if (h->h_nhooks > 0) {
-			for (i = 0; i < h->h_nhooks; i++)
-				khelp_remove_hhook(&h->h_hooks[i]);
-		}
+		for (i = 0; i < h->h_nhooks; i++)
+			hhook_remove_hook_lookup(&h->h_hooks[i]);
 		osd_deregister(OSD_KHELP, h->h_id);
 	}
 
@@ -263,29 +257,14 @@
 int
 khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
 {
-	VNET_ITERATOR_DECL(vnet_iter);
 	int error;
 
-	error = 0;
-
 	/*
-	 * XXXLAS: If a helper is dynamically adding a helper hook function at
-	 * runtime using this function, we should update the helper's h_hooks
-	 * struct member to include the additional hookinfo struct.
+	 * XXXLAS: Should probably include the functionality to update the
+	 * helper's h_hooks struct member.
 	 */
+	error = hhook_add_hook_lookup(hki, flags);
 
-	VNET_LIST_RLOCK_NOSLEEP();
-	VNET_FOREACH(vnet_iter) {
-		CURVNET_SET(vnet_iter);
-		error = hhook_add_hook_lookup(hki, flags);
-		CURVNET_RESTORE();
-#ifdef VIMAGE
-		if (error)
-			break;
-#endif
-	}
-	VNET_LIST_RUNLOCK_NOSLEEP();
-
 	return (error);
 }
 
@@ -292,32 +271,47 @@
 int
 khelp_remove_hhook(struct hookinfo *hki)
 {
-	VNET_ITERATOR_DECL(vnet_iter);
 	int error;
 
-	error = 0;
-
 	/*
-	 * XXXLAS: If a helper is dynamically removing a helper hook function at
-	 * runtime using this function, we should update the helper's h_hooks
-	 * struct member to remove the defunct hookinfo struct.
+	 * XXXLAS: Should probably include the functionality to update the
+	 * helper's h_hooks struct member.
 	 */
+	error = hhook_remove_hook_lookup(hki);
 
-	VNET_LIST_RLOCK_NOSLEEP();
-	VNET_FOREACH(vnet_iter) {
-		CURVNET_SET(vnet_iter);
-		error = hhook_remove_hook_lookup(hki);
-		CURVNET_RESTORE();
-#ifdef VIMAGE
-		if (error)
-			break;
-#endif
-	}
-	VNET_LIST_RUNLOCK_NOSLEEP();
-
 	return (error);
 }
 
+/*
+ * Private KPI between hhook and khelp that allows khelp modules to insert hook
+ * functions into hhook points which register after the modules were loaded.
+ */
+void
+khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
+{
+	struct helper *h;
+	int error, i;
+
+	KHELP_LIST_RLOCK();
+	TAILQ_FOREACH(h, &helpers, h_next) {
+		for (i = 0; i < h->h_nhooks; i++) {
+			if (hhh->hhh_type != h->h_hooks[i].hook_type ||
+			    hhh->hhh_id != h->h_hooks[i].hook_id)
+				continue;
+			error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
+			if (error) {
+				printf("%s: \"%s\" khelp module unable to "
+				    "hook type %d id %d due to error %d\n",
+				    __func__, h->h_name,
+				    h->h_hooks[i].hook_type,
+				    h->h_hooks[i].hook_id, error);
+				error = 0;
+			}
+		}
+	}
+	KHELP_LIST_RUNLOCK();
+}
+
 int
 khelp_modevent(module_t mod, int event_type, void *data)
 {
@@ -377,95 +371,3 @@
 
 	return (error);
 }
-
-/*
- * This function is called in two separate situations:
- *
- * - When the kernel is booting, it is called directly by the SYSINIT framework
- * to allow Khelp modules which were compiled into the kernel or loaded by the
- * boot loader to insert their non-virtualised hook functions into the kernel.
- *
- * - When the kernel is booting or a vnet is created, this function is also
- * called indirectly through khelp_vnet_init() by the vnet initialisation code.
- * In this situation, Khelp modules are able to insert their virtualised hook
- * functions into the virtualised hook points in the vnet which is being
- * initialised. In the case where the kernel is not compiled with "options
- * VIMAGE", this step is still run once at boot, but the hook functions get
- * transparently inserted into the standard unvirtualised network stack.
- */
-static void
-khelp_init(const void *vnet)
-{
-	struct helper *h;
-	int error, i, vinit;
-	int32_t htype, hid;
-
-	error = 0;
-	vinit = vnet != NULL;
-
-	KHELP_LIST_RLOCK();
-	TAILQ_FOREACH(h, &helpers, h_next) {
-		for (i = 0; i < h->h_nhooks && !error; i++) {
-			htype = h->h_hooks[i].hook_type;
-			hid = h->h_hooks[i].hook_id;
-
-			/*
-			 * If we're doing a virtualised init (vinit != 0) and
-			 * the hook point is virtualised, or we're doing a plain
-			 * sysinit at boot and the hook point is not
-			 * virtualised, insert the hook.
-			 */
-			if ((hhook_head_is_virtualised_lookup(htype, hid) ==
-			    HHOOK_HEADISINVNET && vinit) ||
-			    (!hhook_head_is_virtualised_lookup(htype, hid) &&
-			    !vinit)) {
-				error = hhook_add_hook_lookup(&h->h_hooks[i],
-				    HHOOK_NOWAIT);
-			}
-		}
-
-		if (error) {
-			 /* Remove any helper's hooks we successfully added. */
-			for (i--; i >= 0; i--)
-				hhook_remove_hook_lookup(&h->h_hooks[i]);
-
-			printf("%s: Failed to add hooks for helper \"%s\" (%p)",
-				__func__, h->h_name, h);
-			if (vinit)
-				    printf(" to vnet %p.\n", vnet);
-			else
-				printf(".\n");
-
-			error = 0;
-		}
-	}
-	KHELP_LIST_RUNLOCK();
-}
-
-/*
- * Vnet created and being initialised.
- */
-static void
-khelp_vnet_init(const void *unused __unused)
-{
-
-	khelp_init(TD_TO_VNET(curthread));
-}
-
-
-/*
- * As the kernel boots, allow Khelp modules which were compiled into the kernel
- * or loaded by the boot loader to insert their non-virtualised hook functions
- * into the kernel.
- */
-SYSINIT(khelp_init, SI_SUB_PROTO_END, SI_ORDER_FIRST, khelp_init, NULL);
-
-/*
- * When a vnet is created and being initialised, we need to insert the helper
- * hook functions for all currently registered Khelp modules into the vnet's
- * helper hook points.  The hhook KPI provides a mechanism for subsystems which
- * export helper hook points to clean up on vnet shutdown, so we don't need a
- * VNET_SYSUNINIT for Khelp.
- */
-VNET_SYSINIT(khelp_vnet_init, SI_SUB_PROTO_END, SI_ORDER_FIRST,
-    khelp_vnet_init, NULL);

Modified: trunk/sys/kern/kern_kthread.c
===================================================================
--- trunk/sys/kern/kern_kthread.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_kthread.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999 Peter Wemm <peter at FreeBSD.org>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_kthread.c 304905 2016-08-27 11:45:05Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,6 +39,7 @@
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
+#include <sys/umtx.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <sys/sched.h>
@@ -257,12 +259,8 @@
 		panic("kthread_add called too soon");
 
 	/* If no process supplied, put it on proc0 */
-	if (p == NULL) {
+	if (p == NULL)
 		p = &proc0;
-		oldtd = &thread0;
-	} else {
-		oldtd = FIRST_THREAD_IN_PROC(p);
-	}
 
 	/* Initialize our new td  */
 	newtd = thread_alloc(pages);
@@ -269,9 +267,13 @@
 	if (newtd == NULL)
 		return (ENOMEM);
 
+	PROC_LOCK(p);
+	oldtd = FIRST_THREAD_IN_PROC(p);
+
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
-/* XXX check if we should zero. */
+	newtd->td_su = NULL;
+	newtd->td_sleeptimo = 0;
 	bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
@@ -293,9 +295,7 @@
 	newtd->td_ucred = crhold(p->p_ucred);
 
 	/* this code almost the same as create_thread() in kern_thr.c */
-	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
-	newtd->td_sigmask = oldtd->td_sigmask; /* XXX dubious */
 	thread_link(newtd, p);
 	thread_lock(oldtd);
 	/* let the scheduler know about these things. */
@@ -324,11 +324,13 @@
 kthread_exit(void)
 {
 	struct proc *p;
+	struct thread *td;
 
-	p = curthread->td_proc;
+	td = curthread;
+	p = td->td_proc;
 
 	/* A module may be waiting for us to exit. */
-	wakeup(curthread);
+	wakeup(td);
 
 	/*
 	 * The last exiting thread in a kernel process must tear down
@@ -341,8 +343,10 @@
 		rw_wunlock(&tidhash_lock);
 		kproc_exit(0);
 	}
-	LIST_REMOVE(curthread, td_hash);
+	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
+	umtx_thread_exit(td);
+	tdsigcleanup(td);
 	PROC_SLOCK(p);
 	thread_exit();
 }

Modified: trunk/sys/kern/kern_ktr.c
===================================================================
--- trunk/sys/kern/kern_ktr.c	2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_ktr.c	2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000 John Baldwin <jhb at FreeBSD.org>
  * All rights reserved.
@@ -10,9 +11,6 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the author nor the names of any co-contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -33,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_ktr.c 293853 2016-01-13 21:38:52Z marius $");
 
 #include "opt_ddb.h"
 #include "opt_ktr.h"
@@ -47,15 +45,16 @@
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <machine/cpu.h>
-#ifdef __sparc64__
-#include <machine/ktr.h>
-#endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -62,14 +61,25 @@
 #include <ddb/db_output.h>
 #endif
 
+#ifndef KTR_BOOT_ENTRIES
+#define	KTR_BOOT_ENTRIES	1024
+#endif
+
 #ifndef KTR_ENTRIES
 #define	KTR_ENTRIES	1024
 #endif
 
+/* Limit the allocations to something manageable. */
+#define	KTR_ENTRIES_MAX	(8 * 1024 * 1024)
+
 #ifndef KTR_MASK
 #define	KTR_MASK	(0)
 #endif
 
+#ifndef KTR_CPUMASK
+#define	KTR_CPUMASK	CPUSET_FSET
+#endif
+
 #ifndef KTR_TIME
 #define	KTR_TIME	get_cyclecount()
 #endif
@@ -78,41 +88,36 @@
 #define	KTR_CPU		PCPU_GET(cpuid)
 #endif
 
+static MALLOC_DEFINE(M_KTR, "KTR", "KTR");
+
 FEATURE(ktr, "Kernel support for KTR kernel tracing facility");
 
-static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+volatile int	ktr_idx = 0;
+int	ktr_mask = KTR_MASK;
+int	ktr_compile = KTR_COMPILE;
+int	ktr_entries = KTR_BOOT_ENTRIES;
+int	ktr_version = KTR_VERSION;
+struct	ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES];
+struct	ktr_entry *ktr_buf = ktr_buf_init;
+cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK);
+static char ktr_cpumask_str[CPUSETBUFSIZ];
 
-int	ktr_mask = KTR_MASK;
 TUNABLE_INT("debug.ktr.mask", &ktr_mask);
-SYSCTL_INT(_debug_ktr, OID_AUTO, mask, CTLFLAG_RW,
-    &ktr_mask, 0, "Bitmask of KTR event classes for which logging is enabled");
 
-int	ktr_compile = KTR_COMPILE;
-SYSCTL_INT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
-    &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
+TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
 
-int	ktr_entries = KTR_ENTRIES;
-SYSCTL_INT(_debug_ktr, OID_AUTO, entries, CTLFLAG_RD,
-    &ktr_entries, 0, "Number of entries in the KTR buffer");
+static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
 
-int	ktr_version = KTR_VERSION;
 SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD,
     &ktr_version, 0, "Version of the KTR interface");
 
-cpuset_t ktr_cpumask;
-static char ktr_cpumask_str[CPUSETBUFSIZ];
-TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
+SYSCTL_UINT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
+    &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
 
 static void
 ktr_cpumask_initializer(void *dummy __unused)
 {
 
-	CPU_FILL(&ktr_cpumask);
-#ifdef KTR_CPUMASK
-	if (cpusetobj_strscan(&ktr_cpumask, KTR_CPUMASK) == -1)
-		CPU_FILL(&ktr_cpumask);
-#endif
-
 	/*
 	 * TUNABLE_STR() runs with SI_ORDER_MIDDLE priority, thus it must be
 	 * already set, if necessary.
@@ -147,9 +152,6 @@
     sysctl_debug_ktr_cpumask, "S",
     "Bitmask of CPUs on which KTR logging is enabled");
 
-volatile int	ktr_idx = 0;
-struct	ktr_entry ktr_buf[KTR_ENTRIES];
-
 static int
 sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
 {
@@ -161,7 +163,7 @@
 		return (error);
 
 	if (clear) {
-		bzero(ktr_buf, sizeof(ktr_buf));
+		bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries);
 		ktr_idx = 0;
 	}
 
@@ -170,6 +172,94 @@
 SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
     sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
 
+/*
+ * This is a sysctl proc so that it is serialized as !MPSAFE along with
+ * the other ktr sysctl procs.
+ */
+static int
+sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS)
+{
+	int mask, error;
+
+	mask = ktr_mask;
+	error = sysctl_handle_int(oidp, &mask, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	ktr_mask = mask;
+	return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_UINT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_mask, "IU",
+    "Bitmask of KTR event classes for which logging is enabled");
+
+#if KTR_ENTRIES > KTR_BOOT_ENTRIES
+/*
+ * A simplified version of sysctl_debug_ktr_entries.
+ * No need to care about SMP, scheduling, etc.
+ */
+static void
+ktr_entries_initializer(void *dummy __unused)
+{
+	int mask;
+
+	/* Temporarily disable ktr in case malloc() is being traced. */
+	mask = ktr_mask;
+	ktr_mask = 0;
+	ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR,
+	    M_WAITOK | M_ZERO);
+	memcpy(ktr_buf, ktr_buf_init + ktr_idx,
+	    (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf));
+	if (ktr_idx != 0)
+		memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init,
+		    ktr_idx * sizeof(*ktr_buf));
+	ktr_entries = KTR_ENTRIES;
+	ktr_mask = mask;
+}
+SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY,
+    ktr_entries_initializer, NULL);
+#endif
+
+static int
+sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS)
+{
+	int entries, error, mask;
+	struct ktr_entry *buf, *oldbuf;
+
+	entries = ktr_entries;
+	error = sysctl_handle_int(oidp, &entries, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (entries > KTR_ENTRIES_MAX)
+		return (ERANGE);
+	/* Disable ktr temporarily. */
+	mask = ktr_mask;
+	atomic_store_rel_int(&ktr_mask, 0);
+	/* Wait for threads to go idle. */
+	if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) {
+		ktr_mask = mask;
+		return (error);
+	}
+	if (ktr_buf != ktr_buf_init)
+		oldbuf = ktr_buf;
+	else
+		oldbuf = NULL;
+	/* Allocate a new buffer. */
+	buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO);
+	/* Install the new buffer and restart ktr. */
+	ktr_buf = buf;
+	ktr_entries = entries;
+	ktr_idx = 0;
+	atomic_store_rel_int(&ktr_mask, mask);
+	if (oldbuf != NULL)
+		free(oldbuf, M_KTR);
+
+	return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer");
+
 #ifdef KTR_VERBOSE
 int	ktr_verbose = KTR_VERBOSE;
 TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
@@ -251,7 +341,7 @@
 
 	if (panicstr)
 		return;
-	if ((ktr_mask & mask) == 0)
+	if ((ktr_mask & mask) == 0 || ktr_buf == NULL)
 		return;
 	cpu = KTR_CPU;
 	if (!CPU_ISSET(cpu, &ktr_cpumask))
@@ -283,7 +373,7 @@
 	{
 		do {
 			saveindex = ktr_idx;
-			newindex = (saveindex + 1) % KTR_ENTRIES;
+			newindex = (saveindex + 1) % ktr_entries;
 		} while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
 		entry = &ktr_buf[saveindex];
 	}
@@ -338,12 +428,12 @@
 DB_SHOW_COMMAND(ktr, db_ktr_all)
 {
 	
-	tstate.cur = (ktr_idx - 1) % KTR_ENTRIES;
+	tstate.cur = (ktr_idx - 1) % ktr_entries;
 	tstate.first = -1;
 	db_ktr_verbose = 0;
-	db_ktr_verbose |= (index(modif, 'v') != NULL) ? 2 : 0;
-	db_ktr_verbose |= (index(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
-	if (index(modif, 'a') != NULL) {
+	db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0;
+	db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
+	if (strchr(modif, 'a') != NULL) {
 		db_disable_pager();
 		while (cncheckc() != -1)
 			if (db_mach_vtrace() == 0)
@@ -360,7 +450,7 @@
 {
 	struct ktr_entry	*kp;
 
-	if (tstate.cur == tstate.first) {
+	if (tstate.cur == tstate.first || ktr_buf == NULL) {
 		db_printf("--- End of trace buffer ---\n");
 		return (0);
 	}
@@ -392,7 +482,7 @@
 		tstate.first = tstate.cur;
 
 	if (--tstate.cur < 0)
-		tstate.cur = KTR_ENTRIES - 1;
+		tstate.cur = ktr_entries - 1;
 
 	return (1);
 }