[Midnightbsd-cvs] src [9954] trunk/sys/kern: sync with freebsd

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sat May 26 10:25:55 EDT 2018


Revision: 9954
          http://svnweb.midnightbsd.org/src/?rev=9954
Author:   laffer1
Date:     2018-05-26 10:25:55 -0400 (Sat, 26 May 2018)
Log Message:
-----------
 sync with freebsd

Modified Paths:
--------------
    trunk/sys/kern/uipc_accf.c
    trunk/sys/kern/uipc_debug.c
    trunk/sys/kern/uipc_domain.c
    trunk/sys/kern/uipc_mbuf.c
    trunk/sys/kern/uipc_mbuf2.c
    trunk/sys/kern/uipc_mqueue.c
    trunk/sys/kern/uipc_sem.c
    trunk/sys/kern/uipc_shm.c
    trunk/sys/kern/uipc_sockbuf.c
    trunk/sys/kern/uipc_socket.c
    trunk/sys/kern/uipc_syscalls.c
    trunk/sys/kern/uipc_usrreq.c

Modified: trunk/sys/kern/uipc_accf.c
===================================================================
--- trunk/sys/kern/uipc_accf.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_accf.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000 Paycounter, Inc.
  * Copyright (c) 2005 Robert N. M. Watson
@@ -27,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_accf.c 201145 2009-12-28 22:56:30Z antoine $");
 
 #define ACCEPT_FILTER_MOD
 

Modified: trunk/sys/kern/uipc_debug.c
===================================================================
--- trunk/sys/kern/uipc_debug.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_debug.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2007 Robert N. M. Watson
  * All rights reserved.
@@ -29,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_debug.c 255138 2013-09-01 23:34:53Z davide $");
 
 #include "opt_ddb.h"
 
@@ -411,7 +412,7 @@
 	db_print_indent(indent);
 	db_printf("sb_ctl: %u   ", sb->sb_ctl);
 	db_printf("sb_lowat: %d   ", sb->sb_lowat);
-	db_printf("sb_timeo: %d\n", sb->sb_timeo);
+	db_printf("sb_timeo: %jd\n", sb->sb_timeo);
 
 	db_print_indent(indent);
 	db_printf("sb_flags: 0x%x (", sb->sb_flags);

Modified: trunk/sys/kern/uipc_domain.c
===================================================================
--- trunk/sys/kern/uipc_domain.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_domain.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_domain.c 314667 2017-03-04 13:03:31Z avg $");
 
 #include <sys/param.h>
 #include <sys/socket.h>
@@ -46,8 +47,6 @@
 
 #include <net/vnet.h>
 
-#include <vm/uma.h>
-
 /*
  * System initialization
  *
@@ -138,8 +137,10 @@
 #define DEFAULT(foo, bar)	if ((foo) == NULL)  (foo) = (bar)
 	DEFAULT(pu->pru_accept, pru_accept_notsupp);
 	DEFAULT(pu->pru_bind, pru_bind_notsupp);
+	DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
 	DEFAULT(pu->pru_connect, pru_connect_notsupp);
 	DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
+	DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
 	DEFAULT(pu->pru_control, pru_control_notsupp);
 	DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
 	DEFAULT(pu->pru_listen, pru_listen_notsupp);
@@ -239,33 +240,16 @@
 	mtx_unlock(&dom_mtx);
 }
 
-static void
-socket_zone_change(void *tag)
-{
-
-	uma_zone_set_max(socket_zone, maxsockets);
-}
-
 /* ARGSUSED*/
 static void
 domaininit(void *dummy)
 {
 
-	/*
-	 * Before we do any setup, make sure to initialize the
-	 * zone allocator we get struct sockets from.
-	 */
-	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	uma_zone_set_max(socket_zone, maxsockets);
-	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
-		EVENTHANDLER_PRI_FIRST);
-
 	if (max_linkhdr < 16)		/* XXX */
 		max_linkhdr = 16;
 
-	callout_init(&pffast_callout, CALLOUT_MPSAFE);
-	callout_init(&pfslow_callout, CALLOUT_MPSAFE);
+	callout_init(&pffast_callout, 1);
+	callout_init(&pfslow_callout, 1);
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 0, ("domaininit called too late!"));
@@ -287,6 +271,17 @@
 	callout_reset(&pfslow_callout, 1, pfslowtimo, NULL);
 }
 
+struct domain *
+pffinddomain(int family)
+{
+	struct domain *dp;
+
+	for (dp = domains; dp != NULL; dp = dp->dom_next)
+		if (dp->dom_family == family)
+			return (dp);
+	return (NULL);
+}
+
 struct protosw *
 pffindtype(int family, int type)
 {
@@ -293,15 +288,14 @@
 	struct domain *dp;
 	struct protosw *pr;
 
-	for (dp = domains; dp; dp = dp->dom_next)
-		if (dp->dom_family == family)
-			goto found;
-	return (0);
-found:
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (NULL);
+
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 		if (pr->pr_type && pr->pr_type == type)
 			return (pr);
-	return (0);
+	return (NULL);
 }
 
 struct protosw *
@@ -309,21 +303,22 @@
 {
 	struct domain *dp;
 	struct protosw *pr;
-	struct protosw *maybe = 0;
+	struct protosw *maybe;
 
+	maybe = NULL;
 	if (family == 0)
-		return (0);
-	for (dp = domains; dp; dp = dp->dom_next)
-		if (dp->dom_family == family)
-			goto found;
-	return (0);
-found:
+		return (NULL);
+
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (NULL);
+
 	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
 		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
 			return (pr);
 
 		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
-		    pr->pr_protocol == 0 && maybe == (struct protosw *)0)
+		    pr->pr_protocol == 0 && maybe == NULL)
 			maybe = pr;
 	}
 	return (maybe);
@@ -351,12 +346,10 @@
 		return (ENXIO);
 
 	/* Try to find the specified domain based on the family. */
-	for (dp = domains; dp; dp = dp->dom_next)
-		if (dp->dom_family == family)
-			goto found;
-	return (EPFNOSUPPORT);
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (EPFNOSUPPORT);
 
-found:
 	/* Initialize backpointer to struct domain. */
 	npr->pr_domain = dp;
 	fpr = NULL;
@@ -422,12 +415,10 @@
 		return (EPROTOTYPE);
 
 	/* Try to find the specified domain based on the family type. */
-	for (dp = domains; dp; dp = dp->dom_next)
-		if (dp->dom_family == family)
-			goto found;
-	return (EPFNOSUPPORT);
+	dp = pffinddomain(family);
+	if (dp == NULL)
+		return (EPFNOSUPPORT);
 
-found:
 	dpr = NULL;
 
 	/* Lock out everyone else while we are manipulating the protosw. */

Modified: trunk/sys/kern/uipc_mbuf.c
===================================================================
--- trunk/sys/kern/uipc_mbuf.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_mbuf.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_mbuf.c 308376 2016-11-06 16:44:33Z avos $");
 
 #include "opt_param.h"
 #include "opt_mbuf_stress_test.h"
@@ -85,6 +86,77 @@
 #endif
 
 /*
+ * Ensure the correct size of various mbuf parameters.  It could be off due
+ * to compiler-induced padding and alignment artifacts.
+ */
+CTASSERT(sizeof(struct mbuf) == MSIZE);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN);
+CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN);
+
+/*
+ * m_get2() allocates minimum mbuf that would fit "size" argument.
+ */
+struct mbuf *
+m_get2(int size, int how, short type, int flags)
+{
+	struct mb_args args;
+	struct mbuf *m, *n;
+
+	args.flags = flags;
+	args.type = type;
+
+	if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
+		return (uma_zalloc_arg(zone_mbuf, &args, how));
+	if (size <= MCLBYTES)
+		return (uma_zalloc_arg(zone_pack, &args, how));
+
+	if (size > MJUMPAGESIZE)
+		return (NULL);
+
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m == NULL)
+		return (NULL);
+
+	n = uma_zalloc_arg(zone_jumbop, m, how);
+	if (n == NULL) {
+		uma_zfree(zone_mbuf, m);
+		return (NULL);
+	}
+
+	return (m);
+}
+
+/*
+ * m_getjcl() returns an mbuf with a cluster of the specified size attached.
+ * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
+ */
+struct mbuf *
+m_getjcl(int how, short type, int flags, int size)
+{
+	struct mb_args args;
+	struct mbuf *m, *n;
+	uma_zone_t zone;
+
+	if (size == MCLBYTES)
+		return m_getcl(how, type, flags);
+
+	args.flags = flags;
+	args.type = type;
+
+	m = uma_zalloc_arg(zone_mbuf, &args, how);
+	if (m == NULL)
+		return (NULL);
+
+	zone = m_getzone(size);
+	n = uma_zalloc_arg(zone, m, how);
+	if (n == NULL) {
+		uma_zfree(zone_mbuf, m);
+		return (NULL);
+	}
+	return (m);
+}
+
+/*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
@@ -182,25 +254,31 @@
  * Returns:
  *    Nothing.
  */
-void
+int
 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
-    void (*freef)(void *, void *), void *arg1, void *arg2, int flags, int type)
+    int (*freef)(struct mbuf *, void *, void *), void *arg1, void *arg2,
+    int flags, int type, int wait)
 {
 	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
 	if (type != EXT_EXTREF)
-		mb->m_ext.ref_cnt = (u_int *)uma_zalloc(zone_ext_refcnt, M_NOWAIT);
-	if (mb->m_ext.ref_cnt != NULL) {
-		*(mb->m_ext.ref_cnt) = 1;
-		mb->m_flags |= (M_EXT | flags);
-		mb->m_ext.ext_buf = buf;
-		mb->m_data = mb->m_ext.ext_buf;
-		mb->m_ext.ext_size = size;
-		mb->m_ext.ext_free = freef;
-		mb->m_ext.ext_arg1 = arg1;
-		mb->m_ext.ext_arg2 = arg2;
-		mb->m_ext.ext_type = type;
-        }
+		mb->m_ext.ref_cnt = uma_zalloc(zone_ext_refcnt, wait);
+
+	if (mb->m_ext.ref_cnt == NULL)
+		return (ENOMEM);
+
+	*(mb->m_ext.ref_cnt) = 1;
+	mb->m_flags |= (M_EXT | flags);
+	mb->m_ext.ext_buf = buf;
+	mb->m_data = mb->m_ext.ext_buf;
+	mb->m_ext.ext_size = size;
+	mb->m_ext.ext_free = freef;
+	mb->m_ext.ext_arg1 = arg1;
+	mb->m_ext.ext_arg2 = arg2;
+	mb->m_ext.ext_type = type;
+	mb->m_ext.ext_flags = 0;
+
+	return (0);
 }
 
 /*
@@ -215,12 +293,11 @@
 	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
 	KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
 
-
 	/*
 	 * check if the header is embedded in the cluster
-	 */     
+	 */
 	skipmbuf = (m->m_flags & M_NOFREE);
-	
+
 	/* Free attached storage if this mbuf is the only reference to it. */
 	if (*(m->m_ext.ref_cnt) == 1 ||
 	    atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) {
@@ -253,7 +330,7 @@
 		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
 				("%s: ext_free not set", __func__));
-			(*(m->m_ext.ext_free))(m->m_ext.ext_arg1,
+			(void)(*(m->m_ext.ext_free))(m, m->m_ext.ext_arg1,
 			    m->m_ext.ext_arg2);
 			break;
 		default:
@@ -275,6 +352,7 @@
 	m->m_ext.ref_cnt = NULL;
 	m->m_ext.ext_size = 0;
 	m->m_ext.ext_type = 0;
+	m->m_ext.ext_flags = 0;
 	m->m_flags &= ~M_EXT;
 	uma_zfree(zone_mbuf, m);
 }
@@ -301,6 +379,7 @@
 	n->m_ext.ext_size = m->m_ext.ext_size;
 	n->m_ext.ref_cnt = m->m_ext.ref_cnt;
 	n->m_ext.ext_type = m->m_ext.ext_type;
+	n->m_ext.ext_flags = m->m_ext.ext_flags;
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & M_RDONLY;
 }
@@ -327,7 +406,7 @@
 			m_freem(m->m_nextpkt);
 			m->m_nextpkt = NULL;
 		}
-		m->m_flags = m->m_flags & (M_EXT|M_RDONLY|M_FREELIST|M_NOFREE);
+		m->m_flags = m->m_flags & (M_EXT|M_RDONLY|M_NOFREE);
 	}
 }
 
@@ -368,11 +447,6 @@
 			M_SANITY_ACTION("m_data outside mbuf data range right");
 		if ((caddr_t)m->m_data + m->m_len > b)
 			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
-		if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.header) {
-			if ((caddr_t)m->m_pkthdr.header < a ||
-			    (caddr_t)m->m_pkthdr.header > b)
-				M_SANITY_ACTION("m_pkthdr.header outside mbuf data range");
-		}
 
 		/* m->m_nextpkt may only be set on first mbuf in chain. */
 		if (m != m0 && m->m_nextpkt != NULL) {
@@ -462,8 +536,8 @@
 #if 0
 	/*
 	 * The mbuf allocator only initializes the pkthdr
-	 * when the mbuf is allocated with MGETHDR. Many users
-	 * (e.g. m_copy*, m_prepend) use MGET and then
+	 * when the mbuf is allocated with m_gethdr(). Many users
+	 * (e.g. m_copy*, m_prepend) use m_get() and then
 	 * smash the pkthdr as needed causing these
 	 * assertions to trip.  For now just disable them.
 	 */
@@ -495,15 +569,15 @@
 	struct mbuf *mn;
 
 	if (m->m_flags & M_PKTHDR)
-		MGETHDR(mn, how, m->m_type);
+		mn = m_gethdr(how, m->m_type);
 	else
-		MGET(mn, how, m->m_type);
+		mn = m_get(how, m->m_type);
 	if (mn == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (m->m_flags & M_PKTHDR)
-		M_MOVE_PKTHDR(mn, m);
+		m_move_pkthdr(mn, m);
 	mn->m_next = m;
 	m = mn;
 	if(m->m_flags & M_PKTHDR) {
@@ -520,7 +594,7 @@
 /*
  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
- * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
+ * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller.
  * Note that the copy is read-only, because clusters are not copied,
  * only their reference counts are incremented.
  */
@@ -553,9 +627,9 @@
 			break;
 		}
 		if (copyhdr)
-			MGETHDR(n, wait, m->m_type);
+			n = m_gethdr(wait, m->m_type);
 		else
-			MGET(n, wait, m->m_type);
+			n = m_get(wait, m->m_type);
 		*np = n;
 		if (n == NULL)
 			goto nospace;
@@ -581,13 +655,10 @@
 		m = m->m_next;
 		np = &n->m_next;
 	}
-	if (top == NULL)
-		mbstat.m_mcfail++;	/* XXX: No consistency. */
 
 	return (top);
 nospace:
 	m_freem(top);
-	mbstat.m_mcfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
@@ -679,7 +750,6 @@
 			return NULL;
 		bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
 		mm->m_data = mm->m_ext.ext_buf;
-		mm->m_pkthdr.header = NULL;
 	}
 	if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
 		bcopy(mm->m_data, &buf, mm->m_len);
@@ -690,7 +760,6 @@
 		       mm->m_ext.ext_size - mm->m_len, mm->m_len);
 		mm->m_data = (caddr_t)mm->m_ext.ext_buf +
 			      mm->m_ext.ext_size - mm->m_len;
-		mm->m_pkthdr.header = NULL;
 	}
 
 	/* Append/prepend as many mbuf (clusters) as necessary to fit len. */
@@ -754,7 +823,7 @@
 	struct mbuf *top, *n, *o;
 
 	MBUF_CHECKSLEEP(how);
-	MGET(n, how, m->m_type);
+	n = m_get(how, m->m_type);
 	top = n;
 	if (n == NULL)
 		goto nospace;
@@ -772,7 +841,7 @@
 
 	m = m->m_next;
 	while (m) {
-		MGET(o, how, m->m_type);
+		o = m_get(how, m->m_type);
 		if (o == NULL)
 			goto nospace;
 
@@ -792,7 +861,6 @@
 	return top;
 nospace:
 	m_freem(top);
-	mbstat.m_mcfail++;	/* XXX: No consistency. */ 
 	return (NULL);
 }
 
@@ -867,6 +935,7 @@
 			}
 			if ((n->m_flags & M_EXT) == 0)
 				nsize = MHLEN;
+			n->m_flags &= ~M_RDONLY;
 		}
 		n->m_len = 0;
 
@@ -896,7 +965,6 @@
 
 nospace:
 	m_freem(top);
-	mbstat.m_mcfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
@@ -911,8 +979,8 @@
 	while (m->m_next)
 		m = m->m_next;
 	while (n) {
-		if (m->m_flags & M_EXT ||
-		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
+		if (!M_WRITABLE(m) ||
+		    M_TRAILINGSPACE(m) < n->m_len) {
 			/* just join the two chains */
 			m->m_next = n;
 			return;
@@ -1000,8 +1068,8 @@
 
 /*
  * Rearange an mbuf chain so that len bytes are contiguous
- * and in the data area of an mbuf (so that mtod and dtom
- * will work for a structure of size len).  Returns the resulting
+ * and in the data area of an mbuf (so that mtod will work
+ * for a structure of size len).  Returns the resulting
  * mbuf chain on success, frees it and returns null on failure.
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
@@ -1028,12 +1096,11 @@
 	} else {
 		if (len > MHLEN)
 			goto bad;
-		MGET(m, M_DONTWAIT, n->m_type);
+		m = m_get(M_NOWAIT, n->m_type);
 		if (m == NULL)
 			goto bad;
-		m->m_len = 0;
 		if (n->m_flags & M_PKTHDR)
-			M_MOVE_PKTHDR(m, n);
+			m_move_pkthdr(m, n);
 	}
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
@@ -1057,7 +1124,6 @@
 	return (m);
 bad:
 	m_freem(n);
-	mbstat.m_mpfail++;	/* XXX: No consistency. */
 	return (NULL);
 }
 
@@ -1076,12 +1142,11 @@
 
 	if (len > (MHLEN - dstoff))
 		goto bad;
-	MGET(m, M_DONTWAIT, n->m_type);
+	m = m_get(M_NOWAIT, n->m_type);
 	if (m == NULL)
 		goto bad;
-	m->m_len = 0;
 	if (n->m_flags & M_PKTHDR)
-		M_MOVE_PKTHDR(m, n);
+		m_move_pkthdr(m, n);
 	m->m_data += dstoff;
 	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
 	do {
@@ -1131,13 +1196,23 @@
 	if (m == NULL)
 		return (NULL);
 	remain = m->m_len - len;
-	if (m0->m_flags & M_PKTHDR) {
-		MGETHDR(n, wait, m0->m_type);
+	if (m0->m_flags & M_PKTHDR && remain == 0) {
+		n = m_gethdr(wait, m0->m_type);
 		if (n == NULL)
 			return (NULL);
+		n->m_next = m->m_next;
+		m->m_next = NULL;
 		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
 		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
 		m0->m_pkthdr.len = len0;
+		return (n);
+	} else if (m0->m_flags & M_PKTHDR) {
+		n = m_gethdr(wait, m0->m_type);
+		if (n == NULL)
+			return (NULL);
+		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+		m0->m_pkthdr.len = len0;
 		if (m->m_flags & M_EXT)
 			goto extpacket;
 		if (remain > MHLEN) {
@@ -1158,7 +1233,7 @@
 		m->m_next = NULL;
 		return (n);
 	} else {
-		MGET(n, wait, m->m_type);
+		n = m_get(wait, m->m_type);
 		if (n == NULL)
 			return (NULL);
 		M_ALIGN(n, remain);
@@ -1195,10 +1270,10 @@
 	while (totlen > 0) {
 		if (top == NULL) {	/* First one, must be PKTHDR */
 			if (totlen + off >= MINCLSIZE) {
-				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
+				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				len = MCLBYTES;
 			} else {
-				m = m_gethdr(M_DONTWAIT, MT_DATA);
+				m = m_gethdr(M_NOWAIT, MT_DATA);
 				len = MHLEN;
 
 				/* Place initial small packet/header at end of mbuf */
@@ -1213,10 +1288,10 @@
 			m->m_pkthdr.len = totlen;
 		} else {
 			if (totlen + off >= MINCLSIZE) {
-				m = m_getcl(M_DONTWAIT, MT_DATA, 0);
+				m = m_getcl(M_NOWAIT, MT_DATA, 0);
 				len = MCLBYTES;
 			} else {
-				m = m_get(M_DONTWAIT, MT_DATA);
+				m = m_get(M_NOWAIT, MT_DATA);
 				len = MLEN;
 			}
 			if (m == NULL) {
@@ -1260,7 +1335,7 @@
 		off -= mlen;
 		totlen += mlen;
 		if (m->m_next == NULL) {
-			n = m_get(M_DONTWAIT, m->m_type);
+			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				goto out;
 			bzero(mtod(n, caddr_t), MLEN);
@@ -1284,7 +1359,7 @@
 		if (len == 0)
 			break;
 		if (m->m_next == NULL) {
-			n = m_get(M_DONTWAIT, m->m_type);
+			n = m_get(M_NOWAIT, m->m_type);
 			if (n == NULL)
 				break;
 			n->m_len = min(MLEN, len);
@@ -1328,7 +1403,7 @@
 		 * Allocate a new mbuf; could check space
 		 * and allocate a cluster instead.
 		 */
-		n = m_get(M_DONTWAIT, m->m_type);
+		n = m_get(M_NOWAIT, m->m_type);
 		if (n == NULL)
 			break;
 		n->m_len = min(MLEN, remainder);
@@ -1584,7 +1659,7 @@
 		n = m->m_next;
 		if (n == NULL)
 			break;
-		if ((m->m_flags & M_RDONLY) == 0 &&
+		if (M_WRITABLE(m) &&
 		    n->m_len < M_TRAILINGSPACE(m)) {
 			bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
 				n->m_len);
@@ -1811,14 +1886,22 @@
 void
 m_align(struct mbuf *m, int len)
 {
+#ifdef INVARIANTS
+	const char *msg = "%s: not a virgin mbuf";
+#endif
 	int adjust;
 
-	if (m->m_flags & M_EXT)
+	if (m->m_flags & M_EXT) {
+		KASSERT(m->m_data == m->m_ext.ext_buf, (msg, __func__));
 		adjust = m->m_ext.ext_size - len;
-	else if (m->m_flags & M_PKTHDR)
+	} else if (m->m_flags & M_PKTHDR) {
+		KASSERT(m->m_data == m->m_pktdat, (msg, __func__));
 		adjust = MHLEN - len;
-	else
+	} else {
+		KASSERT(m->m_data == m->m_dat, (msg, __func__));
 		adjust = MLEN - len;
+	}
+
 	m->m_data += adjust &~ (sizeof(long)-1);
 }
 
@@ -1900,43 +1983,23 @@
 		}
 
 		/*
-		 * Allocate new space to hold the copy...
+		 * Allocate new space to hold the copy and copy the data.
+		 * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by
+		 * splitting them into clusters.  We could just malloc a
+		 * buffer and make it external but too many device drivers
+		 * don't know how to break up the non-contiguous memory when
+		 * doing DMA.
 		 */
-		/* XXX why can M_PKTHDR be set past the first mbuf? */
-		if (mprev == NULL && (m->m_flags & M_PKTHDR)) {
-			/*
-			 * NB: if a packet header is present we must
-			 * allocate the mbuf separately from any cluster
-			 * because M_MOVE_PKTHDR will smash the data
-			 * pointer and drop the M_EXT marker.
-			 */
-			MGETHDR(n, how, m->m_type);
-			if (n == NULL) {
-				m_freem(m0);
-				return (NULL);
-			}
-			M_MOVE_PKTHDR(n, m);
-			MCLGET(n, how);
-			if ((n->m_flags & M_EXT) == 0) {
-				m_free(n);
-				m_freem(m0);
-				return (NULL);
-			}
-		} else {
-			n = m_getcl(how, m->m_type, m->m_flags);
-			if (n == NULL) {
-				m_freem(m0);
-				return (NULL);
-			}
+		n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
+		if (n == NULL) {
+			m_freem(m0);
+			return (NULL);
 		}
-		/*
-		 * ... and copy the data.  We deal with jumbo mbufs
-		 * (i.e. m_len > MCLBYTES) by splitting them into
-		 * clusters.  We could just malloc a buffer and make
-		 * it external but too many device drivers don't know
-		 * how to break up the non-contiguous memory when
-		 * doing DMA.
-		 */
+		if (m->m_flags & M_PKTHDR) {
+			KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR",
+			    __func__, m0, m));
+			m_move_pkthdr(n, m);
+		}
 		len = m->m_len;
 		off = 0;
 		mfirst = n;
@@ -1957,7 +2020,7 @@
 				break;
 			off += cc;
 
-			n = m_getcl(how, m->m_type, m->m_flags);
+			n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS);
 			if (n == NULL) {
 				m_freem(mfirst);
 				m_freem(m0);

Modified: trunk/sys/kern/uipc_mbuf2.c
===================================================================
--- trunk/sys/kern/uipc_mbuf2.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_mbuf2.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*	$KAME: uipc_mbuf2.c,v 1.31 2001/11/28 11:08:53 itojun Exp $	*/
 /*	$NetBSD: uipc_mbuf.c,v 1.40 1999/04/01 00:23:25 thorpej Exp $	*/
 
@@ -61,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_mbuf2.c 312442 2017-01-19 23:44:05Z rpokala $");
 
 /*#define PULLDOWN_DEBUG*/
 
@@ -141,7 +142,7 @@
 	 *      Ideally, the requirement should only be (iii).
 	 *
 	 * If we're writable, we're sure we're writable, because the ref. count
-	 * cannot increase from 1, as that would require posession of mbuf
+	 * cannot increase from 1, as that would require possession of mbuf
 	 * n by someone else (which is impossible). However, if we're _not_
 	 * writable, we may eventually become writable )if the ref. count drops
 	 * to 1), but we'll fail to notice it unless we re-evaluate
@@ -161,7 +162,7 @@
 	 * the target data is on <n, off>.
 	 * if we got enough data on the mbuf "n", we're done.
 	 */
-	if ((off == 0 || offp) && len <= n->m_len - off && writable)
+	if ((off == 0 || offp) && len <= n->m_len - off)
 		goto ok;
 
 	/*
@@ -171,7 +172,7 @@
 	 * chop the current mbuf into two pieces, set off to 0.
 	 */
 	if (len <= n->m_len - off) {
-		o = m_dup1(n, off, n->m_len - off, M_DONTWAIT);
+		o = m_dup1(n, off, n->m_len - off, M_NOWAIT);
 		if (o == NULL) {
 			m_freem(m);
 			return NULL;	/* ENOBUFS */
@@ -231,9 +232,9 @@
 	 * on both end.
 	 */
 	if (len > MLEN)
-		o = m_getcl(M_DONTWAIT, m->m_type, 0);
+		o = m_getcl(M_NOWAIT, m->m_type, 0);
 	else
-		o = m_get(M_DONTWAIT, m->m_type);
+		o = m_get(M_NOWAIT, m->m_type);
 	if (!o) {
 		m_freem(m);
 		return NULL;	/* ENOBUFS */

Modified: trunk/sys/kern/uipc_mqueue.c
===================================================================
--- trunk/sys/kern/uipc_mqueue.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_mqueue.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005 David Xu <davidxu at freebsd.org>
  * All rights reserved.
@@ -43,8 +44,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_mqueue.c 325783 2017-11-13 23:21:17Z jamie $");
 
+#include "opt_capsicum.h"
 #include "opt_compat.h"
 
 #include <sys/param.h>
@@ -51,8 +53,9 @@
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
+#include <sys/malloc.h>
 #include <sys/buf.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
@@ -59,8 +62,8 @@
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
+#include <sys/jail.h>
 #include <sys/lock.h>
-#include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
@@ -130,6 +133,7 @@
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
+	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
@@ -217,6 +221,7 @@
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
+static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
@@ -234,6 +239,7 @@
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
+static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
@@ -434,6 +440,7 @@
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
+	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
@@ -582,7 +589,6 @@
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
-	mp->mnt_kern_flag |= MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
@@ -643,6 +649,9 @@
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
+	osd_method_t methods[PR_MAXMETHOD] = {
+	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
+	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
@@ -669,6 +678,7 @@
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
+	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
@@ -682,6 +692,7 @@
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
+	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
@@ -703,7 +714,9 @@
 {
 	struct vnode *vp = (struct vnode *)context;
 
-	vrecycle(vp, curthread);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	vrecycle(vp);
+	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 }
 
@@ -799,13 +812,17 @@
  * Search a directory entry
  */
 static struct mqfs_node *
-mqfs_search(struct mqfs_node *pd, const char *name, int len)
+mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
+	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
+	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
-		if (strncmp(pn->mn_name, name, len) == 0 &&
+		/* Only match names within the same prison root directory */
+		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
+		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
@@ -877,7 +894,7 @@
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
-	pn = mqfs_search(pd, pname, namelen);
+	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
@@ -1065,7 +1082,7 @@
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
-		vrecycle(ap->a_vp, ap->a_td);
+		vrecycle(ap->a_vp);
 	return (0);
 }
 
@@ -1362,6 +1379,7 @@
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
+	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
@@ -1386,10 +1404,18 @@
 	error = 0;
 	offset = 0;
 
+	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
+
+		/*
+		 * Only show names within the same prison root directory
+		 * (or not associated with a prison, e.g. "." and "..").
+		 */
+		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
+			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
@@ -1523,6 +1549,38 @@
 #endif /* notyet */
 
 /*
+ * See if this prison root is obsolete, and clean up associated queues if it is.
+ */
+static int
+mqfs_prison_remove(void *obj, void *data __unused)
+{
+	const struct prison *pr = obj;
+	const struct prison *tpr;
+	struct mqfs_node *pn, *tpn;
+	int found;
+
+	found = 0;
+	TAILQ_FOREACH(tpr, &allprison, pr_list) {
+		if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0)
+			found = 1;
+	}
+	if (!found) {
+		/*
+		 * No jails are rooted in this directory anymore,
+		 * so no queues should be either.
+		 */
+		sx_xlock(&mqfs_data.mi_lock);
+		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
+		    mn_sibling, tpn) {
+			if (pn->mn_pr_root == pr->pr_root)
+				(void)do_unlink(pn, curthread->td_ucred);
+		}
+		sx_xunlock(&mqfs_data.mi_lock);
+	}
+	return (0);
+}
+
+/*
  * Allocate a message queue
  */
 static struct mqueue *
@@ -1974,7 +2032,7 @@
 	 * characters. 
 	 */
 	len = strlen(path);
-	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
+	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
@@ -1982,7 +2040,7 @@
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
-	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
@@ -2021,7 +2079,7 @@
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
-		fdclose(fdp, fp, fd, td);
+		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -2046,7 +2104,7 @@
 	struct mq_attr attr;
 	int flags, error;
 
-	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
+	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
@@ -2073,11 +2131,11 @@
 		return (error);
 
 	len = strlen(path);
-	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
+	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 
 	sx_xlock(&mqfs_data.mi_lock);
-	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
@@ -2086,19 +2144,19 @@
 	return (error);
 }
 
-typedef int (*_fgetf)(struct thread *, int, cap_rights_t, struct file **);
+typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
-_getmq(struct thread *td, int fd, cap_rights_t rights, _fgetf func,
+_getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
-	error = func(td, fd, rights, fpp);
+	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
@@ -2117,7 +2175,10 @@
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
-	return _getmq(td, fd, CAP_POLL_EVENT, fget, fpp, ppn, pmq);
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_EVENT), fget,
+	    fpp, ppn, pmq);
 }
 
 static __inline int
@@ -2124,7 +2185,10 @@
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
-	return _getmq(td, fd, CAP_READ, fget_read, fpp, ppn, pmq);
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_READ), fget_read,
+	    fpp, ppn, pmq);
 }
 
 static __inline int
@@ -2131,7 +2195,10 @@
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
-	return _getmq(td, fd, CAP_WRITE, fget_write, fpp, ppn, pmq);
+	cap_rights_t rights;
+
+	return _getmq(td, fd, cap_rights_init(&rights, CAP_WRITE), fget_write,
+	    fpp, ppn, pmq);
 }
 
 static int
@@ -2177,10 +2244,10 @@
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
-	if (error != 0)
-		return (error);
-	if (uap->oattr != NULL)
+	if (error == 0 && uap->oattr != NULL) {
+		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
+	}
 	return (error);
 }
 
@@ -2235,10 +2302,12 @@
 	return (error);
 }
 
-int
-sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+static int
+kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
-	struct sigevent ev;
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
@@ -2246,37 +2315,37 @@
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
-	p = td->td_proc;
-	fdp = td->td_proc->p_fd;
-	if (uap->sigev) {
-		error = copyin(uap->sigev, &ev, sizeof(ev));
-		if (error)
-			return (error);
-		if (ev.sigev_notify != SIGEV_SIGNAL &&
-		    ev.sigev_notify != SIGEV_THREAD_ID &&
-		    ev.sigev_notify != SIGEV_NONE)
+	if (sigev != NULL) {
+		if (sigev->sigev_notify != SIGEV_SIGNAL &&
+		    sigev->sigev_notify != SIGEV_THREAD_ID &&
+		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
-		if ((ev.sigev_notify == SIGEV_SIGNAL ||
-		     ev.sigev_notify == SIGEV_THREAD_ID) &&
-			!_SIG_VALID(ev.sigev_signo))
+		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
+		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
+		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
-	error = getmq(td, uap->mqd, &fp, NULL, &mq);
+	p = td->td_proc;
+	fdp = td->td_proc->p_fd;
+	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
-	fp2 = fget_locked(fdp, uap->mqd);
+	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
-	error = cap_funwrap(fp2, CAP_POLL_EVENT, &fp2);
+#ifdef CAPABILITIES
+	error = cap_check(cap_rights(fdp, mqd),
+	    cap_rights_init(&rights, CAP_EVENT));
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
+#endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
@@ -2284,12 +2353,12 @@
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
-	if (uap->sigev != NULL) {
+	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
-			nt = notifier_search(p, uap->mqd);
+			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
@@ -2312,10 +2381,10 @@
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
-				nt->nt_ksi.ksi_mqd = uap->mqd;
+				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
-			nt->nt_sigev = ev;
+			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
@@ -2328,7 +2397,7 @@
 				mqueue_send_notification(mq);
 		}
 	} else {
-		notifier_remove(p, mq, uap->mqd);
+		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
@@ -2339,6 +2408,23 @@
 	return (error);
 }
 
+int
+sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+{
+	struct sigevent ev, *evp;
+	int error;
+
+	if (uap->sigev == NULL) {
+		evp = NULL;
+	} else {
+		error = copyin(uap->sigev, &ev, sizeof(ev));
+		if (error != 0)
+			return (error);
+		evp = &ev;
+	}
+	return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
@@ -2582,7 +2668,8 @@
 	.fo_stat		= mqf_stat,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
-	.fo_close		= mqf_close
+	.fo_close		= mqf_close,
+	.fo_sendfile		= invfo_sendfile,
 };
 
 static struct vop_vector mqfs_vnodeops = {
@@ -2635,6 +2722,7 @@
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
+#include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
@@ -2665,7 +2753,7 @@
 	struct mq_attr32 attr32;
 	int flags, error;
 
-	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
+	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
@@ -2693,10 +2781,9 @@
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
-	if (error != 0)
-		return (error);
-	if (uap->oattr != NULL) {
+	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
+		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
@@ -2713,7 +2800,7 @@
 	int error;
 	int waitok;
 
-	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
@@ -2742,7 +2829,7 @@
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
-	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
@@ -2761,12 +2848,33 @@
 	return (error);
 }
 
+int
+freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
+{
+	struct sigevent ev, *evp;
+	struct sigevent32 ev32;
+	int error;
+
+	if (uap->sigev == NULL) {
+		evp = NULL;
+	} else {
+		error = copyin(uap->sigev, &ev32, sizeof(ev32));
+		if (error != 0)
+			return (error);
+		error = convert_sigevent32(&ev32, &ev);
+		if (error != 0)
+			return (error);
+		evp = &ev;
+	}
+	return (kern_kmq_notify(td, uap->mqd, evp));
+}
+
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
-	SYSCALL32_INIT_HELPER_COMPAT(kmq_notify),
+	SYSCALL32_INIT_HELPER(freebsd32_kmq_notify),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };

Modified: trunk/sys/kern/uipc_sem.c
===================================================================
--- trunk/sys/kern/uipc_sem.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_sem.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002 Alfred Perlstein <alfred at FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
@@ -32,18 +33,19 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_sem.c 325783 2017-11-13 23:21:17Z jamie $");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 
 #include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
+#include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
@@ -71,7 +73,6 @@
  * TODO
  *
  * - Resource limits?
- * - Update fstat(1)
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
@@ -117,7 +118,7 @@
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
-static int	ksem_get(struct thread *td, semid_t id, cap_rights_t rights,
+static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
@@ -150,6 +151,7 @@
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
+	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE
 };
 
@@ -407,6 +409,7 @@
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
+	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
@@ -428,6 +431,7 @@
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
+			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
@@ -439,6 +443,32 @@
 	return (ENOENT);
 }
 
+static void
+ksem_info_impl(struct ksem *ks, char *path, size_t size, uint32_t *value)
+{
+	const char *ks_path, *pr_path;
+	size_t pr_pathlen;
+
+	if (ks->ks_path == NULL)
+		return;
+	sx_slock(&ksem_dict_lock);
+	ks_path = ks->ks_path;
+	if (ks_path != NULL) {
+		pr_path = curthread->td_ucred->cr_prison->pr_path;
+		if (strcmp(pr_path, "/") != 0) {
+			/* Return the jail-rooted pathname. */
+			pr_pathlen = strlen(pr_path);
+			if (strncmp(ks_path, pr_path, pr_pathlen) == 0 &&
+			    ks_path[pr_pathlen] == '/')
+				ks_path += pr_pathlen;
+		}
+		strlcpy(path, ks_path, size);
+	}
+	if (value != NULL)
+		*value = ks->ks_value;
+	sx_sunlock(&ksem_dict_lock);
+}
+
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
@@ -477,6 +507,8 @@
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
+	const char *pr_path;
+	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
@@ -499,7 +531,7 @@
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
-		fdclose(fdp, fp, fd, td);
+		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -513,13 +545,19 @@
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
-		error = copyinstr(name, path, MAXPATHLEN, NULL);
+		pr_path = td->td_ucred->cr_prison->pr_path;
 
+		/* Construct a full pathname for jailed callers. */
+		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
+		    : strlcpy(path, pr_path, MAXPATHLEN);
+		error = copyinstr(name, path + pr_pathlen,
+		    MAXPATHLEN - pr_pathlen, NULL);
+
 		/* Require paths to start with a '/' character. */
-		if (error == 0 && path[0] != '/')
+		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
-			fdclose(fdp, fp, fd, td);
+			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
@@ -570,7 +608,7 @@
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
-		fdclose(fdp, fp, fd, td);
+		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -584,13 +622,14 @@
 }
 
 static int
-ksem_get(struct thread *td, semid_t id, cap_rights_t rights, struct file **fpp)
+ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
+    struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
-	error = fget(td, id, rights, &fp);
+	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
@@ -651,11 +690,17 @@
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
+	const char *pr_path;
+	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
-	error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
+	pr_path = td->td_ucred->cr_prison->pr_path;
+	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
+	    : strlcpy(path, pr_path, MAXPATHLEN);
+	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
+	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
@@ -704,11 +749,13 @@
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
+	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
-	error = ksem_get(td, uap->id, CAP_SEM_POST, &fp);
+	error = ksem_get(td, uap->id,
+	    cap_rights_init(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
@@ -793,12 +840,13 @@
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
+	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
-	error = ksem_get(td, id, CAP_SEM_WAIT, &fp);
+	error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
@@ -860,11 +908,13 @@
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
+	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
-	error = ksem_get(td, uap->id, CAP_SEM_GETVALUE, &fp);
+	error = ksem_get(td, uap->id,
+	    cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
@@ -1014,6 +1064,7 @@
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
+	ksem_info = ksem_info_impl;
 
 	error = syscall_helper_register(ksem_syscalls);
 	if (error)
@@ -1035,6 +1086,7 @@
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
+	ksem_info = NULL;
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);

Modified: trunk/sys/kern/uipc_shm.c
===================================================================
--- trunk/sys/kern/uipc_shm.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_shm.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006, 2011 Robert N. M. Watson
  * All rights reserved.
@@ -42,17 +43,23 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_shm.c 325783 2017-11-13 23:21:17Z jamie $");
 
 #include "opt_capsicum.h"
+#include "opt_ktrace.h"
 
 #include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
+#include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
+#include <sys/uio.h>
+#include <sys/signal.h>
+#include <sys/jail.h>
+#include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
@@ -61,6 +68,7 @@
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
@@ -68,6 +76,7 @@
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
+#include <sys/unistd.h>
 
 #include <security/mac/mac_framework.h>
 
@@ -95,12 +104,14 @@
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
+static struct unrhdr *shm_ino_unr;
+static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
 static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
-static void	shm_dict_init(void *arg);
+static void	shm_init(void *arg);
 static void	shm_drop(struct shmfd *shmfd);
 static struct shmfd *shm_hold(struct shmfd *shmfd);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
@@ -118,6 +129,7 @@
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
+static fo_seek_t	shm_seek;
 
 /* File descriptor operations. */
 static struct fileops shm_ops = {
@@ -131,17 +143,170 @@
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
-	.fo_flags = DFLAG_PASSABLE
+	.fo_sendfile = vn_sendfile,
+	.fo_seek = shm_seek,
+	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
+uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
+{
+	vm_page_t m;
+	vm_pindex_t idx;
+	size_t tlen;
+	int error, offset, rv;
+
+	idx = OFF_TO_IDX(uio->uio_offset);
+	offset = uio->uio_offset & PAGE_MASK;
+	tlen = MIN(PAGE_SIZE - offset, len);
+
+	VM_OBJECT_WLOCK(obj);
+
+	/*
+	 * Parallel reads of the page content from disk are prevented
+	 * by exclusive busy.
+	 *
+	 * Although the tmpfs vnode lock is held here, it is
+	 * nonetheless safe to sleep waiting for a free page.  The
+	 * pageout daemon does not need to acquire the tmpfs vnode
+	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
+	 * type object.
+	 */
+	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
+	if (m->valid != VM_PAGE_BITS_ALL) {
+		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
+			rv = vm_pager_get_pages(obj, &m, 1, 0);
+			m = vm_page_lookup(obj, idx);
+			if (m == NULL) {
+				printf(
+		    "uiomove_object: vm_obj %p idx %jd null lookup rv %d\n",
+				    obj, idx, rv);
+				VM_OBJECT_WUNLOCK(obj);
+				return (EIO);
+			}
+			if (rv != VM_PAGER_OK) {
+				printf(
+	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
+				    obj, idx, m->valid, rv);
+				vm_page_lock(m);
+				vm_page_free(m);
+				vm_page_unlock(m);
+				VM_OBJECT_WUNLOCK(obj);
+				return (EIO);
+			}
+		} else
+			vm_page_zero_invalid(m, TRUE);
+	}
+	vm_page_xunbusy(m);
+	vm_page_lock(m);
+	vm_page_hold(m);
+	if (m->queue == PQ_NONE) {
+		vm_page_deactivate(m);
+	} else {
+		/* Requeue to maintain LRU ordering. */
+		vm_page_requeue(m);
+	}
+	vm_page_unlock(m);
+	VM_OBJECT_WUNLOCK(obj);
+	error = uiomove_fromphys(&m, offset, tlen, uio);
+	if (uio->uio_rw == UIO_WRITE && error == 0) {
+		VM_OBJECT_WLOCK(obj);
+		vm_page_dirty(m);
+		vm_pager_page_unswapped(m);
+		VM_OBJECT_WUNLOCK(obj);
+	}
+	vm_page_lock(m);
+	vm_page_unhold(m);
+	vm_page_unlock(m);
+
+	return (error);
+}
+
+int
+uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
+{
+	ssize_t resid;
+	size_t len;
+	int error;
+
+	error = 0;
+	while ((resid = uio->uio_resid) > 0) {
+		if (obj_size <= uio->uio_offset)
+			break;
+		len = MIN(obj_size - uio->uio_offset, resid);
+		if (len == 0)
+			break;
+		error = uiomove_object_page(obj, len, uio);
+		if (error != 0 || resid == uio->uio_resid)
+			break;
+	}
+	return (error);
+}
+
+static int
+shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
+{
+	struct shmfd *shmfd;
+	off_t foffset;
+	int error;
+
+	shmfd = fp->f_data;
+	foffset = foffset_lock(fp, 0);
+	error = 0;
+	switch (whence) {
+	case L_INCR:
+		if (foffset < 0 ||
+		    (offset > 0 && foffset > OFF_MAX - offset)) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += foffset;
+		break;
+	case L_XTND:
+		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
+			error = EOVERFLOW;
+			break;
+		}
+		offset += shmfd->shm_size;
+		break;
+	case L_SET:
+		break;
+	default:
+		error = EINVAL;
+	}
+	if (error == 0) {
+		if (offset < 0 || offset > shmfd->shm_size)
+			error = EINVAL;
+		else
+			*(off_t *)(td->td_retval) = offset;
+	}
+	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
+	return (error);
+}
+
+static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
+	struct shmfd *shmfd;
+	void *rl_cookie;
+	int error;
 
-	return (EOPNOTSUPP);
+	shmfd = fp->f_data;
+#ifdef MAC
+	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	foffset_lock_uio(fp, uio, flags);
+	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
+	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
 }
 
 static int
@@ -148,8 +313,29 @@
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
+	struct shmfd *shmfd;
+	void *rl_cookie;
+	int error;
 
-	return (EOPNOTSUPP);
+	shmfd = fp->f_data;
+#ifdef MAC
+	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
+	if (error)
+		return (error);
+#endif
+	foffset_lock_uio(fp, uio, flags);
+	if ((flags & FOF_OFFSET) == 0) {
+		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
+		    &shmfd->shm_mtx);
+	} else {
+		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
+		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
+	}
+
+	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
+	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
+	foffset_unlock_uio(fp, uio, flags);
+	return (error);
 }
 
 static int
@@ -227,6 +413,8 @@
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
+	sb->st_dev = shm_dev_ino;
+	sb->st_ino = shmfd->shm_ino;
 
 	return (0);
 }
@@ -252,10 +440,11 @@
 	vm_ooffset_t delta;
 	int base, rv;
 
+	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
@@ -267,7 +456,7 @@
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
@@ -280,17 +469,14 @@
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
-				if ((m->oflags & VPO_BUSY) != 0 ||
-				    m->busy != 0) {
-					vm_page_sleep(m, "shmtrc");
+				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
-				}
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
 				if (m == NULL) {
-					VM_OBJECT_UNLOCK(object);
+					VM_OBJECT_WUNLOCK(object);
 					VM_WAIT;
-					VM_OBJECT_LOCK(object);
+					VM_OBJECT_WLOCK(object);
 					goto retry;
 				} else if (m->valid != VM_PAGE_BITS_ALL) {
 					ma[0] = m;
@@ -304,11 +490,11 @@
 				if (rv == VM_PAGER_OK) {
 					vm_page_deactivate(m);
 					vm_page_unlock(m);
-					vm_page_wakeup(m);
+					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
-					VM_OBJECT_UNLOCK(object);
+					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
@@ -320,7 +506,7 @@
 				vm_pager_page_unswapped(m);
 			}
 		}
-		delta = ptoa(object->size - nobjsize);
+		delta = IDX_TO_OFF(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
@@ -335,10 +521,10 @@
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
-		/* Attempt to reserve the swap */
-		delta = ptoa(nobjsize - object->size);
+		/* Try to reserve additional swap space. */
+		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
@@ -349,7 +535,7 @@
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
@@ -361,6 +547,7 @@
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
+	int ino;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
@@ -370,14 +557,22 @@
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
-	VM_OBJECT_LOCK(shmfd->shm_object);
+	shmfd->shm_object->pg_color = 0;
+	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
-	vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
-	VM_OBJECT_UNLOCK(shmfd->shm_object);
+	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
+	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
+	ino = alloc_unr(shm_ino_unr);
+	if (ino == -1)
+		shmfd->shm_ino = 0;
+	else
+		shmfd->shm_ino = ino;
 	refcount_init(&shmfd->shm_refs, 1);
+	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
+	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
@@ -402,7 +597,11 @@
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
+		rangelock_destroy(&shmfd->shm_rl);
+		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
+		if (shmfd->shm_ino != 0)
+			free_unr(shm_ino_unr, shmfd->shm_ino);
 		free(shmfd, M_SHMFD);
 	}
 }
@@ -435,14 +634,18 @@
  * the mappings in a hash table.
  */
 static void
-shm_dict_init(void *arg)
+shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
+	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
+	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
+	shm_dev_ino = devfs_alloc_cdp_inode();
+	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
-SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);
+SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
@@ -511,6 +714,8 @@
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
+	const char *pr_path;
+	size_t pr_pathlen;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
@@ -527,7 +732,7 @@
 	    (uap->flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
-	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC)) != 0)
+	if ((uap->flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
@@ -541,7 +746,7 @@
 	if (uap->path == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((uap->flags & O_ACCMODE) == O_RDONLY) {
-			fdclose(fdp, fp, fd, td);
+			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
@@ -548,13 +753,22 @@
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
-		error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+		pr_path = td->td_ucred->cr_prison->pr_path;
 
+		/* Construct a full pathname for jailed callers. */
+		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
+		    : strlcpy(path, pr_path, MAXPATHLEN);
+		error = copyinstr(uap->path, path + pr_pathlen,
+		    MAXPATHLEN - pr_pathlen, NULL);
+#ifdef KTRACE
+		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
+			ktrnamei(path);
+#endif
 		/* Require paths to start with a '/' character. */
-		if (error == 0 && path[0] != '/')
+		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
-			fdclose(fdp, fp, fd, td);
+			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
@@ -620,7 +834,7 @@
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
-			fdclose(fdp, fp, fd, td);
+			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
@@ -638,16 +852,25 @@
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
+	const char *pr_path;
+	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
-	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
+	pr_path = td->td_ucred->cr_prison->pr_path;
+	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
+	    : strlcpy(path, pr_path, MAXPATHLEN);
+	error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
+	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
-
+#ifdef KTRACE
+	if (KTRPOINT(curthread, KTR_NAMEI))
+		ktrnamei(path);
+#endif
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
@@ -757,7 +980,7 @@
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
@@ -764,13 +987,13 @@
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
-		VM_OBJECT_UNLOCK(obj);
+		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
@@ -777,8 +1000,8 @@
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
-	rv = vm_map_find(kernel_map, obj, offset, &kva, size,
-	    VMFS_ALIGNED_SPACE, VM_PROT_READ | VM_PROT_WRITE,
+	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
+	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
@@ -792,9 +1015,9 @@
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
@@ -836,10 +1059,10 @@
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
@@ -846,11 +1069,23 @@
 void
 shm_path(struct shmfd *shmfd, char *path, size_t size)
 {
+	const char *shm_path, *pr_path;
+	size_t pr_pathlen;
 
 	if (shmfd->shm_path == NULL)
 		return;
 	sx_slock(&shm_dict_lock);
-	if (shmfd->shm_path != NULL)
-		strlcpy(path, shmfd->shm_path, size);
+	shm_path = shmfd->shm_path;
+	if (shm_path != NULL) {
+		pr_path = curthread->td_ucred->cr_prison->pr_path;
+		if (strcmp(pr_path, "/") != 0) {
+			/* Return the jail-rooted pathname. */
+			pr_pathlen = strlen(pr_path);
+			if (strncmp(shm_path, pr_path, pr_pathlen) == 0 &&
+			    shm_path[pr_pathlen] == '/')
+				shm_path += pr_pathlen;
+		}
+		strlcpy(path, shm_path, size);
+	}
 	sx_sunlock(&shm_dict_lock);
 }

Modified: trunk/sys/kern/uipc_sockbuf.c
===================================================================
--- trunk/sys/kern/uipc_sockbuf.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_sockbuf.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_sockbuf.c 274043 2014-11-03 12:38:29Z hselasky $");
 
 #include "opt_param.h"
 
@@ -65,7 +66,7 @@
 
 static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
-static void	sbdrop_internal(struct sockbuf *sb, int len);
+static struct mbuf	*sbcut_internal(struct sockbuf *sb, int len);
 static void	sbflush_internal(struct sockbuf *sb);
 
 /*
@@ -127,9 +128,9 @@
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	sb->sb_flags |= SB_WAIT;
-	return (msleep(&sb->sb_cc, &sb->sb_mtx,
+	return (msleep_sbt(&sb->sb_cc, &sb->sb_mtx,
 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
-	    sb->sb_timeo));
+	    sb->sb_timeo, 0, 0));
 }
 
 int
@@ -188,7 +189,7 @@
 	}
 	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
 	if (sb->sb_upcall != NULL) {
-		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_DONTWAIT);
+		ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
 		if (ret == SU_ISCONNECTED) {
 			KASSERT(sb == &so->so_rcv,
 			    ("SO_SND upcall returned SU_ISCONNECTED"));
@@ -528,6 +529,9 @@
 
 	SBLASTMBUFCHK(sb);
 
+	/* Remove all packet headers and mbuf tags to get a pure data chain. */
+	m_demote(m, 1);
+	
 	sbcompress(sb, m, sb->sb_mbtail);
 
 	sb->sb_lastrecord = sb->sb_mb;
@@ -617,40 +621,23 @@
 	SOCKBUF_UNLOCK(sb);
 }
 
-/*
- * Append address and data, and optionally, control (ancillary) data to the
- * receive queue of a socket.  If present, m0 must include a packet header
- * with total length.  Returns 0 if no space in sockbuf or insufficient
- * mbufs.
- */
-int
-sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
-    struct mbuf *m0, struct mbuf *control)
+/* Helper routine that appends data, control, and address to a sockbuf. */
+static int
+sbappendaddr_locked_internal(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control, struct mbuf *ctrl_last)
 {
 	struct mbuf *m, *n, *nlast;
-	int space = asa->sa_len;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
-		panic("sbappendaddr_locked");
-	if (m0)
-		space += m0->m_pkthdr.len;
-	space += m_length(control, &n);
-
-	if (space > sbspace(sb))
-		return (0);
 #if MSIZE <= 256
 	if (asa->sa_len > MLEN)
 		return (0);
 #endif
-	MGET(m, M_DONTWAIT, MT_SONAME);
-	if (m == 0)
+	m = m_get(M_NOWAIT, MT_SONAME);
+	if (m == NULL)
 		return (0);
 	m->m_len = asa->sa_len;
 	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
-	if (n)
-		n->m_next = m0;		/* concatenate data to control */
+	if (ctrl_last)
+		ctrl_last->m_next = m0;	/* concatenate data to control */
 	else
 		control = m0;
 	m->m_next = control;
@@ -674,6 +661,50 @@
  * mbufs.
  */
 int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	struct mbuf *ctrl_last;
+	int space = asa->sa_len;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddr_locked");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	space += m_length(control, &ctrl_last);
+
+	if (space > sbspace(sb))
+		return (0);
+	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if insufficient mbufs.  Does not validate space
+ * on the receiving sockbuf.
+ */
+int
+sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	struct mbuf *ctrl_last;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	ctrl_last = (control == NULL) ? NULL : m_last(control);
+	return (sbappendaddr_locked_internal(sb, asa, m0, control, ctrl_last));
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
 sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control)
 {
@@ -815,7 +846,7 @@
 		 */
 		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
 			break;
-		sbdrop_internal(sb, (int)sb->sb_cc);
+		m_freem(sbcut_internal(sb, (int)sb->sb_cc));
 	}
 	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
 		panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
@@ -840,15 +871,16 @@
 }
 
 /*
- * Drop data from (the front of) a sockbuf.
+ * Cut data from (the front of) a sockbuf.
  */
-static void
-sbdrop_internal(struct sockbuf *sb, int len)
+static struct mbuf *
+sbcut_internal(struct sockbuf *sb, int len)
 {
-	struct mbuf *m;
-	struct mbuf *next;
+	struct mbuf *m, *n, *next, *mfree;
 
 	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	mfree = NULL;
+
 	while (len > 0) {
 		if (m == 0) {
 			if (next == 0)
@@ -869,11 +901,17 @@
 		}
 		len -= m->m_len;
 		sbfree(sb, m);
-		m = m_free(m);
+		n = m->m_next;
+		m->m_next = mfree;
+		mfree = m;
+		m = n;
 	}
 	while (m && m->m_len == 0) {
 		sbfree(sb, m);
-		m = m_free(m);
+		n = m->m_next;
+		m->m_next = mfree;
+		mfree = m;
+		m = n;
 	}
 	if (m) {
 		sb->sb_mb = m;
@@ -891,6 +929,8 @@
 	} else if (m->m_nextpkt == NULL) {
 		sb->sb_lastrecord = m;
 	}
+
+	return (mfree);
 }
 
 /*
@@ -901,17 +941,31 @@
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
+	m_freem(sbcut_internal(sb, len));
+}
 
-	sbdrop_internal(sb, len);
+/*
+ * Drop data from (the front of) a sockbuf,
+ * and return it to caller.
+ */
+struct mbuf *
+sbcut_locked(struct sockbuf *sb, int len)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	return (sbcut_internal(sb, len));
 }
 
 void
 sbdrop(struct sockbuf *sb, int len)
 {
+	struct mbuf *mfree;
 
 	SOCKBUF_LOCK(sb);
-	sbdrop_locked(sb, len);
+	mfree = sbcut_internal(sb, len);
 	SOCKBUF_UNLOCK(sb);
+
+	m_freem(mfree);
 }
 
 /*
@@ -939,6 +993,13 @@
 	/* Return closest mbuf in chain for current offset. */
 	*moff = off - sb->sb_sndptroff;
 	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+	if (*moff == m->m_len) {
+		*moff = 0;
+		sb->sb_sndptroff += m->m_len;
+		m = ret = m->m_next;
+		KASSERT(ret->m_len > 0,
+		    ("mbuf %p in sockbuf %p chain has no valid data", ret, sb));
+	}
 
 	/* Advance by len to be as close as possible for the next transmit. */
 	for (off = off - sb->sb_sndptroff + len - 1;
@@ -955,6 +1016,37 @@
 }
 
 /*
+ * Return the first mbuf and the mbuf data offset for the provided
+ * send offset without changing the "sb_sndptroff" field.
+ */
+struct mbuf *
+sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff)
+{
+	struct mbuf *m;
+
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+
+	/*
+	 * If the "off" is below the stored offset, which happens on
+	 * retransmits, just use "sb_mb":
+	 */
+	if (sb->sb_sndptr == NULL || sb->sb_sndptroff > off) {
+		m = sb->sb_mb;
+	} else {
+		m = sb->sb_sndptr;
+		off -= sb->sb_sndptroff;
+	}
+	while (off > 0 && m != NULL) {
+		if (off < m->m_len)
+			break;
+		off -= m->m_len;
+		m = m->m_next;
+	}
+	*moff = off;
+	return (m);
+}
+
+/*
  * Drop a record off the front of a sockbuf and move the next record to the
  * front.
  */
@@ -1002,9 +1094,9 @@
 	if (CMSG_SPACE((u_int)size) > MCLBYTES)
 		return ((struct mbuf *) NULL);
 	if (CMSG_SPACE((u_int)size) > MLEN)
-		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+		m = m_getcl(M_NOWAIT, MT_CONTROL, 0);
 	else
-		m = m_get(M_DONTWAIT, MT_CONTROL);
+		m = m_get(M_NOWAIT, MT_CONTROL);
 	if (m == NULL)
 		return ((struct mbuf *) NULL);
 	cp = mtod(m, struct cmsghdr *);
@@ -1053,4 +1145,4 @@
 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
     &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
 SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
-    &sb_efficiency, 0, "");
+    &sb_efficiency, 0, "Socket buffer size waste factor");

Modified: trunk/sys/kern/uipc_socket.c
===================================================================
--- trunk/sys/kern/uipc_socket.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_socket.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.
@@ -92,7 +93,7 @@
  * from a listen queue to a file descriptor, in order to prevent garbage
  * collection of the socket at an untimely moment.  For a number of reasons,
  * these interfaces are not preferred, and should be avoided.
- * 
+ *
  * NOTE: With regard to VNETs the general rule is that callers do not set
  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
@@ -101,11 +102,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_socket.c 305261 2016-09-02 00:14:28Z markj $");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
-#include "opt_zero.h"
 #include "opt_compat.h"
 
 #include <sys/param.h>
@@ -135,6 +135,8 @@
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/jail.h>
+#include <sys/syslog.h>
+#include <netinet/in.h>
 
 #include <net/vnet.h>
 
@@ -173,11 +175,8 @@
 	.f_event = filt_sowrite,
 };
 
-uma_zone_t socket_zone;
 so_gen_t	so_gencnt;	/* generation count for sockets */
 
-int	maxsockets;
-
 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 
@@ -185,26 +184,42 @@
 	VNET_ASSERT(curvnet != NULL,					\
 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 
+/*
+ * Limit on the number of connections in the listen queue waiting
+ * for accept(2).
+ * NB: The original sysctl somaxconn is still available but hidden
+ * to prevent confusion about the actual purpose of this number.
+ */
 static int somaxconn = SOMAXCONN;
-static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
-/* XXX: we dont have SYSCTL_USHORT */
-SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
-    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
-    "queue size");
+
+static int
+sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	int val;
+
+	val = somaxconn;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error || !req->newptr )
+		return (error);
+
+	if (val < 1 || val > USHRT_MAX)
+		return (EINVAL);
+
+	somaxconn = val;
+	return (0);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW,
+    0, sizeof(int), sysctl_somaxconn, "I",
+    "Maximum listen socket pending connection accept queue size");
+SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
+    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP,
+    0, sizeof(int), sysctl_somaxconn, "I",
+    "Maximum listen socket pending connection accept queue size (compat)");
+
 static int numopensockets;
 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
     &numopensockets, 0, "Number of open sockets");
-#ifdef ZERO_COPY_SOCKETS
-/* These aren't static because they're used in other files. */
-int so_zero_copy_send = 1;
-int so_zero_copy_receive = 1;
-SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0,
-    "Zero copy controls");
-SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW,
-    &so_zero_copy_receive, 0, "Enable zero copy receive");
-SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW,
-    &so_zero_copy_send, 0, "Enable zero copy send");
-#endif /* ZERO_COPY_SOCKETS */
 
 /*
  * accept_mtx locks down per-socket fields relating to accept queues.  See
@@ -227,6 +242,46 @@
 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 
 /*
+ * Initialize the socket subsystem and set up the socket
+ * memory allocator.
+ */
+static uma_zone_t socket_zone;
+int	maxsockets;
+
+static void
+socket_zone_change(void *tag)
+{
+
+	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+}
+
+static void
+socket_init(void *tag)
+{
+
+	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
+	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
+	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+}
+SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
+
+/*
+ * Initialise maxsockets.  This SYSINIT must be run after
+ * tunable_mbinit().
+ */
+static void
+init_maxsockets(void *ignored)
+{
+
+	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+	maxsockets = imax(maxsockets, maxfiles);
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
+
+/*
  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
  * of the change so that they can update their dependent limits as required.
  */
@@ -238,12 +293,9 @@
 	newmaxsockets = maxsockets;
 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 	if (error == 0 && req->newptr) {
-		if (newmaxsockets > maxsockets) {
+		if (newmaxsockets > maxsockets &&
+		    newmaxsockets <= maxfiles) {
 			maxsockets = newmaxsockets;
-			if (maxsockets > ((maxfiles / 4) * 3)) {
-				maxfiles = (maxsockets * 5) / 4;
-				maxfilesperproc = (maxfiles * 9) / 10;
-			}
 			EVENTHANDLER_INVOKE(maxsockets_change);
 		} else
 			error = EINVAL;
@@ -250,25 +302,11 @@
 	}
 	return (error);
 }
-
 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
     &maxsockets, 0, sysctl_maxsockets, "IU",
     "Maximum number of sockets avaliable");
 
 /*
- * Initialise maxsockets.  This SYSINIT must be run after
- * tunable_mbinit().
- */
-static void
-init_maxsockets(void *ignored)
-{
-
-	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
-	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
-}
-SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
-
-/*
  * Socket operation routines.  These routines are called by the routines in
  * sys_socket.c or from a system process, and implement the semantics of
  * socket operations by switching out to the protocol specific routines.
@@ -374,7 +412,16 @@
 	else
 		prp = pffindtype(dom, type);
 
-	if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL ||
+	if (prp == NULL) {
+		/* No support for domain. */
+		if (pffinddomain(dom) == NULL)
+			return (EAFNOSUPPORT);
+		/* No support for socket type. */
+		if (proto == 0 && type != 0)
+			return (EPROTOTYPE);
+		return (EPROTONOSUPPORT);
+	}
+	if (prp->pr_usrreqs->pru_attach == NULL ||
 	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
 		return (EPROTONOSUPPORT);
 
@@ -440,6 +487,10 @@
 struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
+	static struct timeval lastover;
+	static struct timeval overinterval = { 60, 0 };
+	static int overcount;
+
 	struct socket *so;
 	int over;
 
@@ -447,16 +498,32 @@
 	over = (head->so_qlen > 3 * head->so_qlimit / 2);
 	ACCEPT_UNLOCK();
 #ifdef REGRESSION
-	if (regression_sonewconn_earlytest && over)
+	if (regression_sonewconn_earlytest && over) {
 #else
-	if (over)
+	if (over) {
 #endif
+		overcount++;
+
+		if (ratecheck(&lastover, &overinterval)) {
+			log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
+			    "%i already in queue awaiting acceptance "
+			    "(%d occurrences)\n",
+			    __func__, head->so_pcb, head->so_qlen, overcount);
+
+			overcount = 0;
+		}
+
 		return (NULL);
+	}
 	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 	    __func__, __LINE__, head));
 	so = soalloc(head->so_vnet);
-	if (so == NULL)
+	if (so == NULL) {
+		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+		    "limit reached or out of memory\n",
+		    __func__, head->so_pcb);
 		return (NULL);
+	}
 	if ((head->so_options & SO_ACCEPTFILTER) != 0)
 		connstatus = 0;
 	so->so_head = head;
@@ -473,11 +540,18 @@
 	knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
 	knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
 	VNET_SO_ASSERT(head);
-	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
-	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+		    __func__, head->so_pcb);
 		return (NULL);
 	}
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+		    __func__, head->so_pcb);
+		return (NULL);
+	}
 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
@@ -486,6 +560,20 @@
 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 	so->so_state |= connstatus;
 	ACCEPT_LOCK();
+	/*
+	 * The accept socket may be tearing down but we just
+	 * won a race on the ACCEPT_LOCK.
+	 * However, if sctp_peeloff() is called on a 1-to-many
+	 * style socket, the SO_ACCEPTCONN doesn't need to be set.
+	 */
+	if (!(head->so_options & SO_ACCEPTCONN) &&
+	    ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
+	     (head->so_type != SOCK_SEQPACKET))) {
+		SOCK_LOCK(so);
+		so->so_head = NULL;
+		sofree(so);		/* NB: returns ACCEPT_UNLOCK'ed. */
+		return (NULL);
+	}
 	if (connstatus) {
 		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 		so->so_qstate |= SQ_COMP;
@@ -529,9 +617,20 @@
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
 	CURVNET_RESTORE();
-	return error;
+	return (error);
 }
 
+int
+sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	int error;
+
+	CURVNET_SET(so->so_vnet);
+	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
+	CURVNET_RESTORE();
+	return (error);
+}
+
 /*
  * solisten() transitions a socket from a non-listening state to a listening
  * state, but can also be used to update the listen queue depth on an
@@ -552,7 +651,7 @@
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
 	CURVNET_RESTORE();
-	return error;
+	return (error);
 }
 
 int
@@ -634,15 +733,21 @@
 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
 	if (so->so_options & SO_ACCEPTCONN) {
-		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
-		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_incomp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_comp)),
+		    ("sofree: so_comp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+		    ("sofree: so_incomp populated"));
 	}
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
 
 	VNET_SO_ASSERT(so);
-	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
-		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) {
+		if (pr->pr_domain->dom_family == AF_LOCAL)
+			unp_dispose_so(so);
+		else
+			(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	}
 	if (pr->pr_usrreqs->pru_detach != NULL)
 		(*pr->pr_usrreqs->pru_detach)(so);
 
@@ -701,7 +806,8 @@
 				goto drop;
 			while (so->so_state & SS_ISCONNECTED) {
 				error = tsleep(&so->so_timeo,
-				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
+				    PSOCK | PCATCH, "soclos",
+				    so->so_linger * hz);
 				if (error)
 					break;
 			}
@@ -711,9 +817,14 @@
 drop:
 	if (so->so_proto->pr_usrreqs->pru_close != NULL)
 		(*so->so_proto->pr_usrreqs->pru_close)(so);
+	ACCEPT_LOCK();
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct socket *sp;
-		ACCEPT_LOCK();
+		/*
+		 * Prevent new additions to the accept queues due
+		 * to ACCEPT_LOCK races while we are draining them.
+		 */
+		so->so_options &= ~SO_ACCEPTCONN;
 		while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) {
 			TAILQ_REMOVE(&so->so_incomp, sp, so_list);
 			so->so_incqlen--;
@@ -732,13 +843,15 @@
 			soabort(sp);
 			ACCEPT_LOCK();
 		}
-		ACCEPT_UNLOCK();
+		KASSERT((TAILQ_EMPTY(&so->so_comp)),
+		    ("%s: so_comp populated", __func__));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)),
+		    ("%s: so_incomp populated", __func__));
 	}
-	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
 	so->so_state |= SS_NOFDREF;
-	sorele(so);
+	sorele(so);			/* NB: Returns with ACCEPT_UNLOCK(). */
 	CURVNET_RESTORE();
 	return (error);
 }
@@ -800,6 +913,13 @@
 int
 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
+
+	return (soconnectat(AT_FDCWD, so, nam, td));
+}
+
+int
+soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
+{
 	int error;
 
 	if (so->so_options & SO_ACCEPTCONN)
@@ -821,7 +941,13 @@
 		 * biting us.
 		 */
 		so->so_error = 0;
-		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
+		if (fd == AT_FDCWD) {
+			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
+			    nam, td);
+		} else {
+			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
+			    so, nam, td);
+		}
 	}
 	CURVNET_RESTORE();
 
@@ -853,135 +979,6 @@
 	return (error);
 }
 
-#ifdef ZERO_COPY_SOCKETS
-struct so_zerocopy_stats{
-	int size_ok;
-	int align_ok;
-	int found_ifp;
-};
-struct so_zerocopy_stats so_zerocp_stats = {0,0,0};
-#include <netinet/in.h>
-#include <net/route.h>
-#include <netinet/in_pcb.h>
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_object.h>
-
-/*
- * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
- * sosend_dgram() and sosend_generic() use m_uiotombuf().
- * 
- * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
- * all of the data referenced by the uio.  If desired, it uses zero-copy.
- * *space will be updated to reflect data copied in.
- *
- * NB: If atomic I/O is requested, the caller must already have checked that
- * space can hold resid bytes.
- *
- * NB: In the event of an error, the caller may need to free the partial
- * chain pointed to by *mpp.  The contents of both *uio and *space may be
- * modified even in the case of an error.
- */
-static int
-sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
-    int flags)
-{
-	struct mbuf *m, **mp, *top;
-	long len;
-	ssize_t resid;
-	int error;
-#ifdef ZERO_COPY_SOCKETS
-	int cow_send;
-#endif
-
-	*retmp = top = NULL;
-	mp = ⊤
-	len = 0;
-	resid = uio->uio_resid;
-	error = 0;
-	do {
-#ifdef ZERO_COPY_SOCKETS
-		cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
-		if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
-			if (top == NULL) {
-				m = m_gethdr(M_WAITOK, MT_DATA);
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = NULL;
-			} else
-				m = m_get(M_WAITOK, MT_DATA);
-			if (so_zero_copy_send &&
-			    resid>=PAGE_SIZE &&
-			    *space>=PAGE_SIZE &&
-			    uio->uio_iov->iov_len>=PAGE_SIZE) {
-				so_zerocp_stats.size_ok++;
-				so_zerocp_stats.align_ok++;
-				cow_send = socow_setup(m, uio);
-				len = cow_send;
-			}
-			if (!cow_send) {
-				m_clget(m, M_WAITOK);
-				len = min(min(MCLBYTES, resid), *space);
-			}
-#else /* ZERO_COPY_SOCKETS */
-			if (top == NULL) {
-				m = m_getcl(M_WAIT, MT_DATA, M_PKTHDR);
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = NULL;
-			} else
-				m = m_getcl(M_WAIT, MT_DATA, 0);
-			len = min(min(MCLBYTES, resid), *space);
-#endif /* ZERO_COPY_SOCKETS */
-		} else {
-			if (top == NULL) {
-				m = m_gethdr(M_WAIT, MT_DATA);
-				m->m_pkthdr.len = 0;
-				m->m_pkthdr.rcvif = NULL;
-
-				len = min(min(MHLEN, resid), *space);
-				/*
-				 * For datagram protocols, leave room
-				 * for protocol headers in first mbuf.
-				 */
-				if (atomic && m && len < MHLEN)
-					MH_ALIGN(m, len);
-			} else {
-				m = m_get(M_WAIT, MT_DATA);
-				len = min(min(MLEN, resid), *space);
-			}
-		}
-		if (m == NULL) {
-			error = ENOBUFS;
-			goto out;
-		}
-
-		*space -= len;
-#ifdef ZERO_COPY_SOCKETS
-		if (cow_send)
-			error = 0;
-		else
-#endif /* ZERO_COPY_SOCKETS */
-		error = uiomove(mtod(m, void *), (int)len, uio);
-		resid = uio->uio_resid;
-		m->m_len = len;
-		*mp = m;
-		top->m_pkthdr.len += len;
-		if (error)
-			goto out;
-		mp = &m->m_next;
-		if (resid <= 0) {
-			if (flags & MSG_EOR)
-				top->m_flags |= M_EOR;
-			break;
-		}
-	} while (*space > 0 && atomic);
-out:
-	*retmp = top;
-	return (error);
-}
-#endif /*ZERO_COPY_SOCKETS*/
-
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 
 int
@@ -991,13 +988,10 @@
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
-#ifdef ZERO_COPY_SOCKETS
-	int atomic = sosendallatonce(so) || top;
-#endif
 
-	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
+	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
-	    ("sodgram_send: !PR_ATOMIC"));
+	    ("sosend_dgram: !PR_ATOMIC"));
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
@@ -1076,11 +1070,6 @@
 		if (flags & MSG_EOR)
 			top->m_flags |= M_EOR;
 	} else {
-#ifdef ZERO_COPY_SOCKETS
-		error = sosend_copyin(uio, &top, atomic, &space, flags);
-		if (error)
-			goto out;
-#else
 		/*
 		 * Copy the data from userland into a mbuf chain.
 		 * If no data is to be copied in, a single empty mbuf
@@ -1093,7 +1082,6 @@
 			goto out;
 		}
 		space -= resid - uio->uio_resid;
-#endif
 		resid = uio->uio_resid;
 	}
 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
@@ -1108,7 +1096,7 @@
 	}
 	/*
 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
-	 * of date.  We could have recieved a reset packet in an interrupt or
+	 * of date.  We could have received a reset packet in an interrupt or
 	 * maybe we slept while doing page faults in uiomove() etc.  We could
 	 * probably recheck again inside the locking protection here, but
 	 * there are probably other places that this also happens.  We must
@@ -1265,12 +1253,6 @@
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			} else {
-#ifdef ZERO_COPY_SOCKETS
-				error = sosend_copyin(uio, &top, atomic,
-				    &space, flags);
-				if (error != 0)
-					goto release;
-#else
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If no data is to be copied in,
@@ -1285,7 +1267,6 @@
 					goto release;
 				}
 				space -= resid - uio->uio_resid;
-#endif
 				resid = uio->uio_resid;
 			}
 			if (dontroute) {
@@ -1295,7 +1276,7 @@
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
-			 * done could be out of date.  We could have recieved
+			 * done could be out of date.  We could have received
 			 * a reset packet in an interrupt or maybe we slept
 			 * while doing page faults in uiomove() etc.  We
 			 * could probably recheck again inside the locking
@@ -1372,26 +1353,11 @@
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 	VNET_SO_ASSERT(so);
 
-	m = m_get(M_WAIT, MT_DATA);
+	m = m_get(M_WAITOK, MT_DATA);
 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
-#ifdef ZERO_COPY_SOCKETS
-		if (so_zero_copy_receive) {
-			int disposable;
-
-			if ((m->m_flags & M_EXT)
-			 && (m->m_ext.ext_type == EXT_DISPOSABLE))
-				disposable = 1;
-			else
-				disposable = 0;
-
-			error = uiomoveco(mtod(m, void *),
-					  min(uio->uio_resid, m->m_len),
-					  uio, disposable);
-		} else
-#endif /* ZERO_COPY_SOCKETS */
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
@@ -1424,20 +1390,19 @@
 	else
 		sb->sb_mb = nextrecord;
 
-        /*
-         * Now update any dependent socket buffer fields to reflect the new
-         * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
+	/*
+	 * Now update any dependent socket buffer fields to reflect the new
+	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
 	 * addition of a second clause that takes care of the case where
 	 * sb_mb has been updated, but remains the last record.
-         */
-        if (sb->sb_mb == NULL) {
-                sb->sb_mbtail = NULL;
-                sb->sb_lastrecord = NULL;
-        } else if (sb->sb_mb->m_nextpkt == NULL)
-                sb->sb_lastrecord = sb->sb_mb;
+	 */
+	if (sb->sb_mb == NULL) {
+		sb->sb_mbtail = NULL;
+		sb->sb_lastrecord = NULL;
+	} else if (sb->sb_mb->m_nextpkt == NULL)
+		sb->sb_lastrecord = sb->sb_mb;
 }
 
-
 /*
  * Implement receive operations on a socket.  We depend on the way that
  * records are added to the sockbuf by sbappend.  In particular, each record
@@ -1626,7 +1591,7 @@
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				error = (*pr->pr_domain->dom_externalize)
-				    (cm, controlp);
+				    (cm, controlp, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			} else if (controlp != NULL)
 				*controlp = cm;
@@ -1687,7 +1652,7 @@
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
-	 	 */
+		 */
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
 			if (type != m->m_type)
@@ -1715,21 +1680,6 @@
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			SOCKBUF_UNLOCK(&so->so_rcv);
-#ifdef ZERO_COPY_SOCKETS
-			if (so_zero_copy_receive) {
-				int disposable;
-
-				if ((m->m_flags & M_EXT)
-				 && (m->m_ext.ext_type == EXT_DISPOSABLE))
-					disposable = 1;
-				else
-					disposable = 0;
-
-				error = uiomoveco(mtod(m, char *) + moff,
-						  (int)len, uio,
-						  disposable);
-			} else
-#endif /* ZERO_COPY_SOCKETS */
 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (error) {
@@ -1760,6 +1710,7 @@
 				nextrecord = m->m_nextpkt;
 				sbfree(&so->so_rcv, m);
 				if (mp != NULL) {
+					m->m_nextpkt = NULL;
 					*mp = m;
 					mp = &m->m_next;
 					so->so_rcv.sb_mb = m = m->m_next;
@@ -1780,26 +1731,26 @@
 					int copy_flag;
 
 					if (flags & MSG_DONTWAIT)
-						copy_flag = M_DONTWAIT;
+						copy_flag = M_NOWAIT;
 					else
 						copy_flag = M_WAIT;
-					if (copy_flag == M_WAIT)
+					if (copy_flag == M_WAITOK)
 						SOCKBUF_UNLOCK(&so->so_rcv);
 					*mp = m_copym(m, 0, len, copy_flag);
-					if (copy_flag == M_WAIT)
+					if (copy_flag == M_WAITOK)
 						SOCKBUF_LOCK(&so->so_rcv);
- 					if (*mp == NULL) {
- 						/*
- 						 * m_copym() couldn't
+					if (*mp == NULL) {
+						/*
+						 * m_copym() couldn't
 						 * allocate an mbuf.  Adjust
 						 * uio_resid back (it was
 						 * adjusted down by len
 						 * bytes, which we didn't end
 						 * up "copying" over).
- 						 */
- 						uio->uio_resid += len;
- 						break;
- 					}
+						 */
+						uio->uio_resid += len;
+						break;
+					}
 				}
 				m->m_data += len;
 				m->m_len -= len;
@@ -1832,7 +1783,8 @@
 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 		    !sosendallatonce(so) && nextrecord == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-			if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE)
+			if (so->so_error ||
+			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				break;
 			/*
 			 * Notify the protocol that some data has been
@@ -1915,6 +1867,7 @@
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
+ * XXXAO: (MSG_WAITALL | MSG_PEEK) isn't properly handled.
  */
 int
 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
@@ -1993,7 +1946,7 @@
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
-	    ((sb->sb_flags & SS_NBIO) ||
+	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sb->sb_cc >= sb->sb_lowat ||
 	     sb->sb_cc >= uio->uio_resid ||
@@ -2003,7 +1956,7 @@
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
-	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
+	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_hiwat))
 		goto deliver;
 
 	/*
@@ -2029,7 +1982,11 @@
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
-			for (*mp0 = m = sb->sb_mb;
+			if (*mp0 == NULL)
+				*mp0 = sb->sb_mb;
+			else
+				m_cat(*mp0, sb->sb_mb);
+			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				len -= m->m_len;
@@ -2037,10 +1994,11 @@
 				sbfree(sb, m);
 				n = m;
 			}
+			n->m_next = NULL;
 			sb->sb_mb = m;
+			sb->sb_lastrecord = sb->sb_mb;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
-			n->m_next = NULL;
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
@@ -2047,13 +2005,13 @@
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
-			m = m_copym(sb->sb_mb, 0, len, M_DONTWAIT);
+			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
-				uio->uio_resid -= m->m_len;
+				uio->uio_resid -= len;
 			if (*mp0 != NULL)
-				n->m_next = m;
+				m_cat(*mp0, m);
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
@@ -2236,7 +2194,8 @@
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  We call into the
 	 * protocol to perform externalization (or freeing if controlp ==
-	 * NULL).
+	 * NULL). In some cases there can be only MT_CONTROL mbufs without
+	 * MT_DATA mbufs.
 	 */
 	if (m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
@@ -2254,7 +2213,7 @@
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				error = (*pr->pr_domain->dom_externalize)
-				    (cm, controlp);
+				    (cm, controlp, flags);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
@@ -2266,8 +2225,8 @@
 			cm = cmn;
 		}
 	}
-	KASSERT(m->m_type == MT_DATA, ("soreceive_dgram: !data"));
-
+	KASSERT(m == NULL || m->m_type == MT_DATA,
+	    ("soreceive_dgram: !data"));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
@@ -2284,9 +2243,10 @@
 			m->m_len -= len;
 		}
 	}
-	if (m != NULL)
+	if (m != NULL) {
 		flags |= MSG_TRUNC;
-	m_freem(m);
+		m_freem(m);
+	}
 	if (flagsp != NULL)
 		*flagsp |= flags;
 	return (0);
@@ -2315,16 +2275,17 @@
 		return (EINVAL);
 
 	CURVNET_SET(so->so_vnet);
-	if (pr->pr_usrreqs->pru_flush != NULL) {
-	        (*pr->pr_usrreqs->pru_flush)(so, how);
-	}
+	if (pr->pr_usrreqs->pru_flush != NULL)
+		(*pr->pr_usrreqs->pru_flush)(so, how);
 	if (how != SHUT_WR)
 		sorflush(so);
 	if (how != SHUT_RD) {
 		error = (*pr->pr_usrreqs->pru_shutdown)(so);
+		wakeup(&so->so_timeo);
 		CURVNET_RESTORE();
 		return (error);
 	}
+	wakeup(&so->so_timeo);
 	CURVNET_RESTORE();
 	return (0);
 }
@@ -2334,7 +2295,7 @@
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct protosw *pr = so->so_proto;
-	struct sockbuf asb;
+	struct socket aso;
 
 	VNET_SO_ASSERT(so);
 
@@ -2359,8 +2320,9 @@
 	 * and mutex data unchanged.
 	 */
 	SOCKBUF_LOCK(sb);
-	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
-	bcopy(&sb->sb_startzero, &asb.sb_startzero,
+	bzero(&aso, sizeof(aso));
+	aso.so_pcb = so->so_pcb;
+	bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 	bzero(&sb->sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
@@ -2368,12 +2330,16 @@
 	sbunlock(sb);
 
 	/*
-	 * Dispose of special rights and flush the socket buffer.  Don't call
-	 * any unsafe routines (that rely on locks being initialized) on asb.
+	 * Dispose of special rights and flush the copied socket.  Don't call
+	 * any unsafe routines (that rely on locks being initialized) on aso.
 	 */
-	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
-		(*pr->pr_domain->dom_dispose)(asb.sb_mb);
-	sbrelease_internal(&asb, so);
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) {
+		if (pr->pr_domain->dom_family == AF_LOCAL)
+			unp_dispose_so(&aso);
+		else
+			(*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb);
+	}
+	sbrelease_internal(&aso.so_rcv, so);
 }
 
 /*
@@ -2432,7 +2398,7 @@
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
-	u_long  val;
+	sbintime_t val;
 	uint32_t val32;
 #ifdef MAC
 	struct mac extmac;
@@ -2484,7 +2450,7 @@
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
-					    sizeof optval);
+			    sizeof optval);
 			if (error)
 				goto bad;
 			SOCK_LOCK(so);
@@ -2497,7 +2463,7 @@
 
 		case SO_SETFIB:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
-					    sizeof optval);
+			    sizeof optval);
 			if (error)
 				goto bad;
 
@@ -2515,7 +2481,7 @@
 
 		case SO_USER_COOKIE:
 			error = sooptcopyin(sopt, &val32, sizeof val32,
-					    sizeof val32);
+			    sizeof val32);
 			if (error)
 				goto bad;
 			so->so_user_cookie = val32;
@@ -2526,7 +2492,7 @@
 		case SO_SNDLOWAT:
 		case SO_RCVLOWAT:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
-					    sizeof optval);
+			    sizeof optval);
 			if (error)
 				goto bad;
 
@@ -2589,23 +2555,15 @@
 				    sizeof tv);
 			if (error)
 				goto bad;
-
-			/* assert(hz > 0); */
-			if (tv.tv_sec < 0 || tv.tv_sec > INT_MAX / hz ||
-			    tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
+			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
+			    tv.tv_usec >= 1000000) {
 				error = EDOM;
 				goto bad;
 			}
-			/* assert(tick > 0); */
-			/* assert(ULONG_MAX - INT_MAX >= 1000000); */
-			val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick;
-			if (val > INT_MAX) {
-				error = EDOM;
-				goto bad;
-			}
-			if (val == 0 && tv.tv_usec != 0)
-				val = 1;
-
+			if (tv.tv_sec > INT32_MAX)
+				val = SBT_MAX;
+			else
+				val = tvtosbt(tv);
 			switch (sopt->sopt_name) {
 			case SO_SNDTIMEO:
 				so->so_snd.sb_timeo = val;
@@ -2755,11 +2713,8 @@
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
-			optval = (sopt->sopt_name == SO_SNDTIMEO ?
-				  so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
-
-			tv.tv_sec = optval / hz;
-			tv.tv_usec = (optval % hz) * tick;
+			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
+			    so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
@@ -2828,7 +2783,6 @@
 	return (error);
 }
 
-/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
 int
 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 {
@@ -2835,11 +2789,11 @@
 	struct mbuf *m, *m_prev;
 	int sopt_size = sopt->sopt_valsize;
 
-	MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
+	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return ENOBUFS;
 	if (sopt_size > MLEN) {
-		MCLGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT);
+		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return ENOBUFS;
@@ -2853,14 +2807,14 @@
 	m_prev = m;
 
 	while (sopt_size) {
-		MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA);
+		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			m_freem(*mp);
 			return ENOBUFS;
 		}
 		if (sopt_size > MLEN) {
-			MCLGET(m, sopt->sopt_td != NULL ? M_WAIT :
-			    M_DONTWAIT);
+			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
+			    M_NOWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				m_freem(m);
 				m_freem(*mp);
@@ -2877,7 +2831,6 @@
 	return (0);
 }
 
-/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
 int
 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 {
@@ -2890,7 +2843,7 @@
 			int error;
 
 			error = copyin(sopt->sopt_val, mtod(m, char *),
-				       m->m_len);
+			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
@@ -2906,7 +2859,6 @@
 	return (0);
 }
 
-/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
 int
 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 {
@@ -2920,7 +2872,7 @@
 			int error;
 
 			error = copyout(mtod(m, char *), sopt->sopt_val,
-				       m->m_len);
+			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
@@ -2927,10 +2879,10 @@
 			}
 		} else
 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
-	       sopt->sopt_valsize -= m->m_len;
-	       sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
-	       valsize += m->m_len;
-	       m = m->m_next;
+		sopt->sopt_valsize -= m->m_len;
+		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
+		valsize += m->m_len;
+		m = m->m_next;
 	}
 	if (m != NULL) {
 		/* enough soopt buffer should be given from user-land */
@@ -3067,6 +3019,14 @@
 }
 
 int
+pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
@@ -3074,6 +3034,14 @@
 }
 
 int
+pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
 {
 
@@ -3258,7 +3226,7 @@
 	struct socket *so = kn->kn_fp->f_data;
 
 	kn->kn_data = so->so_qlen;
-	return (! TAILQ_EMPTY(&so->so_comp));
+	return (!TAILQ_EMPTY(&so->so_comp));
 }
 
 int
@@ -3272,24 +3240,6 @@
 	return (0);
 }
 
-static int
-sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
-{
-	int error;
-	int val;
-
-	val = somaxconn;
-	error = sysctl_handle_int(oidp, &val, 0, req);
-	if (error || !req->newptr )
-		return (error);
-
-	if (val < 1 || val > USHRT_MAX)
-		return (EINVAL);
-
-	somaxconn = val;
-	return (0);
-}
-
 /*
  * These functions are used by protocols to notify the socket layer (and its
  * consumers) of state changes in the sockets driven by protocol-side events.
@@ -3335,7 +3285,7 @@
 void
 soisconnected(struct socket *so)
 {
-	struct socket *head;	
+	struct socket *head;
 	int ret;
 
 restart:
@@ -3363,7 +3313,7 @@
 			    head->so_accf->so_accept_filter_arg);
 			so->so_options &= ~SO_ACCEPTFILTER;
 			ret = head->so_accf->so_accept_filter->accf_callback(so,
-			    head->so_accf->so_accept_filter_arg, M_DONTWAIT);
+			    head->so_accf->so_accept_filter_arg, M_NOWAIT);
 			if (ret == SU_ISCONNECTED)
 				soupcall_clear(so, SO_RCV);
 			SOCK_UNLOCK(so);
@@ -3440,7 +3390,7 @@
     int (*func)(struct socket *, void *, int), void *arg)
 {
 	struct sockbuf *sb;
-	
+
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
@@ -3524,9 +3474,10 @@
  */
 
 void
-so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg)
+so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *),
+    void *arg)
 {
-	
+
 	TAILQ_FOREACH(so, &so->so_comp, so_list)
 		func(so, arg);
 }
@@ -3646,6 +3597,7 @@
 void
 so_lock(struct socket *so)
 {
+
 	SOCK_LOCK(so);
 }
 
@@ -3652,5 +3604,6 @@
 void
 so_unlock(struct socket *so)
 {
+
 	SOCK_UNLOCK(so);
 }

Modified: trunk/sys/kern/uipc_syscalls.c
===================================================================
--- trunk/sys/kern/uipc_syscalls.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_syscalls.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -33,18 +34,18 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_syscalls.c 321021 2017-07-15 17:28:03Z dchagin $");
 
 #include "opt_capsicum.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
-#include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -57,9 +58,11 @@
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
+#include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
+#include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sysent.h>
 #include <sys/socket.h>
@@ -85,77 +88,80 @@
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
-#if defined(INET) || defined(INET6)
-#ifdef SCTP
-#include <netinet/sctp.h>
-#include <netinet/sctp_peeloff.h>
-#endif /* SCTP */
-#endif /* INET || INET6 */
+/*
+ * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
+ * and SOCK_NONBLOCK.
+ */
+#define	ACCEPT4_INHERIT	0x1
+#define	ACCEPT4_COMPAT	0x2
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
-static int accept1(struct thread *td, struct accept_args *uap, int compat);
-static int do_sendfile(struct thread *td, struct sendfile_args *uap, int compat);
+static int accept1(struct thread *td, int s, struct sockaddr *uname,
+		   socklen_t *anamelen, int flags);
+static int do_sendfile(struct thread *td, struct sendfile_args *uap,
+		   int compat);
 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 			int compat);
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
+counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
+
 /*
- * NSFBUFS-related variables and associated sysctls
+ * sendfile(2)-related variables and associated sysctls
  */
-int nsfbufs;
-int nsfbufspeak;
-int nsfbufsused;
+static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
+    "sendfile(2) tunables");
+static int sfreadahead = 1;
+SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
+    &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
-    "Maximum number of sendfile(2) sf_bufs available");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
-    "Number of sendfile(2) sf_bufs at peak usage");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
-    "Number of sendfile(2) sf_bufs in use");
 
+static void
+sfstat_init(const void *unused)
+{
+
+	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
+	    M_WAITOK);
+}
+SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
+
+static int
+sfstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	struct sfstat s;
+
+	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
+	if (req->newptr)
+		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
+	return (SYSCTL_OUT(req, &s, sizeof(s)));
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
+    NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
+
 /*
- * Convert a user file descriptor to a kernel file entry and check that, if
- * it is a capability, the right rights are present. A reference on the file
- * entry is held upon returning.
+ * Convert a user file descriptor to a kernel file entry and check if required
+ * capability rights are present.
+ * A reference on the file entry is held upon returning.
  */
-static int
-getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
+int
+getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_fromcap;
 	int error;
-#endif
 
-	fp = NULL;
-	if ((fdp == NULL) || ((fp = fget_unlocked(fdp, fd)) == NULL))
-		return (EBADF);
-#ifdef CAPABILITIES
-	/*
-	 * If the file descriptor is for a capability, test rights and use
-	 * the file descriptor referenced by the capability.
-	 */
-	error = cap_funwrap(fp, rights, &fp_fromcap);
-	if (error) {
-		fdrop(fp, curthread);
+	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, 0, &fp, NULL);
+	if (error != 0)
 		return (error);
-	}
-	if (fp != fp_fromcap) {
-		fhold(fp_fromcap);
-		fdrop(fp, curthread);
-		fp = fp_fromcap;
-	}
-#endif /* CAPABILITIES */
 	if (fp->f_type != DTYPE_SOCKET) {
-		fdrop(fp, curthread);
+		fdrop(fp, td);
 		return (ENOTSOCK);
 	}
 	if (fflagp != NULL)
@@ -180,29 +186,42 @@
 		int	protocol;
 	} */ *uap;
 {
-	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
-	int fd, error;
+	int fd, error, type, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
+
+	type = uap->type;
+	oflag = 0;
+	fflag = 0;
+	if ((type & SOCK_CLOEXEC) != 0) {
+		type &= ~SOCK_CLOEXEC;
+		oflag |= O_CLOEXEC;
+	}
+	if ((type & SOCK_NONBLOCK) != 0) {
+		type &= ~SOCK_NONBLOCK;
+		fflag |= FNONBLOCK;
+	}
+
 #ifdef MAC
-	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
+	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
 	    uap->protocol);
-	if (error)
+	if (error != 0)
 		return (error);
 #endif
-	fdp = td->td_proc->p_fd;
-	error = falloc(td, &fp, &fd, 0);
-	if (error)
+	error = falloc(td, &fp, &fd, oflag);
+	if (error != 0)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
-	error = socreate(uap->domain, &so, uap->type, uap->protocol,
+	error = socreate(uap->domain, &so, type, uap->protocol,
 	    td->td_ucred, td);
-	if (error) {
-		fdclose(fdp, fp, fd, td);
+	if (error != 0) {
+		fdclose(td, fp, fd);
 	} else {
-		finit(fp, FREAD | FWRITE, DTYPE_SOCKET, so, &socketops);
+		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
+		if ((fflag & FNONBLOCK) != 0)
+			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
@@ -222,27 +241,27 @@
 	struct sockaddr *sa;
 	int error;
 
-	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
-		return (error);
-
-	error = kern_bind(td, uap->s, sa);
-	free(sa, M_SONAME);
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_bind(td, uap->s, sa);
+		free(sa, M_SONAME);
+	}
 	return (error);
 }
 
-int
-kern_bind(td, fd, sa)
-	struct thread *td;
-	int fd;
-	struct sockaddr *sa;
+static int
+kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
-	if (error)
+	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND),
+	    &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = fp->f_data;
 #ifdef KTRACE
@@ -251,15 +270,50 @@
 #endif
 #ifdef MAC
 	error = mac_socket_check_bind(td->td_ucred, so, sa);
-	if (error == 0)
+	if (error == 0) {
 #endif
-		error = sobind(so, sa, td);
+		if (dirfd == AT_FDCWD)
+			error = sobind(so, sa, td);
+		else
+			error = sobindat(dirfd, so, sa, td);
+#ifdef MAC
+	}
+#endif
 	fdrop(fp, td);
 	return (error);
 }
 
+int
+kern_bind(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+	return (kern_bindat(td, AT_FDCWD, fd, sa));
+}
+
 /* ARGSUSED */
 int
+sys_bindat(td, uap)
+	struct thread *td;
+	struct bindat_args /* {
+		int	fd;
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_bindat(td, uap->fd, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+int
 sys_listen(td, uap)
 	struct thread *td;
 	struct listen_args /* {
@@ -269,10 +323,12 @@
 {
 	struct socket *so;
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->s);
-	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
+	error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN),
+	    &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
@@ -289,14 +345,12 @@
  * accept1()
  */
 static int
-accept1(td, uap, compat)
+accept1(td, s, uname, anamelen, flags)
 	struct thread *td;
-	struct accept_args /* {
-		int	s;
-		struct sockaddr	* __restrict name;
-		socklen_t	* __restrict anamelen;
-	} */ *uap;
-	int compat;
+	int s;
+	struct sockaddr *uname;
+	socklen_t *anamelen;
+	int flags;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
@@ -303,38 +357,37 @@
 	struct file *fp;
 	int error;
 
-	if (uap->name == NULL)
-		return (kern_accept(td, uap->s, NULL, NULL, NULL));
+	if (uname == NULL)
+		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
 
-	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
-	if (error)
+	error = copyin(anamelen, &namelen, sizeof (namelen));
+	if (error != 0)
 		return (error);
 
-	error = kern_accept(td, uap->s, &name, &namelen, &fp);
+	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
 
 	/*
 	 * return a namelen of zero for older code which might
 	 * ignore the return value from accept.
 	 */
-	if (error) {
-		(void) copyout(&namelen,
-		    uap->anamelen, sizeof(*uap->anamelen));
+	if (error != 0) {
+		(void) copyout(&namelen, anamelen, sizeof(*anamelen));
 		return (error);
 	}
 
-	if (error == 0 && name != NULL) {
+	if (error == 0 && uname != NULL) {
 #ifdef COMPAT_OLDSOCK
-		if (compat)
+		if (flags & ACCEPT4_COMPAT)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
-		error = copyout(name, uap->name, namelen);
+		error = copyout(name, uname, namelen);
 	}
 	if (error == 0)
-		error = copyout(&namelen, uap->anamelen,
+		error = copyout(&namelen, anamelen,
 		    sizeof(namelen));
-	if (error)
-		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
+	if (error != 0)
+		fdclose(td, fp, td->td_retval[0]);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
@@ -344,26 +397,28 @@
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
-	struct filedesc *fdp;
+	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
+}
+
+int
+kern_accept4(struct thread *td, int s, struct sockaddr **name,
+    socklen_t *namelen, int flags, struct file **fp)
+{
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
-	int error;
 	struct socket *head, *so;
-	int fd;
+	cap_rights_t rights;
 	u_int fflag;
 	pid_t pgid;
-	int tmp;
+	int error, fd, tmp;
 
-	if (name) {
+	if (name != NULL)
 		*name = NULL;
-		if (*namelen < 0)
-			return (EINVAL);
-	}
 
 	AUDIT_ARG_FD(s);
-	fdp = td->td_proc->p_fd;
-	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
-	if (error)
+	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT),
+	    &headfp, &fflag);
+	if (error != 0)
 		return (error);
 	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
@@ -375,8 +430,8 @@
 	if (error != 0)
 		goto done;
 #endif
-	error = falloc(td, &nfp, &fd, 0);
-	if (error)
+	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
+	if (error != 0)
 		goto done;
 	ACCEPT_LOCK();
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
@@ -391,7 +446,7 @@
 		}
 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 		    "accept", 0);
-		if (error) {
+		if (error != 0) {
 			ACCEPT_UNLOCK();
 			goto noconnection;
 		}
@@ -416,7 +471,10 @@
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
-	so->so_state |= (head->so_state & SS_NBIO);
+	if (flags & ACCEPT4_INHERIT)
+		so->so_state |= (head->so_state & SS_NBIO);
+	else
+		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
@@ -429,9 +487,15 @@
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 
-	pgid = fgetown(&head->so_sigio);
-	if (pgid != 0)
-		fsetown(pgid, &so->so_sigio);
+	if (flags & ACCEPT4_INHERIT) {
+		pgid = fgetown(&head->so_sigio);
+		if (pgid != 0)
+			fsetown(pgid, &so->so_sigio);
+	} else {
+		fflag &= ~(FNONBLOCK | FASYNC);
+		if (flags & SOCK_NONBLOCK)
+			fflag |= FNONBLOCK;
+	}
 
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
@@ -441,7 +505,7 @@
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	sa = 0;
 	error = soaccept(so, &sa);
-	if (error) {
+	if (error != 0) {
 		/*
 		 * return a namelen of zero for older code which might
 		 * ignore the return value from accept.
@@ -455,6 +519,7 @@
 			*namelen = 0;
 		goto done;
 	}
+	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
@@ -467,15 +532,14 @@
 		sa = NULL;
 	}
 noconnection:
-	if (sa)
-		free(sa, M_SONAME);
+	free(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
-	if (error)
-		fdclose(fdp, nfp, fd, td);
+	if (error != 0)
+		fdclose(td, nfp, fd);
 
 	/*
 	 * Release explicitly held references before returning.  We return
@@ -501,9 +565,21 @@
 	struct accept_args *uap;
 {
 
-	return (accept1(td, uap, 0));
+	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 }
 
+int
+sys_accept4(td, uap)
+	struct thread *td;
+	struct accept4_args *uap;
+{
+
+	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+		return (EINVAL);
+
+	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
+}
+
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(td, uap)
@@ -511,7 +587,8 @@
 	struct accept_args *uap;
 {
 
-	return (accept1(td, uap, 1));
+	return (accept1(td, uap->s, uap->name, uap->anamelen,
+	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 }
 #endif /* COMPAT_OLDSOCK */
 
@@ -529,29 +606,26 @@
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
-	if (error)
-		return (error);
-
-	error = kern_connect(td, uap->s, sa);
-	free(sa, M_SONAME);
+	if (error == 0) {
+		error = kern_connect(td, uap->s, sa);
+		free(sa, M_SONAME);
+	}
 	return (error);
 }
 
-
-int
-kern_connect(td, fd, sa)
-	struct thread *td;
-	int fd;
-	struct sockaddr *sa;
+static int
+kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
-	int error;
-	int interrupted = 0;
+	cap_rights_t rights;
+	int error, interrupted = 0;
 
 	AUDIT_ARG_FD(fd);
-	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
-	if (error)
+	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
+	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT),
+	    &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
@@ -564,11 +638,14 @@
 #endif
 #ifdef MAC
 	error = mac_socket_check_connect(td->td_ucred, so, sa);
-	if (error)
+	if (error != 0)
 		goto bad;
 #endif
-	error = soconnect(so, sa, td);
-	if (error)
+	if (dirfd == AT_FDCWD)
+		error = soconnect(so, sa, td);
+	else
+		error = soconnectat(dirfd, so, sa, td);
+	if (error != 0)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
@@ -578,7 +655,7 @@
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 		    "connec", 0);
-		if (error) {
+		if (error != 0) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
 			break;
@@ -600,41 +677,80 @@
 }
 
 int
+kern_connect(struct thread *td, int fd, struct sockaddr *sa)
+{
+
+	return (kern_connectat(td, AT_FDCWD, fd, sa));
+}
+
+/* ARGSUSED */
+int
+sys_connectat(td, uap)
+	struct thread *td;
+	struct connectat_args /* {
+		int	fd;
+		int	s;
+		caddr_t	name;
+		int	namelen;
+	} */ *uap;
+{
+	struct sockaddr *sa;
+	int error;
+
+	error = getsockaddr(&sa, uap->name, uap->namelen);
+	if (error == 0) {
+		error = kern_connectat(td, uap->fd, uap->s, sa);
+		free(sa, M_SONAME);
+	}
+	return (error);
+}
+
+int
 kern_socketpair(struct thread *td, int domain, int type, int protocol,
     int *rsv)
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
-	int fd, error;
+	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
+
+	oflag = 0;
+	fflag = 0;
+	if ((type & SOCK_CLOEXEC) != 0) {
+		type &= ~SOCK_CLOEXEC;
+		oflag |= O_CLOEXEC;
+	}
+	if ((type & SOCK_NONBLOCK) != 0) {
+		type &= ~SOCK_NONBLOCK;
+		fflag |= FNONBLOCK;
+	}
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_socket_check_create(td->td_ucred, domain, type,
 	    protocol);
-	if (error)
+	if (error != 0)
 		return (error);
 #endif
 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
-	if (error)
+	if (error != 0)
 		return (error);
 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
-	if (error)
+	if (error != 0)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
-	error = falloc(td, &fp1, &fd, 0);
-	if (error)
+	error = falloc(td, &fp1, &fd, oflag);
+	if (error != 0)
 		goto free2;
 	rsv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
-	error = falloc(td, &fp2, &fd, 0);
-	if (error)
+	error = falloc(td, &fp2, &fd, oflag);
+	if (error != 0)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	rsv[1] = fd;
 	error = soconnect2(so1, so2);
-	if (error)
+	if (error != 0)
 		goto free4;
 	if (type == SOCK_DGRAM) {
 		/*
@@ -641,19 +757,25 @@
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
-		 if (error)
+		 if (error != 0)
 			goto free4;
 	}
-	finit(fp1, FREAD | FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
-	finit(fp2, FREAD | FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
+	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
+	    &socketops);
+	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
+	    &socketops);
+	if ((fflag & FNONBLOCK) != 0) {
+		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
+		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
+	}
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	return (0);
 free4:
-	fdclose(fdp, fp2, rsv[1], td);
+	fdclose(td, fp2, rsv[1]);
 	fdrop(fp2, td);
 free3:
-	fdclose(fdp, fp1, rsv[0], td);
+	fdclose(td, fp1, rsv[0]);
 	fdrop(fp1, td);
 free2:
 	if (so2 != NULL)
@@ -671,10 +793,10 @@
 
 	error = kern_socketpair(td, uap->domain, uap->type,
 	    uap->protocol, sv);
-	if (error)
+	if (error != 0)
 		return (error);
 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
-	if (error) {
+	if (error != 0) {
 		(void)kern_close(td, sv[0]);
 		(void)kern_close(td, sv[1]);
 	}
@@ -699,7 +821,7 @@
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
-		if (error) {
+		if (error != 0) {
 			to = NULL;
 			goto bad;
 		}
@@ -719,13 +841,13 @@
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
-		if (error)
+		if (error != 0)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			struct cmsghdr *cm;
 
-			M_PREPEND(control, sizeof(*cm), M_WAIT);
+			M_PREPEND(control, sizeof(*cm), M_WAITOK);
 			cm = mtod(control, struct cmsghdr *);
 			cm->cmsg_len = control->m_len;
 			cm->cmsg_level = SOL_SOCKET;
@@ -739,8 +861,7 @@
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
-	if (to)
-		free(to, M_SONAME);
+	free(to, M_SONAME);
 	return (error);
 }
 
@@ -757,19 +878,21 @@
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
-	int i, error;
-	ssize_t len;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
+	ssize_t len;
+	int i, error;
 
 	AUDIT_ARG_FD(s);
-	rights = CAP_WRITE;
-	if (mp->msg_name != NULL)
-		rights |= CAP_CONNECT;
-	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
-	if (error)
+	cap_rights_init(&rights, CAP_SEND);
+	if (mp->msg_name != NULL) {
+		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
+		cap_rights_set(&rights, CAP_CONNECT);
+	}
+	error = getsock_cap(td, s, &rights, &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = (struct socket *)fp->f_data;
 
@@ -781,11 +904,11 @@
 	if (mp->msg_name != NULL) {
 		error = mac_socket_check_connect(td->td_ucred, so,
 		    mp->msg_name);
-		if (error)
+		if (error != 0)
 			goto bad;
 	}
 	error = mac_socket_check_send(td->td_ucred, so);
-	if (error)
+	if (error != 0)
 		goto bad;
 #endif
 
@@ -809,7 +932,7 @@
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
-	if (error) {
+	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -848,7 +971,6 @@
 {
 	struct msghdr msg;
 	struct iovec aiov;
-	int error;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
@@ -860,8 +982,7 @@
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
-	error = sendit(td, uap->s, &msg, uap->flags);
-	return (error);
+	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
@@ -877,7 +998,6 @@
 {
 	struct msghdr msg;
 	struct iovec aiov;
-	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
@@ -887,8 +1007,7 @@
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
-	error = sendit(td, uap->s, &msg, uap->flags);
-	return (error);
+	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 int
@@ -905,10 +1024,10 @@
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
-	if (error)
+	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
@@ -932,10 +1051,10 @@
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
-	if (error)
+	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
@@ -956,30 +1075,31 @@
 {
 	struct uio auio;
 	struct iovec *iov;
-	int i;
-	ssize_t len;
-	int error;
-	struct mbuf *m, *control = 0;
+	struct mbuf *m, *control = NULL;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
-	struct sockaddr *fromsa = 0;
+	struct sockaddr *fromsa = NULL;
+	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
+	ssize_t len;
+	int error, i;
 
 	if (controlp != NULL)
 		*controlp = NULL;
 
 	AUDIT_ARG_FD(s);
-	error = getsock_cap(td->td_proc->p_fd, s, CAP_READ, &fp, NULL);
-	if (error)
+	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV),
+	    &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = fp->f_data;
 
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
-	if (error) {
+	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
@@ -1004,14 +1124,16 @@
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
-	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
-	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
+	error = soreceive(so, &fromsa, &auio, NULL,
+	    (mp->msg_control || controlp) ? &control : NULL,
 	    &mp->msg_flags);
-	if (error) {
+	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
+	if (fromsa != NULL)
+		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
@@ -1018,12 +1140,12 @@
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
-	if (error)
+	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
-		if (len <= 0 || fromsa == 0)
+		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
@@ -1036,7 +1158,7 @@
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
-				if (error)
+				if (error != 0)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
@@ -1095,10 +1217,9 @@
 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
-	if (fromsa)
-		free(fromsa, M_SONAME);
+	free(fromsa, M_SONAME);
 
-	if (error == 0 && controlp != NULL)  
+	if (error == 0 && controlp != NULL)
 		*controlp = control;
 	else  if (control)
 		m_freem(control);
@@ -1116,9 +1237,9 @@
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
-	if (error)
+	if (error != 0)
 		return (error);
-	if (namelenp) {
+	if (namelenp != NULL) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags & MSG_COMPAT)
@@ -1147,7 +1268,7 @@
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
-		if (error)
+		if (error != 0)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
@@ -1161,7 +1282,7 @@
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 done2:
-	return(error);
+	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
@@ -1189,7 +1310,6 @@
 {
 	struct msghdr msg;
 	struct iovec aiov;
-	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
@@ -1199,8 +1319,7 @@
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
-	error = recvit(td, uap->s, &msg, NULL);
-	return (error);
+	return (recvit(td, uap->s, &msg, NULL));
 }
 
 /*
@@ -1222,10 +1341,10 @@
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
-	if (error)
+	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
@@ -1252,10 +1371,10 @@
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
-	if (error)
+	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
@@ -1283,11 +1402,12 @@
 {
 	struct socket *so;
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->s);
-	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
-	    NULL);
+	error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN),
+	    &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
@@ -1323,10 +1443,11 @@
 	enum uio_seg valseg;
 	socklen_t valsize;
 {
-	int error;
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
+	cap_rights_t rights;
+	int error;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
@@ -1350,7 +1471,8 @@
 	}
 
 	AUDIT_ARG_FD(s);
-	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
+	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT),
+	    &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
@@ -1372,11 +1494,11 @@
 	} */ *uap;
 {
 	socklen_t valsize;
-	int	error;
+	int error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
-		if (error)
+		if (error != 0)
 			return (error);
 	}
 
@@ -1402,10 +1524,11 @@
 	enum uio_seg valseg;
 	socklen_t *valsize;
 {
+	struct socket *so;
+	struct file *fp;
+	struct sockopt sopt;
+	cap_rights_t rights;
 	int error;
-	struct  socket *so;
-	struct file *fp;
-	struct	sockopt sopt;
 
 	if (val == NULL)
 		*valsize = 0;
@@ -1429,7 +1552,8 @@
 	}
 
 	AUDIT_ARG_FD(s);
-	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
+	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT),
+	    &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
@@ -1458,11 +1582,11 @@
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof(len));
-	if (error)
+	if (error != 0)
 		return (error);
 
 	error = kern_getsockname(td, uap->fdes, &sa, &len);
-	if (error)
+	if (error != 0)
 		return (error);
 
 	if (len != 0) {
@@ -1484,15 +1608,14 @@
 {
 	struct socket *so;
 	struct file *fp;
+	cap_rights_t rights;
 	socklen_t len;
 	int error;
 
-	if (*alen < 0)
-		return (EINVAL);
-
 	AUDIT_ARG_FD(fd);
-	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
-	if (error)
+	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME),
+	    &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	*sa = NULL;
@@ -1499,7 +1622,7 @@
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 	CURVNET_RESTORE();
-	if (error)
+	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
@@ -1512,7 +1635,7 @@
 #endif
 bad:
 	fdrop(fp, td);
-	if (error && *sa) {
+	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
@@ -1558,11 +1681,11 @@
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof (len));
-	if (error)
+	if (error != 0)
 		return (error);
 
 	error = kern_getpeername(td, uap->fdes, &sa, &len);
-	if (error)
+	if (error != 0)
 		return (error);
 
 	if (len != 0) {
@@ -1584,15 +1707,14 @@
 {
 	struct socket *so;
 	struct file *fp;
+	cap_rights_t rights;
 	socklen_t len;
 	int error;
 
-	if (*alen < 0)
-		return (EINVAL);
-
 	AUDIT_ARG_FD(fd);
-	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
-	if (error)
+	error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME),
+	    &fp, NULL);
+	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
@@ -1603,7 +1725,7 @@
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	CURVNET_RESTORE();
-	if (error)
+	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
@@ -1615,7 +1737,7 @@
 		ktrsockaddr(*sa);
 #endif
 bad:
-	if (error && *sa) {
+	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
@@ -1646,27 +1768,31 @@
 #endif /* COMPAT_OLDSOCK */
 
 int
-sockargs(struct mbuf **mp, char *buf, int buflen, int type)
+sockargs(mp, buf, buflen, type)
+	struct mbuf **mp;
+	caddr_t buf;
+	int buflen, type;
 {
 	struct sockaddr *sa;
 	struct mbuf *m;
 	int error;
 
-	if ((u_int)buflen > MLEN) {
+	if (buflen < 0)
+		return (EINVAL);
+
+	if (buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
-		if (type == MT_SONAME && (u_int)buflen <= 112)
+		if (type == MT_SONAME && buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
-			if ((u_int)buflen > MCLBYTES)
+			if (buflen > MCLBYTES)
 				return (EINVAL);
 	}
-	m = m_get(M_WAIT, type);
-	if ((u_int)buflen > MLEN)
-		MCLGET(m, M_WAIT);
+	m = m_get2(buflen, M_WAITOK, type, 0);
 	m->m_len = buflen;
-	error = copyin(buf, mtod(m, void *), buflen);
-	if (error)
+	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
+	if (error != 0)
 		(void) m_free(m);
 	else {
 		*mp = m;
@@ -1698,7 +1824,7 @@
 		return (EINVAL);
 	sa = malloc(len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
-	if (error) {
+	if (error != 0) {
 		free(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
@@ -1711,19 +1837,17 @@
 	return (error);
 }
 
-#include <sys/condvar.h>
-
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
-	unsigned 	count;
+	unsigned	count;
 };
 
 /*
  * Detach mapped page and release resources back to the system.
  */
-void
-sf_buf_mext(void *addr, void *args)
+int
+sf_buf_mext(struct mbuf *mb, void *addr, void *args)
 {
 	vm_page_t m;
 	struct sendfile_sync *sfs;
@@ -1741,7 +1865,7 @@
 		vm_page_free(m);
 	vm_page_unlock(m);
 	if (addr == NULL)
-		return;
+		return (EXT_FREE_OK);
 	sfs = addr;
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
@@ -1748,6 +1872,7 @@
 	if (--sfs->count == 0)
 		cv_signal(&sfs->cv);
 	mtx_unlock(&sfs->mtx);
+	return (EXT_FREE_OK);
 }
 
 /*
@@ -1773,33 +1898,54 @@
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
+	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
+	/*
+	 * File offset must be positive.  If it goes beyond EOF
+	 * we send only the header/trailer and no payload data.
+	 */
+	if (uap->offset < 0)
+		return (EINVAL);
+
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
-		if (error)
+		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
-			if (error)
+			if (error != 0)
 				goto out;
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
-			if (error)
+			if (error != 0)
 				goto out;
 
 		}
 	}
 
-	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
+	AUDIT_ARG_FD(uap->fd);
+
+	/*
+	 * sendfile(2) can start at any offset within a file so we require
+	 * CAP_READ+CAP_SEEK = CAP_PREAD.
+	 */
+	if ((error = fget_read(td, uap->fd,
+	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
+		goto out;
+	}
+
+	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
+	    uap->nbytes, uap->sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
+	fdrop(fp, td);
+
 out:
-	if (hdr_uio)
-		free(hdr_uio, M_IOV);
-	if (trl_uio)
-		free(trl_uio, M_IOV);
+	free(hdr_uio, M_IOV);
+	free(trl_uio, M_IOV);
 	return (error);
 }
 
@@ -1821,103 +1967,253 @@
 }
 #endif /* COMPAT_FREEBSD4 */
 
-int
-kern_sendfile(struct thread *td, struct sendfile_args *uap,
-    struct uio *hdr_uio, struct uio *trl_uio, int compat)
+static int
+sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
+    off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
 {
-	struct file *sock_fp;
+	vm_page_t m;
+	vm_pindex_t pindex;
+	ssize_t resid;
+	int error, readahead, rv;
+
+	pindex = OFF_TO_IDX(off);
+	VM_OBJECT_WLOCK(obj);
+	m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
+	    VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
+
+	/*
+	 * Check if page is valid for what we need, otherwise initiate I/O.
+	 *
+	 * The non-zero nd argument prevents disk I/O, instead we
+	 * return the caller what he specified in nd.  In particular,
+	 * if we already turned some pages into mbufs, nd == EAGAIN
+	 * and the main function send them the pages before we come
+	 * here again and block.
+	 */
+	if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
+		if (vp == NULL)
+			vm_page_xunbusy(m);
+		VM_OBJECT_WUNLOCK(obj);
+		*res = m;
+		return (0);
+	} else if (nd != 0) {
+		if (vp == NULL)
+			vm_page_xunbusy(m);
+		error = nd;
+		goto free_page;
+	}
+
+	/*
+	 * Get the page from backing store.
+	 */
+	error = 0;
+	if (vp != NULL) {
+		VM_OBJECT_WUNLOCK(obj);
+		readahead = sfreadahead * MAXBSIZE;
+
+		/*
+		 * Use vn_rdwr() instead of the pager interface for
+		 * the vnode, to allow the read-ahead.
+		 *
+		 * XXXMAC: Because we don't have fp->f_cred here, we
+		 * pass in NOCRED.  This is probably wrong, but is
+		 * consistent with our original implementation.
+		 */
+		error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
+		    UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
+		    bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
+		SFSTAT_INC(sf_iocnt);
+		VM_OBJECT_WLOCK(obj);
+	} else {
+		if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
+			rv = vm_pager_get_pages(obj, &m, 1, 0);
+			SFSTAT_INC(sf_iocnt);
+			m = vm_page_lookup(obj, pindex);
+			if (m == NULL)
+				error = EIO;
+			else if (rv != VM_PAGER_OK) {
+				vm_page_lock(m);
+				vm_page_free(m);
+				vm_page_unlock(m);
+				m = NULL;
+				error = EIO;
+			}
+		} else {
+			pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+			m->dirty = 0;
+		}
+		if (m != NULL)
+			vm_page_xunbusy(m);
+	}
+	if (error == 0) {
+		*res = m;
+	} else if (m != NULL) {
+free_page:
+		vm_page_lock(m);
+		vm_page_unwire(m, 0);
+
+		/*
+		 * See if anyone else might know about this page.  If
+		 * not and it is not valid, then free it.
+		 */
+		if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
+			vm_page_free(m);
+		vm_page_unlock(m);
+	}
+	KASSERT(error != 0 || (m->wire_count > 0 &&
+	    vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
+	    ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
+	    xfsize));
+	VM_OBJECT_WUNLOCK(obj);
+	return (error);
+}
+
+static int
+sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
+    struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
+    int *bsize)
+{
+	struct vattr va;
+	vm_object_t obj;
 	struct vnode *vp;
-	struct vm_object *obj = NULL;
-	struct socket *so = NULL;
-	struct mbuf *m = NULL;
-	struct sf_buf *sf;
-	struct vm_page *pg;
-	struct vattr va;
-	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
-	int error, hdrlen = 0, mnw = 0;
-	int vfslocked;
-	int bsize;
-	struct sendfile_sync *sfs = NULL;
+	struct shmfd *shmfd;
+	int error;
 
+	vp = *vp_res = NULL;
+	obj = NULL;
+	shmfd = *shmfd_res = NULL;
+	*bsize = 0;
+
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
-	 * File offset must be positive.  If it goes beyond EOF
-	 * we send only the header/trailer and no payload data.
 	 */
-	AUDIT_ARG_FD(uap->fd);
-	if ((error = fgetvp_read(td, uap->fd, CAP_READ, &vp)) != 0)
-		goto out;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	vn_lock(vp, LK_SHARED | LK_RETRY);
-	if (vp->v_type == VREG) {
-		bsize = vp->v_mount->mnt_stat.f_iosize;
-		if (uap->nbytes == 0) {
-			error = VOP_GETATTR(vp, &va, td->td_ucred);
-			if (error != 0) {
-				VOP_UNLOCK(vp, 0);
-				VFS_UNLOCK_GIANT(vfslocked);
-				obj = NULL;
-				goto out;
-			}
-			rem = va.va_size;
-		} else
-			rem = uap->nbytes;
+	if (fp->f_type == DTYPE_VNODE) {
+		vp = fp->f_vnode;
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		if (vp->v_type != VREG) {
+			error = EINVAL;
+			goto out;
+		}
+		*bsize = vp->v_mount->mnt_stat.f_iosize;
+		error = VOP_GETATTR(vp, &va, td->td_ucred);
+		if (error != 0)
+			goto out;
+		*obj_size = va.va_size;
 		obj = vp->v_object;
-		if (obj != NULL) {
-			/*
-			 * Temporarily increase the backing VM
-			 * object's reference count so that a forced
-			 * reclamation of its vnode does not
-			 * immediately destroy it.
-			 */
-			VM_OBJECT_LOCK(obj);
-			if ((obj->flags & OBJ_DEAD) == 0) {
-				vm_object_reference_locked(obj);
-				VM_OBJECT_UNLOCK(obj);
-			} else {
-				VM_OBJECT_UNLOCK(obj);
-				obj = NULL;
-			}
+		if (obj == NULL) {
+			error = EINVAL;
+			goto out;
 		}
-	} else
-		bsize = 0;	/* silence gcc */
-	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
-	if (obj == NULL) {
+	} else if (fp->f_type == DTYPE_SHM) {
+		error = 0;
+		shmfd = fp->f_data;
+		obj = shmfd->shm_object;
+		*obj_size = shmfd->shm_size;
+	} else {
 		error = EINVAL;
 		goto out;
 	}
-	if (uap->offset < 0) {
-		error = EINVAL;
+
+	VM_OBJECT_WLOCK(obj);
+	if ((obj->flags & OBJ_DEAD) != 0) {
+		VM_OBJECT_WUNLOCK(obj);
+		error = EBADF;
 		goto out;
 	}
 
 	/*
+	 * Temporarily increase the backing VM object's reference
+	 * count so that a forced reclamation of its vnode does not
+	 * immediately destroy it.
+	 */
+	vm_object_reference_locked(obj);
+	VM_OBJECT_WUNLOCK(obj);
+	*obj_res = obj;
+	*vp_res = vp;
+	*shmfd_res = shmfd;
+
+out:
+	if (vp != NULL)
+		VOP_UNLOCK(vp, 0);
+	return (error);
+}
+
+static int
+kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
+    struct socket **so)
+{
+	cap_rights_t rights;
+	int error;
+
+	*sock_fp = NULL;
+	*so = NULL;
+
+	/*
 	 * The socket must be a stream socket and connected.
-	 * Remember if it a blocking or non-blocking socket.
 	 */
-	if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_WRITE,
-	    &sock_fp, NULL)) != 0)
+	error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
+	    sock_fp, NULL);
+	if (error != 0)
+		return (error);
+	*so = (*sock_fp)->f_data;
+	if ((*so)->so_type != SOCK_STREAM)
+		return (EINVAL);
+	if (((*so)->so_state & SS_ISCONNECTED) == 0)
+		return (ENOTCONN);
+	return (0);
+}
+
+int
+vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+    int kflags, struct thread *td)
+{
+	struct file *sock_fp;
+	struct vnode *vp;
+	struct vm_object *obj;
+	struct socket *so;
+	struct mbuf *m;
+	struct sf_buf *sf;
+	struct vm_page *pg;
+	struct shmfd *shmfd;
+	struct sendfile_sync *sfs;
+	struct vattr va;
+	off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
+	int error, bsize, nd, hdrlen, mnw;
+	bool inflight_called;
+
+	pg = NULL;
+	obj = NULL;
+	so = NULL;
+	m = NULL;
+	sfs = NULL;
+	fsbytes = sbytes = 0;
+	hdrlen = mnw = 0;
+	rem = nbytes;
+	obj_size = 0;
+	inflight_called = false;
+
+	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
+	if (error != 0)
+		return (error);
+	if (rem == 0)
+		rem = obj_size;
+
+	error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
+	if (error != 0)
 		goto out;
-	so = sock_fp->f_data;
-	if (so->so_type != SOCK_STREAM) {
-		error = EINVAL;
-		goto out;
-	}
-	if ((so->so_state & SS_ISCONNECTED) == 0) {
-		error = ENOTCONN;
-		goto out;
-	}
+
 	/*
 	 * Do not wait on memory allocations but return ENOMEM for
 	 * caller to retry later.
 	 * XXX: Experimental.
 	 */
-	if (uap->flags & SF_MNOWAIT)
+	if (flags & SF_MNOWAIT)
 		mnw = 1;
 
-	if (uap->flags & SF_SYNC) {
+	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
@@ -1925,7 +2221,7 @@
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 
@@ -1939,11 +2235,11 @@
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
-			if (compat) {
-				if (uap->nbytes > hdr_uio->uio_resid)
-					uap->nbytes -= hdr_uio->uio_resid;
+			if (kflags & SFK_COMPAT) {
+				if (nbytes > hdr_uio->uio_resid)
+					nbytes -= hdr_uio->uio_resid;
 				else
-					uap->nbytes = 0;
+					nbytes = 0;
 			}
 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 			    0, 0, 0);
@@ -1974,14 +2270,14 @@
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
-	for (off = uap->offset; ; ) {
+	for (off = offset; ; ) {
 		struct mbuf *mtail;
 		int loopbytes;
 		int space;
 		int done;
 
-		if ((uap->nbytes != 0 && uap->nbytes == fsbytes) ||
-		    (uap->nbytes == 0 && va.va_size == fsbytes))
+		if ((nbytes != 0 && nbytes == fsbytes) ||
+		    (nbytes == 0 && obj_size == fsbytes))
 			break;
 
 		mtail = NULL;
@@ -2040,7 +2336,7 @@
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
-			if (error) {
+			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
@@ -2055,19 +2351,17 @@
 		 */
 		space -= hdrlen;
 
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		error = vn_lock(vp, LK_SHARED);
-		if (error != 0) {
-			VFS_UNLOCK_GIANT(vfslocked);
-			goto done;
+		if (vp != NULL) {
+			error = vn_lock(vp, LK_SHARED);
+			if (error != 0)
+				goto done;
+			error = VOP_GETATTR(vp, &va, td->td_ucred);
+			if (error != 0 || off >= va.va_size) {
+				VOP_UNLOCK(vp, 0);
+				goto done;
+			}
+			obj_size = va.va_size;
 		}
-		error = VOP_GETATTR(vp, &va, td->td_ucred);
-		if (error != 0 || off >= va.va_size) {
-			VOP_UNLOCK(vp, 0);
-			VFS_UNLOCK_GIANT(vfslocked);
-			goto done;
-		}
-		VFS_UNLOCK_GIANT(vfslocked);
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
@@ -2074,7 +2368,6 @@
 		 * dumped into socket buffer.
 		 */
 		while (space > loopbytes) {
-			vm_pindex_t pindex;
 			vm_offset_t pgoff;
 			struct mbuf *m0;
 
@@ -2084,11 +2377,10 @@
 			 * or the passed in nbytes.
 			 */
 			pgoff = (vm_offset_t)(off & PAGE_MASK);
-			if (uap->nbytes != 0)
-				rem = (uap->nbytes - fsbytes - loopbytes);
-			else
-				rem = va.va_size -
-				    uap->offset - fsbytes - loopbytes;
+			rem = obj_size - offset;
+			if (nbytes != 0)
+				rem = omin(rem, nbytes);
+			rem -= fsbytes + loopbytes;
 			xfsize = omin(PAGE_SIZE - pgoff, rem);
 			xfsize = omin(space - loopbytes, xfsize);
 			if (xfsize <= 0) {
@@ -2100,66 +2392,15 @@
 			 * Attempt to look up the page.  Allocate
 			 * if not found or wait and loop if busy.
 			 */
-			pindex = OFF_TO_IDX(off);
-			VM_OBJECT_LOCK(obj);
-			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
-			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
-
-			/*
-			 * Check if page is valid for what we need,
-			 * otherwise initiate I/O.
-			 * If we already turned some pages into mbufs,
-			 * send them off before we come here again and
-			 * block.
-			 */
-			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
-				VM_OBJECT_UNLOCK(obj);
-			else if (m != NULL)
-				error = EAGAIN;	/* send what we already got */
-			else if (uap->flags & SF_NODISKIO)
-				error = EBUSY;
-			else {
-				ssize_t resid;
-
-				/*
-				 * Ensure that our page is still around
-				 * when the I/O completes.
-				 */
-				vm_page_io_start(pg);
-				VM_OBJECT_UNLOCK(obj);
-
-				/*
-				 * Get the page from backing store.
-				 * XXXMAC: Because we don't have fp->f_cred
-				 * here, we pass in NOCRED.  This is probably
-				 * wrong, but is consistent with our original
-				 * implementation.
-				 */
-				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
-				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
-				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
-				    td->td_ucred, NOCRED, &resid, td);
-				VFS_UNLOCK_GIANT(vfslocked);
-				VM_OBJECT_LOCK(obj);
-				vm_page_io_finish(pg);
-				if (!error)
-					VM_OBJECT_UNLOCK(obj);
-				mbstat.sf_iocnt++;
-			}
-			if (error) {
-				vm_page_lock(pg);
-				vm_page_unwire(pg, 0);
-				/*
-				 * See if anyone else might know about
-				 * this page.  If not and it is not valid,
-				 * then free it.
-				 */
-				if (pg->wire_count == 0 && pg->valid == 0 &&
-				    pg->busy == 0 && !(pg->oflags & VPO_BUSY))
-					vm_page_free(pg);
-				vm_page_unlock(pg);
-				VM_OBJECT_UNLOCK(obj);
+			if (m != NULL)
+				nd = EAGAIN; /* send what we already got */
+			else if ((flags & SF_NODISKIO) != 0)
+				nd = EBUSY;
+			else
+				nd = 0;
+			error = sendfile_readpage(obj, vp, nd, off,
+			    xfsize, bsize, td, &pg);
+			if (error != 0) {
 				if (error == EAGAIN)
 					error = 0;	/* not a real error */
 				break;
@@ -2177,11 +2418,11 @@
 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
 			    SFB_CATCH);
 			if (sf == NULL) {
-				mbstat.sf_allocfail++;
+				SFSTAT_INC(sf_allocfail);
 				vm_page_lock(pg);
 				vm_page_unwire(pg, 0);
 				KASSERT(pg->object != NULL,
-				    ("kern_sendfile: object disappeared"));
+				    ("%s: object disappeared", __func__));
 				vm_page_unlock(pg);
 				if (m == NULL)
 					error = (mnw ? EAGAIN : EINTR);
@@ -2195,11 +2436,17 @@
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
-				sf_buf_mext(NULL, sf);
+				(void)sf_buf_mext(NULL, NULL, sf);
 				break;
 			}
-			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
-			    sfs, sf, M_RDONLY, EXT_SFBUF);
+			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
+			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
+			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
+				error = (mnw ? EAGAIN : ENOBUFS);
+				(void)sf_buf_mext(NULL, NULL, sf);
+				m_freem(m0);
+				break;
+			}
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 
@@ -2223,7 +2470,8 @@
 			}
 		}
 
-		VOP_UNLOCK(vp, 0);
+		if (vp != NULL)
+			VOP_UNLOCK(vp, 0);
 
 		/* Add the buffer chain to the socket buffer. */
 		if (m != NULL) {
@@ -2264,9 +2512,9 @@
 		}
 
 		/* Quit outer loop on error or when we're done. */
-		if (done) 
+		if (done)
 			break;
-		if (error)
+		if (error != 0)
 			goto done;
 	}
 
@@ -2275,7 +2523,7 @@
 	 */
 	if (trl_uio != NULL) {
 		sbunlock(&so->so_snd);
-		error = kern_writev(td, uap->s, trl_uio);
+		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
@@ -2291,16 +2539,11 @@
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
-	if (uap->sbytes != NULL) {
-		copyout(&sbytes, uap->sbytes, sizeof(off_t));
+	if (sent != NULL) {
+		copyout(&sbytes, sent, sizeof(off_t));
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
-	if (vp != NULL) {
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
@@ -2321,493 +2564,3 @@
 
 	return (error);
 }
-
-/*
- * SCTP syscalls.
- * Functionality only compiled in if SCTP is defined in the kernel Makefile,
- * otherwise all return EOPNOTSUPP.
- * XXX: We should make this loadable one day.
- */
-int
-sys_sctp_peeloff(td, uap)
-	struct thread *td;
-	struct sctp_peeloff_args /* {
-		int	sd;
-		caddr_t	name;
-	} */ *uap;
-{
-#if (defined(INET) || defined(INET6)) && defined(SCTP)
-	struct filedesc *fdp;
-	struct file *nfp = NULL;
-	int error;
-	struct socket *head, *so;
-	int fd;
-	u_int fflag;
-
-	fdp = td->td_proc->p_fd;
-	AUDIT_ARG_FD(uap->sd);
-	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
-	if (error)
-		goto done2;
-	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
-		error = EOPNOTSUPP;
-		goto done;
-	}
-	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
-	if (error)
-		goto done;
-	/*
-	 * At this point we know we do have a assoc to pull
-	 * we proceed to get the fd setup. This may block
-	 * but that is ok.
-	 */
-
-	error = falloc(td, &nfp, &fd, 0);
-	if (error)
-		goto done;
-	td->td_retval[0] = fd;
-
-	CURVNET_SET(head->so_vnet);
-	so = sonewconn(head, SS_ISCONNECTED);
-	if (so == NULL) 
-		goto noconnection;
-	/*
-	 * Before changing the flags on the socket, we have to bump the
-	 * reference count.  Otherwise, if the protocol calls sofree(),
-	 * the socket will be released due to a zero refcount.
-	 */
-        SOCK_LOCK(so);
-        soref(so);                      /* file descriptor reference */
-        SOCK_UNLOCK(so);
-
-	ACCEPT_LOCK();
-
-	TAILQ_REMOVE(&head->so_comp, so, so_list);
-	head->so_qlen--;
-	so->so_state |= (head->so_state & SS_NBIO);
-	so->so_state &= ~SS_NOFDREF;
-	so->so_qstate &= ~SQ_COMP;
-	so->so_head = NULL;
-	ACCEPT_UNLOCK();
-	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
-	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
-	if (error)
-		goto noconnection;
-	if (head->so_sigio != NULL)
-		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
-
-noconnection:
-	/*
-	 * close the new descriptor, assuming someone hasn't ripped it
-	 * out from under us.
-	 */
-	if (error)
-		fdclose(fdp, nfp, fd, td);
-
-	/*
-	 * Release explicitly held references before returning.
-	 */
-	CURVNET_RESTORE();
-done:
-	if (nfp != NULL)
-		fdrop(nfp, td);
-	fputsock(head);
-done2:
-	return (error);
-#else  /* SCTP */
-	return (EOPNOTSUPP);
-#endif /* SCTP */
-}
-
-int
-sys_sctp_generic_sendmsg (td, uap)
-	struct thread *td;
-	struct sctp_generic_sendmsg_args /* {
-		int sd, 
-		caddr_t msg, 
-		int mlen, 
-		caddr_t to, 
-		__socklen_t tolen, 
-		struct sctp_sndrcvinfo *sinfo, 
-		int flags
-	} */ *uap;
-{
-#if (defined(INET) || defined(INET6)) && defined(SCTP)
-	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
-	struct socket *so;
-	struct file *fp = NULL;
-	int error = 0, len;
-	struct sockaddr *to = NULL;
-#ifdef KTRACE
-	struct uio *ktruio = NULL;
-#endif
-	struct uio auio;
-	struct iovec iov[1];
-	cap_rights_t rights;
-
-	if (uap->sinfo) {
-		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
-		if (error)
-			return (error);
-		u_sinfo = &sinfo;
-	}
-
-	rights = CAP_WRITE;
-	if (uap->tolen) {
-		error = getsockaddr(&to, uap->to, uap->tolen);
-		if (error) {
-			to = NULL;
-			goto sctp_bad2;
-		}
-		rights |= CAP_CONNECT;
-	}
-
-	AUDIT_ARG_FD(uap->sd);
-	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
-	if (error)
-		goto sctp_bad;
-#ifdef KTRACE
-	if (to && (KTRPOINT(td, KTR_STRUCT)))
-		ktrsockaddr(to);
-#endif
-
-	iov[0].iov_base = uap->msg;
-	iov[0].iov_len = uap->mlen;
-
-	so = (struct socket *)fp->f_data;
-	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
-		error = EOPNOTSUPP;
-		goto sctp_bad;
-	}
-#ifdef MAC
-	error = mac_socket_check_send(td->td_ucred, so);
-	if (error)
-		goto sctp_bad;
-#endif /* MAC */
-
-	auio.uio_iov =  iov;
-	auio.uio_iovcnt = 1;
-	auio.uio_segflg = UIO_USERSPACE;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_td = td;
-	auio.uio_offset = 0;			/* XXX */
-	auio.uio_resid = 0;
-	len = auio.uio_resid = uap->mlen;
-	CURVNET_SET(so->so_vnet);
-	error = sctp_lower_sosend(so, to, &auio,
-		    (struct mbuf *)NULL, (struct mbuf *)NULL,
-		    uap->flags, u_sinfo, td);
-	CURVNET_RESTORE();
-	if (error) {
-		if (auio.uio_resid != len && (error == ERESTART ||
-		    error == EINTR || error == EWOULDBLOCK))
-			error = 0;
-		/* Generation of SIGPIPE can be controlled per socket. */
-		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
-		    !(uap->flags & MSG_NOSIGNAL)) {
-			PROC_LOCK(td->td_proc);
-			tdsignal(td, SIGPIPE);
-			PROC_UNLOCK(td->td_proc);
-		}
-	}
-	if (error == 0)
-		td->td_retval[0] = len - auio.uio_resid;
-#ifdef KTRACE
-	if (ktruio != NULL) {
-		ktruio->uio_resid = td->td_retval[0];
-		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
-	}
-#endif /* KTRACE */
-sctp_bad:
-	if (fp)
-		fdrop(fp, td);
-sctp_bad2:
-	if (to)
-		free(to, M_SONAME);
-	return (error);
-#else  /* SCTP */
-	return (EOPNOTSUPP);
-#endif /* SCTP */
-}
-
-int
-sys_sctp_generic_sendmsg_iov(td, uap)
-	struct thread *td;
-	struct sctp_generic_sendmsg_iov_args /* {
-		int sd, 
-		struct iovec *iov, 
-		int iovlen, 
-		caddr_t to, 
-		__socklen_t tolen, 
-		struct sctp_sndrcvinfo *sinfo, 
-		int flags
-	} */ *uap;
-{
-#if (defined(INET) || defined(INET6)) && defined(SCTP)
-	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
-	struct socket *so;
-	struct file *fp = NULL;
-	int error=0, i;
-	ssize_t len;
-	struct sockaddr *to = NULL;
-#ifdef KTRACE
-	struct uio *ktruio = NULL;
-#endif
-	struct uio auio;
-	struct iovec *iov, *tiov;
-	cap_rights_t rights;
-
-	if (uap->sinfo) {
-		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
-		if (error)
-			return (error);
-		u_sinfo = &sinfo;
-	}
-	rights = CAP_WRITE;
-	if (uap->tolen) {
-		error = getsockaddr(&to, uap->to, uap->tolen);
-		if (error) {
-			to = NULL;
-			goto sctp_bad2;
-		}
-		rights |= CAP_CONNECT;
-	}
-
-	AUDIT_ARG_FD(uap->sd);
-	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
-	if (error)
-		goto sctp_bad1;
-
-#ifdef COMPAT_FREEBSD32
-	if (SV_CURPROC_FLAG(SV_ILP32))
-		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
-		    uap->iovlen, &iov, EMSGSIZE);
-	else
-#endif
-		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
-	if (error)
-		goto sctp_bad1;
-#ifdef KTRACE
-	if (to && (KTRPOINT(td, KTR_STRUCT)))
-		ktrsockaddr(to);
-#endif
-
-	so = (struct socket *)fp->f_data;
-	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
-		error = EOPNOTSUPP;
-		goto sctp_bad;
-	}
-#ifdef MAC
-	error = mac_socket_check_send(td->td_ucred, so);
-	if (error)
-		goto sctp_bad;
-#endif /* MAC */
-
-	auio.uio_iov = iov;
-	auio.uio_iovcnt = uap->iovlen;
-	auio.uio_segflg = UIO_USERSPACE;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_td = td;
-	auio.uio_offset = 0;			/* XXX */
-	auio.uio_resid = 0;
-	tiov = iov;
-	for (i = 0; i <uap->iovlen; i++, tiov++) {
-		if ((auio.uio_resid += tiov->iov_len) < 0) {
-			error = EINVAL;
-			goto sctp_bad;
-		}
-	}
-	len = auio.uio_resid;
-	CURVNET_SET(so->so_vnet);
-	error = sctp_lower_sosend(so, to, &auio,
-		    (struct mbuf *)NULL, (struct mbuf *)NULL,
-		    uap->flags, u_sinfo, td);
-	CURVNET_RESTORE();
-	if (error) {
-		if (auio.uio_resid != len && (error == ERESTART ||
-		    error == EINTR || error == EWOULDBLOCK))
-			error = 0;
-		/* Generation of SIGPIPE can be controlled per socket */
-		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
-		    !(uap->flags & MSG_NOSIGNAL)) {
-			PROC_LOCK(td->td_proc);
-			tdsignal(td, SIGPIPE);
-			PROC_UNLOCK(td->td_proc);
-		}
-	}
-	if (error == 0)
-		td->td_retval[0] = len - auio.uio_resid;
-#ifdef KTRACE
-	if (ktruio != NULL) {
-		ktruio->uio_resid = td->td_retval[0];
-		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
-	}
-#endif /* KTRACE */
-sctp_bad:
-	free(iov, M_IOV);
-sctp_bad1:
-	if (fp)
-		fdrop(fp, td);
-sctp_bad2:
-	if (to)
-		free(to, M_SONAME);
-	return (error);
-#else  /* SCTP */
-	return (EOPNOTSUPP);
-#endif /* SCTP */
-}
-
-int
-sys_sctp_generic_recvmsg(td, uap)
-	struct thread *td;
-	struct sctp_generic_recvmsg_args /* {
-		int sd, 
-		struct iovec *iov, 
-		int iovlen,
-		struct sockaddr *from, 
-		__socklen_t *fromlenaddr,
-		struct sctp_sndrcvinfo *sinfo, 
-		int *msg_flags
-	} */ *uap;
-{
-#if (defined(INET) || defined(INET6)) && defined(SCTP)
-	uint8_t sockbufstore[256];
-	struct uio auio;
-	struct iovec *iov, *tiov;
-	struct sctp_sndrcvinfo sinfo;
-	struct socket *so;
-	struct file *fp = NULL;
-	struct sockaddr *fromsa;
-	int fromlen;
-	ssize_t len;
-	int i, msg_flags;
-	int error = 0;
-#ifdef KTRACE
-	struct uio *ktruio = NULL;
-#endif
-
-	AUDIT_ARG_FD(uap->sd);
-	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_READ, &fp, NULL);
-	if (error) {
-		return (error);
-	}
-#ifdef COMPAT_FREEBSD32
-	if (SV_CURPROC_FLAG(SV_ILP32))
-		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
-		    uap->iovlen, &iov, EMSGSIZE);
-	else
-#endif
-		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
-	if (error)
-		goto out1;
-
-	so = fp->f_data;
-	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
-		error = EOPNOTSUPP;
-		goto out;
-	}
-#ifdef MAC
-	error = mac_socket_check_receive(td->td_ucred, so);
-	if (error) {
-		goto out;
-	}
-#endif /* MAC */
-
-	if (uap->fromlenaddr) {
-		error = copyin(uap->fromlenaddr,
-		    &fromlen, sizeof (fromlen));
-		if (error) {
-			goto out;
-		}
-	} else {
-		fromlen = 0;
-	}
-	if (uap->msg_flags) {
-		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
-		if (error) {
-			goto out;
-		}
-	} else {
-		msg_flags = 0;
-	}
-	auio.uio_iov = iov;
-	auio.uio_iovcnt = uap->iovlen;
-  	auio.uio_segflg = UIO_USERSPACE;
-	auio.uio_rw = UIO_READ;
-	auio.uio_td = td;
-	auio.uio_offset = 0;			/* XXX */
-	auio.uio_resid = 0;
-	tiov = iov;
-	for (i = 0; i <uap->iovlen; i++, tiov++) {
-		if ((auio.uio_resid += tiov->iov_len) < 0) {
-			error = EINVAL;
-			goto out;
-		}
-	}
-	len = auio.uio_resid;
-	fromsa = (struct sockaddr *)sockbufstore;
-
-#ifdef KTRACE
-	if (KTRPOINT(td, KTR_GENIO))
-		ktruio = cloneuio(&auio);
-#endif /* KTRACE */
-	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
-	CURVNET_SET(so->so_vnet);
-	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
-		    fromsa, fromlen, &msg_flags,
-		    (struct sctp_sndrcvinfo *)&sinfo, 1);
-	CURVNET_RESTORE();
-	if (error) {
-		if (auio.uio_resid != len && (error == ERESTART ||
-		    error == EINTR || error == EWOULDBLOCK))
-			error = 0;
-	} else {
-		if (uap->sinfo)
-			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
-	}
-#ifdef KTRACE
-	if (ktruio != NULL) {
-		ktruio->uio_resid = len - auio.uio_resid;
-		ktrgenio(uap->sd, UIO_READ, ktruio, error);
-	}
-#endif /* KTRACE */
-	if (error)
-		goto out;
-	td->td_retval[0] = len - auio.uio_resid;
-
-	if (fromlen && uap->from) {
-		len = fromlen;
-		if (len <= 0 || fromsa == 0)
-			len = 0;
-		else {
-			len = MIN(len, fromsa->sa_len);
-			error = copyout(fromsa, uap->from, (size_t)len);
-			if (error)
-				goto out;
-		}
-		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
-		if (error) {
-			goto out;
-		}
-	}
-#ifdef KTRACE
-	if (KTRPOINT(td, KTR_STRUCT))
-		ktrsockaddr(fromsa);
-#endif
-	if (uap->msg_flags) {
-		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
-		if (error) {
-			goto out;
-		}
-	}
-out:
-	free(iov, M_IOV);
-out1:
-	if (fp) 
-		fdrop(fp, td);
-
-	return (error);
-#else  /* SCTP */
-	return (EOPNOTSUPP);
-#endif /* SCTP */
-}

Modified: trunk/sys/kern/uipc_usrreq.c
===================================================================
--- trunk/sys/kern/uipc_usrreq.c	2018-05-26 14:25:17 UTC (rev 9953)
+++ trunk/sys/kern/uipc_usrreq.c	2018-05-26 14:25:55 UTC (rev 9954)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.
@@ -51,17 +52,17 @@
  *
  * TODO:
  *	RDM
- *	distinguish datagram size limits from flow control limits in SEQPACKET
  *	rethink name space problems
  *	need a proper out-of-band
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/uipc_usrreq.c 305261 2016-09-02 00:14:28Z markj $");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
+#include <sys/capsicum.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
@@ -101,6 +102,8 @@
 
 #include <vm/uma.h>
 
+MALLOC_DECLARE(M_FILECAPS);
+
 /*
  * Locking key:
  * (l)	Locked using list lock
@@ -271,6 +274,8 @@
 static int	uipc_ctloutput(struct socket *, struct sockopt *);
 static int	unp_connect(struct socket *, struct sockaddr *,
 		    struct thread *);
+static int	unp_connectat(int, struct socket *, struct sockaddr *,
+		    struct thread *);
 static int	unp_connect2(struct socket *so, struct socket *so2, int);
 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
 static void	unp_dispose(struct mbuf *);
@@ -277,13 +282,13 @@
 static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *, int);
 static void	unp_gc(__unused void *, int);
-static void	unp_scan(struct mbuf *, void (*)(struct file *));
+static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
 static void	unp_discard(struct file *);
-static void	unp_freerights(struct file **, int);
+static void	unp_freerights(struct filedescent **, int);
 static void	unp_init(void);
 static int	unp_internalize(struct mbuf **, struct thread *);
 static void	unp_internalize_fp(struct file *);
-static int	unp_externalize(struct mbuf *, struct mbuf **);
+static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
 static void	unp_process_defers(void * __unused, int);
@@ -320,6 +325,7 @@
 	 */
 	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|PR_WANTRCVD|
 				    PR_RIGHTS,
+	.pr_ctloutput =		&uipc_ctloutput,
 	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
 },
 };
@@ -424,6 +430,8 @@
 	unp->unp_socket = so;
 	so->so_pcb = unp;
 	unp->unp_refcount = 1;
+	if (so->so_head != NULL)
+		unp->unp_flags |= UNP_NASCENT;
 
 	UNP_LIST_LOCK();
 	unp->unp_gencnt = ++unp_gencnt;
@@ -450,17 +458,21 @@
 }
 
 static int
-uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vattr vattr;
-	int error, namelen, vfslocked;
+	int error, namelen;
 	struct nameidata nd;
 	struct unpcb *unp;
 	struct vnode *vp;
 	struct mount *mp;
+	cap_rights_t rights;
 	char *buf;
 
+	if (nam->sa_family != AF_UNIX)
+		return (EAFNOSUPPORT);
+
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
 
@@ -496,15 +508,13 @@
 	buf[namelen] = 0;
 
 restart:
-	vfslocked = 0;
-	NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME,
-	    UIO_SYSSPACE, buf, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
+	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_BINDAT), td);
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vp = nd.ni_vp;
-	vfslocked = NDHASGIANT(&nd);
 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
@@ -519,7 +529,6 @@
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error)
 			goto error;
-		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
@@ -551,12 +560,10 @@
 	UNP_LINK_WUNLOCK();
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	free(buf, M_TEMP);
 	return (0);
 
 error:
-	VFS_UNLOCK_GIANT(vfslocked);
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
@@ -565,6 +572,13 @@
 }
 
 static int
+uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return (uipc_bindat(AT_FDCWD, so, nam, td));
+}
+
+static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
@@ -576,6 +590,19 @@
 	return (error);
 }
 
+static int
+uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+	int error;
+
+	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
+	UNP_LINK_WLOCK();
+	error = unp_connectat(fd, so, nam, td);
+	UNP_LINK_WUNLOCK();
+	return (error);
+}
+
 static void
 uipc_close(struct socket *so)
 {
@@ -627,14 +654,22 @@
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
 
-	UNP_LINK_WLOCK();
+	vp = NULL;
+	local_unp_rights = 0;
+
 	UNP_LIST_LOCK();
-	UNP_PCB_LOCK(unp);
 	LIST_REMOVE(unp, unp_link);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
 	UNP_LIST_UNLOCK();
 
+	if ((unp->unp_flags & UNP_NASCENT) != 0) {
+		UNP_PCB_LOCK(unp);
+		goto teardown;
+	}
+	UNP_LINK_WLOCK();
+	UNP_PCB_LOCK(unp);
+
 	/*
 	 * XXXRW: Should assert vp->v_socket == so.
 	 */
@@ -662,6 +697,7 @@
 	}
 	local_unp_rights = unp_rights;
 	UNP_LINK_WUNLOCK();
+teardown:
 	unp->unp_socket->so_pcb = NULL;
 	saved_unp_addr = unp->unp_addr;
 	unp->unp_addr = NULL;
@@ -674,13 +710,8 @@
 		uma_zfree(unp_zone, unp);
 	} else
 		UNP_PCB_UNLOCK(unp);
-	if (vp) {
-		int vfslocked;
-
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (vp)
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
 	if (local_unp_rights)
 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 }
@@ -772,7 +803,6 @@
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
-	u_long newhiwat;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
@@ -794,6 +824,15 @@
 	mbcnt = so->so_rcv.sb_mbcnt;
 	sbcc = so->so_rcv.sb_cc;
 	SOCKBUF_UNLOCK(&so->so_rcv);
+	/*
+	 * There is a benign race condition at this point.  If we're planning to
+	 * clear SB_STOP, but uipc_send is called on the connected socket at
+	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
+	 * we would erroneously clear SB_STOP below, even though the sockbuf is
+	 * full.  The race is benign because the only ill effect is to allow the
+	 * sockbuf to exceed its size limit, and the size limits are not
+	 * strictly guaranteed anyway.
+	 */
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL) {
@@ -802,13 +841,9 @@
 	}
 	so2 = unp2->unp_socket;
 	SOCKBUF_LOCK(&so2->so_snd);
-	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
-	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
-	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
-	    newhiwat, RLIM_INFINITY);
+	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
+		so2->so_snd.sb_flags &= ~SB_STOP;
 	sowwakeup_locked(so2);
-	unp->unp_mbcnt = mbcnt;
-	unp->unp_cc = sbcc;
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
@@ -819,8 +854,7 @@
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
-	u_int mbcnt_delta, sbcc;
-	u_int newhiwat;
+	u_int mbcnt, sbcc;
 	int error = 0;
 
 	unp = sotounpcb(so);
@@ -875,7 +909,8 @@
 			from = &sun_noname;
 		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
-		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
+		if (sbappendaddr_locked(&so2->so_rcv, from, m,
+		    control)) {
 			sorwakeup_locked(so2);
 			m = NULL;
 			control = NULL;
@@ -936,7 +971,8 @@
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (unp2->unp_flags & UNP_WANTCRED) {
 			/*
-			 * Credentials are passed only once on SOCK_STREAM.
+			 * Credentials are passed only once on SOCK_STREAM
+			 * and SOCK_SEQPACKET.
 			 */
 			unp2->unp_flags &= ~UNP_WANTCRED;
 			control = unp_addsockcred(td, control);
@@ -959,34 +995,34 @@
 			const struct sockaddr *from;
 
 			from = &sun_noname;
-			if (sbappendaddr_locked(&so2->so_rcv, from, m,
-			    control))
+			/*
+			 * Don't check for space available in so2->so_rcv.
+			 * Unix domain sockets only check for space in the
+			 * sending sockbuf, and that check is performed one
+			 * level up the stack.
+			 */
+			if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
+				from, m, control))
 				control = NULL;
 			break;
 			}
 		}
 
-		/*
-		 * XXXRW: While fine for SOCK_STREAM, this conflates maximum
-		 * datagram size and back-pressure for SOCK_SEQPACKET, which
-		 * can lead to undesired return of EMSGSIZE on send instead
-		 * of more desirable blocking.
-		 */
-		mbcnt_delta = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
-		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		mbcnt = so2->so_rcv.sb_mbcnt;
 		sbcc = so2->so_rcv.sb_cc;
 		sorwakeup_locked(so2);
 
+		/*
+		 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
+		 * it would be possible for uipc_rcvd to be called at this
+		 * point, drain the receiving sockbuf, clear SB_STOP, and then
+		 * we would set SB_STOP below.  That could lead to an empty
+		 * sockbuf having SB_STOP set
+		 */
 		SOCKBUF_LOCK(&so->so_snd);
-		if ((int)so->so_snd.sb_hiwat >= (int)(sbcc - unp2->unp_cc))
-			newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
-		else
-			newhiwat = 0;
-		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
-		    newhiwat, RLIM_INFINITY);
-		so->so_snd.sb_mbmax -= mbcnt_delta;
+		if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
+			so->so_snd.sb_flags |= SB_STOP;
 		SOCKBUF_UNLOCK(&so->so_snd);
-		unp2->unp_cc = sbcc;
 		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
@@ -1024,27 +1060,18 @@
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
-	struct unpcb *unp, *unp2;
-	struct socket *so2;
+	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
 
 	sb->st_blksize = so->so_snd.sb_hiwat;
-	UNP_LINK_RLOCK();
 	UNP_PCB_LOCK(unp);
-	unp2 = unp->unp_conn;
-	if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) &&
-	    unp2 != NULL) {
-		so2 = unp2->unp_socket;
-		sb->st_blksize += so2->so_rcv.sb_cc;
-	}
 	sb->st_dev = NODEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
 	sb->st_ino = unp->unp_ino;
 	UNP_PCB_UNLOCK(unp);
-	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
@@ -1090,7 +1117,9 @@
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
@@ -1110,7 +1139,9 @@
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
@@ -1130,7 +1161,9 @@
 	.pru_accept =		uipc_accept,
 	.pru_attach =		uipc_attach,
 	.pru_bind =		uipc_bind,
+	.pru_bindat =		uipc_bindat,
 	.pru_connect =		uipc_connect,
+	.pru_connectat =	uipc_connectat,
 	.pru_connect2 =		uipc_connect2,
 	.pru_detach =		uipc_detach,
 	.pru_disconnect =	uipc_disconnect,
@@ -1242,15 +1275,27 @@
 static int
 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
+
+	return (unp_connectat(AT_FDCWD, so, nam, td));
+}
+
+static int
+unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vnode *vp;
 	struct socket *so2, *so3;
 	struct unpcb *unp, *unp2, *unp3;
-	int error, len, vfslocked;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
+	cap_rights_t rights;
+	int error, len;
 
+	if (nam->sa_family != AF_UNIX)
+		return (EAFNOSUPPORT);
+
 	UNP_LINK_WLOCK_ASSERT();
 
 	unp = sotounpcb(so);
@@ -1274,8 +1319,8 @@
 	UNP_PCB_UNLOCK(unp);
 
 	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKSHARED | LOCKLEAF,
-	    UIO_SYSSPACE, buf, td);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
+	    UIO_SYSSPACE, buf, fd, cap_rights_init(&rights, CAP_CONNECTAT), td);
 	error = namei(&nd);
 	if (error)
 		vp = NULL;
@@ -1282,7 +1327,6 @@
 	else
 		vp = nd.ni_vp;
 	ASSERT_VOP_LOCKED(vp, "unp_connect");
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error)
 		goto bad;
@@ -1299,7 +1343,6 @@
 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
 	if (error)
 		goto bad;
-	VFS_UNLOCK_GIANT(vfslocked);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
@@ -1342,7 +1385,7 @@
 		}
 
 		/*
-		 * The connecter's (client's) credentials are copied from its
+		 * The connector's (client's) credentials are copied from its
 		 * process structure at the time of connect() (which is now).
 		 */
 		cru2x(td->td_ucred, &unp3->unp_peercred);
@@ -1382,16 +1425,9 @@
 	UNP_PCB_UNLOCK(unp);
 bad2:
 	UNP_LINK_WUNLOCK();
-	if (vfslocked)
-		/* 
-		 * Giant has been previously acquired. This means filesystem
-		 * isn't MPSAFE.  Do it once again.
-		 */
-		mtx_lock(&Giant);
 bad:
 	if (vp != NULL)
 		vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	free(sa, M_SONAME);
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
@@ -1417,6 +1453,7 @@
 
 	if (so2->so_type != so->so_type)
 		return (EPROTOTYPE);
+	unp2->unp_flags &= ~UNP_NASCENT;
 	unp->unp_conn = unp2;
 
 	switch (so->so_type) {
@@ -1660,31 +1697,33 @@
 }
 
 static void
-unp_freerights(struct file **rp, int fdcount)
+unp_freerights(struct filedescent **fdep, int fdcount)
 {
+	struct file *fp;
 	int i;
-	struct file *fp;
 
+	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
+
 	for (i = 0; i < fdcount; i++) {
-		fp = *rp;
-		*rp++ = NULL;
+		fp = fdep[i]->fde_file;
+		filecaps_free(&fdep[i]->fde_caps);
 		unp_discard(fp);
 	}
+	free(fdep[0], M_FILECAPS);
 }
 
 static int
-unp_externalize(struct mbuf *control, struct mbuf **controlp)
+unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 {
 	struct thread *td = curthread;		/* XXX */
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	int i;
 	int *fdp;
-	struct file **rp;
-	struct file *fp;
+	struct filedesc *fdesc = td->td_proc->p_fd;
+	struct filedescent *fde, **fdep;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, newfds;
-	int f;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
@@ -1701,22 +1740,17 @@
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 		if (cm->cmsg_level == SOL_SOCKET
 		    && cm->cmsg_type == SCM_RIGHTS) {
-			newfds = datalen / sizeof(struct file *);
-			rp = data;
+			newfds = datalen / sizeof(*fdep);
+			if (newfds == 0)
+				goto next;
+			fdep = data;
 
 			/* If we're not outputting the descriptors free them. */
 			if (error || controlp == NULL) {
-				unp_freerights(rp, newfds);
+				unp_freerights(fdep, newfds);
 				goto next;
 			}
-			FILEDESC_XLOCK(td->td_proc->p_fd);
-			/* if the new FD's will not fit free them.  */
-			if (!fdavail(td, newfds)) {
-				FILEDESC_XUNLOCK(td->td_proc->p_fd);
-				error = EMSGSIZE;
-				unp_freerights(rp, newfds);
-				goto next;
-			}
+			FILEDESC_XLOCK(fdesc);
 
 			/*
 			 * Now change each pointer to an fd in the global
@@ -1728,23 +1762,33 @@
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
-				FILEDESC_XUNLOCK(td->td_proc->p_fd);
+				FILEDESC_XUNLOCK(fdesc);
 				error = E2BIG;
-				unp_freerights(rp, newfds);
+				unp_freerights(fdep, newfds);
 				goto next;
 			}
 
 			fdp = (int *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
-			for (i = 0; i < newfds; i++) {
-				if (fdalloc(td, 0, &f))
-					panic("unp_externalize fdalloc failed");
-				fp = *rp++;
-				td->td_proc->p_fd->fd_ofiles[f] = fp;
-				unp_externalize_fp(fp);
-				*fdp++ = f;
+			if (fdallocn(td, 0, fdp, newfds) != 0) {
+				FILEDESC_XUNLOCK(td->td_proc->p_fd);
+				error = EMSGSIZE;
+				unp_freerights(fdep, newfds);
+				m_freem(*controlp);
+				*controlp = NULL;
+				goto next;
 			}
-			FILEDESC_XUNLOCK(td->td_proc->p_fd);
+			for (i = 0; i < newfds; i++, fdp++) {
+				fde = &fdesc->fd_ofiles[*fdp];
+				fde->fde_file = fdep[i]->fde_file;
+				filecaps_move(&fdep[i]->fde_caps,
+				    &fde->fde_caps);
+				if ((flags & MSG_CMSG_CLOEXEC) != 0)
+					fde->fde_flags |= UF_EXCLOSE;
+				unp_externalize_fp(fde->fde_file);
+			}
+			FILEDESC_XUNLOCK(fdesc);
+			free(fdep[0], M_FILECAPS);
 		} else {
 			/* We can just copy anything else across. */
 			if (error || controlp == NULL)
@@ -1796,6 +1840,7 @@
 	if (unp_zone == NULL)
 		panic("unp_init");
 	uma_zone_set_max(unp_zone, maxsockets);
+	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	LIST_INIT(&unp_dhead);
@@ -1814,14 +1859,14 @@
 {
 	struct mbuf *control = *controlp;
 	struct proc *p = td->td_proc;
-	struct filedesc *fdescp = p->p_fd;
+	struct filedesc *fdesc = p->p_fd;
 	struct bintime *bt;
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	struct cmsgcred *cmcred;
-	struct file **rp;
+	struct filedescent *fde, **fdep, *fdev;
 	struct file *fp;
 	struct timeval *tv;
-	int i, fd, *fdp;
+	int i, *fdp;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, oldfds;
@@ -1833,7 +1878,7 @@
 	*controlp = NULL;
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
-		    || cm->cmsg_len > clen) {
+		    || cm->cmsg_len > clen || cm->cmsg_len < sizeof(*cm)) {
 			error = EINVAL;
 			goto out;
 		}
@@ -1866,23 +1911,23 @@
 
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
+			if (oldfds == 0)
+				break;
 			/*
 			 * Check that all the FDs passed in refer to legal
 			 * files.  If not, reject the entire operation.
 			 */
 			fdp = data;
-			FILEDESC_SLOCK(fdescp);
-			for (i = 0; i < oldfds; i++) {
-				fd = *fdp++;
-				if ((unsigned)fd >= fdescp->fd_nfiles ||
-				    fdescp->fd_ofiles[fd] == NULL) {
-					FILEDESC_SUNLOCK(fdescp);
+			FILEDESC_SLOCK(fdesc);
+			for (i = 0; i < oldfds; i++, fdp++) {
+				fp = fget_locked(fdesc, *fdp);
+				if (fp == NULL) {
+					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
-				fp = fdescp->fd_ofiles[fd];
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
-					FILEDESC_SUNLOCK(fdescp);
+					FILEDESC_SUNLOCK(fdesc);
 					error = EOPNOTSUPP;
 					goto out;
 				}
@@ -1891,25 +1936,30 @@
 
 			/*
 			 * Now replace the integer FDs with pointers to the
-			 * associated global file table entry..
+			 * file structure and capability rights.
 			 */
-			newlen = oldfds * sizeof(struct file *);
+			newlen = oldfds * sizeof(fdep[0]);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
-				FILEDESC_SUNLOCK(fdescp);
+				FILEDESC_SUNLOCK(fdesc);
 				error = E2BIG;
 				goto out;
 			}
 			fdp = data;
-			rp = (struct file **)
+			fdep = (struct filedescent **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
-			for (i = 0; i < oldfds; i++) {
-				fp = fdescp->fd_ofiles[*fdp++];
-				*rp++ = fp;
-				unp_internalize_fp(fp);
+			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
+			    M_WAITOK);
+			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
+				fde = &fdesc->fd_ofiles[*fdp];
+				fdep[i] = fdev;
+				fdep[i]->fde_file = fde->fde_file;
+				filecaps_copy(&fde->fde_caps,
+				    &fdep[i]->fde_caps);
+				unp_internalize_fp(fdep[i]->fde_file);
 			}
-			FILEDESC_SUNLOCK(fdescp);
+			FILEDESC_SUNLOCK(fdesc);
 			break;
 
 		case SCM_TIMESTAMP:
@@ -2105,17 +2155,22 @@
 static int	unp_unreachable;
 
 static void
-unp_accessable(struct file *fp)
+unp_accessable(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
+	struct file *fp;
+	int i;
 
-	if ((unp = fptounp(fp)) == NULL)
-		return;
-	if (unp->unp_gcflag & UNPGC_REF)
-		return;
-	unp->unp_gcflag &= ~UNPGC_DEAD;
-	unp->unp_gcflag |= UNPGC_REF;
-	unp_marked++;
+	for (i = 0; i < fdcount; i++) {
+		fp = fdep[i]->fde_file;
+		if ((unp = fptounp(fp)) == NULL)
+			continue;
+		if (unp->unp_gcflag & UNPGC_REF)
+			continue;
+		unp->unp_gcflag &= ~UNPGC_DEAD;
+		unp->unp_gcflag |= UNPGC_REF;
+		unp_marked++;
+	}
 }
 
 static void
@@ -2146,9 +2201,11 @@
 	 * Mark all sockets we reference with RIGHTS.
 	 */
 	so = unp->unp_socket;
-	SOCKBUF_LOCK(&so->so_rcv);
-	unp_scan(so->so_rcv.sb_mb, unp_accessable);
-	SOCKBUF_UNLOCK(&so->so_rcv);
+	if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) {
+		SOCKBUF_LOCK(&so->so_rcv);
+		unp_scan(so->so_rcv.sb_mb, unp_accessable);
+		SOCKBUF_UNLOCK(&so->so_rcv);
+	}
 
 	/*
 	 * Mark all sockets in our accept queue.
@@ -2155,6 +2212,8 @@
 	 */
 	ACCEPT_LOCK();
 	TAILQ_FOREACH(soa, &so->so_comp, so_list) {
+		if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0)
+			continue;
 		SOCKBUF_LOCK(&soa->so_rcv);
 		unp_scan(soa->so_rcv.sb_mb, unp_accessable);
 		SOCKBUF_UNLOCK(&soa->so_rcv);
@@ -2184,11 +2243,13 @@
 	unp_taskcount++;
 	UNP_LIST_LOCK();
 	/*
-	 * First clear all gc flags from previous runs.
+	 * First clear all gc flags from previous runs, apart from
+	 * UNPGC_IGNORE_RIGHTS.
 	 */
 	for (head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link)
-			unp->unp_gcflag = 0;
+			unp->unp_gcflag =
+			    (unp->unp_gcflag & UNPGC_IGNORE_RIGHTS);
 
 	/*
 	 * Scan marking all reachable sockets with UNPGC_REF.  Once a socket
@@ -2262,19 +2323,31 @@
 {
 
 	if (m)
-		unp_scan(m, unp_discard);
+		unp_scan(m, unp_freerights);
 }
 
+/*
+ * Synchronize against unp_gc, which can trip over data as we are freeing it.
+ */
+void
+unp_dispose_so(struct socket *so)
+{
+	struct unpcb *unp;
+
+	unp = sotounpcb(so);
+	UNP_LIST_LOCK();
+	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
+	UNP_LIST_UNLOCK();
+	unp_dispose(so->so_rcv.sb_mb);
+}
+
 static void
-unp_scan(struct mbuf *m0, void (*op)(struct file *))
+unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 {
 	struct mbuf *m;
-	struct file **rp;
 	struct cmsghdr *cm;
 	void *data;
-	int i;
 	socklen_t clen, datalen;
-	int qfds;
 
 	while (m0 != NULL) {
 		for (m = m0; m; m = m->m_next) {
@@ -2294,10 +2367,8 @@
 
 				if (cm->cmsg_level == SOL_SOCKET &&
 				    cm->cmsg_type == SCM_RIGHTS) {
-					qfds = datalen / sizeof (struct file *);
-					rp = data;
-					for (i = 0; i < qfds; i++)
-						(*op)(*rp++);
+					(*op)(data, datalen /
+					    sizeof(struct filedescent *));
 				}
 
 				if (CMSG_SPACE(datalen) < clen) {
@@ -2310,7 +2381,7 @@
 				}
 			}
 		}
-		m0 = m0->m_act;
+		m0 = m0->m_nextpkt;
 	}
 }
 
@@ -2443,7 +2514,7 @@
 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
 	    unp->unp_vnode);
 
-	db_printf("unp_ino: %d   unp_conn: %p\n", unp->unp_ino,
+	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
 	    unp->unp_conn);
 
 	db_printf("unp_refs:\n");
@@ -2452,8 +2523,7 @@
 	/* XXXRW: Would be nice to print the full address, if any. */
 	db_printf("unp_addr: %p\n", unp->unp_addr);
 
-	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
-	    unp->unp_cc, unp->unp_mbcnt,
+	db_printf("unp_gencnt: %llu\n",
 	    (unsigned long long)unp->unp_gencnt);
 
 	db_printf("unp_flags: %x (", unp->unp_flags);



More information about the Midnightbsd-cvs mailing list