[Midnightbsd-cvs] src [9957] trunk/sys/kern: sync with freebsd

Sat May 26 10:27:48 EDT 2018

Revision: 9957
          http://svnweb.midnightbsd.org/src/?rev=9957
Author:   laffer1
Date:     2018-05-26 10:27:48 -0400 (Sat, 26 May 2018)
Log Message:
-----------
 sync with freebsd

Modified Paths:
--------------
    trunk/sys/kern/sys_capability.c
    trunk/sys/kern/sys_generic.c
    trunk/sys/kern/sys_pipe.c
    trunk/sys/kern/sys_procdesc.c
    trunk/sys/kern/sys_process.c
    trunk/sys/kern/sys_socket.c

Modified: trunk/sys/kern/sys_capability.c
===================================================================

--- trunk/sys/kern/sys_capability.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_capability.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,11 +1,16 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008-2011 Robert N. M. Watson
  * Copyright (c) 2010-2011 Jonathan Anderson
+ * Copyright (c) 2012 FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
+ * Portions of this software were developed by Pawel Jakub Dawidek under
+ * sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -51,23 +56,28 @@
  * anonymous, rather than named, POSIX shared memory objects.
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_capability.c 302229 2016-06-27 21:25:01Z bdrewery $");
+
 #include "opt_capsicum.h"
+#include "opt_ktrace.h"
 
-#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
-
 #include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
+#include <sys/uio.h>
+#include <sys/ktrace.h>
 
 #include <security/audit/audit.h>
 
@@ -96,7 +106,7 @@
 	oldcred = p->p_ucred;
 	crcopy(newcred, oldcred);
 	newcred->cr_flags |= CRED_FLAG_CAPMODE;
-	p->p_ucred = newcred;
+	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
@@ -110,7 +120,7 @@
 {
 	u_int i;
 
-	i = (IN_CAPABILITY_MODE(td)) ? 1 : 0;
+	i = IN_CAPABILITY_MODE(td) ? 1 : 0;
 	return (copyout(&i, uap->modep, sizeof(i)));
 }
 
@@ -136,85 +146,53 @@
 
 FEATURE(security_capabilities, "Capsicum Capabilities");
 
-/*
- * struct capability describes a capability, and is hung off of its struct
- * file f_data field.  cap_file and cap_rightss are static once hooked up, as
- * neither the object it references nor the rights it encapsulates are
- * permitted to change.
- */
-struct capability {
-	struct file	*cap_object;	/* Underlying object's file. */
-	struct file	*cap_file;	/* Back-pointer to cap's file. */
-	cap_rights_t	 cap_rights;	/* Mask of rights on object. */
-};
+MALLOC_DECLARE(M_FILECAPS);
 
+static inline int
+_cap_check(const cap_rights_t *havep, const cap_rights_t *needp,
+    enum ktr_cap_fail_type type)
+{
+	int i;
+
+	for (i = 0; i < nitems(havep->cr_rights); i++) {
+		if (!cap_rights_contains(havep, needp)) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(type, needp, havep);
+#endif
+			return (ENOTCAPABLE);
+		}
+	}
+	return (0);
+}
+
 /*
- * Capabilities have a fileops vector, but in practice none should ever be
- * called except for fo_close, as the capability will normally not be
- * returned during a file descriptor lookup in the system call code.
+ * Test whether a capability grants the requested rights.
  */
-static fo_rdwr_t capability_read;
-static fo_rdwr_t capability_write;
-static fo_truncate_t capability_truncate;
-static fo_ioctl_t capability_ioctl;
-static fo_poll_t capability_poll;
-static fo_kqfilter_t capability_kqfilter;
-static fo_stat_t capability_stat;
-static fo_close_t capability_close;
-static fo_chmod_t capability_chmod;
-static fo_chown_t capability_chown;
-
-static struct fileops capability_ops = {
-	.fo_read = capability_read,
-	.fo_write = capability_write,
-	.fo_truncate = capability_truncate,
-	.fo_ioctl = capability_ioctl,
-	.fo_poll = capability_poll,
-	.fo_kqfilter = capability_kqfilter,
-	.fo_stat = capability_stat,
-	.fo_close = capability_close,
-	.fo_chmod = capability_chmod,
-	.fo_chown = capability_chown,
-	.fo_flags = DFLAG_PASSABLE,
-};
-
-static struct fileops capability_ops_unpassable = {
-	.fo_read = capability_read,
-	.fo_write = capability_write,
-	.fo_truncate = capability_truncate,
-	.fo_ioctl = capability_ioctl,
-	.fo_poll = capability_poll,
-	.fo_kqfilter = capability_kqfilter,
-	.fo_stat = capability_stat,
-	.fo_close = capability_close,
-	.fo_chmod = capability_chmod,
-	.fo_chown = capability_chown,
-	.fo_flags = 0,
-};
-
-static uma_zone_t capability_zone;
-
-static void
-capability_init(void *dummy __unused)
+int
+cap_check(const cap_rights_t *havep, const cap_rights_t *needp)
 {
 
-	capability_zone = uma_zcreate("capability", sizeof(struct capability),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-	if (capability_zone == NULL)
-		panic("capability_init: capability_zone not initialized");
+	return (_cap_check(havep, needp, CAPFAIL_NOTCAPABLE));
 }
-SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL);
 
 /*
- * Test whether a capability grants the requested rights.
+ * Convert capability rights into VM access flags.
  */
-static int
-cap_check(struct capability *c, cap_rights_t rights)
+u_char
+cap_rights_to_vmprot(cap_rights_t *havep)
 {
+	u_char maxprot;
 
-	if ((c->cap_rights | rights) != c->cap_rights)
-		return (ENOTCAPABLE);
-	return (0);
+	maxprot = VM_PROT_NONE;
+	if (cap_rights_is_set(havep, CAP_MMAP_R))
+		maxprot |= VM_PROT_READ;
+	if (cap_rights_is_set(havep, CAP_MMAP_W))
+		maxprot |= VM_PROT_WRITE;
+	if (cap_rights_is_set(havep, CAP_MMAP_X))
+		maxprot |= VM_PROT_EXECUTE;
+
+	return (maxprot);
 }
 
 /*
@@ -222,44 +200,83 @@
  * any other way, as we want to keep all capability permission evaluation in
  * this one file.
  */
-cap_rights_t
-cap_rights(struct file *fp_cap)
+
+cap_rights_t *
+cap_rights_fde(struct filedescent *fde)
 {
-	struct capability *c;
 
-	KASSERT(fp_cap->f_type == DTYPE_CAPABILITY,
-	    ("cap_rights: !capability"));
+	return (&fde->fde_rights);
+}
 
-	c = fp_cap->f_data;
-	return (c->cap_rights);
+cap_rights_t *
+cap_rights(struct filedesc *fdp, int fd)
+{
+
+	return (cap_rights_fde(&fdp->fd_ofiles[fd]));
 }
 
 /*
- * System call to create a new capability reference to either an existing
- * file object or an an existing capability.
+ * System call to limit rights of the given capability.
  */
 int
-sys_cap_new(struct thread *td, struct cap_new_args *uap)
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
 {
-	int error, capfd;
-	int fd = uap->fd;
-	struct file *fp;
-	cap_rights_t rights = uap->rights;
+	struct filedesc *fdp;
+	cap_rights_t rights;
+	int error, fd, version;
 
+	cap_rights_init(&rights);
+
+	error = copyin(uap->rightsp, &rights, sizeof(rights.cr_rights[0]));
+	if (error != 0)
+		return (error);
+	version = CAPVER(&rights);
+	if (version != CAP_RIGHTS_VERSION_00)
+		return (EINVAL);
+
+	error = copyin(uap->rightsp, &rights,
+	    sizeof(rights.cr_rights[0]) * CAPARSIZE(&rights));
+	if (error != 0)
+		return (error);
+	/* Check for race. */
+	if (CAPVER(&rights) != version)
+		return (EINVAL);
+
+	if (!cap_rights_is_valid(&rights))
+		return (EINVAL);
+
+	if (version != CAP_RIGHTS_VERSION) {
+		rights.cr_rights[0] &= ~(0x3ULL << 62);
+		rights.cr_rights[0] |= ((uint64_t)CAP_RIGHTS_VERSION << 62);
+	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktrcaprights(&rights);
+#endif
+
+	fd = uap->fd;
+
 	AUDIT_ARG_FD(fd);
-	AUDIT_ARG_RIGHTS(rights);
-	error = fget(td, fd, rights, &fp);
-	if (error)
-		return (error);
-	AUDIT_ARG_FILE(td->td_proc, fp);
-	error = kern_capwrap(td, fp, rights, &capfd);
-	/*
-	 * Release our reference to the file (kern_capwrap has held a reference
-	 * for the filedesc array).
-	 */
-	fdrop(fp, td);
-	if (error == 0)
-		td->td_retval[0] = capfd;
+	AUDIT_ARG_RIGHTS(&rights);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+	error = _cap_check(cap_rights(fdp, fd), &rights, CAPFAIL_INCREASE);
+	if (error == 0) {
+		fdp->fd_ofiles[fd].fde_rights = rights;
+		if (!cap_rights_is_set(&rights, CAP_IOCTL)) {
+			free(fdp->fd_ofiles[fd].fde_ioctls, M_FILECAPS);
+			fdp->fd_ofiles[fd].fde_ioctls = NULL;
+			fdp->fd_ofiles[fd].fde_nioctls = 0;
+		}
+		if (!cap_rights_is_set(&rights, CAP_FCNTL))
+			fdp->fd_ofiles[fd].fde_fcntls = 0;
+	}
+	FILEDESC_XUNLOCK(fdp);
 	return (error);
 }
 
@@ -267,241 +284,297 @@
  * System call to query the rights mask associated with a capability.
  */
 int
-sys_cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
 {
-	struct capability *cp;
-	struct file *fp;
-	int error;
+	struct filedesc *fdp;
+	cap_rights_t rights;
+	int error, fd, i, n;
 
-	AUDIT_ARG_FD(uap->fd);
-	error = fgetcap(td, uap->fd, &fp);
-	if (error)
-		return (error);
-	cp = fp->f_data;
-	error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp));
-	fdrop(fp, td);
+	if (uap->version != CAP_RIGHTS_VERSION_00)
+		return (EINVAL);
+
+	fd = uap->fd;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_SUNLOCK(fdp);
+		return (EBADF);
+	}
+	rights = *cap_rights(fdp, fd);
+	FILEDESC_SUNLOCK(fdp);
+	n = uap->version + 2;
+	if (uap->version != CAPVER(&rights)) {
+		/*
+		 * For older versions we need to check if the descriptor
+		 * doesn't contain rights not understood by the caller.
+		 * If it does, we have to return an error.
+		 */
+		for (i = n; i < CAPARSIZE(&rights); i++) {
+			if ((rights.cr_rights[i] & ~(0x7FULL << 57)) != 0)
+				return (EINVAL);
+		}
+	}
+	error = copyout(&rights, uap->rightsp, sizeof(rights.cr_rights[0]) * n);
+#ifdef KTRACE
+	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+		ktrcaprights(&rights);
+#endif
 	return (error);
 }
 
 /*
- * Create a capability to wrap around an existing file.
+ * Test whether a capability grants the given ioctl command.
+ * If descriptor doesn't have CAP_IOCTL, then ioctls list is empty and
+ * ENOTCAPABLE will be returned.
  */
 int
-kern_capwrap(struct thread *td, struct file *fp, cap_rights_t rights,
-    int *capfdp)
+cap_ioctl_check(struct filedesc *fdp, int fd, u_long cmd)
 {
-	struct capability *cp, *cp_old;
-	struct file *fp_object, *fcapp;
-	int error;
+	u_long *cmds;
+	ssize_t ncmds;
+	long i;
 
-	if ((rights | CAP_MASK_VALID) != CAP_MASK_VALID)
-		return (EINVAL);
+	FILEDESC_LOCK_ASSERT(fdp);
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("%s: invalid fd=%d", __func__, fd));
 
-	/*
-	 * If a new capability is being derived from an existing capability,
-	 * then the new capability rights must be a subset of the existing
-	 * rights.
-	 */
-	if (fp->f_type == DTYPE_CAPABILITY) {
-		cp_old = fp->f_data;
-		if ((cp_old->cap_rights | rights) != cp_old->cap_rights)
-			return (ENOTCAPABLE);
+	ncmds = fdp->fd_ofiles[fd].fde_nioctls;
+	if (ncmds == -1)
+		return (0);
+
+	cmds = fdp->fd_ofiles[fd].fde_ioctls;
+	for (i = 0; i < ncmds; i++) {
+		if (cmds[i] == cmd)
+			return (0);
 	}
 
-	/*
-	 * Allocate a new file descriptor to hang the capability off of.
-	 */
-	error = falloc(td, &fcapp, capfdp, fp->f_flag);
-	if (error)
-		return (error);
-
-	/*
-	 * Rather than nesting capabilities, directly reference the object an
-	 * existing capability references.  There's nothing else interesting
-	 * to preserve for future use, as we've incorporated the previous
-	 * rights mask into the new one.  This prevents us from having to
-	 * deal with capability chains.
-	 */
-	if (fp->f_type == DTYPE_CAPABILITY)
-		fp_object = ((struct capability *)fp->f_data)->cap_object;
-	else
-		fp_object = fp;
-	fhold(fp_object);
-	cp = uma_zalloc(capability_zone, M_WAITOK | M_ZERO);
-	cp->cap_rights = rights;
-	cp->cap_object = fp_object;
-	cp->cap_file = fcapp;
-	if (fp->f_flag & DFLAG_PASSABLE)
-		finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
-		    &capability_ops);
-	else
-		finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
-		    &capability_ops_unpassable);
-
-	/*
-	 * Release our private reference (the proc filedesc still has one).
-	 */
-	fdrop(fcapp, td);
-	return (0);
+	return (ENOTCAPABLE);
 }
 
 /*
- * Given a file descriptor, test it against a capability rights mask and then
- * return the file descriptor on which to actually perform the requested
- * operation.  As long as the reference to fp_cap remains valid, the returned
- * pointer in *fp will remain valid, so no extra reference management is
- * required, and the caller should fdrop() fp_cap as normal when done with
- * both.
+ * Check if the current ioctls list can be replaced by the new one.
  */
-int
-cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+static int
+cap_ioctl_limit_check(struct filedesc *fdp, int fd, const u_long *cmds,
+    size_t ncmds)
 {
-	struct capability *c;
-	int error;
+	u_long *ocmds;
+	ssize_t oncmds;
+	u_long i;
+	long j;
 
-	if (fp_cap->f_type != DTYPE_CAPABILITY) {
-		*fpp = fp_cap;
+	oncmds = fdp->fd_ofiles[fd].fde_nioctls;
+	if (oncmds == -1)
 		return (0);
+	if (oncmds < (ssize_t)ncmds)
+		return (ENOTCAPABLE);
+
+	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+	for (i = 0; i < ncmds; i++) {
+		for (j = 0; j < oncmds; j++) {
+			if (cmds[i] == ocmds[j])
+				break;
+		}
+		if (j == oncmds)
+			return (ENOTCAPABLE);
 	}
-	c = fp_cap->f_data;
-	error = cap_check(c, rights);
-	if (error)
-		return (error);
-	*fpp = c->cap_object;
+
 	return (0);
 }
 
-/*
- * Slightly different routine for memory mapping file descriptors: unwrap the
- * capability and check CAP_MMAP, but also return a bitmask representing the
- * maximum mapping rights the capability allows on the object.
- */
 int
-cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
-    struct file **fpp)
+kern_cap_ioctls_limit(struct thread *td, int fd, u_long *cmds, size_t ncmds)
 {
-	struct capability *c;
-	u_char maxprot;
+	struct filedesc *fdp;
+	u_long *ocmds;
 	int error;
 
-	if (fp_cap->f_type != DTYPE_CAPABILITY) {
-		*fpp = fp_cap;
-		*maxprotp = VM_PROT_ALL;
-		return (0);
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+
+	if (fget_locked(fdp, fd) == NULL) {
+		error = EBADF;
+		goto out;
 	}
-	c = fp_cap->f_data;
-	error = cap_check(c, rights | CAP_MMAP);
-	if (error)
-		return (error);
-	*fpp = c->cap_object;
-	maxprot = 0;
-	if (c->cap_rights & CAP_READ)
-		maxprot |= VM_PROT_READ;
-	if (c->cap_rights & CAP_WRITE)
-		maxprot |= VM_PROT_WRITE;
-	if (c->cap_rights & CAP_MAPEXEC)
-		maxprot |= VM_PROT_EXECUTE;
-	*maxprotp = maxprot;
-	return (0);
-}
 
-/*
- * When a capability is closed, simply drop the reference on the underlying
- * object and free the capability.  fdrop() will handle the case where the
- * underlying object also needs to close, and the caller will have already
- * performed any object-specific lock or mqueue handling.
- */
-static int
-capability_close(struct file *fp, struct thread *td)
-{
-	struct capability *c;
-	struct file *fp_object;
+	error = cap_ioctl_limit_check(fdp, fd, cmds, ncmds);
+	if (error != 0)
+		goto out;
 
-	KASSERT(fp->f_type == DTYPE_CAPABILITY,
-	    ("capability_close: !capability"));
+	ocmds = fdp->fd_ofiles[fd].fde_ioctls;
+	fdp->fd_ofiles[fd].fde_ioctls = cmds;
+	fdp->fd_ofiles[fd].fde_nioctls = ncmds;
 
-	c = fp->f_data;
-	fp->f_ops = &badfileops;
-	fp->f_data = NULL;
-	fp_object = c->cap_object;
-	uma_zfree(capability_zone, c);
-	return (fdrop(fp_object, td));
+	cmds = ocmds;
+	error = 0;
+out:
+	FILEDESC_XUNLOCK(fdp);
+	free(cmds, M_FILECAPS);
+	return (error);
 }
 
-/*
- * In general, file descriptor operations should never make it to the
- * capability, only the underlying file descriptor operation vector, so panic
- * if we do turn up here.
- */
-static int
-capability_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
-    int flags, struct thread *td)
+int
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
 {
+	u_long *cmds;
+	size_t ncmds;
+	int error;
 
-	panic("capability_read");
-}
+	ncmds = uap->ncmds;
 
-static int
-capability_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
-    int flags, struct thread *td)
-{
+	if (ncmds > 256)	/* XXX: Is 256 sane? */
+		return (EINVAL);
 
-	panic("capability_write");
+	if (ncmds == 0) {
+		cmds = NULL;
+	} else {
+		cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
+		error = copyin(uap->cmds, cmds, sizeof(cmds[0]) * ncmds);
+		if (error != 0) {
+			free(cmds, M_FILECAPS);
+			return (error);
+		}
+	}
+
+	return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
 }
 
-static int
-capability_truncate(struct file *fp, off_t length, struct ucred *active_cred,
-    struct thread *td)
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
 {
+	struct filedesc *fdp;
+	struct filedescent *fdep;
+	u_long *cmds;
+	size_t maxcmds;
+	int error, fd;
 
-	panic("capability_truncate");
-}
+	fd = uap->fd;
+	cmds = uap->cmds;
+	maxcmds = uap->maxcmds;
 
-static int
-capability_ioctl(struct file *fp, u_long com, void *data,
-    struct ucred *active_cred, struct thread *td)
-{
+	AUDIT_ARG_FD(fd);
 
-	panic("capability_ioctl");
-}
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
 
-static int
-capability_poll(struct file *fp, int events, struct ucred *active_cred,
-    struct thread *td)
-{
+	if (fget_locked(fdp, fd) == NULL) {
+		error = EBADF;
+		goto out;
+	}
 
-	panic("capability_poll");
+	/*
+	 * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
+	 * the only sane thing we can do is to not populate the given array and
+	 * return CAP_IOCTLS_ALL.
+	 */
+
+	fdep = &fdp->fd_ofiles[fd];
+	if (cmds != NULL && fdep->fde_ioctls != NULL) {
+		error = copyout(fdep->fde_ioctls, cmds,
+		    sizeof(cmds[0]) * MIN(fdep->fde_nioctls, maxcmds));
+		if (error != 0)
+			goto out;
+	}
+	if (fdep->fde_nioctls == -1)
+		td->td_retval[0] = CAP_IOCTLS_ALL;
+	else
+		td->td_retval[0] = fdep->fde_nioctls;
+
+	error = 0;
+out:
+	FILEDESC_SUNLOCK(fdp);
+	return (error);
 }
 
-static int
-capability_kqfilter(struct file *fp, struct knote *kn)
+/*
+ * Test whether a capability grants the given fcntl command.
+ */
+int
+cap_fcntl_check_fde(struct filedescent *fde, int cmd)
 {
+	uint32_t fcntlcap;
 
-	panic("capability_kqfilter");
+	fcntlcap = (1 << cmd);
+	KASSERT((CAP_FCNTL_ALL & fcntlcap) != 0,
+	    ("Unsupported fcntl=%d.", cmd));
+
+	if ((fde->fde_fcntls & fcntlcap) != 0)
+		return (0);
+
+	return (ENOTCAPABLE);
 }
 
-static int
-capability_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+int
+cap_fcntl_check(struct filedesc *fdp, int fd, int cmd)
 {
 
-	panic("capability_stat");
+	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+	    ("%s: invalid fd=%d", __func__, fd));
+
+	return (cap_fcntl_check_fde(&fdp->fd_ofiles[fd], cmd));
 }
 
 int
-capability_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
-    struct thread *td)
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
 {
+	struct filedesc *fdp;
+	uint32_t fcntlrights;
+	int fd;
 
-	panic("capability_chmod");
+	fd = uap->fd;
+	fcntlrights = uap->fcntlrights;
+
+	AUDIT_ARG_FD(fd);
+	AUDIT_ARG_FCNTL_RIGHTS(fcntlrights);
+
+	if ((fcntlrights & ~CAP_FCNTL_ALL) != 0)
+		return (EINVAL);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		return (EBADF);
+	}
+
+	if ((fcntlrights & ~fdp->fd_ofiles[fd].fde_fcntls) != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (ENOTCAPABLE);
+	}
+
+	fdp->fd_ofiles[fd].fde_fcntls = fcntlrights;
+	FILEDESC_XUNLOCK(fdp);
+
+	return (0);
 }
 
 int
-capability_chown(struct file *fp, uid_t uid, gid_t gid,
-    struct ucred *active_cred, struct thread *td)
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
 {
+	struct filedesc *fdp;
+	uint32_t rights;
+	int fd;
 
-	panic("capability_chown");
+	fd = uap->fd;
+
+	AUDIT_ARG_FD(fd);
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_SLOCK(fdp);
+	if (fget_locked(fdp, fd) == NULL) {
+		FILEDESC_SUNLOCK(fdp);
+		return (EBADF);
+	}
+	rights = fdp->fd_ofiles[fd].fde_fcntls;
+	FILEDESC_SUNLOCK(fdp);
+
+	return (copyout(&rights, uap->fcntlrightsp, sizeof(rights)));
 }
 
 #else /* !CAPABILITIES */
@@ -510,8 +583,9 @@
  * Stub Capability functions for when options CAPABILITIES isn't compiled
  * into the kernel.
  */
+
 int
-sys_cap_new(struct thread *td, struct cap_new_args *uap)
+sys_cap_rights_limit(struct thread *td, struct cap_rights_limit_args *uap)
 {
 
 	return (ENOSYS);
@@ -518,7 +592,7 @@
 }
 
 int
-sys_cap_getrights(struct thread *td, struct cap_getrights_args *uap)
+sys___cap_rights_get(struct thread *td, struct __cap_rights_get_args *uap)
 {
 
 	return (ENOSYS);
@@ -525,27 +599,31 @@
 }
 
 int
-cap_funwrap(struct file *fp_cap, cap_rights_t rights, struct file **fpp)
+sys_cap_ioctls_limit(struct thread *td, struct cap_ioctls_limit_args *uap)
 {
 
-	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
-	    ("cap_funwrap: saw capability"));
+	return (ENOSYS);
+}
 
-	*fpp = fp_cap;
-	return (0);
+int
+sys_cap_ioctls_get(struct thread *td, struct cap_ioctls_get_args *uap)
+{
+
+	return (ENOSYS);
 }
 
 int
-cap_funwrap_mmap(struct file *fp_cap, cap_rights_t rights, u_char *maxprotp,
-    struct file **fpp)
+sys_cap_fcntls_limit(struct thread *td, struct cap_fcntls_limit_args *uap)
 {
 
-	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
-	    ("cap_funwrap_mmap: saw capability"));
+	return (ENOSYS);
+}
 
-	*fpp = fp_cap;
-	*maxprotp = VM_PROT_ALL;
-	return (0);
+int
+sys_cap_fcntls_get(struct thread *td, struct cap_fcntls_get_args *uap)
+{
+
+	return (ENOSYS);
 }
 
 #endif /* CAPABILITIES */

Modified: trunk/sys/kern/sys_generic.c
===================================================================
--- trunk/sys/kern/sys_generic.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_generic.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_generic.c 315481 2017-03-18 12:39:24Z mmokhi $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -44,11 +45,12 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
+#include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
@@ -74,9 +76,27 @@
 
 #include <security/audit/audit.h>
 
+/*
+ * The following macro defines how many bytes will be allocated from
+ * the stack instead of memory allocated when passing the IOCTL data
+ * structures from userspace and to the kernel. Some IOCTLs having
+ * small data structures are used very frequently and this small
+ * buffer on the stack gives a significant speedup improvement for
+ * those requests. The value of this define should be greater or equal
+ * to 64 bytes and should also be power of two. The data structure is
+ * currently hard-aligned to a 8-byte boundary on the stack. This
+ * should currently be sufficient for all supported platforms.
+ */
+#define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
+#define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
+
 int iosize_max_clamp = 1;
 SYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
     &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
+int devfs_iosize_max_clamp = 1;
+SYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
+    &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
+
 /*
  * Assert that the return value of read(2) and write(2) syscalls fits
  * into a register.  If not, an architecture will need to provide the
@@ -102,7 +122,7 @@
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 static void	seltdinit(struct thread *);
-static int	seltdwait(struct thread *, int);
+static int	seltdwait(struct thread *, sbintime_t, sbintime_t);
 static void	seltdclear(struct thread *);
 
 /*
@@ -242,9 +262,10 @@
 kern_readv(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
+	error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
 	if (error)
 		return (error);
 	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
@@ -285,14 +306,16 @@
 	off_t offset;
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	error = fget_read(td, fd, CAP_READ, &fp);
+	error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
-	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+	else if (offset < 0 &&
+	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
@@ -451,9 +474,10 @@
 kern_writev(struct thread *td, int fd, struct uio *auio)
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
+	error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
 	if (error)
 		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
@@ -494,14 +518,16 @@
 	off_t offset;
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
-	error = fget_write(td, fd, CAP_WRITE, &fp);
+	error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
 	if (error)
 		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
-	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
+	else if (offset < 0 &&
+	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
 		error = EINVAL;
 	else
 		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
@@ -574,12 +600,13 @@
 	off_t length;
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	if (length < 0)
 		return (EINVAL);
-	error = fget(td, fd, CAP_FTRUNCATE, &fp);
+	error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
 	if (error)
 		return (error);
 	AUDIT_ARG_FILE(td->td_proc, fp);
@@ -636,6 +663,7 @@
 int
 sys_ioctl(struct thread *td, struct ioctl_args *uap)
 {
+	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
 	u_long com;
 	int arg, error;
 	u_int size;
@@ -670,17 +698,18 @@
 			arg = (intptr_t)uap->data;
 			data = (void *)&arg;
 			size = 0;
-		} else
-			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+		} else {
+			if (size > SYS_IOCTL_SMALL_SIZE)
+				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+			else
+				data = smalldata;
+		}
 	} else
 		data = (void *)&uap->data;
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
-		if (error) {
-			if (size > 0)
-				free(data, M_IOCTLOPS);
-			return (error);
-		}
+		if (error != 0)
+			goto out;
 	} else if (com & IOC_OUT) {
 		/*
 		 * Zero the buffer so the user always
@@ -694,7 +723,8 @@
 	if (error == 0 && (com & IOC_OUT))
 		error = copyout(data, uap->data, (u_int)size);
 
-	if (size > 0)
+out:
+	if (size > SYS_IOCTL_SMALL_SIZE)
 		free(data, M_IOCTLOPS);
 	return (error);
 }
@@ -704,28 +734,64 @@
 {
 	struct file *fp;
 	struct filedesc *fdp;
-	int error;
-	int tmp;
+#ifndef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	int error, tmp, locked;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_CMD(com);
-	if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0)
-		return (error);
-	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
-		fdrop(fp, td);
-		return (EBADF);
-	}
+
 	fdp = td->td_proc->p_fd;
+
 	switch (com) {
 	case FIONCLEX:
+	case FIOCLEX:
 		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
-		FILEDESC_XUNLOCK(fdp);
+		locked = LA_XLOCKED;
+		break;
+	default:
+#ifdef CAPABILITIES
+		FILEDESC_SLOCK(fdp);
+		locked = LA_SLOCKED;
+#else
+		locked = LA_UNLOCKED;
+#endif
+		break;
+	}
+
+#ifdef CAPABILITIES
+	if ((fp = fget_locked(fdp, fd)) == NULL) {
+		error = EBADF;
 		goto out;
+	}
+	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
+		fp = NULL;	/* fhold() was not called yet */
+		goto out;
+	}
+	fhold(fp);
+	if (locked == LA_SLOCKED) {
+		FILEDESC_SUNLOCK(fdp);
+		locked = LA_UNLOCKED;
+	}
+#else
+	error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
+	if (error != 0) {
+		fp = NULL;
+		goto out;
+	}
+#endif
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+		error = EBADF;
+		goto out;
+	}
+
+	switch (com) {
+	case FIONCLEX:
+		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
+		goto out;
 	case FIOCLEX:
-		FILEDESC_XLOCK(fdp);
-		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
-		FILEDESC_XUNLOCK(fdp);
+		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
 		goto out;
 	case FIONBIO:
 		if ((tmp = *(int *)data))
@@ -745,7 +811,21 @@
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
 out:
-	fdrop(fp, td);
+	switch (locked) {
+	case LA_XLOCKED:
+		FILEDESC_XUNLOCK(fdp);
+		break;
+#ifdef CAPABILITIES
+	case LA_SLOCKED:
+		FILEDESC_SUNLOCK(fdp);
+		break;
+#endif
+	default:
+		FILEDESC_UNLOCK_ASSERT(fdp);
+		break;
+	}
+	if (fp != NULL)
+		fdrop(fp, td);
 	return (error);
 }
 
@@ -903,9 +983,10 @@
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
-	struct timeval atv, rtv, ttv;
-	int error, lf, ndu, timo;
+	struct timeval rtv;
+	sbintime_t asbt, precision, rsbt;
 	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
+	int error, lf, ndu;
 
 	if (nd < 0)
 		return (EINVAL);
@@ -995,19 +1076,30 @@
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
+	precision = 0;
 	if (tvp != NULL) {
-		atv = *tvp;
-		if (itimerfix(&atv)) {
+		rtv = *tvp;
+		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
+		    rtv.tv_usec >= 1000000) {
 			error = EINVAL;
 			goto done;
 		}
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
-	}
-	timo = 0;
+		if (!timevalisset(&rtv))
+			asbt = 0;
+		else if (rtv.tv_sec <= INT32_MAX) {
+			rsbt = tvtosbt(rtv);
+			precision = rsbt;
+			precision >>= tc_precexp;
+			if (TIMESEL(&asbt, rsbt))
+				asbt += tc_tick_sbt;
+			if (asbt <= SBT_MAX - rsbt)
+				asbt += rsbt;
+			else
+				asbt = -1;
+		} else
+			asbt = -1;
+	} else
+		asbt = -1;
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
@@ -1014,16 +1106,7 @@
 		error = selscan(td, ibits, obits, nd);
 		if (error || td->td_retval[0] != 0)
 			break;
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=))
-				break;
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
-		}
-		error = seltdwait(td, timo);
+		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 		error = selrescan(td, ibits, obits);
@@ -1130,32 +1213,11 @@
 static __inline int
 getselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
 {
-	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_fromcap;
-	int error;
-#endif
+	cap_rights_t rights;
 
-	if ((fp = fget_unlocked(fdp, fd)) == NULL)
-		return (EBADF);
-#ifdef CAPABILITIES
-	/*
-	 * If the file descriptor is for a capability, test rights and use
-	 * the file descriptor references by the capability.
-	 */
-	error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap);
-	if (error) {
-		fdrop(fp, curthread);
-		return (error);
-	}
-	if (fp != fp_fromcap) {
-		fhold(fp_fromcap);
-		fdrop(fp, curthread);
-		fp = fp_fromcap;
-	}
-#endif /* CAPABILITIES */
-	*fpp = fp;
-	return (0);
+	cap_rights_init(&rights, CAP_EVENT);
+
+	return (fget_unlocked(fdp, fd, &rights, 0, fpp, NULL));
 }
 
 /*
@@ -1241,26 +1303,60 @@
 	return (0);
 }
 
-#ifndef _SYS_SYSPROTO_H_
-struct poll_args {
-	struct pollfd *fds;
-	u_int	nfds;
-	int	timeout;
-};
-#endif
 int
-sys_poll(td, uap)
-	struct thread *td;
-	struct poll_args *uap;
+sys_poll(struct thread *td, struct poll_args *uap)
 {
+	struct timespec ts, *tsp;
+
+	if (uap->timeout != INFTIM) {
+		if (uap->timeout < 0)
+			return (EINVAL);
+		ts.tv_sec = uap->timeout / 1000;
+		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
+		tsp = &ts;
+	} else
+		tsp = NULL;
+
+	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
+}
+
+int
+kern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
+    struct timespec *tsp, sigset_t *uset)
+{
 	struct pollfd *bits;
 	struct pollfd smallbits[32];
-	struct timeval atv, rtv, ttv;
-	int error, timo;
-	u_int nfds;
+	sbintime_t sbt, precision, tmp;
+	time_t over;
+	struct timespec ts;
+	int error;
 	size_t ni;
 
-	nfds = uap->nfds;
+	precision = 0;
+	if (tsp != NULL) {
+		if (tsp->tv_sec < 0)
+			return (EINVAL);
+		if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000)
+			return (EINVAL);
+		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+			sbt = 0;
+		else {
+			ts = *tsp;
+			if (ts.tv_sec > INT32_MAX / 2) {
+				over = ts.tv_sec - INT32_MAX / 2;
+				ts.tv_sec -= over;
+			} else
+				over = 0;
+			tmp = tstosbt(ts);
+			precision = tmp;
+			precision >>= tc_precexp;
+			if (TIMESEL(&sbt, tmp))
+				sbt += tc_tick_sbt;
+			sbt += tmp;
+		}
+	} else
+		sbt = -1;
+
 	if (nfds > maxfilesperproc && nfds > FD_SETSIZE) 
 		return (EINVAL);
 	ni = nfds * sizeof(struct pollfd);
@@ -1268,23 +1364,26 @@
 		bits = malloc(ni, M_TEMP, M_WAITOK);
 	else
 		bits = smallbits;
-	error = copyin(uap->fds, bits, ni);
+	error = copyin(fds, bits, ni);
 	if (error)
 		goto done;
-	if (uap->timeout != INFTIM) {
-		atv.tv_sec = uap->timeout / 1000;
-		atv.tv_usec = (uap->timeout % 1000) * 1000;
-		if (itimerfix(&atv)) {
-			error = EINVAL;
+
+	if (uset != NULL) {
+		error = kern_sigprocmask(td, SIG_SETMASK, uset,
+		    &td->td_oldsigmask, 0);
+		if (error)
 			goto done;
-		}
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
+		td->td_pflags |= TDP_OLDMASK;
+		/*
+		 * Make sure that ast() is called on return to
+		 * usermode and TDP_OLDMASK is cleared, restoring old
+		 * sigmask.
+		 */
+		thread_lock(td);
+		td->td_flags |= TDF_ASTPENDING;
+		thread_unlock(td);
 	}
-	timo = 0;
+
 	seltdinit(td);
 	/* Iterate until the timeout expires or descriptors become ready. */
 	for (;;) {
@@ -1291,16 +1390,7 @@
 		error = pollscan(td, bits, nfds);
 		if (error || td->td_retval[0] != 0)
 			break;
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=))
-				break;
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
-		}
-		error = seltdwait(td, timo);
+		error = seltdwait(td, sbt, precision);
 		if (error)
 			break;
 		error = pollrescan(td);
@@ -1316,7 +1406,7 @@
 	if (error == EWOULDBLOCK)
 		error = 0;
 	if (error == 0) {
-		error = pollout(td, bits, uap->fds, nfds);
+		error = pollout(td, bits, fds, nfds);
 		if (error)
 			goto out;
 	}
@@ -1326,6 +1416,35 @@
 	return (error);
 }
 
+int
+sys_ppoll(struct thread *td, struct ppoll_args *uap)
+{
+	struct timespec ts, *tsp;
+	sigset_t set, *ssp;
+	int error;
+
+	if (uap->ts != NULL) {
+		error = copyin(uap->ts, &ts, sizeof(ts));
+		if (error)
+			return (error);
+		tsp = &ts;
+	} else
+		tsp = NULL;
+	if (uap->set != NULL) {
+		error = copyin(uap->set, &set, sizeof(set));
+		if (error)
+			return (error);
+		ssp = &set;
+	} else
+		ssp = NULL;
+	/*
+	 * fds is still a pointer to user space. kern_poll() will
+	 * take care of copyin that array to the kernel space.
+	 */
+
+	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
+}
+
 static int
 pollrescan(struct thread *td)
 {
@@ -1336,6 +1455,9 @@
 	struct filedesc *fdp;
 	struct file *fp;
 	struct pollfd *fd;
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
 	int n;
 
 	n = 0;
@@ -1349,13 +1471,15 @@
 		/* If the selinfo wasn't cleared the event didn't fire. */
 		if (si != NULL)
 			continue;
-		fp = fdp->fd_ofiles[fd->fd];
+		fp = fdp->fd_ofiles[fd->fd].fde_file;
 #ifdef CAPABILITIES
-		if ((fp == NULL)
-		    || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
+		if (fp == NULL ||
+		    cap_check(cap_rights(fdp, fd->fd),
+		    cap_rights_init(&rights, CAP_EVENT)) != 0)
 #else
-		if (fp == NULL) {
+		if (fp == NULL)
 #endif
+		{
 			fd->revents = POLLNVAL;
 			n++;
 			continue;
@@ -1408,25 +1532,29 @@
 	u_int nfd;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
-	int i;
 	struct file *fp;
-	int n = 0;
+#ifdef CAPABILITIES
+	cap_rights_t rights;
+#endif
+	int i, n = 0;
 
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
-		if (fds->fd >= fdp->fd_nfiles) {
+		if (fds->fd > fdp->fd_lastfile) {
 			fds->revents = POLLNVAL;
 			n++;
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
 		} else {
-			fp = fdp->fd_ofiles[fds->fd];
+			fp = fdp->fd_ofiles[fds->fd].fde_file;
 #ifdef CAPABILITIES
-			if ((fp == NULL)
-			    || (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
+			if (fp == NULL ||
+			    cap_check(cap_rights(fdp, fds->fd),
+			    cap_rights_init(&rights, CAP_EVENT)) != 0)
 #else
-			if (fp == NULL) {
+			if (fp == NULL)
 #endif
+			{
 				fds->revents = POLLNVAL;
 				n++;
 			} else {
@@ -1483,21 +1611,32 @@
 int
 selsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
 {
-	struct timeval atv, rtv, ttv;
-	int error, timo;
+	struct timeval rtv;
+	sbintime_t asbt, precision, rsbt;
+	int error;
 
+	precision = 0;	/* stupid gcc! */
 	if (tvp != NULL) {
-		atv = *tvp;
-		if (itimerfix(&atv))
+		rtv = *tvp;
+		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 || 
+		    rtv.tv_usec >= 1000000)
 			return (EINVAL);
-		getmicrouptime(&rtv);
-		timevaladd(&atv, &rtv);
-	} else {
-		atv.tv_sec = 0;
-		atv.tv_usec = 0;
-	}
-
-	timo = 0;
+		if (!timevalisset(&rtv))
+			asbt = 0;
+		else if (rtv.tv_sec <= INT32_MAX) {
+			rsbt = tvtosbt(rtv);
+			precision = rsbt;
+			precision >>= tc_precexp;
+			if (TIMESEL(&asbt, rsbt))
+				asbt += tc_tick_sbt;
+			if (asbt <= SBT_MAX - rsbt)
+				asbt += rsbt;
+			else
+				asbt = -1;
+		} else
+			asbt = -1;
+	} else
+		asbt = -1;
 	seltdinit(td);
 	/*
 	 * Iterate until the timeout expires or the socket becomes ready.
@@ -1508,22 +1647,11 @@
 		/* error here is actually the ready events. */
 		if (error)
 			return (0);
-		if (atv.tv_sec || atv.tv_usec) {
-			getmicrouptime(&rtv);
-			if (timevalcmp(&rtv, &atv, >=)) {
-				seltdclear(td);
-				return (EWOULDBLOCK);
-			}
-			ttv = atv;
-			timevalsub(&ttv, &rtv);
-			timo = ttv.tv_sec > 24 * 60 * 60 ?
-			    24 * 60 * 60 * hz : tvtohz(&ttv);
-		}
-		error = seltdwait(td, timo);
-		seltdclear(td);
+		error = seltdwait(td, asbt, precision);
 		if (error)
 			break;
 	}
+	seltdclear(td);
 	/* XXX Duplicates ncp/smb behavior. */
 	if (error == ERESTART)
 		error = 0;
@@ -1698,7 +1826,7 @@
 }
 
 static int
-seltdwait(struct thread *td, int timo)
+seltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
 {
 	struct seltd *stp;
 	int error;
@@ -1717,8 +1845,11 @@
 		mtx_unlock(&stp->st_mtx);
 		return (0);
 	}
-	if (timo > 0)
-		error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
+	if (sbt == 0)
+		error = EWOULDBLOCK;
+	else if (sbt != -1)
+		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
+		    sbt, precision, C_ABSOLUTE);
 	else
 		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
 	mtx_unlock(&stp->st_mtx);

Modified: trunk/sys/kern/sys_pipe.c
===================================================================
--- trunk/sys/kern/sys_pipe.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_pipe.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,5 +1,7 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1996 John S. Dyson
+ * Copyright (c) 2012 Giovanni Trematerra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -89,7 +91,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_pipe.c 321020 2017-07-15 17:25:40Z dchagin $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -128,8 +130,6 @@
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
-int	do_pipe(struct thread *td, int fildes[2], int flags);
-
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
@@ -137,6 +137,9 @@
  */
 /* #define PIPE_NODIRECT */
 
+#define PIPE_PEER(pipe)	\
+	(((pipe)->pipe_state & PIPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
+
 /*
  * interfaces to the outside world
  */
@@ -148,8 +151,10 @@
 static fo_kqfilter_t	pipe_kqfilter;
 static fo_stat_t	pipe_stat;
 static fo_close_t	pipe_close;
+static fo_chmod_t	pipe_chmod;
+static fo_chown_t	pipe_chown;
 
-static struct fileops pipeops = {
+struct fileops pipeops = {
 	.fo_read = pipe_read,
 	.fo_write = pipe_write,
 	.fo_truncate = pipe_truncate,
@@ -158,15 +163,23 @@
 	.fo_kqfilter = pipe_kqfilter,
 	.fo_stat = pipe_stat,
 	.fo_close = pipe_close,
-	.fo_chmod = invfo_chmod,
-	.fo_chown = invfo_chown,
+	.fo_chmod = pipe_chmod,
+	.fo_chown = pipe_chown,
+	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_pipedetach(struct knote *kn);
+static void	filt_pipedetach_notsup(struct knote *kn);
+static int	filt_pipenotsup(struct knote *kn, long hint);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
+static struct filterops pipe_nfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_pipedetach_notsup,
+	.f_event = filt_pipenotsup
+};
 static struct filterops pipe_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
@@ -209,10 +222,10 @@
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
-static int pipe_create(struct pipe *pipe, int backing);
+static void pipe_create(struct pipe *pipe, int backing);
+static void pipe_paircreate(struct thread *td, struct pipepair **p_pp);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
-static __inline void pipeselwakeup(struct pipe *cpipe);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
@@ -303,7 +316,7 @@
 
 	pp = (struct pipepair *)mem;
 
-	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
+	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF);
 	return (0);
 }
 
@@ -319,26 +332,13 @@
 	mtx_destroy(&pp->pp_mtx);
 }
 
-/*
- * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
- * the zone pick up the pieces via pipeclose().
- */
-int
-kern_pipe(struct thread *td, int fildes[2])
+static void
+pipe_paircreate(struct thread *td, struct pipepair **p_pp)
 {
-	return (do_pipe(td, fildes, 0));
-}
-
-int
-do_pipe(struct thread *td, int fildes[2], int flags)
-{
-	struct filedesc *fdp = td->td_proc->p_fd;
-	struct file *rf, *wf;
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
-	int fd, fflags, error;
 
-	pp = uma_zalloc(pipe_zone, M_WAITOK);
+	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
 #ifdef MAC
 	/*
 	 * The MAC label is shared between the connected endpoints.  As a
@@ -355,16 +355,63 @@
 	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
 
 	/* Only the forward direction pipe is backed by default */
-	if ((error = pipe_create(rpipe, 1)) != 0 ||
-	    (error = pipe_create(wpipe, 0)) != 0) {
-		pipeclose(rpipe);
-		pipeclose(wpipe);
-		return (error);
-	}
+	pipe_create(rpipe, 1);
+	pipe_create(wpipe, 0);
 
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
+}
 
+void
+pipe_named_ctor(struct pipe **ppipe, struct thread *td)
+{
+	struct pipepair *pp;
+
+	pipe_paircreate(td, &pp);
+	pp->pp_rpipe.pipe_state |= PIPE_NAMED;
+	*ppipe = &pp->pp_rpipe;
+}
+
+void
+pipe_dtor(struct pipe *dpipe)
+{
+	struct pipe *peer;
+	ino_t ino;
+
+	ino = dpipe->pipe_ino;
+	peer = (dpipe->pipe_state & PIPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
+	funsetown(&dpipe->pipe_sigio);
+	pipeclose(dpipe);
+	if (peer != NULL) {
+		funsetown(&peer->pipe_sigio);
+		pipeclose(peer);
+	}
+	if (ino != 0 && ino != (ino_t)-1)
+		free_unr(pipeino_unr, ino);
+}
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
+ * the zone pick up the pieces via pipeclose().
+ */
+int
+kern_pipe(struct thread *td, int fildes[2])
+{
+
+	return (kern_pipe2(td, fildes, 0));
+}
+
+int
+kern_pipe2(struct thread *td, int fildes[2], int flags)
+{
+	struct file *rf, *wf;
+	struct pipe *rpipe, *wpipe;
+	struct pipepair *pp;
+	int fd, fflags, error;
+
+	pipe_paircreate(td, &pp);
+	rpipe = &pp->pp_rpipe;
+	wpipe = &pp->pp_wpipe;
 	error = falloc(td, &rf, &fd, flags);
 	if (error) {
 		pipeclose(rpipe);
@@ -387,7 +434,7 @@
 	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
 	error = falloc(td, &wf, &fd, flags);
 	if (error) {
-		fdclose(fdp, rf, fildes[0], td);
+		fdclose(td, rf, fildes[0]);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
@@ -412,7 +459,7 @@
 	error = kern_pipe(td, fildes);
 	if (error)
 		return (error);
-	
+
 	td->td_retval[0] = fildes[0];
 	td->td_retval[1] = fildes[1];
 
@@ -419,6 +466,24 @@
 	return (0);
 }
 
+int
+sys_pipe2(struct thread *td, struct pipe2_args *uap)
+{
+	int error, fildes[2];
+
+	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
+		return (EINVAL);
+	error = kern_pipe2(td, fildes, uap->flags);
+	if (error)
+		return (error);
+	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
+	if (error) {
+		(void)kern_close(td, fildes[0]);
+		(void)kern_close(td, fildes[1]);
+	}
+	return (error);
+}
+
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
@@ -447,7 +512,7 @@
 	buffer = (caddr_t) vm_map_min(pipe_map);
 
 	error = vm_map_find(pipe_map, NULL, 0,
-		(vm_offset_t *) &buffer, size, 1,
+		(vm_offset_t *) &buffer, size, 0, VMFS_ANY_SPACE,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error != KERN_SUCCESS) {
 		if ((cpipe->pipe_buffer.buffer == NULL) &&
@@ -545,7 +610,7 @@
 	}
 }
 
-static __inline void
+void
 pipeselwakeup(cpipe)
 	struct pipe *cpipe;
 {
@@ -565,24 +630,27 @@
  * Initialize and allocate VM and memory for pipe.  The structure
  * will start out zero'd from the ctor, so we just manage the kmem.
  */
-static int
+static void
 pipe_create(pipe, backing)
 	struct pipe *pipe;
 	int backing;
 {
-	int error;
 
 	if (backing) {
+		/*
+		 * Note that these functions can fail if pipe map is exhausted
+		 * (as a result of too many pipes created), but we ignore the
+		 * error as it is not fatal and could be provoked by
+		 * unprivileged users. The only consequence is worse performance
+		 * with given pipe.
+		 */
 		if (amountpipekva > maxpipekva / 2)
-			error = pipespace_new(pipe, SMALL_PIPE_SIZE);
+			(void)pipespace_new(pipe, SMALL_PIPE_SIZE);
 		else
-			error = pipespace_new(pipe, PIPE_SIZE);
-	} else {
-		/* If we're not backing this pipe, no need to do anything. */
-		error = 0;
+			(void)pipespace_new(pipe, PIPE_SIZE);
 	}
+
 	pipe->pipe_ino = -1;
-	return (error);
 }
 
 /* ARGSUSED */
@@ -594,11 +662,12 @@
 	struct thread *td;
 	int flags;
 {
-	struct pipe *rpipe = fp->f_data;
+	struct pipe *rpipe;
 	int error;
 	int nread = 0;
 	int size;
 
+	rpipe = fp->f_data;
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
@@ -675,7 +744,7 @@
 			rpipe->pipe_map.pos += size;
 			rpipe->pipe_map.cnt -= size;
 			if (rpipe->pipe_map.cnt == 0) {
-				rpipe->pipe_state &= ~PIPE_DIRECTW;
+				rpipe->pipe_state &= ~(PIPE_DIRECTW|PIPE_WANTW);
 				wakeup(rpipe);
 			}
 #endif
@@ -875,9 +944,10 @@
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	error = pipelock(wpipe, 1);
-	if (wpipe->pipe_state & PIPE_EOF)
+	if (error != 0)
+		goto error1;
+	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		error = EPIPE;
-	if (error) {
 		pipeunlock(wpipe);
 		goto error1;
 	}
@@ -938,6 +1008,7 @@
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
+		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
@@ -978,8 +1049,7 @@
 	struct pipe *wpipe, *rpipe;
 
 	rpipe = fp->f_data;
-	wpipe = rpipe->pipe_peer;
-
+	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 	error = pipelock(wpipe, 1);
 	if (error) {
@@ -1224,13 +1294,13 @@
 	}
 
 	/*
-	 * Don't return EPIPE if I/O was successful
+	 * Don't return EPIPE if any byte was written.
+	 * EINTR and other interrupts are handled by generic I/O layer.
+	 * Do not pretend that I/O succeeded for obvious user error
+	 * like EFAULT.
 	 */
-	if ((wpipe->pipe_buffer.cnt == 0) &&
-	    (uio->uio_resid == 0) &&
-	    (error == EPIPE)) {
+	if (uio->uio_resid != orig_resid && error == EPIPE)
 		error = 0;
-	}
 
 	if (error == 0)
 		vfs_timestamp(&wpipe->pipe_mtime);
@@ -1256,6 +1326,9 @@
 	struct thread *td;
 {
 
+	/* For named pipes call the vnode operation. */
+	if (fp->f_vnode != NULL)
+		return (vnops.fo_truncate(fp, length, active_cred, td));
 	return (EINVAL);
 }
 
@@ -1298,6 +1371,11 @@
 		break;
 
 	case FIONREAD:
+		if (!(fp->f_flag & FREAD)) {
+			*(int *)data = 0;
+			PIPE_UNLOCK(mpipe);
+			return (0);
+		}
 		if (mpipe->pipe_state & PIPE_DIRECTW)
 			*(int *)data = mpipe->pipe_map.cnt;
 		else
@@ -1340,14 +1418,16 @@
 	struct ucred *active_cred;
 	struct thread *td;
 {
-	struct pipe *rpipe = fp->f_data;
+	struct pipe *rpipe;
 	struct pipe *wpipe;
-	int revents = 0;
+	int levents, revents;
 #ifdef MAC
 	int error;
 #endif
 
-	wpipe = rpipe->pipe_peer;
+	revents = 0;
+	rpipe = fp->f_data;
+	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
@@ -1354,12 +1434,12 @@
 	if (error)
 		goto locked_error;
 #endif
-	if (events & (POLLIN | POLLRDNORM))
+	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 		    (rpipe->pipe_buffer.cnt > 0))
 			revents |= events & (POLLIN | POLLRDNORM);
 
-	if (events & (POLLOUT | POLLWRNORM))
+	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
 		if (wpipe->pipe_present != PIPE_ACTIVE ||
 		    (wpipe->pipe_state & PIPE_EOF) ||
 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
@@ -1367,6 +1447,12 @@
 			 wpipe->pipe_buffer.size == 0)))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
+	levents = events &
+	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
+	if (rpipe->pipe_state & PIPE_NAMED && fp->f_flag & FREAD && levents &&
+	    fp->f_seqcount == rpipe->pipe_wgen)
+		events |= POLLINIGNEOF;
+
 	if ((events & POLLINIGNEOF) == 0) {
 		if (rpipe->pipe_state & PIPE_EOF) {
 			revents |= (events & (POLLIN | POLLRDNORM));
@@ -1377,13 +1463,13 @@
 	}
 
 	if (revents == 0) {
-		if (events & (POLLIN | POLLRDNORM)) {
+		if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM)) {
 			selrecord(td, &rpipe->pipe_sel);
 			if (SEL_WAITING(&rpipe->pipe_sel))
 				rpipe->pipe_state |= PIPE_SEL;
 		}
 
-		if (events & (POLLOUT | POLLWRNORM)) {
+		if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM)) {
 			selrecord(td, &wpipe->pipe_sel);
 			if (SEL_WAITING(&wpipe->pipe_sel))
 				wpipe->pipe_state |= PIPE_SEL;
@@ -1423,6 +1509,13 @@
 		return (error);
 	}
 #endif
+
+	/* For named pipes ask the underlying filesystem. */
+	if (pipe->pipe_state & PIPE_NAMED) {
+		PIPE_UNLOCK(pipe);
+		return (vnops.fo_stat(fp, ub, active_cred, td));
+	}
+
 	/*
 	 * Lazily allocate an inode number for the pipe.  Most pipe
 	 * users do not call fstat(2) on the pipe, which means that
@@ -1469,15 +1562,48 @@
 	struct file *fp;
 	struct thread *td;
 {
-	struct pipe *cpipe = fp->f_data;
 
+	if (fp->f_vnode != NULL) 
+		return vnops.fo_close(fp, td);
 	fp->f_ops = &badfileops;
+	pipe_dtor(fp->f_data);
 	fp->f_data = NULL;
-	funsetown(&cpipe->pipe_sigio);
-	pipeclose(cpipe);
 	return (0);
 }
 
+static int
+pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
+{
+	struct pipe *cpipe;
+	int error;
+
+	cpipe = fp->f_data;
+	if (cpipe->pipe_state & PIPE_NAMED)
+		error = vn_chmod(fp, mode, active_cred, td);
+	else
+		error = invfo_chmod(fp, mode, active_cred, td);
+	return (error);
+}
+
+static int
+pipe_chown(fp, uid, gid, active_cred, td)
+	struct file *fp;
+	uid_t uid;
+	gid_t gid;
+	struct ucred *active_cred;
+	struct thread *td;
+{
+	struct pipe *cpipe;
+	int error;
+
+	cpipe = fp->f_data;
+	if (cpipe->pipe_state & PIPE_NAMED)
+		error = vn_chown(fp, uid, gid, active_cred, td);
+	else
+		error = invfo_chown(fp, uid, gid, active_cred, td);
+	return (error);
+}
+
 static void
 pipe_free_kmem(cpipe)
 	struct pipe *cpipe;
@@ -1511,7 +1637,6 @@
 {
 	struct pipepair *pp;
 	struct pipe *ppipe;
-	ino_t ino;
 
 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
 
@@ -1570,12 +1695,6 @@
 	knlist_destroy(&cpipe->pipe_sel.si_note);
 
 	/*
-	 * Postpone the destroy of the fake inode number allocated for
-	 * our end, until pipe mtx is unlocked.
-	 */
-	ino = cpipe->pipe_ino;
-
-	/*
 	 * If both endpoints are now closed, release the memory for the
 	 * pipe pair.  If not, unlock.
 	 */
@@ -1587,9 +1706,6 @@
 		uma_zfree(pipe_zone, cpipe->pipe_pair);
 	} else
 		PIPE_UNLOCK(cpipe);
-
-	if (ino != 0 && ino != (ino_t)-1)
-		free_unr(pipeino_unr, ino);
 }
 
 /*ARGSUSED*/
@@ -1598,7 +1714,20 @@
 {
 	struct pipe *cpipe;
 
-	cpipe = kn->kn_fp->f_data;
+	/*
+	 * If a filter is requested that is not supported by this file
+	 * descriptor, don't return an error, but also don't ever generate an
+	 * event.
+	 */
+	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
+		kn->kn_fop = &pipe_nfiltops;
+		return (0);
+	}
+	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
+		kn->kn_fop = &pipe_nfiltops;
+		return (0);
+	}
+	cpipe = fp->f_data;
 	PIPE_LOCK(cpipe);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
@@ -1611,7 +1740,7 @@
 			PIPE_UNLOCK(cpipe);
 			return (EPIPE);
 		}
-		cpipe = cpipe->pipe_peer;
+		cpipe = PIPE_PEER(cpipe);
 		break;
 	default:
 		PIPE_UNLOCK(cpipe);
@@ -1618,6 +1747,7 @@
 		return (EINVAL);
 	}
 
+	kn->kn_hook = cpipe; 
 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 	return (0);
@@ -1626,11 +1756,9 @@
 static void
 filt_pipedetach(struct knote *kn)
 {
-	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
+	struct pipe *cpipe = kn->kn_hook;
 
 	PIPE_LOCK(cpipe);
-	if (kn->kn_filter == EVFILT_WRITE)
-		cpipe = cpipe->pipe_peer;
 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 }
@@ -1639,11 +1767,11 @@
 static int
 filt_piperead(struct knote *kn, long hint)
 {
-	struct pipe *rpipe = kn->kn_fp->f_data;
+	struct pipe *rpipe = kn->kn_hook;
 	struct pipe *wpipe = rpipe->pipe_peer;
 	int ret;
 
-	PIPE_LOCK(rpipe);
+	PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 		kn->kn_data = rpipe->pipe_map.cnt;
@@ -1652,11 +1780,9 @@
 	    wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
-		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	ret = kn->kn_data > 0;
-	PIPE_UNLOCK(rpipe);
 	return ret;
 }
 
@@ -1664,15 +1790,14 @@
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
-	struct pipe *rpipe = kn->kn_fp->f_data;
-	struct pipe *wpipe = rpipe->pipe_peer;
-
-	PIPE_LOCK(rpipe);
+	struct pipe *wpipe;
+   
+	wpipe = kn->kn_hook;
+	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_data = 0;
 		kn->kn_flags |= EV_EOF;
-		PIPE_UNLOCK(rpipe);
 		return (1);
 	}
 	kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
@@ -1680,6 +1805,18 @@
 	if (wpipe->pipe_state & PIPE_DIRECTW)
 		kn->kn_data = 0;
 
-	PIPE_UNLOCK(rpipe);
 	return (kn->kn_data >= PIPE_BUF);
 }
+
+static void
+filt_pipedetach_notsup(struct knote *kn)
+{
+
+}
+
+static int
+filt_pipenotsup(struct knote *kn, long hint)
+{
+
+	return (0);
+}

Modified: trunk/sys/kern/sys_procdesc.c
===================================================================
--- trunk/sys/kern/sys_procdesc.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_procdesc.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2009 Robert N. M. Watson
  * All rights reserved.
@@ -59,12 +60,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_procdesc.c 280258 2015-03-19 13:37:36Z rwatson $");
 
 #include "opt_procdesc.h"
 
 #include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
@@ -113,6 +114,7 @@
 	.fo_close = procdesc_close,
 	.fo_chmod = procdesc_chmod,
 	.fo_chown = procdesc_chown,
+	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
@@ -137,7 +139,7 @@
  * died.
  */
 int
-procdesc_find(struct thread *td, int fd, cap_rights_t rights,
+procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
     struct proc **p)
 {
 	struct procdesc *pd;
@@ -144,7 +146,7 @@
 	struct file *fp;
 	int error;
 
-	error = fget(td, fd, rights, &fp);
+	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
@@ -184,12 +186,12 @@
  * Retrieve the PID associated with a process descriptor.
  */
 int
-kern_pdgetpid(struct thread *td, int fd, cap_rights_t rights, pid_t *pidp)
+kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
 {
 	struct file *fp;
 	int error;
 
-	error = fget(td, fd, rights, &fp);
+	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
@@ -208,11 +210,13 @@
 int
 sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
 {
+	cap_rights_t rights;
 	pid_t pid;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
-	error = kern_pdgetpid(td, uap->fd, CAP_PDGETPID, &pid);
+	error = kern_pdgetpid(td, uap->fd,
+	    cap_rights_init(&rights, CAP_PDGETPID), &pid);
 	if (error == 0)
 		error = copyout(&pid, uap->pidp, sizeof(pid));
 	return (error);
@@ -333,12 +337,13 @@
 
 	pd = p->p_procdesc;
 	pd->pd_proc = NULL;
+	p->p_procdesc = NULL;
 	procdesc_free(pd);
 }
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
- * still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
+ * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
  */
 static int
@@ -358,14 +363,20 @@
 	pd->pd_flags |= PDF_CLOSED;
 	PROCDESC_UNLOCK(pd);
 	p = pd->pd_proc;
-	PROC_LOCK(p);
-	if (p->p_state == PRS_ZOMBIE) {
+	if (p == NULL) {
 		/*
+		 * This is the case where process' exit status was already
+		 * collected and procdesc_reap() was already called.
+		 */
+		sx_xunlock(&proctree_lock);
+	} else if (p->p_state == PRS_ZOMBIE) {
+		/*
 		 * If the process is already dead and just awaiting reaping,
 		 * do that now.  This will release the process's reference to
 		 * the process descriptor when it calls back into
 		 * procdesc_reap().
 		 */
+		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		proc_reap(curthread, p, NULL, 0);
 	} else {
@@ -376,6 +387,7 @@
 		 * process from its descriptor so that its exit status will
 		 * be reported normally.
 		 */
+		PROC_LOCK(p);
 		pd->pd_proc = NULL;
 		p->p_procdesc = NULL;
 		procdesc_free(pd);
@@ -386,7 +398,7 @@
 		 */
 		p->p_sigparent = SIGCHLD;
 		proc_reparent(p, initproc);
-		if ((pd->pd_flags & PD_DAEMON) == 0)
+		if ((pd->pd_flags & PDF_DAEMON) == 0)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);

Modified: trunk/sys/kern/sys_process.c
===================================================================
--- trunk/sys/kern/sys_process.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_process.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1994, Sean Eric Fagan
  * All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_process.c 328379 2018-01-24 21:48:39Z jhb $");
 
 #include "opt_compat.h"
 
@@ -41,9 +42,11 @@
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/malloc.h>
 #include <sys/signalvar.h>
@@ -59,7 +62,6 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
-#include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef COMPAT_FREEBSD32
@@ -94,7 +96,9 @@
 	sigset_t	pl_siglist;	/* LWP pending signal */
 	struct siginfo32 pl_siginfo;	/* siginfo for signal */
 	char	pl_tdname[MAXCOMLEN + 1];	/* LWP name. */
-	int	pl_child_pid;		/* New child pid */
+	pid_t	pl_child_pid;		/* New child pid */
+	u_int		pl_syscall_code;
+	u_int		pl_syscall_narg;
 };
 
 #endif
@@ -335,7 +339,7 @@
 	struct vnode *vp;
 	char *freepath, *fullpath;
 	u_int pathlen;
-	int error, index, vfslocked;
+	int error, index;
 
 	error = 0;
 	obj = NULL;
@@ -382,11 +386,10 @@
 
 		obj = entry->object.vm_object;
 		if (obj != NULL)
-			VM_OBJECT_LOCK(obj);
+			VM_OBJECT_RLOCK(obj);
 	} while (0);
 
 	vm_map_unlock_read(map);
-	vmspace_free(vm);
 
 	pve->pve_fsid = VNOVAL;
 	pve->pve_fileid = VNOVAL;
@@ -395,24 +398,23 @@
 		lobj = obj;
 		for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
 			if (tobj != obj)
-				VM_OBJECT_LOCK(tobj);
+				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
-				VM_OBJECT_UNLOCK(lobj);
+				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 			pve->pve_offset += tobj->backing_object_offset;
 		}
-		vp = (lobj->type == OBJT_VNODE) ? lobj->handle : NULL;
+		vp = vm_object_vnode(lobj);
 		if (vp != NULL)
 			vref(vp);
 		if (lobj != obj)
-			VM_OBJECT_UNLOCK(lobj);
-		VM_OBJECT_UNLOCK(obj);
+			VM_OBJECT_RUNLOCK(lobj);
+		VM_OBJECT_RUNLOCK(obj);
 
 		if (vp != NULL) {
 			freepath = NULL;
 			fullpath = NULL;
 			vn_fullpath(td, vp, &fullpath, &freepath);
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
 				pve->pve_fileid = vattr.va_fileid;
@@ -419,7 +421,6 @@
 				pve->pve_fsid = vattr.va_fsid;
 			}
 			vput(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 
 			if (fullpath != NULL) {
 				pve->pve_pathlen = strlen(fullpath) + 1;
@@ -433,12 +434,16 @@
 				free(freepath, M_TEMP);
 		}
 	}
+	vmspace_free(vm);
+	if (error == 0)
+		CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p",
+		    p->p_pid, pve->pve_entry, pve->pve_start);
 
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
-static int      
+static int
 ptrace_vm_entry32(struct thread *td, struct proc *p,
     struct ptrace_vm_entry32 *pve32)
 {
@@ -470,6 +475,7 @@
     struct ptrace_lwpinfo32 *pl32)
 {
 
+	bzero(pl32, sizeof(*pl32));
 	pl32->pl_lwpid = pl->pl_lwpid;
 	pl32->pl_event = pl->pl_event;
 	pl32->pl_flags = pl->pl_flags;
@@ -478,6 +484,8 @@
 	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
 	strcpy(pl32->pl_tdname, pl->pl_tdname);
 	pl32->pl_child_pid = pl->pl_child_pid;
+	pl32->pl_syscall_code = pl->pl_syscall_code;
+	pl32->pl_syscall_narg = pl->pl_syscall_narg;
 }
 #endif /* COMPAT_FREEBSD32 */
 
@@ -536,6 +544,7 @@
 		struct ptrace_lwpinfo32 pl32;
 		struct ptrace_vm_entry32 pve32;
 #endif
+		int ptevents;
 	} r;
 	void *addr;
 	int error = 0;
@@ -550,6 +559,7 @@
 	AUDIT_ARG_VALUE(uap->data);
 	addr = &r;
 	switch (uap->req) {
+	case PT_GET_EVENT_MASK:
 	case PT_GETREGS:
 	case PT_GETFPREGS:
 	case PT_GETDBREGS:
@@ -564,6 +574,12 @@
 	case PT_SETDBREGS:
 		error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
 		break;
+	case PT_SET_EVENT_MASK:
+		if (uap->data != sizeof(r.ptevents))
+			error = EINVAL;
+		else
+			error = copyin(uap->addr, &r.ptevents, uap->data);
+		break;
 	case PT_IO:
 		error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
 		break;
@@ -597,7 +613,12 @@
 	case PT_GETDBREGS:
 		error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
 		break;
+	case PT_GET_EVENT_MASK:
+		/* NB: The size in uap->data is validated in kern_ptrace(). */
+		error = copyout(&r.ptevents, uap->addr, uap->data);
+		break;
 	case PT_LWPINFO:
+		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.pl, uap->addr, uap->data);
 		break;
 	}
@@ -629,6 +650,18 @@
 #define	PROC_WRITE(w, t, a)	proc_write_ ## w (t, a)
 #endif
 
+void
+proc_set_traced(struct proc *p, bool stop)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	p->p_flag |= P_TRACED;
+	if (stop)
+		p->p_flag2 |= P2_PTRACE_FSTP;
+	p->p_ptevents = PTRACE_DEFAULT;
+	p->p_oppid = p->p_pptr->p_pid;
+}
+
 int
 kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 {
@@ -635,7 +668,7 @@
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
-	struct thread *td2 = NULL;
+	struct thread *td2 = NULL, *td3;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	int error, write, tmp, num;
@@ -660,6 +693,9 @@
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_FOLLOW_FORK:
+	case PT_LWP_EVENTS:
+	case PT_GET_EVENT_MASK:
+	case PT_SET_EVENT_MASK:
 	case PT_DETACH:
 		sx_xlock(&proctree_lock);
 		proctree_locked = 1;
@@ -737,12 +773,23 @@
 	 */
 	switch (req) {
 	case PT_TRACE_ME:
-		/* Always legal. */
+		/*
+		 * Always legal, when there is a parent process which
+		 * could trace us.  Otherwise, reject.
+		 */
+		if ((p->p_flag & P_TRACED) != 0) {
+			error = EBUSY;
+			goto fail;
+		}
+		if (p->p_pptr == initproc) {
+			error = EPERM;
+			goto fail;
+		}
 		break;
 
 	case PT_ATTACH:
 		/* Self */
-		if (p->p_pid == td->td_proc->p_pid) {
+		if (p == td->td_proc) {
 			error = EINVAL;
 			goto fail;
 		}
@@ -823,10 +870,10 @@
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
-		p->p_flag |= P_TRACED;
+		proc_set_traced(p, false);
 		if (p->p_flag & P_PPWAIT)
 			p->p_flag |= P_PPTRACE;
-		p->p_oppid = p->p_pptr->p_pid;
+		CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
 		break;
 
 	case PT_ATTACH:
@@ -840,23 +887,30 @@
 		 * The old parent is remembered so we can put things back
 		 * on a "detach".
 		 */
-		p->p_flag |= P_TRACED;
-		p->p_oppid = p->p_pptr->p_pid;
+		proc_set_traced(p, true);
 		if (p->p_pptr != td->td_proc) {
 			proc_reparent(p, td->td_proc);
 		}
 		data = SIGSTOP;
+		CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid,
+		    p->p_oppid);
 		goto sendsig;	/* in PT_CONTINUE below */
 
 	case PT_CLEARSTEP:
+		CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		error = ptrace_clear_single_step(td2);
 		break;
 
 	case PT_SETSTEP:
+		CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		error = ptrace_single_step(td2);
 		break;
 
 	case PT_SUSPEND:
+		CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		td2->td_dbgflags |= TDB_SUSPEND;
 		thread_lock(td2);
 		td2->td_flags |= TDF_NEEDSUSPCHK;
@@ -864,16 +918,57 @@
 		break;
 
 	case PT_RESUME:
+		CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		td2->td_dbgflags &= ~TDB_SUSPEND;
 		break;
 
 	case PT_FOLLOW_FORK:
+		CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid,
+		    p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled",
+		    data ? "enabled" : "disabled");
 		if (data)
-			p->p_flag |= P_FOLLOWFORK;
+			p->p_ptevents |= PTRACE_FORK;
 		else
-			p->p_flag &= ~P_FOLLOWFORK;
+			p->p_ptevents &= ~PTRACE_FORK;
 		break;
 
+	case PT_LWP_EVENTS:
+		CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid,
+		    p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled",
+		    data ? "enabled" : "disabled");
+		if (data)
+			p->p_ptevents |= PTRACE_LWP;
+		else
+			p->p_ptevents &= ~PTRACE_LWP;
+		break;
+
+	case PT_GET_EVENT_MASK:
+		if (data != sizeof(p->p_ptevents)) {
+			error = EINVAL;
+			break;
+		}
+		CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid,
+		    p->p_ptevents);
+		*(int *)addr = p->p_ptevents;
+		break;
+
+	case PT_SET_EVENT_MASK:
+		if (data != sizeof(p->p_ptevents)) {
+			error = EINVAL;
+			break;
+		}
+		tmp = *(int *)addr;
+		if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
+		    PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
+			error = EINVAL;
+			break;
+		}
+		CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x",
+		    p->p_pid, p->p_ptevents, tmp);
+		p->p_ptevents = tmp;
+		break;
+		
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
@@ -888,6 +983,8 @@
 
 		switch (req) {
 		case PT_STEP:
+			CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d",
+			    td2->td_tid, p->p_pid, data);
 			error = ptrace_single_step(td2);
 			if (error)
 				goto out;
@@ -904,38 +1001,71 @@
 			}
 			switch (req) {
 			case PT_TO_SCE:
-				p->p_stops |= S_PT_SCE;
+				p->p_ptevents |= PTRACE_SCE;
+				CTR4(KTR_PTRACE,
+		    "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d",
+				    p->p_pid, p->p_ptevents,
+				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_TO_SCX:
-				p->p_stops |= S_PT_SCX;
+				p->p_ptevents |= PTRACE_SCX;
+				CTR4(KTR_PTRACE,
+		    "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d",
+				    p->p_pid, p->p_ptevents,
+				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_SYSCALL:
-				p->p_stops |= S_PT_SCE | S_PT_SCX;
+				p->p_ptevents |= PTRACE_SYSCALL;
+				CTR4(KTR_PTRACE,
+		    "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d",
+				    p->p_pid, p->p_ptevents,
+				    (u_long)(uintfptr_t)addr, data);
 				break;
+			case PT_CONTINUE:
+				CTR3(KTR_PTRACE,
+				    "PT_CONTINUE: pid %d, PC = %#lx, sig = %d",
+				    p->p_pid, (u_long)(uintfptr_t)addr, data);
+				break;
 			}
 			break;
 		case PT_DETACH:
-			/* reset process parent */
+			/*
+			 * Reset the process parent.
+			 *
+			 * NB: This clears P_TRACED before reparenting
+			 * a detached process back to its original
+			 * parent.  Otherwise the debugee will be set
+			 * as an orphan of the debugger.
+			 */
+			p->p_flag &= ~(P_TRACED | P_WAITED);
 			if (p->p_oppid != p->p_pptr->p_pid) {
-				struct proc *pp;
-
 				PROC_LOCK(p->p_pptr);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(p->p_pptr);
 
-				PROC_UNLOCK(p);
-				pp = pfind(p->p_oppid);
-				if (pp == NULL)
-					pp = initproc;
-				else
-					PROC_UNLOCK(pp);
-				PROC_LOCK(p);
+				pp = proc_realparent(p);
 				proc_reparent(p, pp);
 				if (pp == initproc)
 					p->p_sigparent = SIGCHLD;
+				CTR3(KTR_PTRACE,
+			    "PT_DETACH: pid %d reparented to pid %d, sig %d",
+				    p->p_pid, pp->p_pid, data);
+			} else
+				CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d",
+				    p->p_pid, data);
+			p->p_oppid = 0;
+			p->p_ptevents = 0;
+			FOREACH_THREAD_IN_PROC(p, td3) {
+				if ((td3->td_dbgflags & TDB_FSTP) != 0) {
+					sigqueue_delete(&td3->td_sigqueue,
+					    SIGSTOP);
+				}
+				td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP);
 			}
-			p->p_oppid = 0;
-			p->p_flag &= ~(P_TRACED | P_WAITED | P_FOLLOWFORK);
+			if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) {
+				sigqueue_delete(&p->p_sigqueue, SIGSTOP);
+				p->p_flag2 &= ~P2_PTRACE_FSTP;
+			}
 
 			/* should we send SIGCHLD? */
 			/* childproc_continued(p); */
@@ -943,6 +1073,13 @@
 		}
 
 	sendsig:
+		/*
+		 * Clear the pending event for the thread that just
+		 * reported its event (p_xthread).  This may not be
+		 * the thread passed to PT_CONTINUE, PT_STEP, etc. if
+		 * the debugger is resuming a different thread.
+		 */
+		td2 = p->p_xthread;
 		if (proctree_locked) {
 			sx_xunlock(&proctree_lock);
 			proctree_locked = 0;
@@ -954,11 +1091,19 @@
 			td2->td_dbgflags &= ~TDB_XSIG;
 			td2->td_xsig = data;
 
+			/*
+			 * P_WKILLED is insurance that a PT_KILL/SIGKILL always
+			 * works immediately, even if another thread is
+			 * unsuspended first and attempts to handle a different
+			 * signal or if the POSIX.1b style signal queue cannot
+			 * accommodate any new signals.
+			 */
+			if (data == SIGKILL)
+				p->p_flag |= P_WKILLED;
+
 			if (req == PT_DETACH) {
-				struct thread *td3;
-				FOREACH_THREAD_IN_PROC(p, td3) {
-					td3->td_dbgflags &= ~TDB_SUSPEND; 
-				}
+				FOREACH_THREAD_IN_PROC(p, td3)
+					td3->td_dbgflags &= ~TDB_SUSPEND;
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
@@ -969,6 +1114,8 @@
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
+			if (req == PT_ATTACH)
+				kern_psignal(p, data);
 		} else {
 			if (data)
 				kern_psignal(p, data);
@@ -1010,6 +1157,14 @@
 		}
 		if (!write)
 			td->td_retval[0] = tmp;
+		if (error == 0) {
+			if (write)
+				CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x",
+				    p->p_pid, addr, data);
+			else
+				CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x",
+				    p->p_pid, addr, tmp);
+		}
 		PROC_LOCK(p);
 		break;
 
@@ -1042,10 +1197,14 @@
 		switch (tmp) {
 		case PIOD_READ_D:
 		case PIOD_READ_I:
+			CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)",
+			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			uio.uio_rw = UIO_READ;
 			break;
 		case PIOD_WRITE_D:
 		case PIOD_WRITE_I:
+			CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)",
+			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			td2->td_dbgflags |= TDB_USERWR;
 			uio.uio_rw = UIO_WRITE;
 			break;
@@ -1065,33 +1224,46 @@
 		break;
 
 	case PT_KILL:
+		CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid);
 		data = SIGKILL;
 		goto sendsig;	/* in PT_CONTINUE above */
 
 	case PT_SETREGS:
+		CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(regs, td2, addr);
 		break;
 
 	case PT_GETREGS:
+		CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		error = PROC_READ(regs, td2, addr);
 		break;
 
 	case PT_SETFPREGS:
+		CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(fpregs, td2, addr);
 		break;
 
 	case PT_GETFPREGS:
+		CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		error = PROC_READ(fpregs, td2, addr);
 		break;
 
 	case PT_SETDBREGS:
+		CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(dbregs, td2, addr);
 		break;
 
 	case PT_GETDBREGS:
+		CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid,
+		    p->p_pid);
 		error = PROC_READ(dbregs, td2, addr);
 		break;
 
@@ -1113,6 +1285,7 @@
 		} else
 #endif
 		pl = addr;
+		bzero(pl, sizeof(*pl));
 		pl->pl_lwpid = td2->td_tid;
 		pl->pl_event = PL_EVENT_NONE;
 		pl->pl_flags = 0;
@@ -1133,8 +1306,6 @@
 				pl->pl_siginfo = td2->td_dbgksi.ksi_info;
 			}
 		}
-		if ((pl->pl_flags & PL_FLAG_SI) == 0)
-			bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo));
 		if (td2->td_dbgflags & TDB_SCE)
 			pl->pl_flags |= PL_FLAG_SCE;
 		else if (td2->td_dbgflags & TDB_SCX)
@@ -1144,23 +1315,46 @@
 		if (td2->td_dbgflags & TDB_FORK) {
 			pl->pl_flags |= PL_FLAG_FORKED;
 			pl->pl_child_pid = td2->td_dbg_forked;
-		}
+			if (td2->td_dbgflags & TDB_VFORK)
+				pl->pl_flags |= PL_FLAG_VFORKED;
+		} else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
+		    TDB_VFORK)
+			pl->pl_flags |= PL_FLAG_VFORK_DONE;
 		if (td2->td_dbgflags & TDB_CHILD)
 			pl->pl_flags |= PL_FLAG_CHILD;
+		if (td2->td_dbgflags & TDB_BORN)
+			pl->pl_flags |= PL_FLAG_BORN;
+		if (td2->td_dbgflags & TDB_EXIT)
+			pl->pl_flags |= PL_FLAG_EXITED;
 		pl->pl_sigmask = td2->td_sigmask;
 		pl->pl_siglist = td2->td_siglist;
 		strcpy(pl->pl_tdname, td2->td_name);
+		if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) {
+			pl->pl_syscall_code = td2->td_dbg_sc_code;
+			pl->pl_syscall_narg = td2->td_dbg_sc_narg;
+		} else {
+			pl->pl_syscall_code = 0;
+			pl->pl_syscall_narg = 0;
+		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			ptrace_lwpinfo_to32(pl, pl32);
 #endif
+		CTR6(KTR_PTRACE,
+    "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d",
+		    td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags,
+		    pl->pl_child_pid, pl->pl_syscall_code);
 		break;
 
 	case PT_GETNUMLWPS:
+		CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid,
+		    p->p_numthreads);
 		td->td_retval[0] = p->p_numthreads;
 		break;
 
 	case PT_GETLWPLIST:
+		CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d",
+		    p->p_pid, data, p->p_numthreads);
 		if (data <= 0) {
 			error = EINVAL;
 			break;
@@ -1184,6 +1378,8 @@
 		break;
 
 	case PT_VM_TIMESTAMP:
+		CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d",
+		    p->p_pid, p->p_vmspace->vm_map.timestamp);
 		td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
 		break;
 
@@ -1234,6 +1430,8 @@
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_step = 1;
+	CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event,
+	    val);
 	do {
 		p->p_xstat = val;
 		p->p_xthread = NULL;

Modified: trunk/sys/kern/sys_socket.c
===================================================================
--- trunk/sys/kern/sys_socket.c	2018-05-26 14:27:13 UTC (rev 9956)
+++ trunk/sys/kern/sys_socket.c	2018-05-26 14:27:48 UTC (rev 9957)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sys_socket.c 254356 2013-08-15 07:54:31Z glebius $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -66,6 +67,7 @@
 	.fo_close = soo_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
+	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE
 };