[Midnightbsd-cvs] src [9952] trunk/sys/kern: sync with freebsd

Sat May 26 10:24:53 EDT 2018

Revision: 9952
          http://svnweb.midnightbsd.org/src/?rev=9952
Author:   laffer1
Date:     2018-05-26 10:24:52 -0400 (Sat, 26 May 2018)
Log Message:
-----------
 sync with freebsd

Modified Paths:
--------------
    trunk/sys/kern/vfs_aio.c
    trunk/sys/kern/vfs_bio.c
    trunk/sys/kern/vfs_cache.c
    trunk/sys/kern/vfs_cluster.c
    trunk/sys/kern/vfs_default.c
    trunk/sys/kern/vfs_export.c
    trunk/sys/kern/vfs_extattr.c
    trunk/sys/kern/vfs_hash.c
    trunk/sys/kern/vfs_init.c
    trunk/sys/kern/vfs_lookup.c
    trunk/sys/kern/vfs_mount.c
    trunk/sys/kern/vfs_mountroot.c
    trunk/sys/kern/vfs_subr.c
    trunk/sys/kern/vfs_syscalls.c

Modified: trunk/sys/kern/vfs_aio.c
===================================================================

--- trunk/sys/kern/vfs_aio.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_aio.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1997 John S. Dyson.  All rights reserved.
  *
@@ -19,7 +20,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_aio.c 304739 2016-08-24 09:20:27Z kib $");
 
 #include "opt_compat.h"
 
@@ -28,7 +29,7 @@
 #include <sys/malloc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
@@ -46,6 +47,7 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/protosw.h>
+#include <sys/rwlock.h>
 #include <sys/sema.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
@@ -58,10 +60,12 @@
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/mount.h>
+#include <geom/geom.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
+#include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -231,9 +235,10 @@
 	int	jobstate;		/* (b) job state */
 	int	inputcharge;		/* (*) input blockes */
 	int	outputcharge;		/* (*) output blockes */
-	struct	buf *bp;		/* (*) private to BIO backend,
-				  	 * buffer pointer
-					 */
+	struct	bio *bp;		/* (*) BIO backend BIO pointer */
+	struct	buf *pbuf;		/* (*) BIO backend buffer pointer */
+	struct	vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
+	int	npages;			/* BIO backend number of pages */
 	struct	proc *userproc;		/* (*) user process */
 	struct  ucred *cred;		/* (*) active credential when created */
 	struct	file *fd_file;		/* (*) pointer to file structure */
@@ -242,7 +247,6 @@
 	struct	knlist klist;		/* (a) list of knotes */
 	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
 	ksiginfo_t ksi;			/* (a) realtime signal info */
-	struct	task biotask;		/* (*) private to BIO backend */
 	uint64_t seqno;			/* (*) job number */
 	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
@@ -337,15 +341,16 @@
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct aiocblist *aiocbe);
-static void	aio_process(struct aiocblist *aiocbe);
+static void	aio_process_rw(struct aiocblist *aiocbe);
+static void	aio_process_sync(struct aiocblist *aiocbe);
+static void	aio_process_mlock(struct aiocblist *aiocbe);
 static int	aio_newproc(int *);
 int		aio_aqueue(struct thread *td, struct aiocb *job,
 			struct aioliojob *lio, int type, struct aiocb_ops *ops);
-static void	aio_physwakeup(struct buf *bp);
+static void	aio_physwakeup(struct bio *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
 static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void	biohelper(void *, int);
 static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
@@ -424,6 +429,7 @@
 	SYSCALL_INIT_HELPER(aio_cancel),
 	SYSCALL_INIT_HELPER(aio_error),
 	SYSCALL_INIT_HELPER(aio_fsync),
+	SYSCALL_INIT_HELPER(aio_mlock),
 	SYSCALL_INIT_HELPER(aio_read),
 	SYSCALL_INIT_HELPER(aio_return),
 	SYSCALL_INIT_HELPER(aio_suspend),
@@ -451,6 +457,7 @@
 	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
 	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
 	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
+	SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
 	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
 	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
 	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
@@ -700,7 +707,8 @@
 	 * at open time, but this is already true of file descriptors in
 	 * a multithreaded process.
 	 */
-	fdrop(aiocbe->fd_file, curthread);
+	if (aiocbe->fd_file)
+		fdrop(aiocbe->fd_file, curthread);
 	crfree(aiocbe->cred);
 	uma_zfree(aiocb_zone, aiocbe);
 	AIO_LOCK(ki);
@@ -835,17 +843,15 @@
 aio_fsync_vnode(struct thread *td, struct vnode *vp)
 {
 	struct mount *mp;
-	int vfslocked;
 	int error;
 
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_object != NULL) {
-		VM_OBJECT_LOCK(vp->v_object);
+		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
-		VM_OBJECT_UNLOCK(vp->v_object);
+		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
@@ -852,20 +858,19 @@
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * The AIO processing activity.  This is the code that does the I/O request for
- * the non-physio version of the operations.  The normal vn operations are used,
- * and this code should work in all instances for every type of file, including
- * pipes, sockets, fifos, and regular files.
+ * The AIO processing activity for LIO_READ/LIO_WRITE.  This is the code that
+ * does the I/O request for the non-physio version of the operations.  The
+ * normal vn operations are used, and this code should work in all instances
+ * for every type of file, including pipes, sockets, fifos, and regular files.
  *
  * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
-aio_process(struct aiocblist *aiocbe)
+aio_process_rw(struct aiocblist *aiocbe)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
@@ -879,6 +884,10 @@
 	int oublock_st, oublock_end;
 	int inblock_st, inblock_end;
 
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
+	    aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = aiocbe->cred;
@@ -885,17 +894,6 @@
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
-	if (cb->aio_lio_opcode == LIO_SYNC) {
-		error = 0;
-		cnt = 0;
-		if (fp->f_vnode != NULL)
-			error = aio_fsync_vnode(td, fp->f_vnode);
-		cb->_aiocb_private.error = error;
-		cb->_aiocb_private.status = 0;
-		td->td_ucred = td_savedcred;
-		return;
-	}
-
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
@@ -956,6 +954,41 @@
 }
 
 static void
+aio_process_sync(struct aiocblist *aiocbe)
+{
+	struct thread *td = curthread;
+	struct ucred *td_savedcred = td->td_ucred;
+	struct aiocb *cb = &aiocbe->uaiocb;
+	struct file *fp = aiocbe->fd_file;
+	int error = 0;
+
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+	td->td_ucred = aiocbe->cred;
+	if (fp->f_vnode != NULL)
+		error = aio_fsync_vnode(td, fp->f_vnode);
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = 0;
+	td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_mlock(struct aiocblist *aiocbe)
+{
+	struct aiocb *cb = &aiocbe->uaiocb;
+	int error;
+
+	KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
+	    ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+	error = vm_mlock(aiocbe->userproc, aiocbe->cred,
+	    __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
+	cb->_aiocb_private.error = error;
+	cb->_aiocb_private.status = 0;
+}
+
+static void
 aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
 {
 	struct aioliojob *lj;
@@ -1026,7 +1059,7 @@
 }
 
 /*
- * The AIO daemon, most of the actual work is done in aio_process,
+ * The AIO daemon, most of the actual work is done in aio_process_*,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
@@ -1123,7 +1156,18 @@
 			ki = userp->p_aioinfo;
 
 			/* Do the I/O function. */
-			aio_process(aiocbe);
+			switch(aiocbe->uaiocb.aio_lio_opcode) {
+			case LIO_READ:
+			case LIO_WRITE:
+				aio_process_rw(aiocbe);
+				break;
+			case LIO_SYNC:
+				aio_process_sync(aiocbe);
+				break;
+			case LIO_MLOCK:
+				aio_process_mlock(aiocbe);
+				break;
+			}
 
 			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
@@ -1252,122 +1296,139 @@
 {
 	struct aiocb *cb;
 	struct file *fp;
-	struct buf *bp;
+	struct bio *bp;
+	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
 	struct aioliojob *lj;
-	int error, ref;
+	int error, ref, unmap, poff;
+	vm_prot_t prot;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
-	if (fp->f_type != DTYPE_VNODE)
+	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
 
 	vp = fp->f_vnode;
-
-	/*
-	 * If its not a disk, we don't want to return a positive error.
-	 * It causes the aio code to not fall through to try the thread
-	 * way when you're talking to a regular file.
-	 */
-	if (!vn_isdisk(vp, &error)) {
-		if (error == ENOTBLK)
-			return (-1);
-		else
-			return (error);
-	}
-
+	if (vp->v_type != VCHR)
+		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
-
- 	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
-	if (cb->aio_nbytes >
-	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
-		return (-1);
-
-	ki = p->p_aioinfo;
-	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
-		return (-1);
-
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
 	if (csw == NULL)
 		return (ENXIO);
+
+	if ((csw->d_flags & D_DISK) == 0) {
+		error = -1;
+		goto unref;
+	}
 	if (cb->aio_nbytes > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
-	/* Create and build a buffer header for a transfer. */
-	bp = (struct buf *)getpbuf(NULL);
-	BUF_KERNPROC(bp);
+	ki = p->p_aioinfo;
+	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+	unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
+	if (unmap) {
+		if (cb->aio_nbytes > MAXPHYS) {
+			error = -1;
+			goto unref;
+		}
+	} else {
+		if (cb->aio_nbytes > MAXPHYS - poff) {
+			error = -1;
+			goto unref;
+		}
+		if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+			error = -1;
+			goto unref;
+		}
+	}
+	aiocbe->bp = bp = g_alloc_bio();
+	if (!unmap) {
+		aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+		BUF_KERNPROC(pbuf);
+	} else
+		pbuf = NULL;
 
 	AIO_LOCK(ki);
 	ki->kaio_count++;
-	ki->kaio_buffer_count++;
+	if (!unmap)
+		ki->kaio_buffer_count++;
 	lj = aiocbe->lio;
 	if (lj)
 		lj->lioj_count++;
+	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+	aiocbe->jobstate = JOBST_JOBQBUF;
+	cb->_aiocb_private.status = cb->aio_nbytes;
 	AIO_UNLOCK(ki);
 
-	/*
-	 * Get a copy of the kva from the physical buffer.
-	 */
-	error = 0;
+	bp->bio_length = cb->aio_nbytes;
+	bp->bio_bcount = cb->aio_nbytes;
+	bp->bio_done = aio_physwakeup;
+	bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+	bp->bio_offset = cb->aio_offset;
+	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+	bp->bio_dev = dev;
+	bp->bio_caller1 = (void *)aiocbe;
 
-	bp->b_bcount = cb->aio_nbytes;
-	bp->b_bufsize = cb->aio_nbytes;
-	bp->b_iodone = aio_physwakeup;
-	bp->b_saveaddr = bp->b_data;
-	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
-	bp->b_offset = cb->aio_offset;
-	bp->b_iooffset = cb->aio_offset;
-	bp->b_blkno = btodb(cb->aio_offset);
-	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-
-	/*
-	 * Bring buffer into kernel space.
-	 */
-	if (vmapbuf(bp) < 0) {
+	prot = VM_PROT_READ;
+	if (cb->aio_lio_opcode == LIO_READ)
+		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+	if ((aiocbe->npages = vm_fault_quick_hold_pages(
+	    &curproc->p_vmspace->vm_map,
+	    (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
+	    sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
 		error = EFAULT;
 		goto doerror;
 	}
+	if (!unmap) {
+		pmap_qenter((vm_offset_t)pbuf->b_data,
+		    aiocbe->pages, aiocbe->npages);
+		bp->bio_data = pbuf->b_data + poff;
+	} else {
+		bp->bio_ma = aiocbe->pages;
+		bp->bio_ma_n = aiocbe->npages;
+		bp->bio_ma_offset = poff;
+		bp->bio_data = unmapped_buf;
+		bp->bio_flags |= BIO_UNMAPPED;
+	}
 
-	AIO_LOCK(ki);
-	aiocbe->bp = bp;
-	bp->b_caller1 = (void *)aiocbe;
-	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
-	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
-	aiocbe->jobstate = JOBST_JOBQBUF;
-	cb->_aiocb_private.status = cb->aio_nbytes;
-	AIO_UNLOCK(ki);
-
 	atomic_add_int(&num_queue_count, 1);
-	atomic_add_int(&num_buf_aio, 1);
+	if (!unmap)
+		atomic_add_int(&num_buf_aio, 1);
 
-	bp->b_error = 0;
-
-	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
-
 	/* Perform transfer. */
-	dev_strategy_csw(dev, csw, bp);
+	csw->d_strategy(bp);
 	dev_relthread(dev, ref);
 	return (0);
 
 doerror:
 	AIO_LOCK(ki);
+	aiocbe->jobstate = JOBST_NULL;
+	TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
 	ki->kaio_count--;
-	ki->kaio_buffer_count--;
+	if (!unmap)
+		ki->kaio_buffer_count--;
 	if (lj)
 		lj->lioj_count--;
+	AIO_UNLOCK(ki);
+	if (pbuf) {
+		relpbuf(pbuf, NULL);
+		aiocbe->pbuf = NULL;
+	}
+	g_destroy_bio(bp);
 	aiocbe->bp = NULL;
-	AIO_UNLOCK(ki);
-	relpbuf(bp, NULL);
 unref:
 	dev_relthread(dev, ref);
 	return (error);
@@ -1522,9 +1583,10 @@
  */
 int
 aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
-	int type, struct aiocb_ops *ops)
+    int type, struct aiocb_ops *ops)
 {
 	struct proc *p = td->td_proc;
+	cap_rights_t rights;
 	struct file *fp;
 	struct socket *so;
 	struct aiocblist *aiocbe, *cb;
@@ -1553,8 +1615,6 @@
 	}
 
 	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
-	aiocbe->inputcharge = 0;
-	aiocbe->outputcharge = 0;
 	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
 
 	error = ops->copyin(job, &aiocbe->uaiocb);
@@ -1564,6 +1624,12 @@
 		return (error);
 	}
 
+	/* XXX: aio_nbytes is later casted to signed types. */
+	if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
+		uma_zfree(aiocb_zone, aiocbe);
+		return (EINVAL);
+	}
+
 	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
 	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
@@ -1601,16 +1667,21 @@
 	fd = aiocbe->uaiocb.aio_fildes;
 	switch (opcode) {
 	case LIO_WRITE:
-		error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
+		error = fget_write(td, fd,
+		    cap_rights_init(&rights, CAP_PWRITE), &fp);
 		break;
 	case LIO_READ:
-		error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
+		error = fget_read(td, fd,
+		    cap_rights_init(&rights, CAP_PREAD), &fp);
 		break;
 	case LIO_SYNC:
-		error = fget(td, fd, CAP_FSYNC, &fp);
+		error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
 		break;
+	case LIO_MLOCK:
+		fp = NULL;
+		break;
 	case LIO_NOP:
-		error = fget(td, fd, 0, &fp);
+		error = fget(td, fd, cap_rights_init(&rights), &fp);
 		break;
 	default:
 		error = EINVAL;
@@ -1666,7 +1737,8 @@
 	error = kqfd_register(kqfd, &kev, td, 1);
 aqueue_fail:
 	if (error) {
-		fdrop(fp, td);
+		if (fp)
+			fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
 		ops->store_error(job, error);
 		goto done;
@@ -1683,7 +1755,7 @@
 	if (opcode == LIO_SYNC)
 		goto queueit;
 
-	if (fp->f_type == DTYPE_SOCKET) {
+	if (fp && fp->f_type == DTYPE_SOCKET) {
 		/*
 		 * Alternate queueing for socket ops: Reach down into the
 		 * descriptor to get the socket data.  Then check to see if the
@@ -1734,8 +1806,6 @@
 	}
 #endif
 queueit:
-	/* No buffer for daemon I/O. */
-	aiocbe->bp = NULL;
 	atomic_add_int(&num_queue_count, 1);
 
 	AIO_LOCK(ki);
@@ -1997,7 +2067,7 @@
 	struct vnode *vp;
 
 	/* Lookup file object. */
-	error = fget(td, uap->fd, 0, &fp);
+	error = fget(td, uap->fd, NULL, &fp);
 	if (error)
 		return (error);
 
@@ -2161,6 +2231,13 @@
 	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
 }
 
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
 static int
 kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
     struct aiocb **acb_list, int nent, struct sigevent *sig,
@@ -2365,54 +2442,43 @@
 	return (error);
 }
 
-/*
- * Called from interrupt thread for physio, we should return as fast
- * as possible, so we schedule a biohelper task.
- */
 static void
-aio_physwakeup(struct buf *bp)
+aio_physwakeup(struct bio *bp)
 {
-	struct aiocblist *aiocbe;
-
-	aiocbe = (struct aiocblist *)bp->b_caller1;
-	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
-}
-
-/*
- * Task routine to perform heavy tasks, process wakeup, and signals.
- */
-static void
-biohelper(void *context, int pending)
-{
-	struct aiocblist *aiocbe = context;
-	struct buf *bp;
+	struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
 	struct proc *userp;
 	struct kaioinfo *ki;
 	int nblks;
 
+	/* Release mapping into kernel space. */
+	if (aiocbe->pbuf) {
+		pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
+		relpbuf(aiocbe->pbuf, NULL);
+		aiocbe->pbuf = NULL;
+		atomic_subtract_int(&num_buf_aio, 1);
+	}
+	vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
+
 	bp = aiocbe->bp;
+	aiocbe->bp = NULL;
 	userp = aiocbe->userproc;
 	ki = userp->p_aioinfo;
 	AIO_LOCK(ki);
-	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+	aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
 	aiocbe->uaiocb._aiocb_private.error = 0;
-	if (bp->b_ioflags & BIO_ERROR)
-		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+	if (bp->bio_flags & BIO_ERROR)
+		aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
 	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
 	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
 		aiocbe->outputcharge += nblks;
 	else
 		aiocbe->inputcharge += nblks;
-	aiocbe->bp = NULL;
 	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
 	ki->kaio_buffer_count--;
 	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
 	AIO_UNLOCK(ki);
 
-	/* Release mapping into kernel space. */
-	vunmapbuf(bp);
-	relpbuf(bp, NULL);
-	atomic_subtract_int(&num_buf_aio, 1);
+	g_destroy_bio(bp);
 }
 
 /* syscall - wait for the next completion of an aio request */
@@ -2503,14 +2569,9 @@
 kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
     struct aiocb_ops *ops)
 {
-	struct proc *p = td->td_proc;
-	struct kaioinfo *ki;
 
 	if (op != O_SYNC) /* XXX lack of O_DSYNC */
 		return (EINVAL);
-	ki = p->p_aioinfo;
-	if (ki == NULL)
-		aio_init_aioinfo(p);
 	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
 }
 
@@ -2696,31 +2757,6 @@
 }
 
 static int
-convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
-{
-
-	CP(*sig32, *sig, sigev_notify);
-	switch (sig->sigev_notify) {
-	case SIGEV_NONE:
-		break;
-	case SIGEV_THREAD_ID:
-		CP(*sig32, *sig, sigev_notify_thread_id);
-		/* FALLTHROUGH */
-	case SIGEV_SIGNAL:
-		CP(*sig32, *sig, sigev_signo);
-		break;
-	case SIGEV_KEVENT:
-		CP(*sig32, *sig, sigev_notify_kqueue);
-		CP(*sig32, *sig, sigev_notify_kevent_flags);
-		PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
-		break;
-	default:
-		return (EINVAL);
-	}
-	return (0);
-}
-
-static int
 aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
 {
 	struct aiocb32 job32;
@@ -2903,6 +2939,14 @@
 }
 
 int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+	    &aiocb32_ops));
+}
+
+int
 freebsd32_aio_waitcomplete(struct thread *td,
     struct freebsd32_aio_waitcomplete_args *uap)
 {

Modified: trunk/sys/kern/vfs_bio.c
===================================================================
--- trunk/sys/kern/vfs_bio.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_bio.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,8 +1,13 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -39,7 +44,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_bio.c 307672 2016-10-20 13:12:19Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -58,7 +63,9 @@
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
+#include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <geom/geom.h>
@@ -71,7 +78,6 @@
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include "opt_compat.h"
-#include "opt_directio.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
@@ -91,8 +97,10 @@
  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  */
 struct buf *buf;		/* buffer header pool */
+caddr_t unmapped_buf;
 
-static struct proc *bufdaemonproc;
+/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
+struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -101,16 +109,16 @@
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
-static void vfs_drain_busy_pages(struct buf *bp);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int buf_do_flush(struct vnode *vp);
+static int buf_flush(struct vnode *vp, int);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -131,6 +139,10 @@
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Virtual memory used for buffers");
 #endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+    &unmapped_bufspace, 0,
+    "Amount of unmapped buffers, inclusive in the bufspace");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including buf_daemon)");
@@ -199,18 +211,56 @@
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
-    "Number of times getnewbuf has had to restart a buffer aquisition");
+    "Number of times getnewbuf has had to restart a buffer acquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+    "Number of times getblk has had to restart a buffer mapping for "
+    "unmapped buffer");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflashes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+    &unmapped_buf_allowed, 0,
+    "Permit the use of the unmapped i/o");
 
 /*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct rwlock_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
@@ -226,11 +276,6 @@
 static int bd_speedupreq;
 
 /*
- * This lock synchronizes access to bd_request.
- */
-static struct mtx bdlock;
-
-/*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
@@ -246,45 +291,38 @@
  */
 static int runningbufreq;
 
-/*
- * This lock protects the runningbufreq and synchronizes runningbufwakeup and
- * waitrunningbufspace().
- */
-static struct mtx rbreqlock;
-
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
-static int needsbuffer;
+static volatile int needsbuffer;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ * Synchronization for bwillwrite() waiters.
  */
-static struct mtx nblock;
+static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
-#define BUFFER_QUEUES	6	/* number of free buffer queues */
+#define BUFFER_QUEUES	5	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
-#define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
-#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
-#define QUEUE_EMPTY	5	/* empty buffer headers */
+#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	4	/* empty buffer headers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
 
-/* Lock for the bufqueues */
-static struct mtx bqlock;
-
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
@@ -292,7 +330,6 @@
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
-#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
@@ -315,31 +352,71 @@
 }
 #endif
 
-#ifdef DIRECTIO
-extern void ffs_rawread_setup(void);
-#endif /* DIRECTIO */
 /*
- *	numdirtywakeup:
+ *	bqlock:
  *
- *	If someone is blocked due to there being too many dirty buffers,
- *	and numdirtybuffers is now reasonable, wake them up.
+ *	Return the appropriate queue lock based on the index.
  */
+static inline struct mtx *
+bqlock(int qindex)
+{
 
-static __inline void
-numdirtywakeup(int level)
+	if (qindex == QUEUE_DIRTY)
+		return (struct mtx *)(&bqdirty);
+	return (struct mtx *)(&bqclean);
+}
+
+/*
+ *	bdirtywakeup:
+ *
+ *	Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
 {
-
-	if (numdirtybuffers <= level) {
-		mtx_lock(&nblock);
-		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
-			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
-			wakeup(&needsbuffer);
-		}
-		mtx_unlock(&nblock);
+	mtx_lock(&bdirtylock);
+	if (bdirtywait) {
+		bdirtywait = 0;
+		wakeup(&bdirtywait);
 	}
+	mtx_unlock(&bdirtylock);
 }
 
 /*
+ *	bdirtysub:
+ *
+ *	Decrement the numdirtybuffers count by one and wakeup any
+ *	threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bdirtywakeup();
+}
+
+/*
+ *	bdirtyadd:
+ *
+ *	Increment the numdirtybuffers count by one and wakeup the buf 
+ *	daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+	/*
+	 * Only do the wakeup once as we cross the boundary.  The
+	 * buf daemon will keep running until the condition clears.
+	 */
+	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bd_wakeup();
+}
+
+/*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery.
@@ -351,6 +428,7 @@
 static __inline void
 bufspacewakeup(void)
 {
+	int need_wakeup, on;
 
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
@@ -357,36 +435,70 @@
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
-	mtx_lock(&nblock);
-	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
-		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
-		wakeup(&needsbuffer);
+	rw_rlock(&nblock);
+	for (;;) {
+		need_wakeup = 0;
+		on = needsbuffer;
+		if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
+			break;
+		need_wakeup = 1;
+		if (atomic_cmpset_rel_int(&needsbuffer, on,
+		    on & ~VFS_BIO_NEED_BUFSPACE))
+			break;
 	}
-	mtx_unlock(&nblock);
+	if (need_wakeup)
+		wakeup(__DEVOLATILE(void *, &needsbuffer));
+	rw_runlock(&nblock);
 }
 
 /*
- * runningbufwakeup() - in-progress I/O accounting.
+ *	runningwakeup:
  *
+ *	Wake up processes that are waiting on asynchronous writes to fall
+ *	below lorunningspace.
  */
+static void
+runningwakeup(void)
+{
+
+	mtx_lock(&rbreqlock);
+	if (runningbufreq) {
+		runningbufreq = 0;
+		wakeup(&runningbufreq);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+/*
+ *	runningbufwakeup:
+ *
+ *	Decrement the outstanding write count according.
+ */
 void
 runningbufwakeup(struct buf *bp)
 {
+	long space, bspace;
 
-	if (bp->b_runningbufspace) {
-		atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
-		bp->b_runningbufspace = 0;
-		mtx_lock(&rbreqlock);
-		if (runningbufreq && runningbufspace <= lorunningspace) {
-			runningbufreq = 0;
-			wakeup(&runningbufreq);
-		}
-		mtx_unlock(&rbreqlock);
-	}
+	bspace = bp->b_runningbufspace;
+	if (bspace == 0)
+		return;
+	space = atomic_fetchadd_long(&runningbufspace, -bspace);
+	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+	    space, bspace));
+	bp->b_runningbufspace = 0;
+	/*
+	 * Only acquire the lock and wakeup on the transition from exceeding
+	 * the threshold to falling below it.
+	 */
+	if (space < lorunningspace)
+		return;
+	if (space - bspace > lorunningspace)
+		return;
+	runningwakeup();
 }
 
 /*
- *	bufcountwakeup:
+ *	bufcountadd:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
@@ -393,31 +505,60 @@
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
-
 static __inline void
-bufcountwakeup(struct buf *bp) 
+bufcountadd(struct buf *bp)
 {
-	int old;
+	int mask, need_wakeup, old, on;
 
-	KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
+	KASSERT((bp->b_flags & B_INFREECNT) == 0,
 	    ("buf %p already counted as free", bp));
-	if (bp->b_bufobj != NULL)
-		mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
-	bp->b_vflags |= BV_INFREECNT;
+	bp->b_flags |= B_INFREECNT;
 	old = atomic_fetchadd_int(&numfreebuffers, 1);
 	KASSERT(old >= 0 && old < nbuf,
 	    ("numfreebuffers climbed to %d", old + 1));
-	mtx_lock(&nblock);
-	if (needsbuffer) {
-		needsbuffer &= ~VFS_BIO_NEED_ANY;
-		if (numfreebuffers >= hifreebuffers)
-			needsbuffer &= ~VFS_BIO_NEED_FREE;
-		wakeup(&needsbuffer);
+	mask = VFS_BIO_NEED_ANY;
+	if (numfreebuffers >= hifreebuffers)
+		mask |= VFS_BIO_NEED_FREE;
+	rw_rlock(&nblock);
+	for (;;) {
+		need_wakeup = 0;
+		on = needsbuffer;
+		if (on == 0)
+			break;
+		need_wakeup = 1;
+		if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
+			break;
 	}
-	mtx_unlock(&nblock);
+	if (need_wakeup)
+		wakeup(__DEVOLATILE(void *, &needsbuffer));
+	rw_runlock(&nblock);
 }
 
 /*
+ *	bufcountsub:
+ *
+ *	Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+	int old;
+
+	/*
+	 * Fixup numfreebuffers count.  If the buffer is invalid or not
+	 * delayed-write, the buffer was free and we must decrement
+	 * numfreebuffers.
+	 */
+	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+		KASSERT((bp->b_flags & B_INFREECNT) != 0,
+		    ("buf %p not counted in numfreebuffers", bp));
+		bp->b_flags &= ~B_INFREECNT;
+		old = atomic_fetchadd_int(&numfreebuffers, -1);
+		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+	}
+}
+
+/*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
@@ -425,9 +566,6 @@
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
- *	Reads will adjust runningbufspace, but will not block based on it.
- *	The read load has a side effect of reducing the allowed write load.
- *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
@@ -438,7 +576,7 @@
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
-		++runningbufreq;
+		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
@@ -459,7 +597,7 @@
 		  vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
@@ -468,13 +606,12 @@
 }
 
 /* Wake up the buffer daemon if necessary */
-static __inline
-void
-bd_wakeup(int dirtybuflevel)
+static __inline void
+bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
-	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
@@ -484,7 +621,6 @@
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
-
 void
 bd_speedup(void)
 {
@@ -501,6 +637,16 @@
 	mtx_unlock(&bdlock);
 }
 
+#ifndef NSWBUF_MIN
+#define	NSWBUF_MIN	16
+#endif
+
+#ifdef __i386__
+#define	TRANSIENT_DENOM	5
+#else
+#define	TRANSIENT_DENOM 10
+#endif
+
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
@@ -511,7 +657,7 @@
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
-	long maxbuf;
+	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
@@ -537,7 +683,8 @@
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
-			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
+			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
@@ -555,17 +702,60 @@
 	}
 
 	/*
+	 * Ideal allocation size for the transient bio submap if 10%
+	 * of the maximal space buffer map.  This roughly corresponds
+	 * to the amount of the buffer mapped for typical UFS load.
+	 *
+	 * Clip the buffer map to reserve space for the transient
+	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
+	 * maximum buffer map extent on the platform.
+	 *
+	 * The fall-back to the maxbuf in case of maxbcache unset,
+	 * allows to not trim the buffer KVA for the architectures
+	 * with ample KVA space.
+	 */
+	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+		buf_sz = (long)nbuf * BKVASIZE;
+		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+		    (TRANSIENT_DENOM - 1)) {
+			/*
+			 * There is more KVA than memory.  Do not
+			 * adjust buffer map size, and assign the rest
+			 * of maxbuf to transient map.
+			 */
+			biotmap_sz = maxbuf_sz - buf_sz;
+		} else {
+			/*
+			 * Buffer map spans all KVA we could afford on
+			 * this platform.  Give 10% (20% on i386) of
+			 * the buffer map to the transient bio map.
+			 */
+			biotmap_sz = buf_sz / TRANSIENT_DENOM;
+			buf_sz -= biotmap_sz;
+		}
+		if (biotmap_sz / INT_MAX > MAXPHYS)
+			bio_transient_maxcnt = INT_MAX;
+		else
+			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+		/*
+		 * Artificially limit to 1024 simultaneous in-flight I/Os
+		 * using the transient mapping.
+		 */
+		if (bio_transient_maxcnt > 1024)
+			bio_transient_maxcnt = 1024;
+		if (tuned_nbuf)
+			nbuf = buf_sz / BKVASIZE;
+	}
+
+	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
-	nswbuf = max(min(nbuf/4, 256), 16);
-#ifdef NSWBUF_MIN
+	nswbuf = min(nbuf / 4, 256);
+	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
-#endif
-#ifdef DIRECTIO
-	ffs_rawread_setup();
-#endif
 
 	/*
 	 * Reserve space for the buffer cache buffers
@@ -585,10 +775,13 @@
 	struct buf *bp;
 	int i;
 
-	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
+	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
+	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
-	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
+	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
@@ -598,15 +791,17 @@
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
-		bp->b_flags = B_INVAL;	/* we're just an empty header */
+		bp->b_flags = B_INVAL | B_INFREECNT;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
-		bp->b_vflags = BV_INFREECNT;	/* buf is counted as free */
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+		bq_len[QUEUE_EMPTY]++;
+#endif
 	}
 
 	/*
@@ -622,8 +817,8 @@
 	 * by the system.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
-	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
-	lobufspace = hibufspace - MAXBSIZE;
+	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
+	lobufspace = hibufspace - MAXBCACHEBUF;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -633,9 +828,9 @@
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
-	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
+	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
 	    16 * 1024 * 1024), 1024 * 1024);
-	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
+	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
 
 /*
  * Limit the amount of malloc memory since it is wired permanently into
@@ -675,8 +870,57 @@
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 }
 
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+	    ("mapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase != unmapped_buf,
+	    ("mapped buf: b_kvabase was not updated %p", bp));
+	KASSERT(bp->b_data != unmapped_buf,
+	    ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+	    ("unmapped buf %p %x", bp, bp->b_flags));
+	KASSERT(bp->b_kvabase == unmapped_buf,
+	    ("unmapped buf: corrupted b_kvabase %p", bp));
+	KASSERT(bp->b_data == unmapped_buf,
+	    ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define	BUF_CHECK_MAPPED(bp) do {} while (0)
+#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+	BUF_CHECK_MAPPED(bp);
+
+	/*
+	 * bp->b_data is relative to bp->b_offset, but
+	 * bp->b_offset may be offset into the first page.
+	 */
+	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
 /*
  * bfreekva() - free the kva allocation for a buffer.
  *
@@ -686,26 +930,85 @@
 bfreekva(struct buf *bp)
 {
 
-	if (bp->b_kvasize) {
-		atomic_add_int(&buffreekvacnt, 1);
-		atomic_subtract_long(&bufspace, bp->b_kvasize);
-		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
-		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
-		bp->b_kvasize = 0;
-		bufspacewakeup();
+	if (bp->b_kvasize == 0)
+		return;
+
+	atomic_add_int(&buffreekvacnt, 1);
+	atomic_subtract_long(&bufspace, bp->b_kvasize);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
+		    bp->b_kvasize);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		if ((bp->b_flags & B_KVAALLOC) != 0) {
+			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
+			    bp->b_kvasize);
+		}
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
 	}
+	bp->b_kvasize = 0;
+	bufspacewakeup();
 }
 
 /*
+ *	binsfree:
+ *
+ *	Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+	struct mtx *olock, *nlock;
+
+	BUF_ASSERT_XLOCKED(bp);
+
+	nlock = bqlock(qindex);
+	/* Handle delayed bremfree() processing. */
+	if (bp->b_flags & B_REMFREE) {
+		olock = bqlock(bp->b_qindex);
+		mtx_lock(olock);
+		bremfreel(bp);
+		if (olock != nlock) {
+			mtx_unlock(olock);
+			mtx_lock(nlock);
+		}
+	} else
+		mtx_lock(nlock);
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("binsfree: free buffer onto another queue???");
+
+	bp->b_qindex = qindex;
+	if (bp->b_flags & B_AGE)
+		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+	else
+		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	bq_len[bp->b_qindex]++;
+#endif
+	mtx_unlock(nlock);
+
+	/*
+	 * Something we can maybe free or reuse.
+	 */
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+		bufspacewakeup();
+
+	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+		bufcountadd(bp);
+}
+
+/*
  *	bremfree:
  *
- *	Mark the buffer for removal from the appropriate free list in brelse.
+ *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
-	int old;
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
@@ -712,19 +1015,10 @@
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
-	BUF_ASSERT_HELD(bp);
+	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
-	/* Fixup numfreebuffers count.  */
-	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-		KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
-		    ("buf %p not counted in numfreebuffers", bp));
-		if (bp->b_bufobj != NULL)
-			mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
-		bp->b_vflags &= ~BV_INFREECNT;
-		old = atomic_fetchadd_int(&numfreebuffers, -1);
-		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-	}
+	bufcountsub(bp);
 }
 
 /*
@@ -736,9 +1030,12 @@
 void
 bremfreef(struct buf *bp)
 {
-	mtx_lock(&bqlock);
+	struct mtx *qlock;
+
+	qlock = bqlock(bp->b_qindex);
+	mtx_lock(qlock);
 	bremfreel(bp);
-	mtx_unlock(&bqlock);
+	mtx_unlock(qlock);
 }
 
 /*
@@ -745,21 +1042,25 @@
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
- *	bqlock held.
+ *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
-	int old;
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
-	BUF_ASSERT_HELD(bp);
-	mtx_assert(&bqlock, MA_OWNED);
+	BUF_ASSERT_XLOCKED(bp);
+	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+	    bp->b_qindex));
+	bq_len[bp->b_qindex]--;
+#endif
 	bp->b_qindex = QUEUE_NONE;
 	/*
 	 * If this was a delayed bremfree() we only need to remove the buffer
@@ -769,34 +1070,10 @@
 		bp->b_flags &= ~B_REMFREE;
 		return;
 	}
-	/*
-	 * Fixup numfreebuffers count.  If the buffer is invalid or not
-	 * delayed-write, the buffer was free and we must decrement
-	 * numfreebuffers.
-	 */
-	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-		KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
-		    ("buf %p not counted in numfreebuffers", bp));
-		if (bp->b_bufobj != NULL)
-			mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
-		bp->b_vflags &= ~BV_INFREECNT;
-		old = atomic_fetchadd_int(&numfreebuffers, -1);
-		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-	}
+	bufcountsub(bp);
 }
 
 /*
- * Get a buffer with the specified data.
- */
-int
-bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
-    struct buf **bpp)
-{
-
-	return (breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp));
-}
-
-/*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
@@ -833,33 +1110,7 @@
 }
 
 /*
- * Operates like bread, but with getblk flags.
- */
-int
-bread_gb(struct vnode * vp, daddr_t blkno, int cnt, struct ucred * cred,
-    int gbflags, struct buf **bpp)
-{
-
-	return (breadn_flags(vp, blkno, cnt, NULL, NULL, 0,
-		    cred, gbflags, bpp));
-}
-
-/*
- * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.
- */
-int
-breadn(struct vnode * vp, daddr_t blkno, int size,
-    daddr_t * rablkno, int *rabsize,
-    int cnt, struct ucred * cred, struct buf **bpp)
-{
-
-	return (breadn_flags(vp, blkno, size, rablkno, rabsize, cnt,
-		    cred, 0, bpp));
-}
-
-/*
- * Entry point for bread() and breadn().
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
@@ -920,9 +1171,16 @@
 {
 	int oldflags;
 	struct vnode *vp;
+	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
+		bp->b_flags |= B_INVAL | B_RELBUF;
+		bp->b_flags &= ~B_CACHE;
+		brelse(bp);
+		return (ENXIO);
+	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
@@ -967,7 +1225,7 @@
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
-	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
+	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread))
 		curthread->td_ru.ru_oublock++;
@@ -980,7 +1238,7 @@
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
-	} else {
+	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
@@ -1115,13 +1373,6 @@
 	bqrelse(bp);
 
 	/*
-	 * Wakeup the buffer flushing daemon if we have a lot of dirty
-	 * buffers (midpoint between our recovery point and our stall
-	 * point).
-	 */
-	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
-
-	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
@@ -1161,8 +1412,7 @@
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
-		atomic_add_int(&numdirtybuffers, 1);
-		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+		bdirtyadd();
 	}
 }
 
@@ -1190,8 +1440,7 @@
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
-		atomic_subtract_int(&numdirtybuffers, 1);
-		numdirtywakeup(lodirtybuffers);
+		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
@@ -1259,20 +1508,18 @@
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
-
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
-		mtx_lock(&nblock);
+		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
-			bd_wakeup(1);
-			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
-			msleep(&needsbuffer, &nblock,
-			    (PRIBIO + 4), "flswai", 0);
+			bdirtywait = 1;
+			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+			    "flswai", 0);
 		}
-		mtx_unlock(&nblock);
+		mtx_unlock(&bdirtylock);
 	}
 }
 
@@ -1305,6 +1552,8 @@
 void
 brelse(struct buf *bp)
 {
+	int qindex;
+
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1324,6 +1573,12 @@
 		return;
 	}
 
+	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
+		BO_LOCK(bp->b_bufobj);
+		bp->b_vflags &= ~BV_BKGRDERR;
+		BO_UNLOCK(bp->b_bufobj);
+		bdirty(bp);
+	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
 		/*
@@ -1343,10 +1598,8 @@
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
-		if (bp->b_flags & B_DELWRI) {
-			atomic_subtract_int(&numdirtybuffers, 1);
-			numdirtywakeup(lodirtybuffers);
-		}
+		if (bp->b_flags & B_DELWRI)
+			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
@@ -1372,15 +1625,10 @@
 		bp->b_flags &= ~B_RELBUF;
 	else if (buf_vm_page_count_severe()) {
 		/*
-		 * The locking of the BO_LOCK is not necessary since
-		 * BKGRDINPROG cannot be set while we hold the buf
-		 * lock, it can only be cleared if it is already
-		 * pending.
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
 		 */
-		if (bp->b_vp) {
-			if (!(bp->b_vflags & BV_BKGRDINPROG))
-				bp->b_flags |= B_RELBUF;
-		} else
+		if (!(bp->b_vflags & BV_BKGRDINPROG))
 			bp->b_flags |= B_RELBUF;
 	}
 
@@ -1430,7 +1678,6 @@
 		 */
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
-		VM_OBJECT_LOCK(obj);
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
@@ -1444,6 +1691,7 @@
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
+				VM_OBJECT_RLOCK(obj);
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
@@ -1455,8 +1703,10 @@
 						bp->b_pages[j] = mtmp;
 					}
 				}
+				VM_OBJECT_RUNLOCK(obj);
 
-				if ((bp->b_flags & B_INVAL) == 0) {
+				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+					BUF_CHECK_MAPPED(bp);
 					pmap_qenter(
 					    trunc_page((vm_offset_t)bp->b_data),
 					    bp->b_pages, bp->b_npages);
@@ -1471,7 +1721,16 @@
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
-				vm_page_set_invalid(m, poffset, presid);
+				VM_OBJECT_WLOCK(obj);
+				while (vm_page_xbusied(m)) {
+					vm_page_lock(m);
+					VM_OBJECT_WUNLOCK(obj);
+					vm_page_busy_sleep(m, "mbncsh", true);
+					VM_OBJECT_WLOCK(obj);
+				}
+				if (pmap_page_wired_mappings(m) == 0)
+					vm_page_set_invalid(m, poffset, presid);
+				VM_OBJECT_WUNLOCK(obj);
 				if (had_bogus)
 					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
@@ -1478,7 +1737,6 @@
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
-		VM_OBJECT_UNLOCK(obj);
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
@@ -1495,22 +1753,6 @@
 			brelvp(bp);
 	}
 			
-	/* enqueue */
-	mtx_lock(&bqlock);
-	/* Handle delayed bremfree() processing. */
-	if (bp->b_flags & B_REMFREE) {
-		struct bufobj *bo;
-
-		bo = bp->b_bufobj;
-		if (bo != NULL)
-			BO_LOCK(bo);
-		bremfreel(bp);
-		if (bo != NULL)
-			BO_UNLOCK(bo);
-	}
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("brelse: free buffer onto another queue???");
-
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
@@ -1531,12 +1773,11 @@
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 1");
-		if (bp->b_kvasize) {
-			bp->b_qindex = QUEUE_EMPTYKVA;
-		} else {
-			bp->b_qindex = QUEUE_EMPTY;
-		}
-		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+		if (bp->b_kvasize)
+			qindex = QUEUE_EMPTYKVA;
+		else
+			qindex = QUEUE_EMPTY;
+		bp->b_flags |= B_AGE;
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
@@ -1543,48 +1784,16 @@
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
-		bp->b_qindex = QUEUE_CLEAN;
-		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+		qindex = QUEUE_CLEAN;
+		bp->b_flags |= B_AGE;
 	/* remaining buffers */
-	} else {
-		if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
-		    (B_DELWRI|B_NEEDSGIANT))
-			bp->b_qindex = QUEUE_DIRTY_GIANT;
-		else if (bp->b_flags & B_DELWRI)
-			bp->b_qindex = QUEUE_DIRTY;
-		else
-			bp->b_qindex = QUEUE_CLEAN;
-		if (bp->b_flags & B_AGE)
-			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
-		else
-			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-	}
-	mtx_unlock(&bqlock);
+	} else if (bp->b_flags & B_DELWRI)
+		qindex = QUEUE_DIRTY;
+	else
+		qindex = QUEUE_CLEAN;
 
-	/*
-	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
-	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
-	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
-	 * if B_INVAL is set ).
-	 */
+	binsfree(bp, qindex);
 
-	if (!(bp->b_flags & B_DELWRI)) {
-		struct bufobj *bo;
-
-		bo = bp->b_bufobj;
-		if (bo != NULL)
-			BO_LOCK(bo);
-		bufcountwakeup(bp);
-		if (bo != NULL)
-			BO_UNLOCK(bo);
-	}
-
-	/*
-	 * Something we can maybe free or reuse
-	 */
-	if (bp->b_bufsize || bp->b_kvasize)
-		bufspacewakeup();
-
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
@@ -1606,7 +1815,7 @@
 void
 bqrelse(struct buf *bp)
 {
-	struct bufobj *bo;
+	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1617,83 +1826,44 @@
 		BUF_UNLOCK(bp);
 		return;
 	}
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
-	bo = bp->b_bufobj;
 	if (bp->b_flags & B_MANAGED) {
-		if (bp->b_flags & B_REMFREE) {
-			mtx_lock(&bqlock);
-			if (bo != NULL)
-				BO_LOCK(bo);
-			bremfreel(bp);
-			if (bo != NULL)
-				BO_UNLOCK(bo);
-			mtx_unlock(&bqlock);
-		}
-		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-		BUF_UNLOCK(bp);
-		return;
+		if (bp->b_flags & B_REMFREE)
+			bremfreef(bp);
+		goto out;
 	}
 
-	mtx_lock(&bqlock);
-	/* Handle delayed bremfree() processing. */
-	if (bp->b_flags & B_REMFREE) {
-		if (bo != NULL)
-			BO_LOCK(bo);
-		bremfreel(bp);
-		if (bo != NULL)
-			BO_UNLOCK(bo);
-	}
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("bqrelse: free buffer onto another queue???");
 	/* buffers with stale but valid contents */
-	if (bp->b_flags & B_DELWRI) {
-		if (bp->b_flags & B_NEEDSGIANT)
-			bp->b_qindex = QUEUE_DIRTY_GIANT;
-		else
-			bp->b_qindex = QUEUE_DIRTY;
-		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
+	    BV_BKGRDERR)) == BV_BKGRDERR) {
+		BO_LOCK(bp->b_bufobj);
+		bp->b_vflags &= ~BV_BKGRDERR;
+		BO_UNLOCK(bp->b_bufobj);
+		qindex = QUEUE_DIRTY;
 	} else {
+		if ((bp->b_flags & B_DELWRI) == 0 &&
+		    (bp->b_xflags & BX_VNDIRTY))
+			panic("bqrelse: not dirty");
 		/*
-		 * The locking of the BO_LOCK for checking of the
-		 * BV_BKGRDINPROG is not necessary since the
-		 * BV_BKGRDINPROG cannot be set while we hold the buf
-		 * lock, it can only be cleared if it is already
-		 * pending.
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
 		 */
-		if (!buf_vm_page_count_severe() || (bp->b_vflags & BV_BKGRDINPROG)) {
-			bp->b_qindex = QUEUE_CLEAN;
-			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
-			    b_freelist);
-		} else {
+		if (buf_vm_page_count_severe() &&
+		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
 			/*
 			 * We are too low on memory, we have to try to free
 			 * the buffer (most importantly: the wired pages
 			 * making up its backing store) *now*.
 			 */
-			mtx_unlock(&bqlock);
 			brelse(bp);
 			return;
 		}
+		qindex = QUEUE_CLEAN;
 	}
-	mtx_unlock(&bqlock);
+	binsfree(bp, qindex);
 
-	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) {
-		if (bo != NULL)
-			BO_LOCK(bo);
-		bufcountwakeup(bp);
-		if (bo != NULL)
-			BO_UNLOCK(bo);
-	}
-
-	/*
-	 * Something we can maybe free or reuse.
-	 */
-	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
-		bufspacewakeup();
-
-	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
-		panic("bqrelse: not dirty");
+out:
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
@@ -1702,11 +1872,18 @@
 static void
 vfs_vmio_release(struct buf *bp)
 {
+	vm_object_t obj;
+	vm_page_t m;
 	int i;
-	vm_page_t m;
 
-	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
-	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+	} else
+		BUF_CHECK_UNMAPPED(bp);
+	obj = bp->b_bufobj->bo_object;
+	if (obj != NULL)
+		VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
@@ -1716,29 +1893,23 @@
 		 */
 		vm_page_lock(m);
 		vm_page_unwire(m, 0);
+
 		/*
-		 * We don't mess with busy pages, it is
-		 * the responsibility of the process that
-		 * busied the pages to deal with them.
+		 * Might as well free the page if we can and it has
+		 * no valid data.  We also free the page if the
+		 * buffer was used for direct I/O
 		 */
-		if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 &&
-		    m->wire_count == 0) {
-			/*
-			 * Might as well free the page if we can and it has
-			 * no valid data.  We also free the page if the
-			 * buffer was used for direct I/O
-			 */
-			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+			if (m->wire_count == 0 && !vm_page_busied(m))
 				vm_page_free(m);
-			} else if (bp->b_flags & B_DIRECT) {
-				vm_page_try_to_free(m);
-			} else if (buf_vm_page_count_severe()) {
-				vm_page_try_to_cache(m);
-			}
-		}
+		} else if (bp->b_flags & B_DIRECT)
+			vm_page_try_to_free(m);
+		else if (buf_vm_page_count_severe())
+			vm_page_try_to_cache(m);
 		vm_page_unlock(m);
 	}
-	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+	if (obj != NULL)
+		VM_OBJECT_WUNLOCK(obj);
 	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
@@ -1809,8 +1980,10 @@
 	int nwritten;
 	int size;
 	int maxcl;
+	int gbflags;
 
 	bo = &vp->v_bufobj;
+	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
@@ -1823,7 +1996,7 @@
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
-		BO_LOCK(bo);
+		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
@@ -1833,7 +2006,7 @@
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
-		BO_UNLOCK(bo);
+		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
@@ -1841,7 +2014,8 @@
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
-			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
+			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+			    gbflags);
 			return (nwritten);
 		}
 	}
@@ -1858,46 +2032,207 @@
 	return (nwritten);
 }
 
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+	if ((gbflags & GB_UNMAPPED) == 0) {
+		bp->b_kvabase = (caddr_t)addr;
+	} else if ((gbflags & GB_KVAALLOC) != 0) {
+		KASSERT((gbflags & GB_UNMAPPED) != 0,
+		    ("GB_KVAALLOC without GB_UNMAPPED"));
+		bp->b_kvaalloc = (caddr_t)addr;
+		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+	}
+	bp->b_kvasize = maxsize;
+}
+
 /*
- *	getnewbuf:
- *
- *	Find and initialize a new buffer header, freeing up existing buffers 
- *	in the bufqueues as necessary.  The new buffer is returned locked.
- *
- *	Important:  B_INVAL is not set.  If the caller wishes to throw the
- *	buffer away, the caller must set B_INVAL prior to calling brelse().
- *
- *	We block if:
- *		We have insufficient buffer headers
- *		We have insufficient buffer space
- *		buffer_map is too fragmented ( space reservation fails )
- *		If we have to flush dirty buffers ( but we try to avoid this )
- *
- *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *	Instead we ask the buf daemon to do it for us.  We attempt to
- *	avoid piecemeal wakeups of the pageout daemon.
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
  */
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+	vm_offset_t addr;
 
-static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
-    int gbflags)
+	bfreekva(bp);
+	addr = 0;
+
+	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
+		/*
+		 * Buffer map is too fragmented.  Request the caller
+		 * to defragment the map.
+		 */
+		atomic_add_int(&bufdefragcnt, 1);
+		return (1);
+	}
+	setbufkva(bp, addr, maxsize, gbflags);
+	atomic_add_long(&bufspace, bp->b_kvasize);
+	return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+    int defrag)
 {
 	struct thread *td;
-	struct buf *bp;
-	struct buf *nbp;
-	int defrag = 0;
-	int nqindex;
-	static int flushingbufs;
+	char *waitmsg;
+	int error, fl, flags, norunbuf;
 
+	mtx_assert(&bqclean, MA_OWNED);
+
+	if (defrag) {
+		flags = VFS_BIO_NEED_BUFSPACE;
+		waitmsg = "nbufkv";
+	} else if (bufspace >= hibufspace) {
+		waitmsg = "nbufbs";
+		flags = VFS_BIO_NEED_BUFSPACE;
+	} else {
+		waitmsg = "newbuf";
+		flags = VFS_BIO_NEED_ANY;
+	}
+	atomic_set_int(&needsbuffer, flags);
+	mtx_unlock(&bqclean);
+
+	bd_speedup();	/* heeeelp */
+	if ((gbflags & GB_NOWAIT_BD) != 0)
+		return;
+
 	td = curthread;
+	rw_wlock(&nblock);
+	while ((needsbuffer & flags) != 0) {
+		if (vp != NULL && vp->v_type != VCHR &&
+		    (td->td_pflags & TDP_BUFNEED) == 0) {
+			rw_wunlock(&nblock);
+			/*
+			 * getblk() is called with a vnode locked, and
+			 * some majority of the dirty buffers may as
+			 * well belong to the vnode.  Flushing the
+			 * buffers there would make a progress that
+			 * cannot be achieved by the buf_daemon, that
+			 * cannot lock the vnode.
+			 */
+			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+			    (td->td_pflags & TDP_NORUNNINGBUF);
+
+			/*
+			 * Play bufdaemon.  The getnewbuf() function
+			 * may be called while the thread owns lock
+			 * for another dirty buffer for the same
+			 * vnode, which makes it impossible to use
+			 * VOP_FSYNC() there, due to the buffer lock
+			 * recursion.
+			 */
+			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+			fl = buf_flush(vp, flushbufqtarget);
+			td->td_pflags &= norunbuf;
+			rw_wlock(&nblock);
+			if (fl != 0)
+				continue;
+			if ((needsbuffer & flags) == 0)
+				break;
+		}
+		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+		    (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
+		if (error != 0)
+			break;
+	}
+	rw_wunlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+	     bp->b_kvasize, bp->b_bufsize, qindex);
+	mtx_assert(&bqclean, MA_NOTOWNED);
+
 	/*
-	 * We can't afford to block since we might be holding a vnode lock,
-	 * which may prevent system daemons from running.  We deal with
-	 * low-memory situations by proactively returning memory and running
-	 * async I/O rather then sync I/O.
+	 * Note: we no longer distinguish between VMIO and non-VMIO
+	 * buffers.
 	 */
-	atomic_add_int(&getnewbufcalls, 1);
-	atomic_subtract_int(&getnewbufrestarts, 1);
+	KASSERT((bp->b_flags & B_DELWRI) == 0,
+	    ("delwri buffer %p found in queue %d", bp, qindex));
+
+	if (qindex == QUEUE_CLEAN) {
+		if (bp->b_flags & B_VMIO) {
+			bp->b_flags &= ~B_ASYNC;
+			vfs_vmio_release(bp);
+		}
+		if (bp->b_vp != NULL)
+			brelvp(bp);
+	}
+
+	/*
+	 * Get the rest of the buffer freed up.  b_kva* is still valid
+	 * after this operation.
+	 */
+
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+	if (!LIST_EMPTY(&bp->b_dep))
+		buf_deallocate(bp);
+	if (bp->b_vflags & BV_BKGRDINPROG)
+		panic("losing buffer 3");
+	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
+	    bp, bp->b_vp, qindex));
+	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+	if (bp->b_bufsize)
+		allocbuf(bp, 0);
+
+	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+	bp->b_ioflags = 0;
+	bp->b_xflags = 0;
+	KASSERT((bp->b_flags & B_INFREECNT) == 0,
+	    ("buf %p still counted as free?", bp));
+	bp->b_vflags = 0;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_offset = NOOFFSET;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_bufobj = NULL;
+	bp->b_pin_count = 0;
+	bp->b_fsprivate1 = NULL;
+	bp->b_fsprivate2 = NULL;
+	bp->b_fsprivate3 = NULL;
+
+	LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+	struct buf *bp, *nbp;
+	int nqindex, qindex, pass;
+
+	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+	pass = 1;
 restart:
 	atomic_add_int(&getnewbufrestarts, 1);
 
@@ -1907,66 +2242,90 @@
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
-	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
+	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+	 * for the allocation of the mapped buffer.  For unmapped, the
+	 * easiest is to start with EMPTY outright.
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
-	mtx_lock(&bqlock);
-	nqindex = QUEUE_EMPTYKVA;
-	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
+	nbp = NULL;
+	mtx_lock(&bqclean);
+	if (!defrag && unmapped) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
 	if (nbp == NULL) {
-		/*
-		 * If no EMPTYKVA buffers and we are either
-		 * defragging or reusing, locate a CLEAN buffer
-		 * to free or reuse.  If bufspace useage is low
-		 * skip this step so we can allocate a new buffer.
-		 */
-		if (defrag || bufspace >= lobufspace) {
-			nqindex = QUEUE_CLEAN;
-			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
-		}
+		nqindex = QUEUE_EMPTYKVA;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+	}
 
-		/*
-		 * If we could not find or were not allowed to reuse a
-		 * CLEAN buffer, check to see if it is ok to use an EMPTY
-		 * buffer.  We can only use an EMPTY buffer if allocating
-		 * its KVA would not otherwise run us out of buffer space.
-		 */
-		if (nbp == NULL && defrag == 0 &&
-		    bufspace + maxsize < hibufspace) {
-			nqindex = QUEUE_EMPTY;
-			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
-		}
+	/*
+	 * If no EMPTYKVA buffers and we are either defragging or
+	 * reusing, locate a CLEAN buffer to free or reuse.  If
+	 * bufspace useage is low skip this step so we can allocate a
+	 * new buffer.
+	 */
+	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 	}
 
 	/*
+	 * If we could not find or were not allowed to reuse a CLEAN
+	 * buffer, check to see if it is ok to use an EMPTY buffer.
+	 * We can only use an EMPTY buffer if allocating its KVA would
+	 * not otherwise run us out of buffer space.  No KVA is needed
+	 * for the unmapped allocation.
+	 */
+	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+	    metadata)) {
+		nqindex = QUEUE_EMPTY;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+	}
+
+	/*
+	 * All available buffers might be clean, retry ignoring the
+	 * lobufspace as the last resort.
+	 */
+	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+		nqindex = QUEUE_CLEAN;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+	}
+
+	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
-
 	while ((bp = nbp) != NULL) {
-		int qindex = nqindex;
+		qindex = nqindex;
 
 		/*
-		 * Calculate next bp ( we can only use it if we do not block
-		 * or do other fancy things ).
+		 * Calculate next bp (we can only use it if we do not
+		 * block or do other fancy things).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
-			switch(qindex) {
+			switch (qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_CLEAN:
+				if (metadata && pass == 1) {
+					pass = 2;
+					nqindex = QUEUE_EMPTY;
+					nbp = TAILQ_FIRST(
+					    &bufqueues[QUEUE_EMPTY]);
+				}
 				/*
 				 * nbp is NULL. 
 				 */
@@ -1990,101 +2349,38 @@
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
-		if (bp->b_vp) {
-			BO_LOCK(bp->b_bufobj);
-			if (bp->b_vflags & BV_BKGRDINPROG) {
-				BO_UNLOCK(bp->b_bufobj);
-				BUF_UNLOCK(bp);
-				continue;
-			}
-			BO_UNLOCK(bp->b_bufobj);
-		}
-		CTR6(KTR_BUF,
-		    "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
-		    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
-		    bp->b_kvasize, bp->b_bufsize, qindex);
-
 		/*
-		 * Sanity Checks
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
 		 */
-		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
+		if (bp->b_vflags & BV_BKGRDINPROG) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
 
 		/*
-		 * Note: we no longer distinguish between VMIO and non-VMIO
-		 * buffers.
+		 * Requeue the background write buffer with error.
 		 */
+		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+			bremfreel(bp);
+			mtx_unlock(&bqclean);
+			bqrelse(bp);
+			continue;
+		}
 
-		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+		KASSERT(bp->b_qindex == qindex,
+		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 
-		if (bp->b_bufobj != NULL)
-			BO_LOCK(bp->b_bufobj);
 		bremfreel(bp);
-		if (bp->b_bufobj != NULL)
-			BO_UNLOCK(bp->b_bufobj);
-		mtx_unlock(&bqlock);
-
-		if (qindex == QUEUE_CLEAN) {
-			if (bp->b_flags & B_VMIO) {
-				bp->b_flags &= ~B_ASYNC;
-				vfs_vmio_release(bp);
-			}
-			if (bp->b_vp)
-				brelvp(bp);
-		}
-
+		mtx_unlock(&bqclean);
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
-		 *
-		 * Get the rest of the buffer freed up.  b_kva* is still
-		 * valid after this operation.
 		 */
 
-		if (bp->b_rcred != NOCRED) {
-			crfree(bp->b_rcred);
-			bp->b_rcred = NOCRED;
-		}
-		if (bp->b_wcred != NOCRED) {
-			crfree(bp->b_wcred);
-			bp->b_wcred = NOCRED;
-		}
-		if (!LIST_EMPTY(&bp->b_dep))
-			buf_deallocate(bp);
-		if (bp->b_vflags & BV_BKGRDINPROG)
-			panic("losing buffer 3");
-		KASSERT(bp->b_vp == NULL,
-		    ("bp: %p still has vnode %p.  qindex: %d",
-		    bp, bp->b_vp, qindex));
-		KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
-		   ("bp: %p still on a buffer list. xflags %X",
-		    bp, bp->b_xflags));
+		getnewbuf_reuse_bp(bp, qindex);
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
-		if (bp->b_bufsize)
-			allocbuf(bp, 0);
-
-		bp->b_flags = 0;
-		bp->b_ioflags = 0;
-		bp->b_xflags = 0;
-		KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
-		    ("buf %p still counted as free?", bp));
-		bp->b_vflags = 0;
-		bp->b_vp = NULL;
-		bp->b_blkno = bp->b_lblkno = 0;
-		bp->b_offset = NOOFFSET;
-		bp->b_iodone = 0;
-		bp->b_error = 0;
-		bp->b_resid = 0;
-		bp->b_bcount = 0;
-		bp->b_npages = 0;
-		bp->b_dirtyoff = bp->b_dirtyend = 0;
-		bp->b_bufobj = NULL;
-		bp->b_pin_count = 0;
-		bp->b_fsprivate1 = NULL;
-		bp->b_fsprivate2 = NULL;
-		bp->b_fsprivate3 = NULL;
-
-		LIST_INIT(&bp->b_dep);
-
 		/*
 		 * If we are defragging then free the buffer.
 		 */
@@ -2107,6 +2403,9 @@
 			goto restart;
 		}
 
+		if (metadata)
+			break;
+
 		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
@@ -2124,72 +2423,79 @@
 			flushingbufs = 0;
 		break;
 	}
+	return (bp);
+}
 
+/*
+ *	getnewbuf:
+ *
+ *	Find and initialize a new buffer header, freeing up existing buffers
+ *	in the bufqueues as necessary.  The new buffer is returned locked.
+ *
+ *	Important:  B_INVAL is not set.  If the caller wishes to throw the
+ *	buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ *	We block if:
+ *		We have insufficient buffer headers
+ *		We have insufficient buffer space
+ *		buffer_arena is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+    int gbflags)
+{
+	struct buf *bp;
+	int defrag, metadata;
+
+	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+	if (!unmapped_buf_allowed)
+		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+	defrag = 0;
+	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+	    vp->v_type == VCHR)
+		metadata = 1;
+	else
+		metadata = 0;
 	/*
+	 * We can't afford to block since we might be holding a vnode lock,
+	 * which may prevent system daemons from running.  We deal with
+	 * low-memory situations by proactively returning memory and running
+	 * async I/O rather then sync I/O.
+	 */
+	atomic_add_int(&getnewbufcalls, 1);
+	atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+	if (bp != NULL)
+		defrag = 0;
+
+	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
 	 * wakeup various daemons and write out some dirty buffers.
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
-
 	if (bp == NULL) {
-		int flags, norunbuf;
-		char *waitmsg;
-		int fl;
+		mtx_assert(&bqclean, MA_OWNED);
+		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+		mtx_assert(&bqclean, MA_NOTOWNED);
+	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
-		if (defrag) {
-			flags = VFS_BIO_NEED_BUFSPACE;
-			waitmsg = "nbufkv";
-		} else if (bufspace >= hibufspace) {
-			waitmsg = "nbufbs";
-			flags = VFS_BIO_NEED_BUFSPACE;
-		} else {
-			waitmsg = "newbuf";
-			flags = VFS_BIO_NEED_ANY;
-		}
-		mtx_lock(&nblock);
-		needsbuffer |= flags;
-		mtx_unlock(&nblock);
-		mtx_unlock(&bqlock);
+		bfreekva(bp);
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_kvabase = bp->b_data = unmapped_buf;
+		bp->b_kvasize = maxsize;
+		atomic_add_long(&bufspace, bp->b_kvasize);
+		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+		atomic_add_int(&bufreusecnt, 1);
+	} else {
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
-		bd_speedup();	/* heeeelp */
-		if (gbflags & GB_NOWAIT_BD)
-			return (NULL);
-
-		mtx_lock(&nblock);
-		while (needsbuffer & flags) {
-			if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
-				mtx_unlock(&nblock);
-				/*
-				 * getblk() is called with a vnode
-				 * locked, and some majority of the
-				 * dirty buffers may as well belong to
-				 * the vnode. Flushing the buffers
-				 * there would make a progress that
-				 * cannot be achieved by the
-				 * buf_daemon, that cannot lock the
-				 * vnode.
-				 */
-				norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
-				    (td->td_pflags & TDP_NORUNNINGBUF);
-				/* play bufdaemon */
-				td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-				fl = buf_do_flush(vp);
-				td->td_pflags &= norunbuf;
-				mtx_lock(&nblock);
-				if (fl != 0)
-					continue;
-				if ((needsbuffer & flags) == 0)
-					break;
-			}
-			if (msleep(&needsbuffer, &nblock,
-			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
-				mtx_unlock(&nblock);
-				return (NULL);
-			}
-		}
-		mtx_unlock(&nblock);
-	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
@@ -2198,39 +2504,47 @@
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
-		if (maxsize != bp->b_kvasize) {
-			vm_offset_t addr = 0;
-			int rv;
-
-			bfreekva(bp);
-
-			vm_map_lock(buffer_map);
-			if (vm_map_findspace(buffer_map,
-			    vm_map_min(buffer_map), maxsize, &addr)) {
-				/*
-				 * Buffer map is too fragmented.
-				 * We must defragment the map.
-				 */
-				atomic_add_int(&bufdefragcnt, 1);
-				vm_map_unlock(buffer_map);
+		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+		    B_KVAALLOC)) == B_UNMAPPED) {
+			if (allocbufkva(bp, maxsize, gbflags)) {
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
-			rv = vm_map_insert(buffer_map, NULL, 0, addr,
-			    addr + maxsize, VM_PROT_ALL, VM_PROT_ALL,
-			    MAP_NOFAULT);
-			KASSERT(rv == KERN_SUCCESS,
-			    ("vm_map_insert(buffer_map) rv %d", rv));
-			vm_map_unlock(buffer_map);
-			bp->b_kvabase = (caddr_t)addr;
-			bp->b_kvasize = maxsize;
-			atomic_add_long(&bufspace, bp->b_kvasize);
 			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+			/*
+			 * If the reused buffer has KVA allocated,
+			 * reassign b_kvaalloc to b_kvabase.
+			 */
+			bp->b_kvabase = bp->b_kvaalloc;
+			bp->b_flags &= ~B_KVAALLOC;
+			atomic_subtract_long(&unmapped_bufspace,
+			    bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
+		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+		    GB_KVAALLOC)) {
+			/*
+			 * The case of reused buffer already have KVA
+			 * mapped, but the request is for unmapped
+			 * buffer with KVA allocated.
+			 */
+			bp->b_kvaalloc = bp->b_kvabase;
+			bp->b_data = bp->b_kvabase = unmapped_buf;
+			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+			atomic_add_long(&unmapped_bufspace,
+			    bp->b_kvasize);
+			atomic_add_int(&bufreusecnt, 1);
 		}
-		bp->b_saveaddr = bp->b_kvabase;
-		bp->b_data = bp->b_saveaddr;
+		if ((gbflags & GB_UNMAPPED) == 0) {
+			bp->b_saveaddr = bp->b_kvabase;
+			bp->b_data = bp->b_saveaddr;
+			bp->b_flags &= ~B_UNMAPPED;
+			BUF_CHECK_MAPPED(bp);
+		}
 	}
 	return (bp);
 }
@@ -2251,17 +2565,11 @@
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
-buf_do_flush(struct vnode *vp)
+buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
-	flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
-	/* The list empty check here is slightly racy */
-	if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
-		mtx_lock(&Giant);
-		flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
-		mtx_unlock(&Giant);
-	}
+	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
@@ -2268,13 +2576,9 @@
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
-		flushbufqueues(vp, QUEUE_DIRTY, 1);
-		if (!TAILQ_EMPTY(
-			    &bufqueues[QUEUE_DIRTY_GIANT])) {
-			mtx_lock(&Giant);
-			flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
-			mtx_unlock(&Giant);
-		}
+		if (vp != NULL && target > 2)
+			target /= 2;
+		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
@@ -2282,7 +2586,7 @@
 static void
 buf_daemon()
 {
-	int lodirtysave;
+	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
@@ -2300,23 +2604,21 @@
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
-		lodirtysave = lodirtybuffers;
+		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
-			lodirtybuffers = numdirtybuffers / 2;
+			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
-		 * the I/O system.  Wakeup any waiting processes before we
-		 * normally would so they can run in parallel with our drain.
+		 * the I/O system.
 		 */
-		while (numdirtybuffers > lodirtybuffers) {
-			if (buf_do_flush(NULL) == 0)
+		while (numdirtybuffers > lodirty) {
+			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
-			kern_yield(PRI_UNCHANGED);
+			kern_yield(PRI_USER);
 		}
-		lodirtybuffers = lodirtysave;
 
 		/*
 		 * Only clear bd_request if we have reached our low water
@@ -2325,8 +2627,8 @@
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
-		 * find any flushable buffers, we sleep half a second.
-		 * Otherwise we loop immediately.
+		 * find any flushable buffers, we sleep for a short period
+		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
@@ -2336,6 +2638,14 @@
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
+			/*
+			 * Do an extra wakeup in case dirty threshold
+			 * changed via sysctl and the explicit transition
+			 * out of shortfall was missed.
+			 */
+			bdirtywakeup();
+			if (runningbufspace <= lorunningspace)
+				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
@@ -2360,7 +2670,7 @@
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
@@ -2368,61 +2678,63 @@
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
-	int target;
+	int queue;
+	int error;
+	bool unlock;
 
-	if (lvp == NULL) {
-		target = numdirtybuffers - lodirtybuffers;
-		if (flushdeps && target > 2)
-			target /= 2;
-	} else
-		target = flushbufqtarget;
 	flushed = 0;
+	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
-	mtx_lock(&bqlock);
+	mtx_lock(&bqdirty);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
+	mtx_unlock(&bqdirty);
 	while (flushed != target) {
+		maybe_yield();
+		mtx_lock(&bqdirty);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
-		} else
+		} else {
+			mtx_unlock(&bqdirty);
 			break;
+		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
-		 */
-		if (bp->b_qindex == QUEUE_SENTINEL)
-			continue;
-		/*
+		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
-		if (lvp != NULL && bp->b_vp != lvp)
+		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
+		    bp->b_vp != lvp)) {
+			mtx_unlock(&bqdirty);
+ 			continue;
+		}
+		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+		mtx_unlock(&bqdirty);
+		if (error != 0)
 			continue;
-		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
-			continue;
 		if (bp->b_pin_count > 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		BO_LOCK(bp->b_bufobj);
+		/*
+		 * BKGRDINPROG can only be set with the buf and bufobj
+		 * locks both held.  We tolerate a race to clear it here.
+		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
-			BO_UNLOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		BO_UNLOCK(bp->b_bufobj);
 		if (bp->b_flags & B_INVAL) {
-			bremfreel(bp);
-			mtx_unlock(&bqlock);
+			bremfreef(bp);
 			brelse(bp);
 			flushed++;
-			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
-			mtx_lock(&bqlock);
 			continue;
 		}
 
@@ -2449,19 +2761,28 @@
 			BUF_UNLOCK(bp);
 			continue;
 		}
-		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
-			mtx_unlock(&bqlock);
+		if (lvp == NULL) {
+			unlock = true;
+			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+		} else {
+			ASSERT_VOP_LOCKED(vp, "getbuf");
+			unlock = false;
+			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
+			    vn_lock(vp, LK_TRYUPGRADE);
+		}
+		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
-			if (curproc == bufdaemonproc)
+			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
-			else {
+			} else {
 				bremfree(bp);
 				bwrite(bp);
-				notbufdflashes++;
+				notbufdflushes++;
 			}
 			vn_finished_write(mp);
-			VOP_UNLOCK(vp, 0);
+			if (unlock)
+				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
@@ -2469,17 +2790,17 @@
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
-			if (curproc == bufdaemonproc)
+			if (curproc == bufdaemonproc &&
+			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
-			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
-			mtx_lock(&bqlock);
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
+	mtx_lock(&bqdirty);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
-	mtx_unlock(&bqlock);
+	mtx_unlock(&bqdirty);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
@@ -2492,9 +2813,9 @@
 {
 	struct buf *bp;
 
-	BO_LOCK(bo);
+	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
-	BO_UNLOCK(bo);
+	BO_RUNLOCK(bo);
 	return (bp);
 }
 
@@ -2527,7 +2848,7 @@
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
@@ -2539,11 +2860,11 @@
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
@@ -2573,7 +2894,7 @@
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
-	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
@@ -2586,7 +2907,7 @@
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
-	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
@@ -2596,7 +2917,7 @@
 	int i;
 
 	object = bp->b_bufobj->bo_object;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
@@ -2653,6 +2974,90 @@
 }
 
 /*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+	struct buf *scratch_bp;
+	int bsize, maxsize, need_mapping, need_kva;
+	off_t offset;
+
+	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+	    (gbflags & GB_UNMAPPED) == 0;
+	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+	    (gbflags & GB_KVAALLOC) != 0;
+	if (!need_mapping && !need_kva)
+		return;
+
+	BUF_CHECK_UNMAPPED(bp);
+
+	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+		/*
+		 * Buffer is not mapped, but the KVA was already
+		 * reserved at the time of the instantiation.  Use the
+		 * allocated space.
+		 */
+		bp->b_flags &= ~B_KVAALLOC;
+		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+		bp->b_kvabase = bp->b_kvaalloc;
+		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+		goto has_addr;
+	}
+
+	/*
+	 * Calculate the amount of the address space we would reserve
+	 * if the buffer was mapped.
+	 */
+	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+	offset = blkno * bsize;
+	maxsize = size + (offset & PAGE_MASK);
+	maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+	if (allocbufkva(bp, maxsize, gbflags)) {
+		/*
+		 * Request defragmentation. getnewbuf() returns us the
+		 * allocated space by the scratch buffer KVA.
+		 */
+		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+		    (GB_UNMAPPED | GB_KVAALLOC));
+		if (scratch_bp == NULL) {
+			if ((gbflags & GB_NOWAIT_BD) != 0) {
+				/*
+				 * XXXKIB: defragmentation cannot
+				 * succeed, not sure what else to do.
+				 */
+				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+			}
+			atomic_add_int(&mappingrestarts, 1);
+			goto mapping_loop;
+		}
+		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
+		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+		    scratch_bp->b_kvasize, gbflags);
+
+		/* Get rid of the scratch buffer. */
+		scratch_bp->b_kvasize = 0;
+		scratch_bp->b_flags |= B_INVAL;
+		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+		brelse(scratch_bp);
+	}
+	if (!need_mapping)
+		return;
+
+has_addr:
+	bp->b_saveaddr = bp->b_kvabase;
+	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+	bp->b_flags &= ~B_UNMAPPED;
+	BUF_CHECK_MAPPED(bp);
+	bpmap_qenter(bp);
+}
+
+/*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
@@ -2684,7 +3089,7 @@
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
- *	a write attempt or if it was a successfull read.  If the caller 
+ *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
@@ -2694,38 +3099,28 @@
 {
 	struct buf *bp;
 	struct bufobj *bo;
-	int error;
+	int bsize, error, maxsize, vmio;
+	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
-	if (size > MAXBSIZE)
-		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+	if (size > MAXBCACHEBUF)
+		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
+		    MAXBCACHEBUF);
+	if (!unmapped_buf_allowed)
+		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
-	/*
-	 * Block if we are low on buffers.   Certain processes are allowed
-	 * to completely exhaust the buffer cache.
-         *
-         * If this check ever becomes a bottleneck it may be better to
-         * move it into the else, when gbincore() fails.  At the moment
-         * it isn't a problem.
-         */
-	if (numfreebuffers == 0) {
-		if (TD_IS_IDLETHREAD(curthread))
-			return NULL;
-		mtx_lock(&nblock);
-		needsbuffer |= VFS_BIO_NEED_ANY;
-		mtx_unlock(&nblock);
-	}
-
-	BO_LOCK(bo);
+	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
-		 * Buffer is in-core.  If the buffer is not busy, it must
-		 * be on a queue.
+		 * Buffer is in-core.  If the buffer is not busy nor managed,
+		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
@@ -2733,7 +3128,7 @@
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
-		    BO_MTX(bo), "getblk", slpflag, slptimeo);
+		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
@@ -2758,9 +3153,10 @@
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
-		BO_LOCK(bo);
-		bremfree(bp);
-		BO_UNLOCK(bo);
+		if (bp->b_flags & B_MANAGED)
+			MPASS(bp->b_qindex == QUEUE_NONE);
+		else
+			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
@@ -2798,12 +3194,18 @@
 		}
 
 		/*
-		 * If the size is inconsistant in the VMIO case, we can resize
+		 * Handle the case of unmapped buffer which should
+		 * become mapped, or the buffer for which KVA
+		 * reservation is requested.
+		 */
+		bp_unmapped_get_kva(bp, blkno, size, flags);
+
+		/*
+		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
-
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
@@ -2844,15 +3246,12 @@
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
-		int bsize, maxsize, vmio;
-		off_t offset;
-
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
-		BO_UNLOCK(bo);
+		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
@@ -2859,10 +3258,19 @@
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
+		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+			return NULL;
+
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
-		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+		if (vmio) {
+			maxsize = size + (offset & PAGE_MASK);
+		} else {
+			maxsize = size;
+			/* Do not allow non-VMIO notmapped buffers. */
+			flags &= ~GB_UNMAPPED;
+		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
@@ -2918,6 +3326,7 @@
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
+			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
@@ -2961,7 +3370,7 @@
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
- * deadlock or inconsistant data situations.  Tread lightly!!! 
+ * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
@@ -3093,11 +3502,15 @@
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
-				pmap_qremove((vm_offset_t)trunc_page(
-				    (vm_offset_t)bp->b_data) +
-				    (desiredpages << PAGE_SHIFT),
-				    (bp->b_npages - desiredpages));
-				VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+				if ((bp->b_flags & B_UNMAPPED) == 0) {
+					BUF_CHECK_MAPPED(bp);
+					pmap_qremove((vm_offset_t)trunc_page(
+					    (vm_offset_t)bp->b_data) +
+					    (desiredpages << PAGE_SHIFT),
+					    (bp->b_npages - desiredpages));
+				} else
+					BUF_CHECK_UNMAPPED(bp);
+				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
@@ -3107,7 +3520,7 @@
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
-					while (vm_page_sleep_if_busy(m, TRUE,
+					while (vm_page_sleep_if_busy(m,
 					    "biodep"))
 						continue;
 
@@ -3116,7 +3529,7 @@
 					vm_page_unwire(m, 0);
 					vm_page_unlock(m);
 				}
-				VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
@@ -3137,7 +3550,7 @@
 
 			obj = bp->b_bufobj->bo_object;
 
-			VM_OBJECT_LOCK(obj);
+			VM_OBJECT_WLOCK(obj);
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 
@@ -3146,15 +3559,15 @@
 				 * here could interfere with paging I/O, no
 				 * matter which process we are.
 				 *
-				 * We can only test VPO_BUSY here.  Blocking on
-				 * m->busy might lead to a deadlock:
-				 *  vm_fault->getpages->cluster_read->allocbuf
-				 * Thus, we specify VM_ALLOC_IGN_SBUSY.
+				 * Only exclusive busy can be tested here.
+				 * Blocking on shared busy might lead to
+				 * deadlocks once allocbuf() is called after
+				 * pages are vfs_busy_pages().
 				 */
 				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
 				    bp->b_npages, VM_ALLOC_NOBUSY |
 				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
-				    VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY |
+				    VM_ALLOC_IGN_SBUSY |
 				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
 				if (m->valid == 0)
 					bp->b_flags &= ~B_CACHE;
@@ -3199,24 +3612,15 @@
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
-			VM_OBJECT_UNLOCK(obj);
+			VM_OBJECT_WUNLOCK(obj);
 
 			/*
-			 * Step 3, fixup the KVM pmap.  Remember that
-			 * bp->b_data is relative to bp->b_offset, but 
-			 * bp->b_offset may be offset into the first page.
+			 * Step 3, fixup the KVM pmap.
 			 */
-
-			bp->b_data = (caddr_t)
-			    trunc_page((vm_offset_t)bp->b_data);
-			pmap_qenter(
-			    (vm_offset_t)bp->b_data,
-			    bp->b_pages, 
-			    bp->b_npages
-			);
-			
-			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
-			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
+			if ((bp->b_flags & B_UNMAPPED) == 0)
+				bpmap_qenter(bp);
+			else
+				BUF_CHECK_UNMAPPED(bp);
 		}
 	}
 	if (newbsize < bp->b_bufsize)
@@ -3226,28 +3630,39 @@
 	return 1;
 }
 
+extern int inflight_transient_maps;
+
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
+	vm_offset_t start, end;
 
-	mtxp = mtx_pool_find(mtxpool_sleep, bp);
-	mtx_lock(mtxp);
-	bp->bio_flags |= BIO_DONE;
+	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
+		bp->bio_flags |= BIO_UNMAPPED;
+		start = trunc_page((vm_offset_t)bp->bio_data);
+		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+		pmap_qremove(start, OFF_TO_IDX(end - start));
+		vmem_free(transient_arena, start, end - start);
+		atomic_add_int(&inflight_transient_maps, -1);
+	}
 	done = bp->bio_done;
-	if (done == NULL)
+	if (done == NULL) {
+		mtxp = mtx_pool_find(mtxpool_sleep, bp);
+		mtx_lock(mtxp);
+		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
-	mtx_unlock(mtxp);
-	if (done != NULL)
+		mtx_unlock(mtxp);
+	} else {
+		bp->bio_flags |= BIO_DONE;
 		done(bp);
+	}
 }
 
 /*
  * Wait for a BIO to finish.
- *
- * XXX: resort to a timeout for now.  The optimal locking (if any) for this
- * case is not yet clear.
  */
 int
 biowait(struct bio *bp, const char *wchan)
@@ -3257,7 +3672,7 @@
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
-		msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
+		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
@@ -3366,7 +3781,7 @@
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
-	bip->bio_data = bp->b_data;
+	bdata2bio(bp, bip);
 	bip->bio_done = bufdonebio;
 	bip->bio_caller2 = bp;
 	bip->bio_dev = dev;
@@ -3385,11 +3800,11 @@
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
- *	read error occured, or if the op was a write.  B_CACHE is never
+ *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
- *	initiator to leave B_INVAL set to brelse the buffer out of existance
+ *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
@@ -3455,7 +3870,7 @@
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
-		 * occured.  B_CACHE is set for writes in the b*write()
+		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount - bp->b_resid;
@@ -3465,7 +3880,7 @@
 			bp->b_flags |= B_CACHE;
 		}
 		bogus = 0;
-		VM_OBJECT_LOCK(obj);
+		VM_OBJECT_WLOCK(obj);
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
@@ -3501,16 +3916,18 @@
 				vfs_page_set_valid(bp, foff, m);
 			}
 
-			vm_page_io_finish(m);
+			vm_page_sunbusy(m);
 			vm_object_pip_subtract(obj, 1);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		vm_object_pip_wakeupn(obj, 0);
-		VM_OBJECT_UNLOCK(obj);
-		if (bogus)
+		VM_OBJECT_WUNLOCK(obj);
+		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+			BUF_CHECK_MAPPED(bp);
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 			    bp->b_pages, bp->b_npages);
+		}
 	}
 
 	/*
@@ -3531,7 +3948,7 @@
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
- * consistant.
+ * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
@@ -3545,7 +3962,7 @@
 		return;
 
 	obj = bp->b_bufobj->bo_object;
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
@@ -3553,14 +3970,18 @@
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
-			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
-			    bp->b_pages, bp->b_npages);
+			if ((bp->b_flags & B_UNMAPPED) == 0) {
+				BUF_CHECK_MAPPED(bp);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
+			} else
+				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_object_pip_subtract(obj, 1);
-		vm_page_io_finish(m);
+		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, 0);
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
@@ -3591,7 +4012,7 @@
 	 * entire page.
 	 */
 	if (eoff > off)
-		vm_page_set_valid(m, off & PAGE_MASK, eoff - off);
+		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
@@ -3607,7 +4028,7 @@
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
-	 * page boundry or cross the end of the buffer.  The end of the
+	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
@@ -3630,28 +4051,32 @@
 }
 
 /*
- * Ensure that all buffer pages are not busied by VPO_BUSY flag. If
- * any page is busy, drain the flag.
+ * Ensure that all buffer pages are not exclusive busied.  If any page is
+ * exclusive busy, drain it.
  */
-static void
+void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
-	VM_OBJECT_LOCK_ASSERT(bp->b_bufobj->bo_object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
-		if ((m->oflags & VPO_BUSY) != 0) {
+		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
-				vm_page_busy(bp->b_pages[last_busied]);
-			while ((m->oflags & VPO_BUSY) != 0)
-				vm_page_sleep(m, "vbpage");
+				vm_page_sbusy(bp->b_pages[last_busied]);
+			while (vm_page_xbusied(m)) {
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+				vm_page_busy_sleep(m, "vbpage", true);
+				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
-		vm_page_wakeup(bp->b_pages[i]);
+		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
@@ -3658,12 +4083,12 @@
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
- * almost as being VPO_BUSY.  Also the object paging_in_progress
+ * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
- * inconsistant.
+ * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
- * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
@@ -3681,7 +4106,7 @@
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
@@ -3691,7 +4116,7 @@
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
-			vm_page_io_start(m);
+			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
@@ -3718,10 +4143,12 @@
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
-	VM_OBJECT_UNLOCK(obj);
-	if (bogus)
+	VM_OBJECT_WUNLOCK(obj);
+	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
+	}
 }
 
 /*
@@ -3749,17 +4176,17 @@
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
-	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
-		vm_page_set_valid(m, base & PAGE_MASK, n);
+		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
-	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
@@ -3785,13 +4212,13 @@
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
-	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
-		VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED);
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
@@ -3811,7 +4238,7 @@
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
-		VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED);
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
@@ -3827,10 +4254,34 @@
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
-	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+	vm_page_t m;
+	int i, n;
+
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		BUF_CHECK_MAPPED(bp);
+		bzero(bp->b_data + base, size);
+	} else {
+		BUF_CHECK_UNMAPPED(bp);
+		n = PAGE_SIZE - (base & PAGE_MASK);
+		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+			m = bp->b_pages[i];
+			if (n > size)
+				n = size;
+			pmap_zero_page_area(m, base & PAGE_MASK, n);
+			base += n;
+			size -= n;
+			n = PAGE_SIZE;
+		}
+	}
+}
+
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
@@ -3843,6 +4294,8 @@
 	vm_page_t p;
 	int index;
 
+	BUF_CHECK_MAPPED(bp);
+
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
@@ -3874,6 +4327,8 @@
 	vm_page_t p;
 	int index, newnpages;
 
+	BUF_CHECK_MAPPED(bp);
+
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
@@ -3881,7 +4336,7 @@
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
-		if (p->busy != 0)
+		if (vm_page_sbusied(p))
 			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
 		p->wire_count--;
@@ -3904,7 +4359,7 @@
  * check the return value.
  */
 int
-vmapbuf(struct buf *bp)
+vmapbuf(struct buf *bp, int mapbuf)
 {
 	caddr_t kva;
 	vm_prot_t prot;
@@ -3919,12 +4374,19 @@
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
-	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
-	
-	kva = bp->b_saveaddr;
 	bp->b_npages = pidx;
-	bp->b_saveaddr = bp->b_data;
-	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
+	if (mapbuf || !unmapped_buf_allowed) {
+		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
+		kva = bp->b_saveaddr;
+		bp->b_saveaddr = bp->b_data;
+		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
+		bp->b_flags &= ~B_UNMAPPED;
+	} else {
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+		bp->b_saveaddr = bp->b_data;
+		bp->b_data = unmapped_buf;
+	}
 	return(0);
 }
 
@@ -3938,7 +4400,10 @@
 	int npages;
 
 	npages = bp->b_npages;
-	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+	if (bp->b_flags & B_UNMAPPED)
+		bp->b_flags &= ~B_UNMAPPED;
+	else
+		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 	
 	bp->b_data = bp->b_saveaddr;
@@ -3994,7 +4459,7 @@
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
@@ -4028,11 +4493,11 @@
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
-		error = msleep(&bo->bo_numoutput, BO_MTX(bo),
+		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
@@ -4075,6 +4540,30 @@
 	mtx_unlock(mtxp);
 }
 
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+	if ((bp->b_flags & B_UNMAPPED) != 0) {
+		KASSERT(unmapped_buf_allowed, ("unmapped"));
+		bip->bio_ma = bp->b_pages;
+		bip->bio_ma_n = bp->b_npages;
+		bip->bio_data = unmapped_buf;
+		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+		bip->bio_flags |= BIO_UNMAPPED;
+		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+		    PAGE_SIZE == bp->b_npages,
+		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+		    (long long)bip->bio_length, bip->bio_ma_n));
+	} else {
+		bip->bio_data = bp->b_data;
+		bip->bio_ma = NULL;
+	}
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -4166,7 +4655,7 @@
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
-		if ((bp->b_vflags & BV_INFREECNT) != 0)
+		if ((bp->b_flags & B_INFREECNT) != 0)
 			nfree++;
 		else
 			used++;

Modified: trunk/sys/kern/vfs_cache.c
===================================================================
--- trunk/sys/kern/vfs_cache.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_cache.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_cache.c 324611 2017-10-13 21:58:44Z jhb $");
 
 #include "opt_kdtrace.h"
 #include "opt_ktrace.h"
@@ -62,28 +63,28 @@
 #include <vm/uma.h>
 
 SDT_PROVIDER_DECLARE(vfs);
-SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
     "struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
     "char *");
-SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
-SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
-    "struct char *", "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
-SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
-    "struct vnode *", "struct char *");
-SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
+    "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
+    "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
     "struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
     "struct vnode *", "char *");
-SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
     "char *");
-SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
-SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
     "struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
     "char *");
 
 /*
@@ -150,7 +151,7 @@
  */
 
 /*
- * Structures associated with name cacheing.
+ * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
@@ -265,7 +266,7 @@
     "VFS namecache enabled");
 
 /* Export size information to userland */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
     sizeof(struct namecache), "sizeof(struct namecache)");
 
 /*
@@ -289,7 +290,7 @@
     "Number of cache misses");
 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
     "Number of cache misses we do not want to cache");
-static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, 
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
     "Number of cache hits (positive) we do not want to cache");
 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
     "Number of cache hits (positive)");
@@ -304,8 +305,6 @@
     &nchstats, sizeof(nchstats), "LU",
     "VFS cache effectiveness statistics");
 
-
-
 static void cache_zap(struct namecache *ncp);
 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
     u_int *buflen);
@@ -324,29 +323,31 @@
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
-	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
-	int n_nchash;
-	int count;
+	int i, error, n_nchash, *cntbuf;
 
+retry:
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
-
-	/* Scan hash tables for applicable entries */
-	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
-		CACHE_RLOCK();
-		count = 0;
-		LIST_FOREACH(ncp, ncpp, nc_hash) {
-			count++;
-		}
+	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
+	CACHE_RLOCK();
+	if (n_nchash != nchash + 1) {
 		CACHE_RUNLOCK();
-		error = SYSCTL_OUT(req, &count, sizeof(count));
-		if (error)
-			return (error);
+		free(cntbuf, M_TEMP);
+		goto retry;
 	}
-	return (0);
+	/* Scan hash tables counting entries */
+	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
+		LIST_FOREACH(ncp, ncpp, nc_hash)
+			cntbuf[i]++;
+	CACHE_RUNLOCK();
+	for (error = 0, i = 0; i < n_nchash; i++)
+		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
+			break;
+	free(cntbuf, M_TEMP);
+	return (error);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
@@ -382,7 +383,7 @@
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
-	pct = (used * 100 * 100) / n_nchash;
+	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
@@ -399,7 +400,7 @@
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
-    "nchash chain lengths");
+    "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
@@ -409,22 +410,19 @@
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap(ncp)
-	struct namecache *ncp;
+cache_zap(struct namecache *ncp)
 {
 	struct vnode *vp;
 
 	rw_assert(&cache_lock, RA_WLOCKED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
-#ifdef KDTRACE_HOOKS
 	if (ncp->nc_vp != NULL) {
-		SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
-		    nc_get_name(ncp), ncp->nc_vp, 0, 0);
+		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
+		    nc_get_name(ncp), ncp->nc_vp);
 	} else {
-		SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
-		    nc_get_name(ncp), 0, 0, 0);
+		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+		    nc_get_name(ncp));
 	}
-#endif
 	vp = NULL;
 	LIST_REMOVE(ncp, nc_hash);
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
@@ -447,7 +445,7 @@
 	}
 	numcache--;
 	cache_free(ncp);
-	if (vp)
+	if (vp != NULL)
 		vdrop(vp);
 }
 
@@ -458,7 +456,7 @@
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
- * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * (negative caching), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.  If the directory vnode is
  * recycled out from under us due to a forced unmount, a status of
  * ENOENT is returned.
@@ -469,12 +467,8 @@
  */
 
 int
-cache_lookup_times(dvp, vpp, cnp, tsp, ticksp)
-	struct vnode *dvp;
-	struct vnode **vpp;
-	struct componentname *cnp;
-	struct timespec *tsp;
-	int *ticksp;
+cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+    struct timespec *tsp, int *ticksp)
 {
 	struct namecache *ncp;
 	uint32_t hash;
@@ -497,8 +491,7 @@
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			dothits++;
-			SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
-			    *vpp, 0, 0);
+			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 			if (tsp != NULL)
 				timespecclear(tsp);
 			if (ticksp != NULL)
@@ -508,8 +501,8 @@
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
 			if (dvp->v_cache_dd == NULL) {
-				SDT_PROBE(vfs, namecache, lookup, miss, dvp,
-				    "..", NULL, 0, 0);
+				SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+				    "..", NULL);
 				goto unlock;
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
@@ -531,8 +524,8 @@
 				goto negative_success;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
-			SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
-			    *vpp, 0, 0);
+			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
+			    *vpp);
 			cache_out_ts(ncp, tsp, ticksp);
 			if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 			    NCF_DTS && tsp != NULL)
@@ -553,8 +546,8 @@
 
 	/* We failed to find an entry */
 	if (ncp == NULL) {
-		SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
-		    NULL, 0, 0);
+		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+		    NULL);
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
@@ -582,8 +575,8 @@
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
-		SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
-		    *vpp, 0, 0);
+		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
+		    *vpp);
 		cache_out_ts(ncp, tsp, ticksp);
 		goto success;
 	}
@@ -614,8 +607,8 @@
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
-	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
-	    0, 0, 0);
+	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+	    nc_get_name(ncp));
 	cache_out_ts(ncp, tsp, ticksp);
 	CACHE_WUNLOCK();
 	return (ENOENT);
@@ -703,12 +696,8 @@
  * Add an entry to the cache.
  */
 void
-cache_enter_time(dvp, vp, cnp, tsp, dtsp)
-	struct vnode *dvp;
-	struct vnode *vp;
-	struct componentname *cnp;
-	struct timespec *tsp;
-	struct timespec *dtsp;
+cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
+    struct timespec *tsp, struct timespec *dtsp)
 {
 	struct namecache *ncp, *n2;
 	struct namecache_ts *n3;
@@ -749,23 +738,26 @@
 			    ncp->nc_flag & NCF_ISDOTDOT) {
 				KASSERT(ncp->nc_dvp == dvp,
 				    ("wrong isdotdot parent"));
-				if (ncp->nc_vp != NULL)
+				if (ncp->nc_vp != NULL) {
 					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 					    ncp, nc_dst);
-				else
+				} else {
 					TAILQ_REMOVE(&ncneg, ncp, nc_dst);
-				if (vp != NULL)
+					numneg--;
+				}
+				if (vp != NULL) {
 					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 					    ncp, nc_dst);
-				else
+				} else {
 					TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+					numneg++;
+				}
 				ncp->nc_vp = vp;
 				CACHE_WUNLOCK();
 				return;
 			}
 			dvp->v_cache_dd = NULL;
-			SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
-			    0, 0);
+			SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
 			CACHE_WUNLOCK();
 			flag = NCF_ISDOTDOT;
 		}
@@ -835,9 +827,9 @@
 		 * has populated v_cache_dd pointer already.
 		 */
 		if (dvp->v_cache_dd != NULL) {
-		    CACHE_WUNLOCK();
-		    cache_free(ncp);
-		    return;
+			CACHE_WUNLOCK();
+			cache_free(ncp);
+			return;
 		}
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
@@ -845,7 +837,7 @@
 	}
 
 	numcache++;
-	if (!vp) {
+	if (vp == NULL) {
 		numneg++;
 		if (cnp->cn_flags & ISWHITEOUT)
 			ncp->nc_flag |= NCF_WHITE;
@@ -883,17 +875,19 @@
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
-	if (vp) {
+	if (vp != NULL) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
-		SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
-		    vp, 0, 0);
+		SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
+		    vp);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-		SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
-		    nc_get_name(ncp), 0, 0, 0);
+		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
+		    nc_get_name(ncp));
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp = TAILQ_FIRST(&ncneg);
+		KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg",
+		    ncp, ncp->nc_vp));
 		zap = 1;
 	}
 	if (hold)
@@ -914,32 +908,73 @@
 
 	cache_zone_small = uma_zcreate("S VFS Cache",
 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+	    UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+	    UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache",
 	    sizeof(struct namecache) + NAME_MAX + 1,
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+	    UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+	    UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
+void
+cache_changesize(int newmaxvnodes)
+{
+	struct nchashhead *new_nchashtbl, *old_nchashtbl;
+	u_long new_nchash, old_nchash;
+	struct namecache *ncp;
+	uint32_t hash;
+	int i;
 
+	new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash);
+	/* If same hash table size, nothing to do */
+	if (nchash == new_nchash) {
+		free(new_nchashtbl, M_VFSCACHE);
+		return;
+	}
+	/*
+	 * Move everything from the old hash table to the new table.
+	 * None of the namecache entries in the table can be removed
+	 * because to do so, they have to be removed from the hash table.
+	 */
+	CACHE_WLOCK();
+	old_nchashtbl = nchashtbl;
+	old_nchash = nchash;
+	nchashtbl = new_nchashtbl;
+	nchash = new_nchash;
+	for (i = 0; i <= old_nchash; i++) {
+		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+			hash = fnv_32_buf(nc_get_name(ncp), ncp->nc_nlen,
+			    FNV1_32_INIT);
+			hash = fnv_32_buf(&ncp->nc_dvp, sizeof(ncp->nc_dvp),
+			    hash);
+			LIST_REMOVE(ncp, nc_hash);
+			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+		}
+	}
+	CACHE_WUNLOCK();
+	free(old_nchashtbl, M_VFSCACHE);
+}
+
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
-cache_purge(vp)
-	struct vnode *vp;
+cache_purge(struct vnode *vp)
 {
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
-	SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
+	SDT_PROBE1(vfs, namecache, purge, done, vp);
 	CACHE_WLOCK();
 	while (!LIST_EMPTY(&vp->v_cache_src))
 		cache_zap(LIST_FIRST(&vp->v_cache_src));
@@ -958,13 +993,12 @@
  * Invalidate all negative entries for a particular directory vnode.
  */
 void
-cache_purge_negative(vp)
-	struct vnode *vp;
+cache_purge_negative(struct vnode *vp)
 {
 	struct namecache *cp, *ncp;
 
 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
-	SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
+	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 	CACHE_WLOCK();
 	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
 		if (cp->nc_vp == NULL)
@@ -977,14 +1011,13 @@
  * Flush all entries referencing a particular filesystem.
  */
 void
-cache_purgevfs(mp)
-	struct mount *mp;
+cache_purgevfs(struct mount *mp)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 
 	/* Scan hash tables for applicable entries */
-	SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
+	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 	CACHE_WLOCK();
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
 		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
@@ -1001,12 +1034,7 @@
  */
 
 int
-vfs_cache_lookup(ap)
-	struct vop_lookup_args /* {
-		struct vnode *a_dvp;
-		struct vnode **a_vpp;
-		struct componentname *a_cnp;
-	} */ *ap;
+vfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp;
 	int error;
@@ -1030,7 +1058,7 @@
 	if (error)
 		return (error);
 
-	error = cache_lookup(dvp, vpp, cnp);
+	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
@@ -1038,14 +1066,6 @@
 	return (error);
 }
 
-
-#ifndef _SYS_SYSPROTO_H_
-struct  __getcwd_args {
-	u_char	*buf;
-	u_int	buflen;
-};
-#endif
-
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
@@ -1055,28 +1075,28 @@
 
 /* Implementation of the getcwd syscall. */
 int
-sys___getcwd(td, uap)
-	struct thread *td;
-	struct __getcwd_args *uap;
+sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 {
 
-	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
+	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
+	    MAXPATHLEN));
 }
 
 int
-kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
+kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen,
+    u_int path_max)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	struct vnode *cdir, *rdir;
-	int error, vfslocked;
+	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
-	if (buflen > MAXPATHLEN)
-		buflen = MAXPATHLEN;
+	if (buflen > path_max)
+		buflen = path_max;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
@@ -1087,12 +1107,8 @@
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
-	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
 	vrele(rdir);
-	VFS_UNLOCK_GIANT(vfslocked);
-	vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
 	vrele(cdir);
-	VFS_UNLOCK_GIANT(vfslocked);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
@@ -1139,7 +1155,7 @@
 	char *buf;
 	struct filedesc *fdp;
 	struct vnode *rdir;
-	int error, vfslocked;
+	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
@@ -1153,9 +1169,7 @@
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
-	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
 	vrele(rdir);
-	VFS_UNLOCK_GIANT(vfslocked);
 
 	if (!error)
 		*freebuf = buf;
@@ -1208,7 +1222,7 @@
 {
 	struct vnode *dvp;
 	struct namecache *ncp;
-	int error, vfslocked;
+	int error;
 
 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
@@ -1217,41 +1231,34 @@
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
 			vrele(*vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail4++;
 			error = ENOMEM;
-			SDT_PROBE(vfs, namecache, fullpath, return, error,
-			    vp, NULL, 0, 0);
+			SDT_PROBE3(vfs, namecache, fullpath, return, error,
+			    vp, NULL);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
-		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
-		    nc_get_name(ncp), vp, 0, 0);
+		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+		    nc_get_name(ncp), vp);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
 		CACHE_RUNLOCK();
-		vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
 		vrele(dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		CACHE_RLOCK();
 		return (0);
 	}
-	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
+	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 
 	CACHE_RUNLOCK();
-	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 	vput(*vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	if (error) {
 		numfullpathfail2++;
-		SDT_PROBE(vfs, namecache, fullpath, return,  error, vp,
-		    NULL, 0, 0);
+		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 		return (error);
 	}
 
@@ -1260,12 +1267,9 @@
 	if (dvp->v_iflag & VI_DOOMED) {
 		/* forced unmount */
 		CACHE_RUNLOCK();
-		vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
 		vrele(dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		error = ENOENT;
-		SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
-		    NULL, 0, 0);
+		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 		return (error);
 	}
 	/*
@@ -1282,7 +1286,7 @@
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
-	int error, slash_prefixed, vfslocked;
+	int error, slash_prefixed;
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
@@ -1293,7 +1297,7 @@
 	error = 0;
 	slash_prefixed = 0;
 
-	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
+	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 	numfullpathcalls++;
 	vref(vp);
 	CACHE_RLOCK();
@@ -1303,9 +1307,7 @@
 			return (error);
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
@@ -1315,20 +1317,16 @@
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				CACHE_RUNLOCK();
-				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 				vrele(vp);
-				VFS_UNLOCK_GIANT(vfslocked);
 				error = ENOENT;
-				SDT_PROBE(vfs, namecache, fullpath, return,
-				    error, vp, NULL, 0, 0);
+				SDT_PROBE3(vfs, namecache, fullpath, return,
+				    error, vp, NULL);
 				break;
 			}
 			vp1 = vp->v_mount->mnt_vnodecovered;
 			vref(vp1);
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			vp = vp1;
 			CACHE_RLOCK();
 			continue;
@@ -1335,13 +1333,11 @@
 		}
 		if (vp->v_type != VDIR) {
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail1++;
 			error = ENOTDIR;
-			SDT_PROBE(vfs, namecache, fullpath, return,
-			    error, vp, NULL, 0, 0);
+			SDT_PROBE3(vfs, namecache, fullpath, return,
+			    error, vp, NULL);
 			break;
 		}
 		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
@@ -1349,12 +1345,10 @@
 			break;
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			error = ENOMEM;
-			SDT_PROBE(vfs, namecache, fullpath, return, error,
-			    startvp, NULL, 0, 0);
+			SDT_PROBE3(vfs, namecache, fullpath, return, error,
+			    startvp, NULL);
 			break;
 		}
 		buf[--buflen] = '/';
@@ -1365,12 +1359,10 @@
 	if (!slash_prefixed) {
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
-			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			vrele(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail4++;
-			SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
-			    startvp, NULL, 0, 0);
+			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
+			    startvp, NULL);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
@@ -1377,12 +1369,9 @@
 	}
 	numfullpathfound++;
 	CACHE_RUNLOCK();
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 
-	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
-	    0, 0);
+	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 	*retbuf = buf + buflen;
 	return (0);
 }
@@ -1432,12 +1421,9 @@
 
 /* ABI compat shims for old kernel modules. */
 #undef cache_enter
-#undef cache_lookup
 
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
-int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
-	    struct componentname *cnp);
 
 void
 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
@@ -1446,13 +1432,6 @@
 	cache_enter_time(dvp, vp, cnp, NULL, NULL);
 }
 
-int
-cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
-{
-
-	return (cache_lookup_times(dvp, vpp, cnp, NULL, NULL));
-}
-
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
@@ -1473,9 +1452,8 @@
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
-	int error, vfslocked;
+	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Return ENODEV if sysctl debug.disablefullpath==1 */
@@ -1502,7 +1480,7 @@
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, path, td);
 	error = namei(&nd);
 	if (error != 0) {
@@ -1509,7 +1487,6 @@
 		vrele(vp);
 		goto out;
 	}
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp1 = nd.ni_vp;
 	vrele(vp);
@@ -1519,7 +1496,6 @@
 		vput(vp1);
 		error = ENOENT;
 	}
-	VFS_UNLOCK_GIANT(vfslocked);
 
 out:
 	free(fbuf, M_TEMP);

Modified: trunk/sys/kern/vfs_cluster.c
===================================================================
--- trunk/sys/kern/vfs_cluster.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_cluster.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -32,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_cluster.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include "opt_debug_cluster.h"
 
@@ -46,6 +47,7 @@
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
@@ -60,11 +62,11 @@
 
 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
 
-static struct cluster_save *
-	cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
-static struct buf *
-	cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
-			 daddr_t blkno, long size, int run, struct buf *fbp);
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+	    struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+	    daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+	    struct buf *fbp);
 static void cluster_callback(struct buf *);
 
 static int write_behind = 1;
@@ -88,15 +90,6 @@
  */
 int
 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
-    struct ucred *cred, long totread, int seqcount, struct buf **bpp)
-{
-
-	return (cluster_read_gb(vp, filesize, lblkno, size, cred, totread,
-	    seqcount, 0, bpp));
-}
-
-int
-cluster_read_gb(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
     struct ucred *cred, long totread, int seqcount, int gbflags,
     struct buf **bpp)
 {
@@ -109,6 +102,8 @@
 
 	error = 0;
 	bo = &vp->v_bufobj;
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
 
 	/*
 	 * Try to limit the amount of read-ahead by a few
@@ -124,7 +119,7 @@
 	/*
 	 * get the requested block
 	 */
-	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
 	origblkno = lblkno;
 
 	/*
@@ -139,7 +134,7 @@
 			return 0;
 		} else {
 			bp->b_flags &= ~B_RAM;
-			BO_LOCK(bo);
+			BO_RLOCK(bo);
 			for (i = 1; i < maxra; i++) {
 				/*
 				 * Stop if the buffer does not exist or it
@@ -162,7 +157,7 @@
 					BUF_UNLOCK(rbp);
 				}			
 			}
-			BO_UNLOCK(bo);
+			BO_RUNLOCK(bo);
 			if (i >= maxra) {
 				return 0;
 			}
@@ -223,7 +218,7 @@
 			if (ncontig < nblks)
 				nblks = ncontig;
 			bp = cluster_rbuild(vp, filesize, lblkno,
-				blkno, size, nblks, bp);
+			    blkno, size, nblks, gbflags, bp);
 			lblkno += (bp->b_bufsize / size);
 		} else {
 			bp->b_flags |= B_RAM;
@@ -267,7 +262,7 @@
 		if (ncontig) {
 			ncontig = min(ncontig + 1, racluster);
 			rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
-				size, ncontig, NULL);
+			    size, ncontig, gbflags, NULL);
 			lblkno += (rbp->b_bufsize / size);
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
@@ -274,7 +269,7 @@
 				continue;
 			}
 		} else {
-			rbp = getblk(vp, lblkno, size, 0, 0, 0);
+			rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
 			lblkno += 1;
 			if (rbp->b_flags & B_DELWRI) {
 				bqrelse(rbp);
@@ -313,24 +308,17 @@
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
-	struct vnode *vp;
-	u_quad_t filesize;
-	daddr_t lbn;
-	daddr_t blkno;
-	long size;
-	int run;
-	struct buf *fbp;
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+    daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
 {
-	struct bufobj *bo;
 	struct buf *bp, *tbp;
 	daddr_t bn;
 	off_t off;
 	long tinc, tsize;
-	int i, inc, j, toff;
+	int i, inc, j, k, toff;
 
 	KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
-	    ("cluster_rbuild: size %ld != filesize %jd\n",
+	    ("cluster_rbuild: size %ld != f_iosize %jd\n",
 	    size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
 
 	/*
@@ -344,7 +332,7 @@
 		tbp = fbp;
 		tbp->b_iocmd = BIO_READ; 
 	} else {
-		tbp = getblk(vp, lbn, size, 0, 0, 0);
+		tbp = getblk(vp, lbn, size, 0, 0, gbflags);
 		if (tbp->b_flags & B_CACHE)
 			return tbp;
 		tbp->b_flags |= B_ASYNC | B_RAM;
@@ -365,9 +353,14 @@
 	 * address may not be either.  Inherit the b_data offset
 	 * from the original buffer.
 	 */
-	bp->b_data = (char *)((vm_offset_t)bp->b_data |
-	    ((vm_offset_t)tbp->b_data & PAGE_MASK));
 	bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+	if ((gbflags & GB_UNMAPPED) != 0) {
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_data = unmapped_buf;
+	} else {
+		bp->b_data = (char *)((vm_offset_t)bp->b_data |
+		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+	}
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = cluster_callback;
 	bp->b_blkno = blkno;
@@ -383,15 +376,23 @@
 	bp->b_npages = 0;
 
 	inc = btodb(size);
-	bo = &vp->v_bufobj;
 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
-		if (i != 0) {
+		if (i == 0) {
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+			vfs_drain_busy_pages(tbp);
+			vm_object_pip_add(tbp->b_bufobj->bo_object,
+			    tbp->b_npages);
+			for (k = 0; k < tbp->b_npages; k++)
+				vm_page_sbusy(tbp->b_pages[k]);
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+		} else {
 			if ((bp->b_npages * PAGE_SIZE) +
 			    round_page(size) > vp->v_mount->mnt_iosize_max) {
 				break;
 			}
 
-			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
+			tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+			    (gbflags & GB_UNMAPPED));
 
 			/* Don't wait around for locked bufs. */
 			if (tbp == NULL)
@@ -402,17 +403,16 @@
 			 * (marked B_CACHE), or locked (may be doing a
 			 * background write), or if the buffer is not
 			 * VMIO backed.  The clustering code can only deal
-			 * with VMIO-backed buffers.
+			 * with VMIO-backed buffers.  The bo lock is not
+			 * required for the BKGRDINPROG check since it
+			 * can not be set without the buf lock.
 			 */
-			BO_LOCK(bo);
 			if ((tbp->b_vflags & BV_BKGRDINPROG) ||
 			    (tbp->b_flags & B_CACHE) ||
 			    (tbp->b_flags & B_VMIO) == 0) {
-				BO_UNLOCK(bo);
 				bqrelse(tbp);
 				break;
 			}
-			BO_UNLOCK(bo);
 
 			/*
 			 * The buffer must be completely invalid in order to
@@ -421,25 +421,33 @@
 			 */
 			off = tbp->b_offset;
 			tsize = size;
-			VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
+			VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 			for (j = 0; tsize > 0; j++) {
 				toff = off & PAGE_MASK;
 				tinc = tsize;
 				if (toff + tinc > PAGE_SIZE)
 					tinc = PAGE_SIZE - toff;
-				VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object,
-				    MA_OWNED);
+				VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
 				if ((tbp->b_pages[j]->valid &
 				    vm_page_bits(toff, tinc)) != 0)
 					break;
+				if (vm_page_xbusied(tbp->b_pages[j]))
+					break;
+				vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+				vm_page_sbusy(tbp->b_pages[j]);
 				off += tinc;
 				tsize -= tinc;
 			}
-			VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
 			if (tsize > 0) {
+clean_sbusy:
+				vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+				for (k = 0; k < j; k++)
+					vm_page_sunbusy(tbp->b_pages[k]);
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 				bqrelse(tbp);
 				break;
 			}
+			VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 
 			/*
 			 * Set a read-ahead mark as appropriate
@@ -459,8 +467,8 @@
 			if (tbp->b_blkno == tbp->b_lblkno) {
 				tbp->b_blkno = bn;
 			} else if (tbp->b_blkno != bn) {
-				brelse(tbp);
-				break;
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				goto clean_sbusy;
 			}
 		}
 		/*
@@ -470,14 +478,12 @@
 		BUF_KERNPROC(tbp);
 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 			tbp, b_cluster.cluster_entry);
-		VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
+		VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
 		for (j = 0; j < tbp->b_npages; j += 1) {
 			vm_page_t m;
 			m = tbp->b_pages[j];
-			vm_page_io_start(m);
-			vm_object_pip_add(m->object, 1);
 			if ((bp->b_npages == 0) ||
-				(bp->b_pages[bp->b_npages-1] != m)) {
+			    (bp->b_pages[bp->b_npages-1] != m)) {
 				bp->b_pages[bp->b_npages] = m;
 				bp->b_npages++;
 			}
@@ -484,7 +490,7 @@
 			if (m->valid == VM_PAGE_BITS_ALL)
 				tbp->b_pages[j] = bogus_page;
 		}
-		VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
+		VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 		/*
 		 * Don't inherit tbp->b_bufsize as it may be larger due to
 		 * a non-page-aligned size.  Instead just aggregate using
@@ -502,20 +508,22 @@
 	 * Fully valid pages in the cluster are already good and do not need
 	 * to be re-read from disk.  Replace the page with bogus_page
 	 */
-	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (j = 0; j < bp->b_npages; j++) {
-		VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED);
+		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
 		if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
 			bp->b_pages[j] = bogus_page;
 	}
-	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	if (bp->b_bufsize > bp->b_kvasize)
 		panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
 		    bp->b_bufsize, bp->b_kvasize);
 	bp->b_kvasize = bp->b_bufsize;
 
-	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
-		(vm_page_t *)bp->b_pages, bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		    (vm_page_t *)bp->b_pages, bp->b_npages);
+	}
 	return (bp);
 }
 
@@ -533,12 +541,15 @@
 	int error = 0;
 
 	/*
-	 * Must propogate errors to all the components.
+	 * Must propagate errors to all the components.
 	 */
 	if (bp->b_ioflags & BIO_ERROR)
 		error = bp->b_error;
 
-	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) == 0) {
+		pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+		    bp->b_npages);
+	}
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
@@ -580,7 +591,8 @@
  */
 
 static __inline int
-cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+    int gbflags)
 {
 	int r = 0;
 
@@ -591,7 +603,7 @@
 		start_lbn -= len;
 		/* FALLTHROUGH */
 	case 1:
-		r = cluster_wbuild(vp, size, start_lbn, len);
+		r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
 		/* FALLTHROUGH */
 	default:
 		/* FALLTHROUGH */
@@ -611,21 +623,17 @@
  *	4.	end of a cluster - asynchronously write cluster
  */
 void
-cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount)
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+    int gbflags)
 {
-
-	cluster_write_gb(vp, bp, filesize, seqcount, 0);
-}
-
-void
-cluster_write_gb(struct vnode *vp, struct buf *bp, u_quad_t filesize,
-    int seqcount, int gbflags)
-{
 	daddr_t lbn;
 	int maxclen, cursize;
 	int lblocksize;
 	int async;
 
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
 	if (vp->v_type == VREG) {
 		async = DOINGASYNC(vp);
 		lblocksize = vp->v_mount->mnt_stat.f_iosize;
@@ -665,13 +673,13 @@
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
 				if (!async && seqcount > 0) {
 					cluster_wbuild_wb(vp, lblocksize,
-						vp->v_cstart, cursize);
+					    vp->v_cstart, cursize, gbflags);
 				}
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
 
-				buflist = cluster_collectbufs(vp, bp);
+				buflist = cluster_collectbufs(vp, bp, gbflags);
 				endbp = &buflist->bs_children
 				    [buflist->bs_nchildren - 1];
 				if (VOP_REALLOCBLKS(vp, buflist)) {
@@ -690,7 +698,7 @@
 					if (seqcount > 1) {
 						cluster_wbuild_wb(vp, 
 						    lblocksize, vp->v_cstart, 
-						    cursize);
+						    cursize, gbflags);
 					}
 				} else {
 					/*
@@ -738,8 +746,10 @@
 		 * update daemon handle it.
 		 */
 		bdwrite(bp);
-		if (seqcount > 1)
-			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+		if (seqcount > 1) {
+			cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+			    vp->v_clen + 1, gbflags);
+		}
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else if (vm_page_count_severe()) {
@@ -765,14 +775,7 @@
  * the current block (if last_bp == NULL).
  */
 int
-cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
-{
-
-	return (cluster_wbuild_gb(vp, size, start_lbn, len, 0));
-}
-
-int
-cluster_wbuild_gb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
     int gbflags)
 {
 	struct buf *bp, *tbp;
@@ -781,6 +784,9 @@
 	int totalwritten = 0;
 	int dbsize = btodb(size);
 
+	if (!unmapped_buf_allowed)
+		gbflags &= ~GB_UNMAPPED;
+
 	bo = &vp->v_bufobj;
 	while (len > 0) {
 		/*
@@ -797,7 +803,7 @@
 			continue;
 		}
 		if (BUF_LOCK(tbp,
-		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_MTX(bo))) {
+		    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
 			++start_lbn;
 			--len;
 			continue;
@@ -830,7 +836,9 @@
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
-		  ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+		  ((bp = (vp->v_vflag & VV_MD) != 0 ?
+		  trypbuf(&cluster_pbuf_freecnt) :
+		  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
@@ -859,10 +867,16 @@
 		 * address may not be either.  Inherit the b_data offset
 		 * from the original buffer.
 		 */
-		bp->b_data = (char *)((vm_offset_t)bp->b_data |
-		    ((vm_offset_t)tbp->b_data & PAGE_MASK));
-		bp->b_flags |= B_CLUSTER |
-				(tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+		if ((gbflags & GB_UNMAPPED) == 0 ||
+		    (tbp->b_flags & B_VMIO) == 0) {
+			bp->b_data = (char *)((vm_offset_t)bp->b_data |
+			    ((vm_offset_t)tbp->b_data & PAGE_MASK));
+		} else {
+			bp->b_flags |= B_UNMAPPED;
+			bp->b_data = unmapped_buf;
+		}
+		bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+		    B_NEEDCOMMIT));
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
 		/*
@@ -892,7 +906,7 @@
 				 */
 				if (BUF_LOCK(tbp,
 				    LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
-				    BO_MTX(bo)))
+				    BO_LOCKPTR(bo)))
 					break;
 
 				if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
@@ -945,12 +959,14 @@
 			if (tbp->b_flags & B_VMIO) {
 				vm_page_t m;
 
-				VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
-				if (i != 0) { /* if not first buffer */
+				VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+				if (i == 0) {
+					vfs_drain_busy_pages(tbp);
+				} else { /* if not first buffer */
 					for (j = 0; j < tbp->b_npages; j += 1) {
 						m = tbp->b_pages[j];
-						if (m->oflags & VPO_BUSY) {
-							VM_OBJECT_UNLOCK(
+						if (vm_page_xbusied(m)) {
+							VM_OBJECT_WUNLOCK(
 							    tbp->b_object);
 							bqrelse(tbp);
 							goto finishcluster;
@@ -959,7 +975,7 @@
 				}
 				for (j = 0; j < tbp->b_npages; j += 1) {
 					m = tbp->b_pages[j];
-					vm_page_io_start(m);
+					vm_page_sbusy(m);
 					vm_object_pip_add(m->object, 1);
 					if ((bp->b_npages == 0) ||
 					  (bp->b_pages[bp->b_npages - 1] != m)) {
@@ -967,15 +983,21 @@
 						bp->b_npages++;
 					}
 				}
-				VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
+				VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
 			}
 			bp->b_bcount += size;
 			bp->b_bufsize += size;
-			bundirty(tbp);
-			tbp->b_flags &= ~B_DONE;
+			/*
+			 * If any of the clustered buffers have their
+			 * B_BARRIER flag set, transfer that request to
+			 * the cluster.
+			 */
+			bp->b_flags |= (tbp->b_flags & B_BARRIER);
+			tbp->b_flags &= ~(B_DONE | B_BARRIER);
+			tbp->b_flags |= B_ASYNC;
 			tbp->b_ioflags &= ~BIO_ERROR;
-			tbp->b_flags |= B_ASYNC;
 			tbp->b_iocmd = BIO_WRITE;
+			bundirty(tbp);
 			reassignbuf(tbp);		/* put on clean list */
 			bufobj_wref(tbp->b_bufobj);
 			BUF_KERNPROC(tbp);
@@ -983,8 +1005,10 @@
 				tbp, b_cluster.cluster_entry);
 		}
 	finishcluster:
-		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
-			(vm_page_t *) bp->b_pages, bp->b_npages);
+		if ((bp->b_flags & B_UNMAPPED) == 0) {
+			pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			    (vm_page_t *)bp->b_pages, bp->b_npages);
+		}
 		if (bp->b_bufsize > bp->b_kvasize)
 			panic(
 			    "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
@@ -1005,9 +1029,7 @@
  * Plus add one additional buffer.
  */
 static struct cluster_save *
-cluster_collectbufs(vp, last_bp)
-	struct vnode *vp;
-	struct buf *last_bp;
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
 {
 	struct cluster_save *buflist;
 	struct buf *bp;
@@ -1020,7 +1042,8 @@
 	buflist->bs_nchildren = 0;
 	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
-		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+		(void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+		    gbflags, &bp);
 		buflist->bs_children[i] = bp;
 		if (bp->b_blkno == bp->b_lblkno)
 			VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,

Modified: trunk/sys/kern/vfs_default.c
===================================================================
--- trunk/sys/kern/vfs_default.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_default.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_default.c 330266 2018-03-02 04:43:07Z mckusick $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -47,8 +48,8 @@
 #include <sys/lockf.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
-#include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/rwlock.h>
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
@@ -354,8 +355,8 @@
 		if (error)
 			goto out;
 
-		if ((dp->d_type != DT_WHT) &&
-		    !strcmp(dp->d_name, dirname)) {
+		if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+		    strcmp(dp->d_name, dirname) == 0) {
 			found = 1;
 			goto out;
 		}
@@ -399,17 +400,24 @@
 vop_stdadvlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp;
-	struct ucred *cred;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
-	cred = curthread->td_ucred;
-	vn_lock(vp, LK_SHARED | LK_RETRY);
-	error = VOP_GETATTR(vp, &vattr, cred);
-	VOP_UNLOCK(vp, 0);
-	if (error)
-		return (error);
+	if (ap->a_fl->l_whence == SEEK_END) {
+		/*
+		 * The NFSv4 server must avoid doing a vn_lock() here, since it
+		 * can deadlock the nfsd threads, due to a LOR.  Fortunately
+		 * the NFSv4 server always uses SEEK_SET and this code is
+		 * only required for the SEEK_END case.
+		 */
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			return (error);
+	} else
+		vattr.va_size = 0;
 
 	return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
 }
@@ -418,17 +426,19 @@
 vop_stdadvlockasync(struct vop_advlockasync_args *ap)
 {
 	struct vnode *vp;
-	struct ucred *cred;
 	struct vattr vattr;
 	int error;
 
 	vp = ap->a_vp;
-	cred = curthread->td_ucred;
-	vn_lock(vp, LK_SHARED | LK_RETRY);
-	error = VOP_GETATTR(vp, &vattr, cred);
-	VOP_UNLOCK(vp, 0);
-	if (error)
-		return (error);
+	if (ap->a_fl->l_whence == SEEK_END) {
+		/* The size argument is only needed for SEEK_END. */
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+		error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+		VOP_UNLOCK(vp, 0);
+		if (error)
+			return (error);
+	} else
+		vattr.va_size = 0;
 
 	return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
 }
@@ -626,18 +636,25 @@
 vop_stdfsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
-		struct ucred *a_cred;
 		int a_waitfor;
 		struct thread *a_td;
 	} */ *ap;
 {
-	struct vnode *vp = ap->a_vp;
-	struct buf *bp;
+	struct vnode *vp;
+	struct buf *bp, *nbp;
 	struct bufobj *bo;
-	struct buf *nbp;
-	int error = 0;
-	int maxretry = 1000;     /* large, arbitrarily chosen */
+	struct mount *mp;
+	int error, maxretry;
 
+	error = 0;
+	maxretry = 10000;     /* large, arbitrarily chosen */
+	vp = ap->a_vp;
+	mp = NULL;
+	if (vp->v_type == VCHR) {
+		VI_LOCK(vp);
+		mp = vp->v_rdev->si_mountpt;
+		VI_UNLOCK(vp);
+	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 loop1:
@@ -662,7 +679,7 @@
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
-			    BO_MTX(bo)) != 0) {
+			    BO_LOCKPTR(bo)) != 0) {
 				BO_LOCK(bo);
 				goto loop1;
 			}
@@ -680,6 +697,8 @@
 			bremfree(bp);
 			bawrite(bp);
 		}
+		if (maxretry < 1000)
+			pause("dirty", hz < 1000 ? 1 : hz / 1000);
 		BO_LOCK(bo);
 		goto loop2;
 	}
@@ -701,14 +720,16 @@
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) == 0)
 					continue;
-			if (error == 0 && --maxretry >= 0)
+			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
+			    (error == 0 && --maxretry >= 0))
 				goto loop1;
-			error = EAGAIN;
+			if (error == 0)
+				error = EAGAIN;
 		}
 	}
 	BO_UNLOCK(bo);
-	if (error == EAGAIN)
-		vprint("fsync: giving up on dirty", vp);
+	if (error != 0)
+		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 
 	return (error);
 }
@@ -1017,7 +1038,7 @@
 {
 	struct vnode *vp;
 	off_t start, end;
-	int error, vfslocked;
+	int error;
 
 	vp = ap->a_vp;
 	switch (ap->a_advice) {
@@ -1038,11 +1059,9 @@
 		 * requested range.
 		 */
 		error = 0;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (vp->v_iflag & VI_DOOMED) {
 			VOP_UNLOCK(vp, 0);
-			VFS_UNLOCK_GIANT(vfslocked);
 			break;
 		}
 		vinvalbuf(vp, V_CLEANONLY, 0, 0);
@@ -1049,13 +1068,12 @@
 		if (vp->v_object != NULL) {
 			start = trunc_page(ap->a_start);
 			end = round_page(ap->a_end);
-			VM_OBJECT_LOCK(vp->v_object);
+			VM_OBJECT_WLOCK(vp->v_object);
 			vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
 			    OFF_TO_IDX(end));
-			VM_OBJECT_UNLOCK(vp->v_object);
+			VM_OBJECT_WUNLOCK(vp->v_object);
 		}
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		break;
 	default:
 		error = EINVAL;

Modified: trunk/sys/kern/vfs_export.c
===================================================================
--- trunk/sys/kern/vfs_export.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_export.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_export.c 247116 2013-02-21 19:02:50Z jhb $");
 
 #include <sys/param.h>
 #include <sys/dirent.h>

Modified: trunk/sys/kern/vfs_extattr.c
===================================================================
--- trunk/sys/kern/vfs_extattr.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_extattr.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
@@ -27,11 +28,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_extattr.c 280258 2015-03-19 13:37:36Z rwatson $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
@@ -69,7 +70,7 @@
 	struct nameidata nd;
 	struct mount *mp, *mp_writable;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, fnvfslocked, error;
+	int error;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -85,27 +86,24 @@
 	}
 	AUDIT_ARG_TEXT(attrname);
 
-	vfslocked = fnvfslocked = 0;
 	mp = NULL;
 	filename_vp = NULL;
 	if (uap->filename != NULL) {
-		NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE2,
+		NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
 		    UIO_USERSPACE, uap->filename, td);
 		error = namei(&nd);
 		if (error)
 			return (error);
-		fnvfslocked = NDHASGIANT(&nd);
 		filename_vp = nd.ni_vp;
 		NDFREE(&nd, NDF_NO_VP_RELE);
 	}
 
 	/* uap->path is always defined. */
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF | AUDITVNODE1,
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		goto out;
-	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	error = vfs_busy(mp, 0);
 	if (error) {
@@ -145,8 +143,6 @@
 	 */
 	if (filename_vp != NULL)
 		vrele(filename_vp);
-	VFS_UNLOCK_GIANT(fnvfslocked);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -170,7 +166,6 @@
 	ssize_t cnt;
 	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
@@ -222,7 +217,8 @@
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	cap_rights_t rights;
+	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -231,15 +227,14 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_SET, &fp);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
 	if (error)
 		return (error);
 
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 
 	return (error);
 }
@@ -257,7 +252,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -265,7 +260,7 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
@@ -272,12 +267,10 @@
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -294,7 +287,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -302,7 +295,7 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	error = namei(&nd);
 	if (error)
@@ -309,12 +302,10 @@
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -338,7 +329,6 @@
 	size_t size, *sizep;
 	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	/*
@@ -402,7 +392,8 @@
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	cap_rights_t rights;
+	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -411,16 +402,15 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_GET, &fp);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
 	if (error)
 		return (error);
 
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, uap->data, uap->nbytes, td);
 
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -437,7 +427,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -445,19 +435,16 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -474,7 +461,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -482,19 +469,17 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+	    td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
 	    uap->data, uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -515,7 +500,6 @@
 	struct mount *mp;
 	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		return (error);
@@ -552,7 +536,8 @@
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	cap_rights_t rights;
+	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -561,16 +546,14 @@
 		return (error);
 	AUDIT_ARG_TEXT(attrname);
 
-	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_DELETE,
-	    &fp);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
 	if (error)
 		return (error);
 
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
 	    attrname, td);
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -585,7 +568,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -593,17 +576,14 @@
 		return(error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
@@ -618,7 +598,7 @@
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
 	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -626,17 +606,14 @@
 		return(error);
 	AUDIT_ARG_TEXT(attrname);
 
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return(error);
 }
 
@@ -660,7 +637,6 @@
 	ssize_t cnt;
 	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	auiop = NULL;
@@ -717,20 +693,20 @@
 	} */ *uap;
 {
 	struct file *fp;
-	int vfslocked, error;
+	cap_rights_t rights;
+	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_VALUE(uap->attrnamespace);
-	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_LIST, &fp);
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
 	if (error)
 		return (error);
 
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -745,22 +721,19 @@
 	} */ *uap;
 {
 	struct nameidata nd;
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -775,21 +748,19 @@
 	} */ *uap;
 {
 	struct nameidata nd;
-	int vfslocked, error;
+	int error;
 
 	AUDIT_ARG_VALUE(uap->attrnamespace);
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+	    td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
-	vfslocked = NDHASGIANT(&nd);
 	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
 	    uap->nbytes, td);
 
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }

Modified: trunk/sys/kern/vfs_hash.c
===================================================================
--- trunk/sys/kern/vfs_hash.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_hash.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005 Poul-Henning Kamp
  * All rights reserved.
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_hash.c 300140 2016-05-18 11:58:16Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -69,7 +70,8 @@
 }
 
 int
-vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp;
 	int error;
@@ -102,6 +104,36 @@
 }
 
 void
+vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+	struct vnode *vp;
+
+	while (1) {
+		mtx_lock(&vfs_hash_mtx);
+		LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+			if (vp->v_hash != hash)
+				continue;
+			if (vp->v_mount != mp)
+				continue;
+			if (fn != NULL && fn(vp, arg))
+				continue;
+			vhold(vp);
+			mtx_unlock(&vfs_hash_mtx);
+			vref(vp);
+			vdrop(vp);
+			*vpp = vp;
+			return;
+		}
+		if (vp == NULL) {
+			mtx_unlock(&vfs_hash_mtx);
+			*vpp = NULL;
+			return;
+		}
+	}
+}
+
+void
 vfs_hash_remove(struct vnode *vp)
 {
 
@@ -111,7 +143,8 @@
 }
 
 int
-vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
+    struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp2;
 	int error;
@@ -160,3 +193,40 @@
 	vp->v_hash = hash;
 	mtx_unlock(&vfs_hash_mtx);
 }
+
+void
+vfs_hash_changesize(int newmaxvnodes)
+{
+	struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
+	u_long vfs_hash_newmask, vfs_hash_oldmask;
+	struct vnode *vp;
+	int i;
+
+	vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
+		&vfs_hash_newmask);
+	/* If same hash table size, nothing to do */
+	if (vfs_hash_mask == vfs_hash_newmask) {
+		free(vfs_hash_newtbl, M_VFS_HASH);
+		return;
+	}
+	/*
+	 * Move everything from the old hash table to the new table.
+	 * None of the vnodes in the table can be recycled because to
+	 * do so, they have to be removed from the hash table.
+	 */
+	mtx_lock(&vfs_hash_mtx);
+	vfs_hash_oldtbl = vfs_hash_tbl;
+	vfs_hash_oldmask = vfs_hash_mask;
+	vfs_hash_tbl = vfs_hash_newtbl;
+	vfs_hash_mask = vfs_hash_newmask;
+	for (i = 0; i <= vfs_hash_oldmask; i++) {
+		while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
+			LIST_REMOVE(vp, v_hashlist);
+			LIST_INSERT_HEAD(
+			    vfs_hash_bucket(vp->v_mount, vp->v_hash),
+			    vp, v_hashlist);
+		}
+	}
+	mtx_unlock(&vfs_hash_mtx);
+	free(vfs_hash_oldtbl, M_VFS_HASH);
+}

Modified: trunk/sys/kern/vfs_init.c
===================================================================
--- trunk/sys/kern/vfs_init.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_init.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_init.c 284021 2015-06-05 08:36:25Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -44,6 +45,7 @@
 #include <sys/linker.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
+#include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
@@ -64,6 +66,8 @@
  * New entries are added/deleted by vfs_register()/vfs_unregister()
  */
 struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+struct sx vfsconf_sx;
+SX_SYSINIT(vfsconf, &vfsconf_sx, "vfsconf");
 
 /*
  * Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
@@ -105,20 +109,33 @@
  * Routines having to do with the management of the vnode table.
  */
 
-struct vfsconf *
-vfs_byname(const char *name)
+static struct vfsconf *
+vfs_byname_locked(const char *name)
 {
 	struct vfsconf *vfsp;
 
+	sx_assert(&vfsconf_sx, SA_LOCKED);
 	if (!strcmp(name, "ffs"))
 		name = "ufs";
-	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		if (!strcmp(name, vfsp->vfc_name))
 			return (vfsp);
+	}
 	return (NULL);
 }
 
 struct vfsconf *
+vfs_byname(const char *name)
+{
+	struct vfsconf *vfsp;
+
+	vfsconf_slock();
+	vfsp = vfs_byname_locked(name);
+	vfsconf_sunlock();
+	return (vfsp);
+}
+
+struct vfsconf *
 vfs_byname_kld(const char *fstype, struct thread *td, int *error)
 {
 	struct vfsconf *vfsp;
@@ -169,8 +186,11 @@
 		    vfc->vfc_name, vfc->vfc_version);
 		return (EINVAL);
 	}
-	if (vfs_byname(vfc->vfc_name) != NULL)
-		return EEXIST;
+	vfsconf_lock();
+	if (vfs_byname_locked(vfc->vfc_name) != NULL) {
+		vfsconf_unlock();
+		return (EEXIST);
+	}
 
 	if (vfs_typenumhash != 0) {
 		/*
@@ -203,26 +223,6 @@
 	TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
 
 	/*
-	 * If this filesystem has a sysctl node under vfs
-	 * (i.e. vfs.xxfs), then change the oid number of that node to 
-	 * match the filesystem's type number.  This allows user code
-	 * which uses the type number to read sysctl variables defined
-	 * by the filesystem to continue working. Since the oids are
-	 * in a sorted list, we need to make sure the order is
-	 * preserved by re-registering the oid after modifying its
-	 * number.
-	 */
-	sysctl_lock();
-	SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
-		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
-			sysctl_unregister_oid(oidp);
-			oidp->oid_number = vfc->vfc_typenum;
-			sysctl_register_oid(oidp);
-			break;
-		}
-	sysctl_unlock();
-
-	/*
 	 * Initialise unused ``struct vfsops'' fields, to use
 	 * the vfs_std*() functions.  Note, we need the mount
 	 * and unmount operations, at the least.  The check
@@ -281,8 +281,30 @@
 	 * Call init function for this VFS...
 	 */
 	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+	vfsconf_unlock();
 
-	return 0;
+	/*
+	 * If this filesystem has a sysctl node under vfs
+	 * (i.e. vfs.xxfs), then change the oid number of that node to
+	 * match the filesystem's type number.  This allows user code
+	 * which uses the type number to read sysctl variables defined
+	 * by the filesystem to continue working. Since the oids are
+	 * in a sorted list, we need to make sure the order is
+	 * preserved by re-registering the oid after modifying its
+	 * number.
+	 */
+	sysctl_lock();
+	SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link) {
+		if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+			sysctl_unregister_oid(oidp);
+			oidp->oid_number = vfc->vfc_typenum;
+			sysctl_register_oid(oidp);
+			break;
+		}
+	}
+	sysctl_unlock();
+
+	return (0);
 }
 
 
@@ -291,19 +313,24 @@
 vfs_unregister(struct vfsconf *vfc)
 {
 	struct vfsconf *vfsp;
-	int error, i, maxtypenum;
+	int error, maxtypenum;
 
-	i = vfc->vfc_typenum;
-
-	vfsp = vfs_byname(vfc->vfc_name);
-	if (vfsp == NULL)
-		return EINVAL;
-	if (vfsp->vfc_refcount)
-		return EBUSY;
+	vfsconf_lock();
+	vfsp = vfs_byname_locked(vfc->vfc_name);
+	if (vfsp == NULL) {
+		vfsconf_unlock();
+		return (EINVAL);
+	}
+	if (vfsp->vfc_refcount != 0) {
+		vfsconf_unlock();
+		return (EBUSY);
+	}
 	if (vfc->vfc_vfsops->vfs_uninit != NULL) {
 		error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
-		if (error)
+		if (error != 0) {
+			vfsconf_unlock();
 			return (error);
+		}
 	}
 	TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
 	maxtypenum = VFS_GENERIC;
@@ -311,7 +338,8 @@
 		if (maxtypenum < vfsp->vfc_typenum)
 			maxtypenum = vfsp->vfc_typenum;
 	maxvfsconf = maxtypenum + 1;
-	return 0;
+	vfsconf_unlock();
+	return (0);
 }
 
 /*

Modified: trunk/sys/kern/vfs_lookup.c
===================================================================
--- trunk/sys/kern/vfs_lookup.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_lookup.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_lookup.c 308469 2016-11-09 17:07:45Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_kdtrace.h"
@@ -44,7 +45,7 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
@@ -70,9 +71,9 @@
 #undef NAMEI_DIAGNOSTIC
 
 SDT_PROVIDER_DECLARE(vfs);
-SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, entry, "struct vnode *", "char *",
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
     "unsigned long");
-SDT_PROBE_DEFINE2(vfs, namei, lookup, return, return, "int", "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
 
 /*
  * Allocation zone for namei
@@ -143,10 +144,7 @@
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 	struct proc *p = td->td_proc;
-	int vfslocked;
 
-	KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
-	    ("NOT MPSAFE and Giant not held"));
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
@@ -167,11 +165,11 @@
 	if ((cnp->cn_flags & HASBUF) == 0)
 		cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
-		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
-			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+		    &ndp->ni_pathlen);
 	else
-		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
-			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+		    &ndp->ni_pathlen);
 
 	/*
 	 * Don't allow empty pathnames.
@@ -185,10 +183,16 @@
 	 * not an absolute path, and not containing '..' components) to
 	 * a real file descriptor, not the pseudo-descriptor AT_FDCWD.
 	 */
-	if (IN_CAPABILITY_MODE(td)) {
+	if (error == 0 && IN_CAPABILITY_MODE(td) &&
+	    (cnp->cn_flags & NOCAPCHECK) == 0) {
 		ndp->ni_strictrelative = 1;
-		if (ndp->ni_dirfd == AT_FDCWD)
+		if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+			if (KTRPOINT(td, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
 			error = ECAPMODE;
+		}
 	}
 #endif
 	if (error) {
@@ -225,31 +229,36 @@
 			dp = ndp->ni_startdir;
 			error = 0;
 		} else if (ndp->ni_dirfd != AT_FDCWD) {
+			cap_rights_t rights;
+
+			rights = ndp->ni_rightsneeded;
+			cap_rights_set(&rights, CAP_LOOKUP);
+
 			if (cnp->cn_flags & AUDITVNODE1)
 				AUDIT_ARG_ATFD1(ndp->ni_dirfd);
 			if (cnp->cn_flags & AUDITVNODE2)
 				AUDIT_ARG_ATFD2(ndp->ni_dirfd);
 			error = fgetvp_rights(td, ndp->ni_dirfd,
-			    ndp->ni_rightsneeded | CAP_LOOKUP,
-			    &(ndp->ni_baserights), &dp);
+			    &rights, &ndp->ni_filecaps, &dp);
 #ifdef CAPABILITIES
 			/*
-			 * Lookups relative to a capability must also be
+			 * If file descriptor doesn't have all rights,
+			 * all lookups relative to it must also be
 			 * strictly relative.
-			 *
-			 * Note that a capability with rights CAP_MASK_VALID
-			 * is treated exactly like a regular file descriptor.
 			 */
-			if (ndp->ni_baserights != CAP_MASK_VALID)
+			CAP_ALL(&rights);
+			if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+			    &rights) ||
+			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+			    ndp->ni_filecaps.fc_nioctls != -1) {
 				ndp->ni_strictrelative = 1;
+			}
 #endif
 		}
 		if (error != 0 || dp != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			if (error == 0 && dp->v_type != VDIR) {
-				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 				vrele(dp);
-				VFS_UNLOCK_GIANT(vfslocked);
 				error = ENOTDIR;
 			}
 		}
@@ -262,15 +271,11 @@
 		dp = fdp->fd_cdir;
 		VREF(dp);
 		FILEDESC_SUNLOCK(fdp);
-		if (ndp->ni_startdir != NULL) {
-			vfslocked = VFS_LOCK_GIANT(ndp->ni_startdir->v_mount);
+		if (ndp->ni_startdir != NULL)
 			vrele(ndp->ni_startdir);
-			VFS_UNLOCK_GIANT(vfslocked);
-		}
 	}
-	SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
-	    cnp->cn_flags, 0, 0);
-	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
+	SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+	    cnp->cn_flags);
 	for (;;) {
 		/*
 		 * Check if root directory should replace current directory.
@@ -279,8 +284,11 @@
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+				if (KTRPOINT(curthread, KTR_CAPFAIL))
+					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
 				namei_cleanup_cnp(cnp);
 				return (ENOTCAPABLE);
 			}
@@ -289,21 +297,15 @@
 				ndp->ni_pathlen--;
 			}
 			dp = ndp->ni_rootdir;
-			vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 		}
-		if (vfslocked)
-			ndp->ni_cnd.cn_flags |= GIANTHELD;
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error) {
 			namei_cleanup_cnp(cnp);
-			SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
-			    0, 0);
+			SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
 			return (error);
 		}
-		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
-		ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 		/*
 		 * If not a symbolic link, we're done.
 		 */
@@ -313,12 +315,7 @@
 			} else
 				cnp->cn_flags |= HASBUF;
 
-			if ((cnp->cn_flags & MPSAFE) == 0) {
-				VFS_UNLOCK_GIANT(vfslocked);
-			} else if (vfslocked)
-				ndp->ni_cnd.cn_flags |= GIANTHELD;
-			SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
-			    0, 0, 0);
+			SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
 			return (0);
 		}
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
@@ -379,8 +376,7 @@
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	vrele(ndp->ni_dvp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
+	SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
 	return (error);
 }
 
@@ -395,6 +391,7 @@
 		lkflags &= ~LK_SHARED;
 		lkflags |= LK_EXCLUSIVE;
 	}
+	lkflags |= LK_NODDLKTREAT;
 	return (lkflags);
 }
 
@@ -418,13 +415,8 @@
 	 * extended shared operations, then use a shared lock for the
 	 * leaf node, otherwise use an exclusive lock.
 	 */
-	if (flags & ISOPEN) {
-		if (mp != NULL &&
-		    (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED))
-			return (0);
-		else
-			return (1);
-	}
+	if ((flags & ISOPEN) != 0)
+		return (!MNT_EXTENDED_SHARED(mp));
 
 	/*
 	 * Lookup requests outside of open() that specify LOCKSHARED
@@ -485,9 +477,6 @@
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	struct componentname *cnp = &ndp->ni_cnd;
-	int vfslocked;			/* VFS Giant state for child */
-	int dvfslocked;			/* VFS Giant state for parent */
-	int tvfslocked;
 	int lkflags_save;
 	int ni_dvp_unlocked;
 	
@@ -494,10 +483,7 @@
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
-	dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
-	vfslocked = 0;
 	ni_dvp_unlocked = 0;
-	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
 	    ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
@@ -638,6 +624,10 @@
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
 			error = ENOTCAPABLE;
 			goto bad;
 		}
@@ -659,7 +649,6 @@
 			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
-				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 				VREF(dp);
 				goto nextname;
 			}
@@ -671,11 +660,8 @@
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
-			tvfslocked = dvfslocked;
-			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 			vput(tdp);
-			VFS_UNLOCK_GIANT(tvfslocked);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, ISDOTDOT));
@@ -697,7 +683,6 @@
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	ASSERT_VOP_LOCKED(dp, "lookup");
-	VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
 	/*
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
@@ -733,11 +718,8 @@
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
-			tvfslocked = dvfslocked;
-			dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
 			VREF(dp);
 			vput(tdp);
-			VFS_UNLOCK_GIANT(tvfslocked);
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, cnp->cn_flags));
@@ -791,7 +773,6 @@
 	}
 
 	dp = ndp->ni_vp;
-	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 
 	/*
 	 * Check to see if the vnode has been mounted on;
@@ -802,14 +783,10 @@
 		if (vfs_busy(mp, 0))
 			continue;
 		vput(dp);
-		VFS_UNLOCK_GIANT(vfslocked);
-		vfslocked = VFS_LOCK_GIANT(mp);
 		if (dp != ndp->ni_dvp)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
-		VFS_UNLOCK_GIANT(dvfslocked);
-		dvfslocked = 0;
 		vref(vp_crossmp);
 		ndp->ni_dvp = vp_crossmp;
 		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
@@ -870,9 +847,6 @@
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
-		VFS_UNLOCK_GIANT(dvfslocked);
-		dvfslocked = vfslocked;	/* dp becomes dvp in dirloop */
-		vfslocked = 0;
 		goto dirloop;
 	}
 	/*
@@ -901,8 +875,6 @@
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
-		VFS_UNLOCK_GIANT(dvfslocked);
-		dvfslocked = 0;
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
 		VOP_UNLOCK(ndp->ni_dvp, 0);
 		ni_dvp_unlocked = 1;
@@ -928,10 +900,6 @@
 			goto bad2;
 		}
 	}
-	if (vfslocked && dvfslocked)
-		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
-	if (vfslocked || dvfslocked)
-		ndp->ni_cnd.cn_flags |= GIANTHELD;
 	return (0);
 
 bad2:
@@ -944,9 +912,6 @@
 bad:
 	if (!dpunlocked)
 		vput(dp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	VFS_UNLOCK_GIANT(dvfslocked);
-	ndp->ni_cnd.cn_flags &= ~GIANTHELD;
 	ndp->ni_vp = NULL;
 	return (error);
 }
@@ -1086,6 +1051,27 @@
 	return (error);
 }
 
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+    const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+    struct thread *td)
+{
+
+	ndp->ni_cnd.cn_nameiop = op;
+	ndp->ni_cnd.cn_flags = flags;
+	ndp->ni_segflg = segflg;
+	ndp->ni_dirp = namep;
+	ndp->ni_dirfd = dirfd;
+	ndp->ni_startdir = startdir;
+	ndp->ni_strictrelative = 0;
+	if (rightsp != NULL)
+		ndp->ni_rightsneeded = *rightsp;
+	else
+		cap_rights_init(&ndp->ni_rightsneeded);
+	filecaps_init(&ndp->ni_filecaps);
+	ndp->ni_cnd.cn_thread = td;
+}
+
 /*
  * Free data allocated by namei(); see namei(9) for details.
  */
@@ -1142,7 +1128,7 @@
  * Determine if there is a suitable alternate filename under the specified
  * prefix for the specified path.  If the create flag is set, then the
  * alternate prefix will be used so long as the parent directory exists.
- * This is used by the various compatiblity ABIs so that Linux binaries prefer
+ * This is used by the various compatibility ABIs so that Linux binaries prefer
  * files under /compat/linux for example.  The chosen path (whether under
  * the prefix or under /) is returned in a kernel malloc'd buffer pointed
  * to by pathbuf.  The caller is responsible for free'ing the buffer from
@@ -1209,13 +1195,13 @@
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
-		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 		error = namei(&nd);
 		*cp = '/';
 		if (error != 0)
 			goto keeporig;
 	} else {
-		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		error = namei(&nd);
 		if (error != 0)
@@ -1229,7 +1215,7 @@
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
-		NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
+		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
 		    td);
 
 		/* We shouldn't ever get an error from this namei(). */
@@ -1240,13 +1226,11 @@
 
 			NDFREE(&ndroot, NDF_ONLY_PNBUF);
 			vrele(ndroot.ni_vp);
-			VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
 		}
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
 
 keeporig:
 	/* If there was an error, use the original path name. */

Modified: trunk/sys/kern/vfs_mount.c
===================================================================
--- trunk/sys/kern/vfs_mount.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_mount.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999-2004 Poul-Henning Kamp
  * Copyright (c) 1999 Michael Smith
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_mount.c 332754 2018-04-19 05:52:47Z avg $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -78,6 +79,10 @@
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
     "Unprivileged users may mount and unmount file systems");
 
+static int	default_autoro = false;
+SYSCTL_INT(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
+    "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
+
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 static uma_zone_t mount_zone;
 
@@ -232,7 +237,7 @@
 /*
  * If a mount option is specified several times,
  * (with or without the "no" prefix) only keep
- * the last occurence of it.
+ * the last occurrence of it.
  */
 static void
 vfs_sanitizeopts(struct vfsoptlist *opts)
@@ -463,9 +468,9 @@
 	mp->mnt_activevnodelistsize = 0;
 	mp->mnt_ref = 0;
 	(void) vfs_busy(mp, MBF_NOWAIT);
+	atomic_add_acq_int(&vfsp->vfc_refcount, 1);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
-	vfsp->vfc_refcount++;	/* XXX Unlocked */
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
@@ -505,7 +510,7 @@
 		panic("vfs_mount_destroy: nonzero writeopcount");
 	if (mp->mnt_secondary_writes != 0)
 		panic("vfs_mount_destroy: nonzero secondary_writes");
-	mp->mnt_vfc->vfc_refcount--;
+	atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
 		struct vnode *vp;
 
@@ -521,6 +526,8 @@
 	if (mp->mnt_lockref != 0)
 		panic("vfs_mount_destroy: nonzero lock refcount");
 	MNT_IUNLOCK(mp);
+	if (mp->mnt_vnodecovered != NULL)
+		vrele(mp->mnt_vnodecovered);
 #ifdef MAC
 	mac_mount_destroy(mp);
 #endif
@@ -530,6 +537,31 @@
 	uma_zfree(mount_zone, mp);
 }
 
+static bool
+vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
+{
+	/* This is an upgrade of an exisiting mount. */
+	if ((fsflags & MNT_UPDATE) != 0)
+		return (false);
+	/* This is already an R/O mount. */
+	if ((fsflags & MNT_RDONLY) != 0)
+		return (false);
+
+	switch (error) {
+	case ENODEV:	/* generic, geom, ... */
+	case EACCES:	/* cam/scsi, ... */
+	case EROFS:	/* md, mmcsd, ... */
+		/*
+		 * These errors can be returned by the storage layer to signal
+		 * that the media is read-only.  No harm in the R/O mount
+		 * attempt if the error was returned for some other reason.
+		 */
+		return (true);
+	default:
+		return (false);
+	}
+}
+
 int
 vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
 {
@@ -537,10 +569,12 @@
 	struct vfsopt *opt, *tmp_opt;
 	char *fstype, *fspath, *errmsg;
 	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+	bool autoro;
 
 	errmsg = fspath = NULL;
 	errmsg_len = fspathlen = 0;
 	errmsg_pos = -1;
+	autoro = default_autoro;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
@@ -632,17 +666,28 @@
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosymfollow", M_MOUNT);
 		}
-		else if (strcmp(opt->name, "noro") == 0)
+		else if (strcmp(opt->name, "noro") == 0) {
 			fsflags &= ~MNT_RDONLY;
-		else if (strcmp(opt->name, "rw") == 0)
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "rw") == 0) {
 			fsflags &= ~MNT_RDONLY;
-		else if (strcmp(opt->name, "ro") == 0)
+			autoro = false;
+		}
+		else if (strcmp(opt->name, "ro") == 0) {
 			fsflags |= MNT_RDONLY;
+			autoro = false;
+		}
 		else if (strcmp(opt->name, "rdonly") == 0) {
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("ro", M_MOUNT);
 			fsflags |= MNT_RDONLY;
+			autoro = false;
 		}
+		else if (strcmp(opt->name, "autoro") == 0) {
+			vfs_freeopt(optlist, opt);
+			autoro = true;
+		}
 		else if (strcmp(opt->name, "suiddir") == 0)
 			fsflags |= MNT_SUIDDIR;
 		else if (strcmp(opt->name, "sync") == 0)
@@ -649,6 +694,10 @@
 			fsflags |= MNT_SYNCHRONOUS;
 		else if (strcmp(opt->name, "union") == 0)
 			fsflags |= MNT_UNION;
+		else if (strcmp(opt->name, "automounted") == 0) {
+			fsflags |= MNT_AUTOMOUNTED;
+			vfs_freeopt(optlist, opt);
+		}
 	}
 
 	/*
@@ -656,12 +705,25 @@
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
-	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+	if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
 		error = ENAMETOOLONG;
 		goto bail;
 	}
 
 	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+
+	/*
+	 * See if we can mount in the read-only mode if the error code suggests
+	 * that it could be possible and the mount options allow for that.
+	 * Never try it if "[no]{ro|rw}" has been explicitly requested and not
+	 * overridden by "autoro".
+	 */
+	if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
+		printf("%s: R/W mount failed, possibly R/O media,"
+		    " trying R/O mount\n", __func__);
+		fsflags |= MNT_RDONLY;
+		error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+	}
 bail:
 	/* copyout the errmsg */
 	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
@@ -736,19 +798,14 @@
 	}
 
 	AUDIT_ARG_TEXT(fstype);
-	mtx_lock(&Giant);
 	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
-	if (vfsp == NULL) {
-		mtx_unlock(&Giant);
+	if (vfsp == NULL)
 		return (ENOENT);
-	}
-	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
-		mtx_unlock(&Giant);
+	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
 		return (EOPNOTSUPP);
-	}
 
-	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
+	ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
 	ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
 	ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
@@ -755,7 +812,6 @@
 	ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -777,7 +833,6 @@
 	struct vnode *newdp;
 	int error;
 
-	mtx_assert(&Giant, MA_OWNED);
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
 
@@ -821,6 +876,7 @@
 	error = VFS_MOUNT(mp);
 	if (error != 0) {
 		vfs_unbusy(mp);
+		mp->mnt_vnodecovered = NULL;
 		vfs_mount_destroy(mp);
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
@@ -861,8 +917,9 @@
 	vfs_event_signal(NULL, VQ_MOUNT, 0);
 	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
 		panic("mount: lost mount");
+	VOP_UNLOCK(vp, 0);
+	EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td);
 	VOP_UNLOCK(newdp, 0);
-	VOP_UNLOCK(vp, 0);
 	mountcheckdirs(vp, newdp);
 	vrele(newdp);
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
@@ -888,15 +945,20 @@
 	int error, export_error;
 	uint64_t flag;
 
-	mtx_assert(&Giant, MA_OWNED);
 	ASSERT_VOP_ELOCKED(vp, __func__);
 	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+	mp = vp->v_mount;
 
 	if ((vp->v_vflag & VV_ROOT) == 0) {
+		if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
+		    == 0)
+			error = EXDEV;
+		else
+			error = EINVAL;
 		vput(vp);
-		return (EINVAL);
+		return (error);
 	}
-	mp = vp->v_mount;
+
 	/*
 	 * We only allow the filesystem to be reloaded if it
 	 * is currently mounted read-only.
@@ -931,6 +993,11 @@
 	VOP_UNLOCK(vp, 0);
 
 	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+		MNT_IUNLOCK(mp);
+		error = EBUSY;
+		goto end;
+	}
 	mp->mnt_flag &= ~MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
 	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
@@ -1085,13 +1152,11 @@
 	/*
 	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
 	 */
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, fspath, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
-	if (!NDHASGIANT(&nd))
-		mtx_lock(&Giant);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if ((fsflags & MNT_UPDATE) == 0) {
@@ -1106,11 +1171,7 @@
 		free(pathbuf, M_TEMP);
 	} else
 		error = vfs_domount_update(td, vp, fsflags, optlist);
-	mtx_unlock(&Giant);
 
-	ASSERT_VI_UNLOCKED(vp, __func__);
-	ASSERT_VOP_UNLOCKED(vp, __func__);
-
 	return (error);
 }
 
@@ -1128,17 +1189,12 @@
 #endif
 /* ARGSUSED */
 int
-sys_unmount(td, uap)
-	struct thread *td;
-	register struct unmount_args /* {
-		char *path;
-		int flags;
-	} */ *uap;
+sys_unmount(struct thread *td, struct unmount_args *uap)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	char *pathbuf;
-	int error, id0, id1, vfslocked;
+	int error, id0, id1;
 
 	AUDIT_ARG_VALUE(uap->flags);
 	if (jailed(td->td_ucred) || usermount == 0) {
@@ -1153,12 +1209,10 @@
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
-	mtx_lock(&Giant);
 	if (uap->flags & MNT_BYFSID) {
 		AUDIT_ARG_TEXT(pathbuf);
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
-			mtx_unlock(&Giant);
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
@@ -1166,8 +1220,10 @@
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
 			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
-			    mp->mnt_stat.f_fsid.val[1] == id1)
+			    mp->mnt_stat.f_fsid.val[1] == id1) {
+				vfs_ref(mp);
 				break;
+			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	} else {
@@ -1174,22 +1230,21 @@
 		/*
 		 * Try to find global path for path argument.
 		 */
-		NDINIT(&nd, LOOKUP,
-		    FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 		    UIO_SYSSPACE, pathbuf, td);
 		if (namei(&nd) == 0) {
-			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
 			    MNAMELEN);
 			if (error == 0 || error == ENODEV)
 				vput(nd.ni_vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 		}
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
-			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
+			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
+				vfs_ref(mp);
 				break;
+			}
 		}
 		mtx_unlock(&mountlist_mtx);
 	}
@@ -1201,7 +1256,6 @@
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
-		mtx_unlock(&Giant);
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
@@ -1209,11 +1263,10 @@
 	 * Don't allow unmounting the root filesystem.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
-		mtx_unlock(&Giant);
+		vfs_rel(mp);
 		return (EINVAL);
 	}
 	error = dounmount(mp, uap->flags, td);
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -1221,10 +1274,7 @@
  * Do the actual filesystem unmount.
  */
 int
-dounmount(mp, flags, td)
-	struct mount *mp;
-	int flags;
-	struct thread *td;
+dounmount(struct mount *mp, int flags, struct thread *td)
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
@@ -1231,14 +1281,11 @@
 	uint64_t async_flag;
 	int mnt_gen_r;
 
-	mtx_assert(&Giant, MA_OWNED);
-
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
 		vholdl(coveredvp);
 		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
-		vdrop(coveredvp);
 		/*
 		 * Check for mp being unmounted while waiting for the
 		 * covered vnode lock.
@@ -1246,34 +1293,51 @@
 		if (coveredvp->v_mountedhere != mp ||
 		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
 			VOP_UNLOCK(coveredvp, 0);
+			vdrop(coveredvp);
+			vfs_rel(mp);
 			return (EBUSY);
 		}
 	}
+
 	/*
 	 * Only privileged root, or (if MNT_USER is set) the user that did the
 	 * original mount is permitted to unmount this filesystem.
 	 */
 	error = vfs_suser(mp, td);
-	if (error) {
-		if (coveredvp)
+	if (error != 0) {
+		if (coveredvp != NULL) {
 			VOP_UNLOCK(coveredvp, 0);
+			vdrop(coveredvp);
+		}
+		vfs_rel(mp);
 		return (error);
 	}
 
-	vn_start_write(NULL, &mp, V_WAIT);
+	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
 	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		MNT_IUNLOCK(mp);
-		if (coveredvp)
+		if (coveredvp != NULL) {
 			VOP_UNLOCK(coveredvp, 0);
+			vdrop(coveredvp);
+		}
 		vn_finished_write(mp);
 		return (EBUSY);
 	}
 	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
 	/* Allow filesystems to detect that a forced unmount is in progress. */
-	if (flags & MNT_FORCE)
+	if (flags & MNT_FORCE) {
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+		MNT_IUNLOCK(mp);
+		/*
+		 * Must be done after setting MNTK_UNMOUNTF and before
+		 * waiting for mnt_lockref to become 0.
+		 */
+		VFS_PURGE(mp);
+		MNT_ILOCK(mp);
+	}
 	error = 0;
 	if (mp->mnt_lockref) {
 		mp->mnt_kern_flag |= MNTK_DRAINING;
@@ -1291,6 +1355,16 @@
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
+	/*
+	 * From now, we can claim that the use reference on the
+	 * coveredvp is ours, and the ref can be released only by
+	 * successfull unmount by us, or left for later unmount
+	 * attempt.  The previously acquired hold reference is no
+	 * longer needed to protect the vnode from reuse.
+	 */
+	if (coveredvp != NULL)
+		vdrop(coveredvp);
+
 	vfs_msync(mp, MNT_WAIT);
 	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
@@ -1306,7 +1380,8 @@
 	 */
 	if ((flags & MNT_FORCE) &&
 	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
-		if (mp->mnt_vnodecovered != NULL)
+		if (mp->mnt_vnodecovered != NULL &&
+		    (mp->mnt_flag & MNT_IGNORE) == 0)
 			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
@@ -1314,8 +1389,8 @@
 		}
 		vput(fsrootvp);
 	}
-	if (((mp->mnt_flag & MNT_RDONLY) ||
-	     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
+	if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 ||
+	    (error = VFS_SYNC(mp, MNT_WAIT)) == 0)
 		error = VFS_UNMOUNT(mp, flags);
 	vn_finished_write(mp);
 	/*
@@ -1327,7 +1402,8 @@
 	if (error && error != ENXIO) {
 		if ((flags & MNT_FORCE) &&
 		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
-			if (mp->mnt_vnodecovered != NULL)
+			if (mp->mnt_vnodecovered != NULL &&
+			    (mp->mnt_flag & MNT_IGNORE) == 0)
 				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
@@ -1359,11 +1435,14 @@
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
+	EVENTHANDLER_INVOKE(vfs_unmounted, mp, td);
 	if (coveredvp != NULL) {
 		coveredvp->v_mountedhere = NULL;
-		vput(coveredvp);
+		VOP_UNLOCK(coveredvp, 0);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+	if (mp == rootdevmp)
+		rootdevmp = NULL;
 	vfs_mount_destroy(mp);
 	return (0);
 }
@@ -1511,6 +1590,48 @@
 	return (-1);
 }
 
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+	char *opt_value, *vtp;
+	quad_t iv;
+	int error, opt_len;
+
+	error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+	if (error != 0)
+		return (error);
+	if (opt_len == 0 || opt_value == NULL)
+		return (EINVAL);
+	if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+		return (EINVAL);
+	iv = strtoq(opt_value, &vtp, 0);
+	if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+		return (EINVAL);
+	if (iv < 0)
+		return (EINVAL);
+	switch (vtp[0]) {
+	case 't':
+	case 'T':
+		iv *= 1024;
+	case 'g':
+	case 'G':
+		iv *= 1024;
+	case 'm':
+	case 'M':
+		iv *= 1024;
+	case 'k':
+	case 'K':
+		iv *= 1024;
+	case '\0':
+		break;
+	default:
+		return (EINVAL);
+	}
+	*value = iv;
+
+	return (0);
+}
+
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
@@ -1668,103 +1789,6 @@
 	return (ENOENT);
 }
 
-/*
- * These are helper functions for filesystems to traverse all
- * their vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h.
- *
- * This interface has been deprecated in favor of MNT_VNODE_FOREACH_ALL.
- */
-
-MALLOC_DECLARE(M_VNODE_MARKER);
-
-struct vnode *
-__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
-{
-	struct vnode *vp;
-
-	mtx_assert(MNT_MTX(mp), MA_OWNED);
-
-	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
-	if (should_yield()) {
-		MNT_IUNLOCK(mp);
-		kern_yield(PRI_UNCHANGED);
-		MNT_ILOCK(mp);
-	}
-	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
-	while (vp != NULL && vp->v_type == VMARKER)
-		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-	/* Check if we are done */
-	if (vp == NULL) {
-		__mnt_vnode_markerfree(mvp, mp);
-		return (NULL);
-	}
-	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
-	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
-	return (vp);
-}
-
-struct vnode *
-__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
-{
-	struct vnode *vp;
-
-	mtx_assert(MNT_MTX(mp), MA_OWNED);
-
-	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	while (vp != NULL && vp->v_type == VMARKER)
-		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-	/* Check if we are done */
-	if (vp == NULL) {
-		*mvp = NULL;
-		return (NULL);
-	}
-	MNT_REF(mp);
-	MNT_IUNLOCK(mp);
-	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
-				       M_VNODE_MARKER,
-				       M_WAITOK | M_ZERO);
-	MNT_ILOCK(mp);
-	(*mvp)->v_type = VMARKER;
-
-	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	while (vp != NULL && vp->v_type == VMARKER)
-		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-	/* Check if we are done */
-	if (vp == NULL) {
-		MNT_IUNLOCK(mp);
-		free(*mvp, M_VNODE_MARKER);
-		MNT_ILOCK(mp);
-		*mvp = NULL;
-		MNT_REL(mp);
-		return (NULL);
-	}
-	(*mvp)->v_mount = mp;
-	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
-	return (vp);
-}
-
-
-void
-__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
-{
-
-	if (*mvp == NULL)
-		return;
-
-	mtx_assert(MNT_MTX(mp), MA_OWNED);
-
-	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
-	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
-	MNT_IUNLOCK(mp);
-	free(*mvp, M_VNODE_MARKER);
-	MNT_ILOCK(mp);
-	*mvp = NULL;
-	MNT_REL(mp);
-}
-
 int
 __vfs_statfs(struct mount *mp, struct statfs *sbp)
 {

Modified: trunk/sys/kern/vfs_mountroot.c
===================================================================
--- trunk/sys/kern/vfs_mountroot.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_mountroot.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2010 Marcel Moolenaar
  * Copyright (c) 1999-2004 Poul-Henning Kamp
@@ -38,7 +39,7 @@
 #include "opt_rootdevname.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_mountroot.c 331276 2018-03-20 22:57:14Z ian $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -79,7 +80,7 @@
  *
  * If the environment variable vfs.root.mountfrom is a space separated list,
  * each list element is tried in turn and the root filesystem will be mounted
- * from the first one that suceeds.
+ * from the first one that succeeds.
  *
  * The environment variable vfs.root.mountfrom.options is a comma delimited
  * set of string mount options.  These mount options must be parseable
@@ -95,8 +96,16 @@
  */
 struct vnode *rootvnode;
 
+/*
+ * Mount of the system's /dev.
+ */
+struct mount *rootdevmp;
+
 char *rootdevnames[2] = {NULL, NULL};
 
+struct mtx root_holds_mtx;
+MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
+
 struct root_hold_token {
 	const char			*who;
 	LIST_ENTRY(root_hold_token)	list;
@@ -119,6 +128,7 @@
 
 /* By default wait up to 3 seconds for devices to appear. */
 static int root_mount_timeout = 3;
+TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
 
 struct root_hold_token *
 root_mount_hold(const char *identifier)
@@ -130,9 +140,9 @@
 
 	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
 	h->who = identifier;
-	mtx_lock(&mountlist_mtx);
+	mtx_lock(&root_holds_mtx);
 	LIST_INSERT_HEAD(&root_holds, h, list);
-	mtx_unlock(&mountlist_mtx);
+	mtx_unlock(&root_holds_mtx);
 	return (h);
 }
 
@@ -142,10 +152,10 @@
 
 	if (h == NULL)
 		return;
-	mtx_lock(&mountlist_mtx);
+	mtx_lock(&root_holds_mtx);
 	LIST_REMOVE(h, list);
 	wakeup(&root_holds);
-	mtx_unlock(&mountlist_mtx);
+	mtx_unlock(&root_holds_mtx);
 	free(h, M_DEVBUF);
 }
 
@@ -167,12 +177,12 @@
 	 */
 	KASSERT(curthread->td_proc->p_pid != 0,
 	    ("root_mount_wait: cannot be called from the swapper thread"));
-	mtx_lock(&mountlist_mtx);
+	mtx_lock(&root_holds_mtx);
 	while (!root_mount_complete) {
-		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+		msleep(&root_mount_complete, &root_holds_mtx, PZERO, "rootwait",
 		    hz);
 	}
-	mtx_unlock(&mountlist_mtx);
+	mtx_unlock(&root_holds_mtx);
 }
 
 static void
@@ -199,8 +209,6 @@
 	VREF(rootvnode);
 
 	FILEDESC_XUNLOCK(p->p_fd);
-
-	EVENTHANDLER_INVOKE(mountroot);
 }
 
 static int
@@ -213,27 +221,39 @@
 
 	*mpp = NULL;
 
-	vfsp = vfs_byname("devfs");
-	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
-	if (vfsp == NULL)
-		return (ENOENT);
+	if (rootdevmp != NULL) {
+		/*
+		 * Already have /dev; this happens during rerooting.
+		 */
+		error = vfs_busy(rootdevmp, 0);
+		if (error != 0)
+			return (error);
+		*mpp = rootdevmp;
+	} else {
+		vfsp = vfs_byname("devfs");
+		KASSERT(vfsp != NULL, ("Could not find devfs by name"));
+		if (vfsp == NULL)
+			return (ENOENT);
 
-	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
+		mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
 
-	error = VFS_MOUNT(mp);
-	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
-	if (error)
-		return (error);
+		error = VFS_MOUNT(mp);
+		KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
+		if (error)
+			return (error);
 
-	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
-	TAILQ_INIT(opts);
-	mp->mnt_opt = opts;
+		opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+		TAILQ_INIT(opts);
+		mp->mnt_opt = opts;
 
-	mtx_lock(&mountlist_mtx);
-	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
-	mtx_unlock(&mountlist_mtx);
+		mtx_lock(&mountlist_mtx);
+		TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+		mtx_unlock(&mountlist_mtx);
 
-	*mpp = mp;
+		*mpp = mp;
+		rootdevmp = mp;
+	}
+
 	set_rootvnode();
 
 	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
@@ -243,7 +263,7 @@
 	return (error);
 }
 
-static int
+static void
 vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
 {
 	struct nameidata nd;
@@ -353,8 +373,6 @@
 			printf("mountroot: unable to unlink /dev/dev "
 			    "(error %d)\n", error);
 	}
-
-	return (0);
 }
 
 /*
@@ -390,13 +408,6 @@
 	(*conf)++;
 }
 
-static __inline int
-parse_isspace(int c)
-{
-
-	return ((c == ' ' || c == '\t' || c == '\n') ? 1 : 0);
-}
-
 static int
 parse_skipto(char **conf, int mc)
 {
@@ -711,13 +722,13 @@
 	errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
 
 	if (vfs_byname(fs) == NULL) {
-		strlcpy(errmsg, "unknown file system", sizeof(errmsg));
+		strlcpy(errmsg, "unknown file system", ERRMSGL);
 		error = ENOENT;
 		goto out;
 	}
 
-	if (strcmp(fs, "zfs") != 0 && dev[0] != '\0' &&
-	    !parse_mount_dev_present(dev)) {
+	if (strcmp(fs, "zfs") != 0 && strstr(fs, "nfs") == NULL && 
+	    dev[0] != '\0' && !parse_mount_dev_present(dev)) {
 		printf("mountroot: waiting for device %s ...\n", dev);
 		delay = hz / 10;
 		timeout = root_mount_timeout * hz;
@@ -731,15 +742,31 @@
 		}
 	}
 
-	ma = NULL;
-	ma = mount_arg(ma, "fstype", fs, -1);
-	ma = mount_arg(ma, "fspath", "/", -1);
-	ma = mount_arg(ma, "from", dev, -1);
-	ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
-	ma = mount_arg(ma, "ro", NULL, 0);
-	ma = parse_mountroot_options(ma, opts);
-	error = kernel_mount(ma, MNT_ROOTFS);
+	delay = hz / 10;
+	timeout = root_mount_timeout * hz;
 
+	for (;;) {
+		ma = NULL;
+		ma = mount_arg(ma, "fstype", fs, -1);
+		ma = mount_arg(ma, "fspath", "/", -1);
+		ma = mount_arg(ma, "from", dev, -1);
+		ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
+		ma = mount_arg(ma, "ro", NULL, 0);
+		ma = parse_mountroot_options(ma, opts);
+
+		error = kernel_mount(ma, MNT_ROOTFS);
+		if (error == 0 || timeout <= 0)
+			break;
+
+		if (root_mount_timeout * hz == timeout ||
+		    (bootverbose && timeout % hz == 0)) {
+			printf("Mounting from %s:%s failed with error %d; "
+			    "retrying for %d more second%s\n", fs, dev, error,
+			    timeout / hz, (timeout / hz > 1) ? "s" : "");
+		}
+		pause("rmretry", delay);
+		timeout -= delay;
+	}
  out:
 	if (error) {
 		printf("Mounting from %s:%s failed with error %d",
@@ -875,16 +902,14 @@
 	struct nameidata nd;
 	off_t ofs;
 	ssize_t resid;
-	int error, flags, len, vfslocked;
+	int error, flags, len;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
-	    "/.mount.conf", td);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return (error);
 
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	ofs = 0;
 	len = sizeof(buf) - 1;
@@ -903,7 +928,6 @@
 
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -919,9 +943,9 @@
 		DROP_GIANT();
 		g_waitidle();
 		PICKUP_GIANT();
-		mtx_lock(&mountlist_mtx);
+		mtx_lock(&root_holds_mtx);
 		if (LIST_EMPTY(&root_holds)) {
-			mtx_unlock(&mountlist_mtx);
+			mtx_unlock(&root_holds_mtx);
 			break;
 		}
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
@@ -930,7 +954,7 @@
 				printf(" %s", h->who);
 			printf("\n");
 		}
-		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
+		msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
 		    hz);
 	}
 }
@@ -956,12 +980,10 @@
 	while (!error) {
 		error = vfs_mountroot_parse(sb, mp);
 		if (!error) {
-			error = vfs_mountroot_shuffle(td, mp);
-			if (!error) {
-				sbuf_clear(sb);
-				error = vfs_mountroot_readconf(td, sb);
-				sbuf_finish(sb);
-			}
+			vfs_mountroot_shuffle(td, mp);
+			sbuf_clear(sb);
+			error = vfs_mountroot_readconf(td, sb);
+			sbuf_finish(sb);
 		}
 	}
 
@@ -990,10 +1012,12 @@
 	vref(prison0.pr_root);
 	mtx_unlock(&prison0.pr_mtx);
 
-	mtx_lock(&mountlist_mtx);
+	mtx_lock(&root_holds_mtx);
 	atomic_store_rel_int(&root_mount_complete, 1);
 	wakeup(&root_mount_complete);
-	mtx_unlock(&mountlist_mtx);
+	mtx_unlock(&root_holds_mtx);
+
+	EVENTHANDLER_INVOKE(mountroot);
 }
 
 static struct mntarg *

Modified: trunk/sys/kern/vfs_subr.c
===================================================================
--- trunk/sys/kern/vfs_subr.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_subr.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -39,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_subr.c 328997 2018-02-07 22:50:10Z mckusick $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -65,8 +66,10 @@
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
+#include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
@@ -94,9 +97,6 @@
 #include <ddb/ddb.h>
 #endif
 
-#define	WI_MPSAFEQ	0
-#define	WI_GIANTQ	1
-
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
@@ -123,6 +123,10 @@
 SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
     "Number of vnodes in existence");
 
+static u_long vnodes_created;
+SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
+    0, "Number of vnodes created by getnewvnode");
+
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
@@ -157,6 +161,10 @@
 SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
     &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
 
+static u_long recycles_count;
+SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
+    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
+
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
@@ -166,6 +174,11 @@
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
     "Number of calls to reassignbuf");
 
+static u_long free_owe_inact;
+SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
+    "Number of times free vnodes kept on active list due to VFS "
+    "owing inactivation");
+
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
@@ -186,6 +199,8 @@
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
+static uma_zone_t buf_trie_zone;
+
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
@@ -218,7 +233,7 @@
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
-static struct synclist *syncer_workitem_pending[2];
+static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
@@ -266,8 +281,25 @@
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
-SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
-    &desiredvnodes, 0, "Maximum number of vnodes");
+
+static int
+sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
+{
+	int error, old_desiredvnodes;
+
+	old_desiredvnodes = desiredvnodes;
+	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
+		return (error);
+	if (old_desiredvnodes != desiredvnodes) {
+		vfs_hash_changesize(desiredvnodes);
+		cache_changesize(desiredvnodes);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
+    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
+    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
 SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
@@ -274,17 +306,27 @@
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
 /*
- * Macros to control when a vnode is freed and recycled.  All require
- * the vnode interlock.
+ * Support for the bufobj clean & dirty pctrie.
  */
-#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
-#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
-#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
 
-/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
-static int vnsz2log;
+	return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
 
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+	uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
 /*
  * Initialize the vnode management data structures.
  *
@@ -295,7 +337,67 @@
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
 #endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	bzero(vp, size);
+	/*
+	 * Setup locks.
+	 */
+	vp->v_vnlock = &vp->v_lock;
+	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	/*
+	 * By default, don't allow shared locks unless filesystems opt-in.
+	 */
+	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+	    LK_NOSHARE | LK_IS_VNODE);
+	/*
+	 * Initialize bufobj.
+	 */
+	bo = &vp->v_bufobj;
+	bo->__bo_vnode = vp;
+	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+	bo->bo_private = vp;
+	TAILQ_INIT(&bo->bo_clean.bv_hd);
+	TAILQ_INIT(&bo->bo_dirty.bv_hd);
+	/*
+	 * Initialize namecache.
+	 */
+	LIST_INIT(&vp->v_cache_src);
+	TAILQ_INIT(&vp->v_cache_dst);
+	/*
+	 * Initialize rangelocks.
+	 */
+	rangelock_init(&vp->v_rl);
+	return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
 static void
+vnode_fini(void *mem, int size)
+{
+	struct vnode *vp;
+	struct bufobj *bo;
+
+	vp = mem;
+	rangelock_destroy(&vp->v_rl);
+	lockdestroy(vp->v_vnlock);
+	mtx_destroy(&vp->v_interlock);
+	bo = &vp->v_bufobj;
+	rw_destroy(BO_LOCKPTR(bo));
+}
+
+static void
 vntblinit(void *dummy __unused)
 {
 	u_int i;
@@ -327,16 +429,23 @@
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, 0);
+	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	/*
+	 * Preallocate enough nodes to support one-per buf so that
+	 * we can not fail an insert.  reassignbuf() callers can not
+	 * tolerate the insertion failure.
+	 */
+	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, 
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	uma_prealloc(buf_trie_zone, nbuf);
+	/*
 	 * Initialize the filesystem syncer.
 	 */
-	syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
+	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 	    &syncer_mask);
-	syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
-	    &syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 	cv_init(&sync_wakeup, "syncer");
@@ -393,7 +502,7 @@
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	/*
-	 * If mount point is currenly being unmounted, sleep until the
+	 * If mount point is currently being unmounted, sleep until the
 	 * mount point fate is decided.  If thread doing the unmounting fails,
 	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
 	 * that this mount point has survived the unmount attempt and vfs_busy
@@ -474,14 +583,42 @@
 /*
  * Lookup a mount point by filesystem identifier, busying it before
  * returning.
+ *
+ * To avoid congestion on mountlist_mtx, implement simple direct-mapped
+ * cache for popular filesystem identifiers.  The cache is lockess, using
+ * the fact that struct mount's are never freed.  In worst case we may
+ * get pointer to unmounted or even different filesystem, so we have to
+ * check what we got, and go slow way if so.
  */
 struct mount *
 vfs_busyfs(fsid_t *fsid)
 {
+#define	FSID_CACHE_SIZE	256
+	typedef struct mount * volatile vmp_t;
+	static vmp_t cache[FSID_CACHE_SIZE];
 	struct mount *mp;
 	int error;
+	uint32_t hash;
 
 	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+	hash = fsid->val[0] ^ fsid->val[1];
+	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
+	mp = cache[hash];
+	if (mp == NULL ||
+	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
+	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
+		goto slow;
+	if (vfs_busy(mp, 0) != 0) {
+		cache[hash] = NULL;
+		goto slow;
+	}
+	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
+		return (mp);
+	else
+	    vfs_unbusy(mp);
+
+slow:
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
@@ -488,9 +625,11 @@
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			error = vfs_busy(mp, MBF_MNTLSTLOCK);
 			if (error) {
+				cache[hash] = NULL;
 				mtx_unlock(&mountlist_mtx);
 				return (NULL);
 			}
+			cache[hash] = mp;
 			return (mp);
 		}
 	}
@@ -584,7 +723,7 @@
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
-static int timestamp_precision = TSP_SEC;
+static int timestamp_precision = TSP_USEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "File timestamp precision (0: seconds, "
     "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
@@ -659,7 +798,7 @@
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
- * desireable to reuse such vnodes.  These conditions may cause the
+ * desirable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
@@ -735,10 +874,12 @@
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK);
+			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+		atomic_add_long(&recycles_count, 1);
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0);
 		vdropl(vp);
@@ -752,7 +893,7 @@
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
-		kern_yield(PRI_UNCHANGED);
+		kern_yield(PRI_USER);
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
@@ -768,7 +909,6 @@
 vnlru_free(int count)
 {
 	struct vnode *vp;
-	int vfslocked;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
@@ -793,16 +933,24 @@
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
 			continue;
 		}
-		VNASSERT(VCANRECYCLE(vp), vp,
-		    ("vp inconsistent on freelist"));
+		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
+		    vp, ("vp inconsistent on freelist"));
+
+		/*
+		 * The clear of VI_FREE prevents activation of the
+		 * vnode.  There is no sense in putting the vnode on
+		 * the mount point active list, only to remove it
+		 * later during recycling.  Inline the relevant part
+		 * of vholdl(), to avoid triggering assertions or
+		 * activating.
+		 */
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
-		vholdl(vp);
+		vp->v_holdcnt++;
+
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vtryrecycle(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
@@ -824,7 +972,7 @@
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
-	int done, vfslocked;
+	int done;
 	struct proc *p = vnlruproc;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
@@ -850,9 +998,7 @@
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
-			vfslocked = VFS_LOCK_GIANT(mp);
 			done += vlrureclaim(mp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp);
@@ -869,7 +1015,7 @@
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
-			kern_yield(PRI_UNCHANGED);
+			kern_yield(PRI_USER);
 	}
 }
 
@@ -934,8 +1080,10 @@
 		    __func__, vp);
 		return (EBUSY);
 	}
-	if ((vp->v_iflag & VI_DOOMED) == 0)
+	if ((vp->v_iflag & VI_DOOMED) == 0) {
+		atomic_add_long(&recycles_count, 1);
 		vgonel(vp);
+	}
 	VOP_UNLOCK(vp, LK_INTERLOCK);
 	vn_finished_write(vnmp);
 	return (0);
@@ -975,12 +1123,19 @@
 	struct thread *td;
 
 	td = curthread;
+	/* First try to be quick and racy. */
+	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
+		td->td_vp_reserv += count;
+		return;
+	} else
+		atomic_subtract_long(&numvnodes, count);
+
 	mtx_lock(&vnode_free_list_mtx);
 	while (count > 0) {
 		if (getnewvnode_wait(0) == 0) {
 			count--;
 			td->td_vp_reserv++;
-			numvnodes++;
+			atomic_add_long(&numvnodes, 1);
 		}
 	}
 	mtx_unlock(&vnode_free_list_mtx);
@@ -992,10 +1147,7 @@
 	struct thread *td;
 
 	td = curthread;
-	mtx_lock(&vnode_free_list_mtx);
-	KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
-	numvnodes -= td->td_vp_reserv;
-	mtx_unlock(&vnode_free_list_mtx);
+	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
 	td->td_vp_reserv = 0;
 }
 
@@ -1007,8 +1159,8 @@
     struct vnode **vpp)
 {
 	struct vnode *vp;
-	struct bufobj *bo;
 	struct thread *td;
+	struct lock_object *lo;
 	int error;
 
 	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1032,43 +1184,46 @@
 		return (error);
 	}
 #endif
-	numvnodes++;
+	atomic_add_long(&numvnodes, 1);
 	mtx_unlock(&vnode_free_list_mtx);
 alloc:
-	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
+	atomic_add_long(&vnodes_created, 1);
+	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
 	/*
-	 * Setup locks.
+	 * Locks are given the generic name "vnode" when created.
+	 * Follow the historic practice of using the filesystem
+	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+	 *
+	 * Locks live in a witness group keyed on their name. Thus,
+	 * when a lock is renamed, it must also move from the witness
+	 * group of its old name to the witness group of its new name.
+	 *
+	 * The change only needs to be made when the vnode moves
+	 * from one filesystem type to another. We ensure that each
+	 * filesystem use a single static name pointer for its tag so
+	 * that we can compare pointers rather than doing a strcmp().
 	 */
-	vp->v_vnlock = &vp->v_lock;
-	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+	lo = &vp->v_vnlock->lock_object;
+	if (lo->lo_name != tag) {
+		lo->lo_name = tag;
+		WITNESS_DESTROY(lo);
+		WITNESS_INIT(lo, tag);
+	}
 	/*
-	 * By default, don't allow shared locks unless filesystems
-	 * opt-in.
+	 * By default, don't allow shared locks unless filesystems opt-in.
 	 */
-	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
+	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
 	/*
-	 * Initialize bufobj.
-	 */
-	bo = &vp->v_bufobj;
-	bo->__bo_vnode = vp;
-	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
-	bo->bo_ops = &buf_ops_bio;
-	bo->bo_private = vp;
-	TAILQ_INIT(&bo->bo_clean.bv_hd);
-	TAILQ_INIT(&bo->bo_dirty.bv_hd);
-	/*
-	 * Initialize namecache.
-	 */
-	LIST_INIT(&vp->v_cache_src);
-	TAILQ_INIT(&vp->v_cache_dst);
-	/*
 	 * Finalize various vnode identity bits.
 	 */
+	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
-	vp->v_data = 0;
+	vp->v_bufobj.bo_ops = &buf_ops_bio;
 #ifdef MAC
 	mac_vnode_init(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
@@ -1077,11 +1232,10 @@
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
-		bo->bo_bsize = mp->mnt_stat.f_iosize;
+		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
-	rangelock_init(&vp->v_rl);
 
 	/*
 	 * For the filesystems which do not use vfs_hash_insert(),
@@ -1136,10 +1290,6 @@
 
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
-	/* XXX non mp-safe fs may still call insmntque with vnode
-	   unlocked */
-	if (!VOP_ISLOCKED(vp))
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vgone(vp);
 	vput(vp);
 }
@@ -1151,16 +1301,12 @@
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
-	int locked;
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
-#ifdef DEBUG_VFS_LOCKS
-	if (!VFS_NEEDSGIANT(mp))
-		ASSERT_VOP_ELOCKED(vp,
-		    "insmntque: mp-safe fs and non-locked vp");
-#endif
+	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
 	/*
 	 * We acquire the vnode interlock early to ensure that the
 	 * vnode cannot be recycled by another process releasing a
@@ -1172,18 +1318,15 @@
 	 */
 	MNT_ILOCK(mp);
 	VI_LOCK(vp);
-	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
 	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
-	     mp->mnt_nvnodelistsize == 0)) {
-		locked = VOP_ISLOCKED(vp);
-		if (!locked || (locked == LK_EXCLUSIVE &&
-		     (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
-			VI_UNLOCK(vp);
-			MNT_IUNLOCK(mp);
-			if (dtr != NULL)
-				dtr(vp, dtr_arg);
-			return (EBUSY);
-		}
+	    mp->mnt_nvnodelistsize == 0)) &&
+	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+		VI_UNLOCK(vp);
+		MNT_IUNLOCK(mp);
+		if (dtr != NULL)
+			dtr(vp, dtr_arg);
+		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
@@ -1265,9 +1408,9 @@
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
-			VM_OBJECT_LOCK(bo->bo_object);
+			VM_OBJECT_WLOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
-			VM_OBJECT_UNLOCK(bo->bo_object);
+			VM_OBJECT_WUNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
@@ -1278,10 +1421,10 @@
 	 */
 	if (bo->bo_object != NULL &&
 	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
-		VM_OBJECT_LOCK(bo->bo_object);
+		VM_OBJECT_WLOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
 		    OBJPR_CLEANONLY : 0);
-		VM_OBJECT_UNLOCK(bo->bo_object);
+		VM_OBJECT_WUNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
@@ -1304,6 +1447,8 @@
 
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+	if (vp->v_object != NULL && vp->v_object->handle != vp)
+		return (0);
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
 }
 
@@ -1312,7 +1457,7 @@
  *
  */
 static int
-flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
@@ -1320,7 +1465,7 @@
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
@@ -1332,12 +1477,11 @@
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
-			xflags = nbp->b_xflags &
-				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
+			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
-		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
+		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
@@ -1359,17 +1503,13 @@
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
-			BO_LOCK(bo);
 			bremfree(bp);
-			BO_UNLOCK(bo);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
-		BO_LOCK(bo);
 		bremfree(bp);
-		BO_UNLOCK(bo);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
@@ -1377,8 +1517,7 @@
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo ||
 		     nbp->b_lblkno != lblkno ||
-		     (nbp->b_xflags &
-		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
+		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
@@ -1390,8 +1529,7 @@
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
-    off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
@@ -1418,12 +1556,10 @@
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
-			    BO_MTX(bo)) == ENOLCK)
+			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
 
-			BO_LOCK(bo);
 			bremfree(bp);
-			BO_UNLOCK(bo);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
@@ -1444,11 +1580,9 @@
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
-			    BO_MTX(bo)) == ENOLCK)
+			    BO_LOCKPTR(bo)) == ENOLCK)
 				goto restart;
-			BO_LOCK(bo);
 			bremfree(bp);
-			BO_UNLOCK(bo);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
@@ -1476,15 +1610,13 @@
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
-			    BO_MTX(bo)) == ENOLCK) {
+			    BO_LOCKPTR(bo)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
-			BO_LOCK(bo);
 			bremfree(bp);
-			BO_UNLOCK(bo);
 			bawrite(bp);
 			BO_LOCK(bo);
 			goto restartsync;
@@ -1498,83 +1630,13 @@
 	return (0);
 }
 
-/*
- * buf_splay() - splay tree core for the clean/dirty list of buffers in
- *		 a vnode.
- *
- *	NOTE: We have to deal with the special case of a background bitmap
- *	buffer, a situation where two buffers will have the same logical
- *	block offset.  We want (1) only the foreground buffer to be accessed
- *	in a lookup and (2) must differentiate between the foreground and
- *	background buffer in the splay tree algorithm because the splay
- *	tree cannot normally handle multiple entities with the same 'index'.
- *	We accomplish this by adding differentiating flags to the splay tree's
- *	numerical domain.
- */
-static
-struct buf *
-buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
-{
-	struct buf dummy;
-	struct buf *lefttreemax, *righttreemin, *y;
-
-	if (root == NULL)
-		return (NULL);
-	lefttreemax = righttreemin = &dummy;
-	for (;;) {
-		if (lblkno < root->b_lblkno ||
-		    (lblkno == root->b_lblkno &&
-		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
-			if ((y = root->b_left) == NULL)
-				break;
-			if (lblkno < y->b_lblkno) {
-				/* Rotate right. */
-				root->b_left = y->b_right;
-				y->b_right = root;
-				root = y;
-				if ((y = root->b_left) == NULL)
-					break;
-			}
-			/* Link into the new root's right tree. */
-			righttreemin->b_left = root;
-			righttreemin = root;
-		} else if (lblkno > root->b_lblkno ||
-		    (lblkno == root->b_lblkno &&
-		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
-			if ((y = root->b_right) == NULL)
-				break;
-			if (lblkno > y->b_lblkno) {
-				/* Rotate left. */
-				root->b_right = y->b_left;
-				y->b_left = root;
-				root = y;
-				if ((y = root->b_right) == NULL)
-					break;
-			}
-			/* Link into the new root's left tree. */
-			lefttreemax->b_right = root;
-			lefttreemax = root;
-		} else {
-			break;
-		}
-		root = y;
-	}
-	/* Assemble the new root. */
-	lefttreemax->b_right = root->b_left;
-	righttreemin->b_left = root->b_right;
-	root->b_left = dummy.b_right;
-	root->b_right = dummy.b_left;
-	return (root);
-}
-
 static void
 buf_vlist_remove(struct buf *bp)
 {
-	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
-	ASSERT_BO_LOCKED(bp->b_bufobj);
+	ASSERT_BO_WLOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
@@ -1582,17 +1644,7 @@
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
-	if (bp != bv->bv_root) {
-		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
-		KASSERT(root == bp, ("splay lookup failed in remove"));
-	}
-	if (bp->b_left == NULL) {
-		root = bp->b_right;
-	} else {
-		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
-		root->b_right = bp->b_right;
-	}
-	bv->bv_root = root;
+	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
@@ -1599,8 +1651,7 @@
 }
 
 /*
- * Add the buffer to the sorted clean or dirty block list using a
- * splay tree algorithm.
+ * Add the buffer to the sorted clean or dirty block list.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
@@ -1607,10 +1658,13 @@
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
-	struct buf *root;
 	struct bufv *bv;
+	struct buf *n;
+	int error;
 
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
+	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
+	    ("dead bo %p", bo));
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
@@ -1619,26 +1673,22 @@
 	else
 		bv = &bo->bo_clean;
 
-	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
-	if (root == NULL) {
-		bp->b_left = NULL;
-		bp->b_right = NULL;
+	/*
+	 * Keep the list ordered.  Optimize empty list insertion.  Assume
+	 * we tend to grow at the tail so lookup_le should usually be cheaper
+	 * than _ge. 
+	 */
+	if (bv->bv_cnt == 0 ||
+	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
-	} else if (bp->b_lblkno < root->b_lblkno ||
-	    (bp->b_lblkno == root->b_lblkno &&
-	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
-		bp->b_left = root->b_left;
-		bp->b_right = root;
-		root->b_left = NULL;
-		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
-	} else {
-		bp->b_right = root->b_right;
-		bp->b_left = root;
-		root->b_right = NULL;
-		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
-	}
+	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+	else
+		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+	if (error)
+		panic("buf_vlist_add:  Preallocated nodes insufficient.");
 	bv->bv_cnt++;
-	bv->bv_root = bp;
 }
 
 /*
@@ -1659,23 +1709,10 @@
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
-	if ((bp = bo->bo_clean.bv_root) != NULL &&
-	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
+	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+	if (bp != NULL)
 		return (bp);
-	if ((bp = bo->bo_dirty.bv_root) != NULL &&
-	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
-		return (bp);
-	if ((bp = bo->bo_clean.bv_root) != NULL) {
-		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
-		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
-			return (bp);
-	}
-	if ((bp = bo->bo_dirty.bv_root) != NULL) {
-		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
-		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
-			return (bp);
-	}
-	return (NULL);
+	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
 }
 
 /*
@@ -1687,7 +1724,7 @@
 	struct bufobj *bo;
 
 	bo = &vp->v_bufobj;
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
@@ -1695,8 +1732,6 @@
 	    ("bgetvp: bp already attached! %p", bp));
 
 	vhold(vp);
-	if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
-		bp->b_flags |= B_NEEDSGIANT;
 	bp->b_vp = vp;
 	bp->b_bufobj = bo;
 	/*
@@ -1734,7 +1769,6 @@
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
-	bp->b_flags &= ~B_NEEDSGIANT;
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	BO_UNLOCK(bo);
@@ -1747,9 +1781,9 @@
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
-	int queue, slot;
+	int slot;
 
-	ASSERT_BO_LOCKED(bo);
+	ASSERT_BO_WLOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
@@ -1763,10 +1797,7 @@
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
-	queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
-	    WI_MPSAFEQ;
-	LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
-	    bo_synclist);
+	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
@@ -1840,6 +1871,8 @@
 	return (0);
 }
 
+static int first_printf = 1;
+
 /*
  * System filesystem synchronizer daemon.
  */
@@ -1846,8 +1879,7 @@
 static void
 sched_sync(void)
 {
-	struct synclist *gnext, *next;
-	struct synclist *gslp, *slp;
+	struct synclist *next, *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = curthread;
@@ -1854,12 +1886,10 @@
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
-	int first_printf;
 	int error;
 
 	last_work_seen = 0;
 	syncer_final_iter = 0;
-	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
@@ -1893,13 +1923,11 @@
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
-			slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
-			gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
+			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
-			next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
-			gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
+			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
@@ -1913,7 +1941,7 @@
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
-		    LIST_EMPTY(gslp) && syncer_worklist_len > 0);
+		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
@@ -1937,21 +1965,6 @@
 				wdog_kern_pat(WD_LASTVAL);
 
 		}
-		if (!LIST_EMPTY(gslp)) {
-			mtx_unlock(&sync_mtx);
-			mtx_lock(&Giant);
-			mtx_lock(&sync_mtx);
-			while (!LIST_EMPTY(gslp)) {
-				error = sync_vnode(gslp, &bo, td);
-				if (error == 1) {
-					LIST_REMOVE(bo, bo_synclist);
-					LIST_INSERT_HEAD(gnext, bo,
-					    bo_synclist);
-					continue;
-				}
-			}
-			mtx_unlock(&Giant);
-		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
@@ -2033,6 +2046,25 @@
 	kproc_shutdown(arg, howto);
 }
 
+void
+syncer_suspend(void)
+{
+
+	syncer_shutdown(updateproc, 0);
+}
+
+void
+syncer_resume(void)
+{
+
+	mtx_lock(&sync_mtx);
+	first_printf = 1;
+	syncer_state = SYNCER_RUNNING;
+	mtx_unlock(&sync_mtx);
+	cv_broadcast(&sync_wakeup);
+	kproc_resume(updateproc);
+}
+
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
@@ -2129,6 +2161,7 @@
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+	vholdl(vp);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
@@ -2135,7 +2168,6 @@
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
-	vholdl(vp);
 }
 
 /*
@@ -2212,7 +2244,6 @@
 	int error;
 
 	error = 0;
-	VFS_ASSERT_GIANT(vp->v_mount);
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vget: invalid lock operation"));
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
@@ -2297,7 +2328,6 @@
 		ASSERT_VOP_LOCKED(vp, "vput");
 	else
 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
-	VFS_ASSERT_GIANT(vp->v_mount);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	VI_LOCK(vp);
 
@@ -2343,8 +2373,10 @@
 		}
 		break;
 	case VPUTX_VUNREF:
-		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
-			error = EBUSY;
+		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
+			VI_LOCK(vp);
+		}
 		break;
 	}
 	if (vp->v_usecount > 0)
@@ -2412,11 +2444,15 @@
 	struct mount *mp;
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+#ifdef INVARIANTS
+	/* getnewvnode() calls v_incr_usecount() without holding interlock. */
+	if (vp->v_type != VNON || vp->v_data != NULL)
+		ASSERT_VI_LOCKED(vp, "vholdl");
+#endif
 	vp->v_holdcnt++;
-	if (!VSHOULDBUSY(vp))
+	if ((vp->v_iflag & VI_FREE) == 0)
 		return;
-	ASSERT_VI_LOCKED(vp, "vholdl");
-	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
+	VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
 	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
 	/*
 	 * Remove a vnode from the free list, mark it as in use,
@@ -2425,7 +2461,7 @@
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
 	freevnodes--;
-	vp->v_iflag &= ~(VI_FREE|VI_AGE);
+	vp->v_iflag &= ~VI_FREE;
 	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
 	    ("Activating already active vnode"));
 	vp->v_iflag |= VI_ACTIVE;
@@ -2451,6 +2487,10 @@
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we place it on the free list unless it has been vgone'd
  * (marked VI_DOOMED) in which case we will free it.
+ *
+ * Because the vnode vm object keeps a hold reference on the vnode if
+ * there is at least one resident non-cached page, the vnode cannot
+ * leave the active list without the page cleanup done.
  */
 void
 vdropl(struct vnode *vp)
@@ -2477,36 +2517,40 @@
 		    ("vdropl: vnode already reclaimed."));
 		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 		    ("vnode already free"));
-		VNASSERT(VSHOULDFREE(vp), vp,
+		VNASSERT(vp->v_holdcnt == 0, vp,
 		    ("vdropl: freeing when we shouldn't"));
 		active = vp->v_iflag & VI_ACTIVE;
-		vp->v_iflag &= ~VI_ACTIVE;
-		mp = vp->v_mount;
-		mtx_lock(&vnode_free_list_mtx);
-		if (active) {
-			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+		if ((vp->v_iflag & VI_OWEINACT) == 0) {
+			vp->v_iflag &= ~VI_ACTIVE;
+			mp = vp->v_mount;
+			mtx_lock(&vnode_free_list_mtx);
+			if (active) {
+				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+				    v_actfreelist);
+				mp->mnt_activevnodelistsize--;
+			}
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp,
 			    v_actfreelist);
-			mp->mnt_activevnodelistsize--;
-		}
-		if (vp->v_iflag & VI_AGE) {
-			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
+			freevnodes++;
+			vp->v_iflag |= VI_FREE;
+			mtx_unlock(&vnode_free_list_mtx);
 		} else {
-			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+			atomic_add_long(&free_owe_inact, 1);
 		}
-		freevnodes++;
-		vp->v_iflag &= ~VI_AGE;
-		vp->v_iflag |= VI_FREE;
-		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		return;
 	}
 	/*
 	 * The vnode has been marked for destruction, so free it.
+	 *
+	 * The vnode will be returned to the zone where it will
+	 * normally remain until it is needed for another vnode. We
+	 * need to cleanup (or verify that the cleanup has already
+	 * been done) any residual data left from its current use
+	 * so as not to contaminate the freshly allocated vnode.
 	 */
 	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
-	mtx_lock(&vnode_free_list_mtx);
-	numvnodes--;
-	mtx_unlock(&vnode_free_list_mtx);
+	atomic_subtract_long(&numvnodes, 1);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
@@ -2516,26 +2560,33 @@
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
-	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
+	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+	    ("clean blk trie not empty"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
-	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
+	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+	    ("dirty blk trie not empty"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+	    ("Dangling rangelock waiters"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_vnode_destroy(vp);
 #endif
-	if (vp->v_pollinfo != NULL)
+	if (vp->v_pollinfo != NULL) {
 		destroy_vpollinfo(vp->v_pollinfo);
+		vp->v_pollinfo = NULL;
+	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
-	rangelock_destroy(&vp->v_rl);
-	lockdestroy(vp->v_vnlock);
-	mtx_destroy(&vp->v_interlock);
-	mtx_destroy(BO_MTX(bo));
+	bzero(&vp->v_un, sizeof(vp->v_un));
+	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+	vp->v_iflag = 0;
+	vp->v_vflag = 0;
+	bo->bo_flag = 0;
 	uma_zfree(vnode_zone, vp);
 }
 
@@ -2560,17 +2611,19 @@
 	VI_UNLOCK(vp);
 	/*
 	 * Before moving off the active list, we must be sure that any
-	 * modified pages are on the vnode's dirty list since these will
-	 * no longer be checked once the vnode is on the inactive list.
-	 * Because the vnode vm object keeps a hold reference on the vnode
-	 * if there is at least one resident non-cached page, the vnode
-	 * cannot leave the active list without the page cleanup done.
+	 * modified pages are converted into the vnode's dirty
+	 * buffers, since these will no longer be checked once the
+	 * vnode is on the inactive list.
+	 *
+	 * The write-out of the dirty pages is asynchronous.  At the
+	 * point that VOP_INACTIVE() is called, there could still be
+	 * pending I/O and dirty pages in the object.
 	 */
 	obj = vp->v_object;
 	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
-		VM_OBJECT_LOCK(obj);
+		VM_OBJECT_WLOCK(obj);
 		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
-		VM_OBJECT_UNLOCK(obj);
+		VM_OBJECT_WUNLOCK(obj);
 	}
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
@@ -2651,9 +2704,9 @@
 		 */
 		if (flags & WRITECLOSE) {
 			if (vp->v_object != NULL) {
-				VM_OBJECT_LOCK(vp->v_object);
+				VM_OBJECT_WLOCK(vp->v_object);
 				vm_object_page_clean(vp->v_object, 0, 0, 0);
-				VM_OBJECT_UNLOCK(vp->v_object);
+				VM_OBJECT_WUNLOCK(vp->v_object);
 			}
 			error = VOP_FSYNC(vp, MNT_WAIT, td);
 			if (error != 0) {
@@ -2681,9 +2734,6 @@
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
-			VNASSERT(vp->v_usecount == 0 ||
-			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
-			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
@@ -2727,7 +2777,7 @@
  * Recycle an unused vnode to the front of the free list.
  */
 int
-vrecycle(struct vnode *vp, struct thread *td)
+vrecycle(struct vnode *vp)
 {
 	int recycled;
 
@@ -2821,7 +2871,7 @@
 /*
  * vgone, with the vp interlock held.
  */
-void
+static void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
@@ -2853,16 +2903,6 @@
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
 
 	/*
-	 * Clean out any buffers associated with the vnode.
-	 * If the flush fails, just toss the buffers.
-	 */
-	mp = NULL;
-	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
-		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
-	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
-		vinvalbuf(vp, 0, 0, 0);
-
-	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
@@ -2876,7 +2916,35 @@
 	}
 	if (vp->v_type == VSOCK)
 		vfs_unp_reclaim(vp);
+
 	/*
+	 * Clean out any buffers associated with the vnode.
+	 * If the flush fails, just toss the buffers.
+	 */
+	mp = NULL;
+	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
+	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
+		while (vinvalbuf(vp, 0, 0, 0) != 0)
+			;
+	}
+
+	BO_LOCK(&vp->v_bufobj);
+	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
+	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
+	    vp->v_bufobj.bo_clean.bv_cnt == 0,
+	    ("vp %p bufobj not invalidated", vp));
+
+	/*
+	 * For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
+	 * after the object's page queue is flushed.
+	 */
+	if (vp->v_bufobj.bo_object == NULL)
+		vp->v_bufobj.bo_flag |= BO_DEAD;
+	BO_UNLOCK(&vp->v_bufobj);
+
+	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
@@ -2889,6 +2957,7 @@
 	 * Clear the advisory locks and wake up waiting threads.
 	 */
 	(void)VOP_ADVLOCKPURGE(vp);
+	vp->v_lockf = NULL;
 	/*
 	 * Delete from old mount point vnode list.
 	 */
@@ -2952,8 +3021,25 @@
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
-	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
-	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
+	printf("    usecount %d, writecount %d, refcount %d",
+	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+	switch (vp->v_type) {
+	case VDIR:
+		printf(" mountedhere %p\n", vp->v_mountedhere);
+		break;
+	case VCHR:
+		printf(" rdev %p\n", vp->v_rdev);
+		break;
+	case VSOCK:
+		printf(" socket %p\n", vp->v_socket);
+		break;
+	case VFIFO:
+		printf(" fifoinfo %p\n", vp->v_fifoinfo);
+		break;
+	default:
+		printf("\n");
+		break;
+	}
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
@@ -2991,8 +3077,6 @@
 	}
 	if (vp->v_iflag & VI_MOUNT)
 		strlcat(buf, "|VI_MOUNT", sizeof(buf));
-	if (vp->v_iflag & VI_AGE)
-		strlcat(buf, "|VI_AGE", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
 		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
@@ -3003,7 +3087,7 @@
 		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
 	if (vp->v_iflag & VI_OWEINACT)
 		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
-	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
 	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
 	if (flags != 0) {
 		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
@@ -3013,9 +3097,12 @@
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
-		printf("    v_object %p ref %d pages %d\n",
+		printf("    v_object %p ref %d pages %d "
+		    "cleanbuf %d dirtybuf %d\n",
 		    vp->v_object, vp->v_object->ref_count,
-		    vp->v_object->resident_page_count);
+		    vp->v_object->resident_page_count,
+		    vp->v_bufobj.bo_clean.bv_cnt,
+		    vp->v_bufobj.bo_dirty.bv_cnt);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	if (vp->v_data != NULL)
@@ -3029,7 +3116,7 @@
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
-	struct mount *mp, *nmp;
+	struct mount *mp;
 	struct vnode *vp;
 
 	/*
@@ -3039,14 +3126,11 @@
 	 * about that.
 	 */
 	db_printf("Locked vnodes\n");
-	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
-		nmp = TAILQ_NEXT(mp, mnt_list);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
-			if (vp->v_type != VMARKER &&
-			    VOP_ISLOCKED(vp))
+			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
 				vprint("", vp);
 		}
-		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 
@@ -3170,6 +3254,7 @@
 	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
+	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
@@ -3176,7 +3261,6 @@
 	MNT_KERN_FLAG(MNTK_SUSPEND);
 	MNT_KERN_FLAG(MNTK_SUSPEND2);
 	MNT_KERN_FLAG(MNTK_SUSPENDED);
-	MNT_KERN_FLAG(MNTK_MPSAFE);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 #undef MNT_KERN_FLAG
@@ -3228,6 +3312,7 @@
 	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
 	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
 	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
+	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
 	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
 	db_printf("    mnt_secondary_accwrites = %d\n",
 	    mp->mnt_secondary_accwrites);
@@ -3290,12 +3375,11 @@
 {
 	struct xvfsconf32 xvfsp;
 
+	bzero(&xvfsp, sizeof(xvfsp));
 	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
 	xvfsp.vfc_typenum = vfsp->vfc_typenum;
 	xvfsp.vfc_refcount = vfsp->vfc_refcount;
 	xvfsp.vfc_flags = vfsp->vfc_flags;
-	xvfsp.vfc_vfsops = 0;
-	xvfsp.vfc_next = 0;
 	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 }
 #endif
@@ -3310,6 +3394,7 @@
 	int error;
 
 	error = 0;
+	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 #ifdef COMPAT_FREEBSD32
 		if (req->flags & SCTL_MASK32)
@@ -3320,11 +3405,12 @@
 		if (error)
 			break;
 	}
+	vfsconf_sunlock();
 	return (error);
 }
 
-SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
-    NULL, 0, sysctl_vfs_conflist,
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
@@ -3337,7 +3423,7 @@
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
-	printf("WARNING: userland calling deprecated sysctl, "
+	log(LOG_WARNING, "userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
@@ -3354,9 +3440,12 @@
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
-		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+		vfsconf_slock();
+		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 			if (vfsp->vfc_typenum == name[2])
 				break;
+		}
+		vfsconf_sunlock();
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 #ifdef COMPAT_FREEBSD32
@@ -3369,8 +3458,9 @@
 	return (EOPNOTSUPP);
 }
 
-static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
-    vfs_sysctl, "Generic filesystem");
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
+    CTLFLAG_MPSAFE, vfs_sysctl,
+    "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
@@ -3381,6 +3471,7 @@
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
+	vfsconf_slock();
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
@@ -3389,10 +3480,13 @@
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
-		if (error)
-			return error;
+		if (error != 0) {
+			vfsconf_sunlock();
+			return (error);
+		}
 	}
-	return 0;
+	vfsconf_sunlock();
+	return (0);
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
@@ -3490,10 +3584,26 @@
 	return (error);
 }
 
-SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
-    0, 0, sysctl_vnode, "S,xvnode", "");
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
+    "");
 #endif
 
+static void
+unmount_or_warn(struct mount *mp)
+{
+	int error;
+
+	error = dounmount(mp, MNT_FORCE, curthread);
+	if (error != 0 && strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
+		printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
+		if (error == EBUSY)
+			printf("BUSY)\n");
+		else
+			printf("%d)\n", error);
+	}
+}
+
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
@@ -3501,41 +3611,28 @@
 void
 vfs_unmountall(void)
 {
-	struct mount *mp;
-	struct thread *td;
-	int error;
+	struct mount *mp, *tmp;
 
 	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
-	td = curthread;
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
-	while(!TAILQ_EMPTY(&mountlist)) {
-		mp = TAILQ_LAST(&mountlist, mntlist);
-		error = dounmount(mp, MNT_FORCE, td);
-		if (error) {
-			TAILQ_REMOVE(&mountlist, mp, mnt_list);
-			/*
-			 * XXX: Due to the way in which we mount the root
-			 * file system off of devfs, devfs will generate a
-			 * "busy" warning when we try to unmount it before
-			 * the root.  Don't print a warning as a result in
-			 * order to avoid false positive errors that may
-			 * cause needless upset.
-			 */
-			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
-				printf("unmount of %s failed (",
-				    mp->mnt_stat.f_mntonname);
-				if (error == EBUSY)
-					printf("BUSY)\n");
-				else
-					printf("%d)\n", error);
-			}
-		} else {
-			/* The unmount has removed mp from the mountlist */
-		}
+	TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
+		vfs_ref(mp);
+
+		/*
+		 * Forcibly unmounting "/dev" before "/" would prevent clean
+		 * unmount of the latter.
+		 */
+		if (mp == rootdevmp)
+			continue;
+
+		unmount_or_warn(mp);
 	}
+
+	if (rootdevmp != NULL)
+		unmount_or_warn(rootdevmp);
 }
 
 /*
@@ -3563,11 +3660,11 @@
 
 				obj = vp->v_object;
 				if (obj != NULL) {
-					VM_OBJECT_LOCK(obj);
+					VM_OBJECT_WLOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
-					VM_OBJECT_UNLOCK(obj);
+					VM_OBJECT_WUNLOCK(obj);
 				}
 				vput(vp);
 			}
@@ -3577,16 +3674,25 @@
 }
 
 static void
-destroy_vpollinfo(struct vpollinfo *vi)
+destroy_vpollinfo_free(struct vpollinfo *vi)
 {
-	seldrain(&vi->vpi_selinfo);
+
 	knlist_destroy(&vi->vpi_selinfo.si_note);
 	mtx_destroy(&vi->vpi_lock);
 	uma_zfree(vnodepoll_zone, vi);
 }
 
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+	knlist_clear(&vi->vpi_selinfo.si_note, 1);
+	seldrain(&vi->vpi_selinfo);
+	destroy_vpollinfo_free(vi);
+}
+
 /*
- * Initalize per-vnode helper structure to hold poll-related state.
+ * Initialize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
@@ -3602,7 +3708,7 @@
 	VI_LOCK(vp);
 	if (vp->v_pollinfo != NULL) {
 		VI_UNLOCK(vp);
-		destroy_vpollinfo(vi);
+		destroy_vpollinfo_free(vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
@@ -3763,11 +3869,8 @@
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
-	mtx_lock(&mountlist_mtx);
-	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
-		mtx_unlock(&mountlist_mtx);
+	if (vfs_busy(mp, MBF_NOWAIT) != 0)
 		return (0);
-	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp);
 		return (0);
@@ -3828,11 +3931,13 @@
 {
 	int error;
 
+	if (vp->v_type != VCHR) {
+		error = ENOTBLK;
+		goto out;
+	}
 	error = 0;
 	dev_lock();
-	if (vp->v_type != VCHR)
-		error = ENOTBLK;
-	else if (vp->v_rdev == NULL)
+	if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
@@ -3839,6 +3944,7 @@
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
+out:
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
@@ -3997,7 +4103,7 @@
 
 #ifdef DEBUG_VFS_LOCKS
 /*
- * This only exists to supress warnings from unlocked specfs accesses.  It is
+ * This only exists to suppress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
@@ -4268,6 +4374,15 @@
 }
 
 void
+vop_reclaim_post(void *ap, int rc)
+{
+	struct vop_reclaim_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
+}
+
+void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
@@ -4282,10 +4397,27 @@
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
+	long hint;
 
 	if (!rc) {
-		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
-		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
+		hint = NOTE_WRITE;
+		if (a->a_fdvp == a->a_tdvp) {
+			if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
+				hint |= NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+		} else {
+			hint |= NOTE_EXTEND;
+			if (a->a_fvp->v_type == VDIR)
+				hint |= NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+
+			if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
+			    a->a_tvp->v_type == VDIR)
+				hint &= ~NOTE_LINK;
+			VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+		}
+
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
@@ -4337,6 +4469,45 @@
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
+void
+vop_open_post(void *ap, int rc)
+{
+	struct vop_open_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+}
+
+void
+vop_close_post(void *ap, int rc)
+{
+	struct vop_close_args *a = ap;
+
+	if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
+	    (a->a_vp->v_iflag & VI_DOOMED) == 0)) {
+		VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+		    NOTE_CLOSE_WRITE : NOTE_CLOSE);
+	}
+}
+
+void
+vop_read_post(void *ap, int rc)
+{
+	struct vop_read_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+void
+vop_readdir_post(void *ap, int rc)
+{
+	struct vop_readdir_args *a = ap;
+
+	if (!rc)
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
 static struct knlist fs_knlist;
 
 static void
@@ -4516,6 +4687,7 @@
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+	vhold(vp);
 	knlist_add(knl, kn, 0);
 
 	return (0);
@@ -4531,6 +4703,7 @@
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+	vdrop(vp);
 }
 
 /*ARGSUSED*/
@@ -4545,7 +4718,7 @@
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
-	if (hint == NOTE_REVOKE) {
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		VI_LOCK(vp);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		VI_UNLOCK(vp);
@@ -4574,7 +4747,7 @@
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
-	if (hint == NOTE_REVOKE)
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
@@ -4591,7 +4764,7 @@
 	VI_LOCK(vp);
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
-	if (hint == NOTE_REVOKE) {
+	if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
 		kn->kn_flags |= EV_EOF;
 		VI_UNLOCK(vp);
 		return (1);
@@ -4627,6 +4800,7 @@
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
+	*ap->a_ncookies += 1;
 	return (0);
 }
 
@@ -4642,7 +4816,6 @@
 	struct mount *mp;
 
 	mp = vp->v_mount;
-	VFS_ASSERT_GIANT(mp);
 	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
 	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		(void)VOP_MARKATIME(vp);
@@ -4710,15 +4883,21 @@
 	struct vnode *vp;
 
 	if (should_yield())
-		kern_yield(PRI_UNCHANGED);
+		kern_yield(PRI_USER);
 	MNT_ILOCK(mp);
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
-	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
-	while (vp != NULL && (vp->v_type == VMARKER ||
-	    (vp->v_iflag & VI_DOOMED) != 0))
-		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-	/* Check if we are done */
+	for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
+	    vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
+		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+			continue;
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		break;
+	}
 	if (vp == NULL) {
 		__mnt_vnode_markerfree_all(mvp, mp);
 		/* MNT_IUNLOCK(mp); -- done in above function */
@@ -4727,7 +4906,6 @@
 	}
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
-	VI_LOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
@@ -4740,14 +4918,20 @@
 	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
+	(*mvp)->v_mount = mp;
 	(*mvp)->v_type = VMARKER;
 
-	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	while (vp != NULL && (vp->v_type == VMARKER ||
-	    (vp->v_iflag & VI_DOOMED) != 0))
-		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
-	/* Check if we are done */
+	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+		/* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+		if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+			continue;
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			continue;
+		}
+		break;
+	}
 	if (vp == NULL) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
@@ -4755,14 +4939,11 @@
 		*mvp = NULL;
 		return (NULL);
 	}
-	(*mvp)->v_mount = mp;
 	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
-	VI_LOCK(vp);
 	MNT_IUNLOCK(mp);
 	return (vp);
 }
 
-
 void
 __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
 {
@@ -4799,12 +4980,6 @@
 	*mvp = NULL;
 }
 
-#ifdef SMP
-#define	ALWAYS_YIELD	(mp_ncpus == 1)
-#else
-#define	ALWAYS_YIELD	1
-#endif
-
 static struct vnode *
 mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
 {
@@ -4821,10 +4996,10 @@
 			continue;
 		}
 		if (!VI_TRYLOCK(vp)) {
-			if (ALWAYS_YIELD || should_yield()) {
+			if (mp_ncpus == 1 || should_yield()) {
 				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
 				mtx_unlock(&vnode_free_list_mtx);
-				kern_yield(PRI_USER);
+				pause("vnacti", 1);
 				mtx_lock(&vnode_free_list_mtx);
 				goto restart;
 			}
@@ -4852,7 +5027,6 @@
 	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
 	return (vp);
 }
-#undef ALWAYS_YIELD
 
 struct vnode *
 __mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
@@ -4859,7 +5033,7 @@
 {
 
 	if (should_yield())
-		kern_yield(PRI_UNCHANGED);
+		kern_yield(PRI_USER);
 	mtx_lock(&vnode_free_list_mtx);
 	return (mnt_vnode_next_active(mvp, mp));
 }

Modified: trunk/sys/kern/vfs_syscalls.c
===================================================================
--- trunk/sys/kern/vfs_syscalls.c	2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_syscalls.c	2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 325099 2017-10-29 09:48:28Z kib $");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
@@ -46,7 +47,7 @@
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/disk.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
@@ -61,6 +62,7 @@
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
+#include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
@@ -91,16 +93,18 @@
 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
 
 SDT_PROVIDER_DEFINE(vfs);
-SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
-SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
-SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
-SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
-SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
-SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
+SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
 
 static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int kern_chflags(struct thread *td, const char *path,
+    enum uio_seg pathseg, u_long flags);
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
-static int setfflags(struct thread *td, struct vnode *, int);
+static int getutimens(const struct timespec *, enum uio_seg,
+    struct timespec *, int *);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
@@ -114,11 +118,6 @@
  */
 int async_io_version;
 
-#ifdef DEBUG
-static int syncprt = 0;
-SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
-#endif
-
 /*
  * Sync each mounted filesystem.
  */
@@ -134,7 +133,7 @@
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
-	int save, vfslocked;
+	int save;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
@@ -142,7 +141,6 @@
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
-		vfslocked = VFS_LOCK_GIANT(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			save = curthread_pflags_set(TDP_SYNCIO);
@@ -151,7 +149,6 @@
 			curthread_pflags_restore(save);
 			vn_finished_write(mp);
 		}
-		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
@@ -182,19 +179,17 @@
 	} */ *uap;
 {
 	struct mount *mp;
-	int vfslocked;
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_UID(uap->uid);
 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 		return (EPERM);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	   UIO_USERSPACE, uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
@@ -201,10 +196,8 @@
 	vput(nd.ni_vp);
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
+	if (error != 0)
 		return (error);
-	}
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
 
 	/*
@@ -220,7 +213,6 @@
 	 */
 	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
 		vfs_unbusy(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -265,6 +257,43 @@
 	sf->f_bavail >>= shift;
 }
 
+static int
+kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
+{
+	struct statfs *sp;
+	int error;
+
+	if (mp == NULL)
+		return (EBADF);
+	error = vfs_busy(mp, 0);
+	vfs_rel(mp);
+	if (error != 0)
+		return (error);
+#ifdef MAC
+	error = mac_mount_check_stat(td->td_ucred, mp);
+	if (error != 0)
+		goto out;
+#endif
+	/*
+	 * Set these in case the underlying filesystem fails to do so.
+	 */
+	sp = &mp->mnt_stat;
+	sp->f_version = STATFS_VERSION;
+	sp->f_namemax = NAME_MAX;
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	error = VFS_STATFS(mp, sp);
+	if (error != 0)
+		goto out;
+	*buf = *sp;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
+		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
+		prison_enforce_statfs(td->td_ucred, mp, buf);
+	}
+out:
+	vfs_unbusy(mp);
+	return (error);
+}
+
 /*
  * Get filesystem statistics.
  */
@@ -296,53 +325,19 @@
     struct statfs *buf)
 {
 	struct mount *mp;
-	struct statfs *sp, sb;
-	int vfslocked;
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
-	    AUDITVNODE1, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
 	error = namei(&nd);
-	if (error)
+	if (error != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
-	error = vfs_busy(mp, 0);
-	vfs_rel(mp);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
-#ifdef MAC
-	error = mac_mount_check_stat(td->td_ucred, mp);
-	if (error)
-		goto out;
-#endif
-	/*
-	 * Set these in case the underlying filesystem fails to do so.
-	 */
-	sp = &mp->mnt_stat;
-	sp->f_version = STATFS_VERSION;
-	sp->f_namemax = NAME_MAX;
-	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-	error = VFS_STATFS(mp, sp);
-	if (error)
-		goto out;
-	if (priv_check(td, PRIV_VFS_GENERATION)) {
-		bcopy(sp, &sb, sizeof(sb));
-		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
-		prison_enforce_statfs(td->td_ucred, mp, &sb);
-		sp = &sb;
-	}
-	*buf = *sp;
-out:
-	vfs_unbusy(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
+	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
@@ -376,63 +371,26 @@
 {
 	struct file *fp;
 	struct mount *mp;
-	struct statfs *sp, sb;
-	int vfslocked;
 	struct vnode *vp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
-	if (error)
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
+	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef AUDIT
 	AUDIT_ARG_VNODE1(vp);
 #endif
 	mp = vp->v_mount;
-	if (mp)
+	if (mp != NULL)
 		vfs_ref(mp);
 	VOP_UNLOCK(vp, 0);
 	fdrop(fp, td);
-	if (mp == NULL) {
-		error = EBADF;
-		goto out;
-	}
-	error = vfs_busy(mp, 0);
-	vfs_rel(mp);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
-#ifdef MAC
-	error = mac_mount_check_stat(td->td_ucred, mp);
-	if (error)
-		goto out;
-#endif
-	/*
-	 * Set these in case the underlying filesystem fails to do so.
-	 */
-	sp = &mp->mnt_stat;
-	sp->f_version = STATFS_VERSION;
-	sp->f_namemax = NAME_MAX;
-	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-	error = VFS_STATFS(mp, sp);
-	if (error)
-		goto out;
-	if (priv_check(td, PRIV_VFS_GENERATION)) {
-		bcopy(sp, &sb, sizeof(sb));
-		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
-		prison_enforce_statfs(td->td_ucred, mp, &sb);
-		sp = &sb;
-	}
-	*buf = *sp;
-out:
-	if (mp)
-		vfs_unbusy(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
+	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
@@ -461,7 +419,7 @@
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
- * 	The caller is responsible for freeing memory which will be allocated
+ *	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
@@ -471,7 +429,6 @@
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, sb;
 	size_t count, maxcount;
-	int vfslocked;
 	int error;
 
 	maxcount = bufsize / sizeof(struct statfs);
@@ -508,8 +465,7 @@
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
-		vfslocked = VFS_LOCK_GIANT(mp);
-		if (sfsp && count < maxcount) {
+		if (sfsp != NULL && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * Set these in case the underlying filesystem
@@ -526,7 +482,6 @@
 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (flags & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp))) {
-				VFS_UNLOCK_GIANT(vfslocked);
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp);
@@ -542,15 +497,13 @@
 				bcopy(sp, sfsp, sizeof(*sp));
 			else /* if (bufseg == UIO_USERSPACE) */ {
 				error = copyout(sp, sfsp, sizeof(*sp));
-				if (error) {
+				if (error != 0) {
 					vfs_unbusy(mp);
-					VFS_UNLOCK_GIANT(vfslocked);
 					return (error);
 				}
 			}
 			sfsp++;
 		}
-		VFS_UNLOCK_GIANT(vfslocked);
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
@@ -557,7 +510,7 @@
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
-	if (sfsp && count > maxcount)
+	if (sfsp != NULL && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
@@ -589,7 +542,7 @@
 	int error;
 
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -617,7 +570,7 @@
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &sf);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -688,10 +641,10 @@
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtstatfs(&sf, &osb);
 	return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -751,49 +704,40 @@
 	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
-	int vfslocked;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
-	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
+	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
+	    &fp);
+	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
-		int tvfslocked;
 		if (vfs_busy(mp, 0))
 			continue;
-		tvfslocked = VFS_LOCK_GIANT(mp);
 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
 		vfs_unbusy(mp);
-		if (error) {
-			VFS_UNLOCK_GIANT(tvfslocked);
+		if (error != 0)
 			break;
-		}
 		vput(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		vp = tdp;
-		vfslocked = tvfslocked;
 	}
-	if (error) {
+	if (error != 0) {
 		vput(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	FILEDESC_XLOCK(fdp);
 	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
-	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
 	vrele(vpold);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
@@ -820,32 +764,26 @@
 kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
-	int error;
 	struct nameidata nd;
 	struct vnode *vp;
-	int vfslocked;
+	int error;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
-	    MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	FILEDESC_XLOCK(fdp);
 	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
 	FILEDESC_XUNLOCK(fdp);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
@@ -863,7 +801,7 @@
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
-	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
@@ -887,7 +825,8 @@
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
-     &chroot_allow_open_directories, 0, "");
+     &chroot_allow_open_directories, 0,
+     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Change notion of root (``/'') directory.
@@ -904,34 +843,32 @@
 		char *path;
 	} */ *uap;
 {
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 	error = priv_check(td, PRIV_VFS_CHROOT);
-	if (error)
+	if (error != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
-	    AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
-	if (error)
+	if (error != 0)
 		goto error;
-	vfslocked = NDHASGIANT(&nd);
-	if ((error = change_dir(nd.ni_vp, td)) != 0)
+	error = change_dir(nd.ni_vp, td);
+	if (error != 0)
 		goto e_vunlock;
 #ifdef MAC
-	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
+	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+	if (error != 0)
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp, 0);
 	error = change_root(nd.ni_vp, td);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 error:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
@@ -946,7 +883,9 @@
 	struct vnode *vp;
 	struct thread *td;
 {
+#ifdef MAC
 	int error;
+#endif
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
@@ -953,11 +892,10 @@
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_vnode_check_chdir(td->td_ucred, vp);
-	if (error)
+	if (error != 0)
 		return (error);
 #endif
-	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
-	return (error);
+	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 }
 
 /*
@@ -972,16 +910,14 @@
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
-	int vfslocked;
 	int error;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
-		if (error) {
+		if (error != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
@@ -994,45 +930,43 @@
 		VREF(fdp->fd_jdir);
 	}
 	FILEDESC_XUNLOCK(fdp);
-	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
 	vrele(oldvp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 }
 
-static __inline cap_rights_t
-flags_to_rights(int flags)
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
 {
-	cap_rights_t rights = 0;
 
-	switch ((flags & O_ACCMODE)) {
-	case O_RDONLY:
-		rights |= CAP_READ;
-		break;
-
-	case O_RDWR:
-		rights |= CAP_READ;
-		/* fall through */
-
-	case O_WRONLY:
-		rights |= CAP_WRITE;
-		break;
-
-	case O_EXEC:
-		rights |= CAP_FEXECVE;
-		break;
+	if (flags & O_EXEC) {
+		cap_rights_set(rightsp, CAP_FEXECVE);
+	} else {
+		switch ((flags & O_ACCMODE)) {
+		case O_RDONLY:
+			cap_rights_set(rightsp, CAP_READ);
+			break;
+		case O_RDWR:
+			cap_rights_set(rightsp, CAP_READ);
+			/* FALLTHROUGH */
+		case O_WRONLY:
+			cap_rights_set(rightsp, CAP_WRITE);
+			if (!(flags & (O_APPEND | O_TRUNC)))
+				cap_rights_set(rightsp, CAP_SEEK);
+			break;
+		}
 	}
 
 	if (flags & O_CREAT)
-		rights |= CAP_CREATE;
+		cap_rights_set(rightsp, CAP_CREATE);
 
 	if (flags & O_TRUNC)
-		rights |= CAP_FTRUNCATE;
+		cap_rights_set(rightsp, CAP_FTRUNCATE);
 
-	if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
-		rights |= CAP_FLOCK;
+	if (flags & (O_SYNC | O_FSYNC))
+		cap_rights_set(rightsp, CAP_FSYNC);
 
-	return (rights);
+	if (flags & (O_EXLOCK | O_SHLOCK))
+		cap_rights_set(rightsp, CAP_FLOCK);
 }
 
 /*
@@ -1091,18 +1025,17 @@
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
-	int cmode;
-	struct file *nfp;
-	int type, indx = -1, error, error_open;
-	struct flock lf;
 	struct nameidata nd;
-	int vfslocked;
-	cap_rights_t rights_needed = CAP_LOOKUP;
+	cap_rights_t rights;
+	int cmode, error, indx;
 
+	indx = -1;
+
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	/* XXX: audit dirfd */
-	rights_needed |= flags_to_rights(flags);
+	cap_rights_init(&rights, CAP_LOOKUP);
+	flags_to_rights(flags, &rights);
 	/*
 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 	 * may be specified.
@@ -1110,27 +1043,30 @@
 	if (flags & O_EXEC) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
-	} else if ((flags & O_ACCMODE) == O_ACCMODE)
+	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
-	else
+	} else {
 		flags = FFLAGS(flags);
+	}
 
 	/*
-	 * allocate the file descriptor, but don't install a descriptor yet
+	 * Allocate the file descriptor, but don't install a descriptor yet.
 	 */
-	error = falloc_noinstall(td, &nfp);
-	if (error)
+	error = falloc_noinstall(td, &fp);
+	if (error != 0)
 		return (error);
-	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
-	fp = nfp;
+	/*
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
+	 */
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
-	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
-	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
-	    path, fd, rights_needed, td);
+	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    &rights, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
 	error = vn_open(&nd, &flags, cmode, fp);
-	if (error) {
+	if (error != 0) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
@@ -1140,37 +1076,24 @@
 			goto success;
 
 		/*
-		 * handle special fdopen() case.  bleh.  dupfdopen() is
-		 * responsible for dropping the old contents of ofiles[indx]
-		 * if it succeeds.
+		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
-		if ((nd.ni_strictrelative == 0) &&
+		if (nd.ni_strictrelative == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
-		    (td->td_dupfd >= 0)) {
-			/* XXX from fdopen */
-			error_open = error;
-			if ((error = finstall(td, fp, &indx, flags)) != 0)
-				goto bad_unlocked;
-			if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
-			    flags, error_open)) == 0)
+		    td->td_dupfd >= 0) {
+			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+			    &indx);
+			if (error == 0)
 				goto success;
 		}
-		/*
-		 * Clean up the descriptor, but only if another thread hadn't
-		 * replaced or closed it.
-		 */
-		if (indx != -1)
-			fdclose(fdp, fp, indx, td);
-		fdrop(fp, td);
 
-		return (error);
+		goto bad;
 	}
 	td->td_dupfd = 0;
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
@@ -1187,51 +1110,37 @@
 	if (fp->f_ops == &badfileops) {
 		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
 		fp->f_seqcount = 1;
-		finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
+		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+		    DTYPE_VNODE, vp, &vnops);
 	}
 
 	VOP_UNLOCK(vp, 0);
-	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
-		lf.l_whence = SEEK_SET;
-		lf.l_start = 0;
-		lf.l_len = 0;
-		if (flags & O_EXLOCK)
-			lf.l_type = F_WRLCK;
-		else
-			lf.l_type = F_RDLCK;
-		type = F_FLOCK;
-		if ((flags & FNONBLOCK) == 0)
-			type |= F_WAIT;
-		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0)
-			goto bad;
-		atomic_set_int(&fp->f_flag, FHASLOCK);
-	}
 	if (flags & O_TRUNC) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
-		if (error)
+		if (error != 0)
 			goto bad;
 	}
-	VFS_UNLOCK_GIANT(vfslocked);
 success:
 	/*
 	 * If we haven't already installed the FD (for dupfdopen), do so now.
 	 */
 	if (indx == -1) {
+		struct filecaps *fcaps;
+
 #ifdef CAPABILITIES
-		if (nd.ni_strictrelative == 1) {
-			/*
-			 * We are doing a strict relative lookup; wrap the
-			 * result in a capability.
-			 */
-			if ((error = kern_capwrap(td, fp, nd.ni_baserights,
-			    &indx)) != 0)
-				goto bad_unlocked;
-		} else
+		if (nd.ni_strictrelative == 1)
+			fcaps = &nd.ni_filecaps;
+		else
 #endif
-			if ((error = finstall(td, fp, &indx, flags)) != 0)
-				goto bad_unlocked;
-
+			fcaps = NULL;
+		error = finstall(td, fp, &indx, flags, fcaps);
+		/* On success finstall() consumes fcaps. */
+		if (error != 0) {
+			filecaps_free(&nd.ni_filecaps);
+			goto bad;
+		}
+	} else {
+		filecaps_free(&nd.ni_filecaps);
 	}
 
 	/*
@@ -1242,12 +1151,8 @@
 	td->td_retval[0] = indx;
 	return (0);
 bad:
-	VFS_UNLOCK_GIANT(vfslocked);
-bad_unlocked:
-	if (indx != -1)
-		fdclose(fdp, fp, indx, td);
+	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	fdrop(fp, td);
-	td->td_retval[0] = -1;
 	return (error);
 }
 
@@ -1329,10 +1234,9 @@
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
-	int error;
-	int whiteout = 0;
 	struct nameidata nd;
-	int vfslocked;
+	cap_rights_t rights;
+	int error, whiteout = 0;
 
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_DEV(dev);
@@ -1340,10 +1244,9 @@
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+		if (error == 0 && dev == VNOVAL)
+			error = EINVAL;
 		break;
-	case S_IFMT:
-		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
-		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
@@ -1355,16 +1258,15 @@
 		error = EINVAL;
 		break;
 	}
-	if (error)
+	if (error != 0)
 		return (error);
 restart:
 	bwillwrite();
-	NDINIT_ATRIGHTS(&nd, CREATE,
-	    LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
-	    CAP_MKFIFO, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
+	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -1373,7 +1275,6 @@
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
@@ -1383,9 +1284,6 @@
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
-		case S_IFMT:	/* used by badsect to flag bad sectors */
-			vattr.va_type = VBAD;
-			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
@@ -1402,7 +1300,6 @@
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
@@ -1412,7 +1309,7 @@
 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
-	if (!error) {
+	if (error == 0) {
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
@@ -1425,7 +1322,6 @@
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1478,18 +1374,18 @@
 {
 	struct mount *mp;
 	struct vattr vattr;
+	struct nameidata nd;
+	cap_rights_t rights;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
-	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
-	    pathseg, path, fd, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
+	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
@@ -1497,13 +1393,11 @@
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
@@ -1514,7 +1408,7 @@
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
@@ -1525,7 +1419,6 @@
 #endif
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	return (error);
 }
@@ -1599,13 +1492,13 @@
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
-		if (error)
+		if (error != 0)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
-		if (error)
+		if (error != 0)
 			return (error);
 	}
 
@@ -1626,59 +1519,81 @@
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
-	int vfslocked;
-	int lvfslocked;
+	cap_rights_t rights;
 	int error;
 
+again:
 	bwillwrite();
-	NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
-	    fd1, td);
+	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (EPERM);		/* POSIX */
 	}
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
-		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
-	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
-	    segflg, path2, fd2, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
+	    NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
+	    td);
 	if ((error = namei(&nd)) == 0) {
-		lvfslocked = NDHASGIANT(&nd);
 		if (nd.ni_vp != NULL) {
+			NDFREE(&nd, NDF_ONLY_PNBUF);
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
-			error = EEXIST;
-		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
-		    == 0) {
+			vrele(vp);
+			return (EEXIST);
+		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
+			/*
+			 * Cross-device link.  No need to recheck
+			 * vp->v_type, since it cannot change, except
+			 * to VBAD.
+			 */
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vput(nd.ni_dvp);
+			vrele(vp);
+			return (EXDEV);
+		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
 			error = can_hardlink(vp, td->td_ucred);
+#ifdef MAC
 			if (error == 0)
-#ifdef MAC
 				error = mac_vnode_check_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
-			if (error == 0)
 #endif
-				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			if (error != 0) {
+				vput(vp);
+				vput(nd.ni_dvp);
+				NDFREE(&nd, NDF_ONLY_PNBUF);
+				return (error);
+			}
+			error = vn_start_write(vp, &mp, V_NOWAIT);
+			if (error != 0) {
+				vput(vp);
+				vput(nd.ni_dvp);
+				NDFREE(&nd, NDF_ONLY_PNBUF);
+				error = vn_start_write(NULL, &mp,
+				    V_XSLEEP | PCATCH);
+				if (error != 0)
+					return (error);
+				goto again;
+			}
+			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_UNLOCK(vp, 0);
 			vput(nd.ni_dvp);
+			vn_finished_write(mp);
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+		} else {
+			vput(nd.ni_dvp);
+			NDFREE(&nd, NDF_ONLY_PNBUF);
+			vrele(vp);
+			goto again;
 		}
-		NDFREE(&nd, NDF_ONLY_PNBUF);
-		VFS_UNLOCK_GIANT(lvfslocked);
 	}
 	vrele(vp);
-	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1732,9 +1647,9 @@
 	struct mount *mp;
 	struct vattr vattr;
 	char *syspath;
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
+	cap_rights_t rights;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path1;
@@ -1746,11 +1661,11 @@
 	AUDIT_ARG_TEXT(syspath);
 restart:
 	bwillwrite();
-	NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
-	    segflg, path2, fd, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
+	    td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
-	vfslocked = NDHASGIANT(&nd);
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
@@ -1758,7 +1673,6 @@
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		error = EEXIST;
 		goto out;
 	}
@@ -1765,7 +1679,6 @@
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
@@ -1776,7 +1689,7 @@
 	vattr.va_type = VLNK;
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
-	if (error)
+	if (error != 0)
 		goto out2;
 #endif
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
@@ -1788,7 +1701,6 @@
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, syspath);
@@ -1805,19 +1717,17 @@
 		char *path;
 	} */ *uap;
 {
-	int error;
 	struct mount *mp;
 	struct nameidata nd;
-	int vfslocked;
+	int error;
 
 restart:
 	bwillwrite();
-	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
+	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
-	if (error)
+	if (error != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -1827,13 +1737,11 @@
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
@@ -1842,7 +1750,6 @@
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1901,18 +1808,17 @@
 {
 	struct mount *mp;
 	struct vnode *vp;
-	int error;
 	struct nameidata nd;
 	struct stat sb;
-	int vfslocked;
+	cap_rights_t rights;
+	int error;
 
 restart:
 	bwillwrite();
-	NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	    pathseg, path, fd, td);
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error == EINVAL ? EPERM : error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR && oldinum == 0) {
 		error = EPERM;		/* POSIX */
@@ -1937,7 +1843,6 @@
 				vrele(vp);
 			else
 				vput(vp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0)
 				return (error);
@@ -1946,7 +1851,7 @@
 #ifdef MAC
 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
-		if (error)
+		if (error != 0)
 			goto out;
 #endif
 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
@@ -1962,7 +1867,6 @@
 		vrele(vp);
 	else
 		vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1987,80 +1891,17 @@
 		int whence;
 	} */ *uap;
 {
-	struct ucred *cred = td->td_ucred;
 	struct file *fp;
-	struct vnode *vp;
-	struct vattr vattr;
-	off_t foffset, offset, size;
-	int error, noneg;
-	int vfslocked;
+	cap_rights_t rights;
+	int error;
 
 	AUDIT_ARG_FD(uap->fd);
-	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
+	if (error != 0)
 		return (error);
-	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
-		fdrop(fp, td);
-		return (ESPIPE);
-	}
-	vp = fp->f_vnode;
-	foffset = foffset_lock(fp, 0);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	noneg = (vp->v_type != VCHR);
-	offset = uap->offset;
-	switch (uap->whence) {
-	case L_INCR:
-		if (noneg &&
-		    (foffset < 0 ||
-		    (offset > 0 && foffset > OFF_MAX - offset))) {
-			error = EOVERFLOW;
-			break;
-		}
-		offset += foffset;
-		break;
-	case L_XTND:
-		vn_lock(vp, LK_SHARED | LK_RETRY);
-		error = VOP_GETATTR(vp, &vattr, cred);
-		VOP_UNLOCK(vp, 0);
-		if (error)
-			break;
-
-		/*
-		 * If the file references a disk device, then fetch
-		 * the media size and use that to determine the ending
-		 * offset.
-		 */
-		if (vattr.va_size == 0 && vp->v_type == VCHR &&
-		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
-			vattr.va_size = size;
-		if (noneg &&
-		    (vattr.va_size > OFF_MAX ||
-		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
-			error = EOVERFLOW;
-			break;
-		}
-		offset += vattr.va_size;
-		break;
-	case L_SET:
-		break;
-	case SEEK_DATA:
-		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
-		break;
-	case SEEK_HOLE:
-		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
-		break;
-	default:
-		error = EINVAL;
-	}
-	if (error == 0 && noneg && offset < 0)
-		error = EINVAL;
-	if (error != 0)
-		goto drop;
-	VFS_KNOTE_UNLOCKED(vp, 0);
-	*(off_t *)(td->td_retval) = offset;
-drop:
+	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
-	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
@@ -2122,8 +1963,8 @@
 	struct ucred	*cred;
 	struct thread	*td;
 {
+	accmode_t accmode;
 	int error;
-	accmode_t accmode;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
@@ -2137,7 +1978,7 @@
 			accmode |= VEXEC;
 #ifdef MAC
 		error = mac_vnode_check_access(cred, vp, accmode);
-		if (error)
+		if (error != 0)
 			return (error);
 #endif
 		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
@@ -2199,7 +2040,7 @@
 	struct ucred *cred, *tmpcred;
 	struct vnode *vp;
 	struct nameidata nd;
-	int vfslocked;
+	cap_rights_t rights;
 	int error;
 
 	/*
@@ -2215,17 +2056,16 @@
 	} else
 		cred = tmpcred = td->td_ucred;
 	AUDIT_ARG_VALUE(amode);
-	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
-	    AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
+	    td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, amode, tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 out1:
 	if (!(flag & AT_EACCESS)) {
 		td->td_ucred = cred;
@@ -2285,11 +2125,10 @@
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
-	error = copyout(&osb, uap->ub, sizeof (osb));
-	return (error);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
@@ -2314,11 +2153,10 @@
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
-	error = copyout(&osb, uap->ub, sizeof (osb));
-	return (error);
+	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
@@ -2421,30 +2259,29 @@
 {
 	struct nameidata nd;
 	struct stat sb;
-	int error, vfslocked;
+	cap_rights_t rights;
+	int error;
 
 	if (flag & ~AT_SYMLINK_NOFOLLOW)
 		return (EINVAL);
 
 	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
-	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
-	    path, fd, CAP_FSTAT, td);
+	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FSTAT), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
-	if (!error) {
-		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
+	if (error == 0) {
+		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
 		if (S_ISREG(sb.st_mode))
-			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
+			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
 		if (__predict_false(hook != NULL))
 			hook(nd.ni_vp, &sb);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
+	if (error != 0)
 		return (error);
 	*sbp = sb;
 #ifdef KTRACE
@@ -2496,6 +2333,7 @@
 	struct stat *sb;
 	struct nstat *nsb;
 {
+
 	bzero(nsb, sizeof *nsb);
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
@@ -2534,11 +2372,10 @@
 	int error;
 
 	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtnstat(&sb, &nsb);
-	error = copyout(&nsb, uap->ub, sizeof (nsb));
-	return (error);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
@@ -2563,11 +2400,10 @@
 	int error;
 
 	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
-	if (error)
+	if (error != 0)
 		return (error);
 	cvtnstat(&sb, &nsb);
-	error = copyout(&nsb, uap->ub, sizeof (nsb));
-	return (error);
+	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
@@ -2606,7 +2442,8 @@
 	} */ *uap;
 {
 
-	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
+	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+	    NOFOLLOW));
 }
 
 int
@@ -2614,13 +2451,12 @@
     u_long flags)
 {
 	struct nameidata nd;
-	int error, vfslocked;
+	int error;
 
-	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
-	    flags, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	/* If asynchronous I/O is available, it works for all files. */
@@ -2629,7 +2465,6 @@
 	else
 		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
 	vput(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -2688,26 +2523,23 @@
 	struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 
-	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
-	    AUDITVNODE1, pathseg, path, fd, td);
+	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 #ifdef MAC
 	error = mac_vnode_check_readlink(td->td_ucred, vp);
-	if (error) {
+	if (error != 0) {
 		vput(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 #endif
@@ -2724,10 +2556,9 @@
 		auio.uio_td = td;
 		auio.uio_resid = count;
 		error = VOP_READLINK(vp, &auio, td->td_ucred);
+		td->td_retval[0] = count - auio.uio_resid;
 	}
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
@@ -2738,11 +2569,11 @@
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
-	int flags;
+	u_long flags;
 {
-	int error;
 	struct mount *mp;
 	struct vattr vattr;
+	int error;
 
 	/* We can't support the value matching VNOVAL. */
 	if (flags == VNOVAL)
@@ -2756,15 +2587,15 @@
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
-		if (error)
+		if (error != 0)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
@@ -2780,8 +2611,8 @@
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
-	char	*path;
-	int	flags;
+	const char *path;
+	u_long	flags;
 };
 #endif
 int
@@ -2788,27 +2619,44 @@
 sys_chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
-		char *path;
-		int flags;
+		const char *path;
+		u_long flags;
 	} */ *uap;
 {
-	int error;
-	struct nameidata nd;
-	int vfslocked;
 
-	AUDIT_ARG_FFLAGS(uap->flags);
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
-	if ((error = namei(&nd)) != 0)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vfslocked = NDHASGIANT(&nd);
-	error = setfflags(td, nd.ni_vp, uap->flags);
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
+	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+	int	fd;
+	const char *path;
+	u_long	flags;
+	int	atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+	int fd = uap->fd;
+	const char *path = uap->path;
+	u_long flags = uap->flags;
+	int atflag = uap->atflag;
+
+	if (atflag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+static int
+kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
+    u_long flags)
+{
+
+	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
+}
+
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
@@ -2816,24 +2664,32 @@
 sys_lchflags(td, uap)
 	struct thread *td;
 	register struct lchflags_args /* {
-		char *path;
-		int flags;
+		const char *path;
+		u_long flags;
 	} */ *uap;
 {
-	int error;
+
+	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+	    uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+    enum uio_seg pathseg, u_long flags, int atflag)
+{
 	struct nameidata nd;
-	int vfslocked;
+	cap_rights_t rights;
+	int error, follow;
 
-	AUDIT_ARG_FFLAGS(uap->flags);
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
-	    uap->path, td);
+	AUDIT_ARG_FFLAGS(flags);
+	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	error = setfflags(td, nd.ni_vp, uap->flags);
+	error = setfflags(td, nd.ni_vp, flags);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -2843,7 +2699,7 @@
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
-	int	flags;
+	u_long	flags;
 };
 #endif
 int
@@ -2851,19 +2707,19 @@
 	struct thread *td;
 	register struct fchflags_args /* {
 		int fd;
-		int flags;
+		u_long flags;
 	} */ *uap;
 {
 	struct file *fp;
-	int vfslocked;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_FFLAGS(uap->flags);
-	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
-	    &fp)) != 0)
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
+	if (error != 0)
 		return (error);
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
@@ -2870,7 +2726,6 @@
 	VOP_UNLOCK(fp->f_vnode, 0);
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
-	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
@@ -2885,9 +2740,9 @@
 	struct vnode *vp;
 	int mode;
 {
-	int error;
 	struct mount *mp;
 	struct vattr vattr;
+	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
@@ -2976,27 +2831,23 @@
 	    uap->mode, AT_SYMLINK_NOFOLLOW));
 }
 
-
 int
 kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
     mode_t mode, int flag)
 {
-	int error;
 	struct nameidata nd;
-	int vfslocked;
-	int follow;
+	cap_rights_t rights;
+	int error, follow;
 
 	AUDIT_ARG_MODE(mode);
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
-	NDINIT_ATRIGHTS(&nd, LOOKUP,  follow | MPSAFE | AUDITVNODE1, pathseg,
-	    path, fd, CAP_FCHMOD, td);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHMOD), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3013,12 +2864,13 @@
 sys_fchmod(struct thread *td, struct fchmod_args *uap)
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_MODE(uap->mode);
 
-	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
@@ -3037,9 +2889,9 @@
 	uid_t uid;
 	gid_t gid;
 {
-	int error;
 	struct mount *mp;
 	struct vattr vattr;
+	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
@@ -3116,20 +2968,19 @@
     int uid, int gid, int flag)
 {
 	struct nameidata nd;
-	int error, vfslocked, follow;
+	cap_rights_t rights;
+	int error, follow;
 
 	AUDIT_ARG_OWNER(uid, gid);
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
-	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
-	    path, fd, CAP_FCHOWN, td);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FCHOWN), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3185,11 +3036,12 @@
 	} */ *uap;
 {
 	struct file *fp;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
-	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
+	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
@@ -3232,9 +3084,55 @@
 }
 
 /*
- * Common implementation code for utimes(), lutimes(), and futimes().
+ * Common implementation code for futimens(), utimensat().
  */
+#define	UTIMENS_NULL	0x1
+#define	UTIMENS_EXIT	0x2
 static int
+getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
+    struct timespec *tsp, int *retflags)
+{
+	struct timespec tsnow;
+	int error;
+
+	vfs_timestamp(&tsnow);
+	*retflags = 0;
+	if (usrtsp == NULL) {
+		tsp[0] = tsnow;
+		tsp[1] = tsnow;
+		*retflags |= UTIMENS_NULL;
+		return (0);
+	}
+	if (tspseg == UIO_SYSSPACE) {
+		tsp[0] = usrtsp[0];
+		tsp[1] = usrtsp[1];
+	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
+		return (error);
+	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
+		*retflags |= UTIMENS_EXIT;
+	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
+		*retflags |= UTIMENS_NULL;
+	if (tsp[0].tv_nsec == UTIME_OMIT)
+		tsp[0].tv_sec = VNOVAL;
+	else if (tsp[0].tv_nsec == UTIME_NOW)
+		tsp[0] = tsnow;
+	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
+		return (EINVAL);
+	if (tsp[1].tv_nsec == UTIME_OMIT)
+		tsp[1].tv_sec = VNOVAL;
+	else if (tsp[1].tv_nsec == UTIME_NOW)
+		tsp[1] = tsnow;
+	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
+ * and utimensat().
+ */
+static int
 setutimes(td, vp, ts, numtimes, nullflag)
 	struct thread *td;
 	struct vnode *vp;
@@ -3242,9 +3140,9 @@
 	int numtimes;
 	int nullflag;
 {
-	int error, setbirthtime;
 	struct mount *mp;
 	struct vattr vattr;
+	int error, setbirthtime;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
@@ -3324,20 +3222,19 @@
 {
 	struct nameidata nd;
 	struct timespec ts[2];
-	int error, vfslocked;
+	cap_rights_t rights;
+	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
-	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
-	    path, fd, CAP_FUTIMES, td);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3368,20 +3265,17 @@
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3412,16 +3306,17 @@
 {
 	struct timespec ts[2];
 	struct file *fp;
-	int vfslocked;
+	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
-	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+	error = getutimes(tptr, tptrseg, ts);
+	if (error != 0)
 		return (error);
-	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
-	    != 0)
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
+	if (error != 0)
 		return (error);
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 #ifdef AUDIT
 	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(fp->f_vnode);
@@ -3428,11 +3323,86 @@
 	VOP_UNLOCK(fp->f_vnode, 0);
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
-	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
 
+int
+sys_futimens(struct thread *td, struct futimens_args *uap)
+{
+
+	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
+}
+
+int
+kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+    enum uio_seg tptrseg)
+{
+	struct timespec ts[2];
+	struct file *fp;
+	cap_rights_t rights;
+	int error, flags;
+
+	AUDIT_ARG_FD(fd);
+	error = getutimens(tptr, tptrseg, ts, &flags);
+	if (error != 0)
+		return (error);
+	if (flags & UTIMENS_EXIT)
+		return (0);
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
+	if (error != 0)
+		return (error);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+	AUDIT_ARG_VNODE1(fp->f_vnode);
+	VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+sys_utimensat(struct thread *td, struct utimensat_args *uap)
+{
+
+	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
+	    uap->times, UIO_USERSPACE, uap->flag));
+}
+
+int
+kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+    struct timespec *tptr, enum uio_seg tptrseg, int flag)
+{
+	struct nameidata nd;
+	struct timespec ts[2];
+	cap_rights_t rights;
+	int error, flags;
+
+	if (flag & ~AT_SYMLINK_NOFOLLOW)
+		return (EINVAL);
+
+	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
+		return (error);
+	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
+	    cap_rights_init(&rights, CAP_FUTIMES), td);
+	if ((error = namei(&nd)) != 0)
+		return (error);
+	/*
+	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
+	 * POSIX states:
+	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
+	 * "Search permission is denied by a component of the path prefix."
+	 */
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if ((flags & UTIMENS_EXIT) == 0)
+		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
+	vrele(nd.ni_vp);
+	return (error);
+}
+
 /*
  * Truncate a file given its path name.
  */
@@ -3464,20 +3434,18 @@
 	void *rl_cookie;
 	struct vattr vattr;
 	struct nameidata nd;
-	int error, vfslocked;
+	int error;
 
 	if (length < 0)
 		return(EINVAL);
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vn_rangelock_unlock(vp, rl_cookie);
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -3498,7 +3466,6 @@
 	vn_finished_write(mp);
 	vn_rangelock_unlock(vp, rl_cookie);
 	vrele(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3571,16 +3538,17 @@
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
-	int vfslocked;
+	cap_rights_t rights;
 	int error, lock_flags;
 
 	AUDIT_ARG_FD(uap->fd);
-	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
-	    &fp)) != 0)
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_FSYNC), &fp);
+	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
 		goto drop;
 	if (MNT_SHARED_WRITES(mp) ||
 	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
@@ -3591,9 +3559,9 @@
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_object != NULL) {
-		VM_OBJECT_LOCK(vp->v_object);
+		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
-		VM_OBJECT_UNLOCK(vp->v_object);
+		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = VOP_FSYNC(vp, MNT_WAIT, td);
 
@@ -3600,7 +3568,6 @@
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
 drop:
-	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
@@ -3657,23 +3624,22 @@
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
-	int tvfslocked;
-	int fvfslocked;
+	cap_rights_t rights;
 	int error;
 
+again:
 	bwillwrite();
 #ifdef MAC
 	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
-	    MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
+	    AUDITVNODE1, pathseg, old, oldfd,
+	    cap_rights_init(&rights, CAP_RENAMEAT), td);
 #else
-	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
-	    AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
+	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
 #endif
 
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
-	fvfslocked = NDHASGIANT(&fromnd);
-	tvfslocked = 0;
 #ifdef MAC
 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
 	    fromnd.ni_vp, &fromnd.ni_cnd);
@@ -3682,17 +3648,9 @@
 		VOP_UNLOCK(fromnd.ni_vp, 0);
 #endif
 	fvp = fromnd.ni_vp;
-	if (error == 0)
-		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
-	if (error != 0) {
-		NDFREE(&fromnd, NDF_ONLY_PNBUF);
-		vrele(fromnd.ni_dvp);
-		vrele(fvp);
-		goto out1;
-	}
 	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
-	    SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
-	    td);
+	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+	    cap_rights_init(&rights, CAP_LINKAT), td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
@@ -3702,12 +3660,30 @@
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
-		vn_finished_write(mp);
 		goto out1;
 	}
-	tvfslocked = NDHASGIANT(&tond);
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
+	error = vn_start_write(fvp, &mp, V_NOWAIT);
+	if (error != 0) {
+		NDFREE(&fromnd, NDF_ONLY_PNBUF);
+		NDFREE(&tond, NDF_ONLY_PNBUF);
+		if (tvp != NULL)
+			vput(tvp);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		vrele(tond.ni_startdir);
+		if (fromnd.ni_startdir != NULL)
+			vrele(fromnd.ni_startdir);
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error != 0)
+			return (error);
+		goto again;
+	}
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
@@ -3716,6 +3692,18 @@
 			error = EISDIR;
 			goto out;
 		}
+#ifdef CAPABILITIES
+		if (newfd != AT_FDCWD) {
+			/*
+			 * If the target already exists we require CAP_UNLINKAT
+			 * from 'newfd'.
+			 */
+			error = cap_check(&tond.ni_filecaps.fc_rights,
+			    cap_rights_init(&rights, CAP_UNLINKAT));
+			if (error != 0)
+				goto out;
+		}
+#endif
 	}
 	if (fvp == tdvp) {
 		error = EINVAL;
@@ -3733,15 +3721,15 @@
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
-	if (!error) {
+	if (error == 0) {
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
-				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
-		if (tvp)
+		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
@@ -3755,8 +3743,6 @@
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
-	VFS_UNLOCK_GIANT(fvfslocked);
-	VFS_UNLOCK_GIANT(tvfslocked);
 	if (error == -1)
 		return (0);
 	return (error);
@@ -3811,19 +3797,19 @@
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
+	struct nameidata nd;
+	cap_rights_t rights;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 	AUDIT_ARG_MODE(mode);
 restart:
 	bwillwrite();
-	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
-	    AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
+	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
+	    td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -3837,13 +3823,11 @@
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
@@ -3854,7 +3838,7 @@
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
@@ -3863,10 +3847,9 @@
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
-	if (!error)
+	if (error == 0)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3901,17 +3884,16 @@
 {
 	struct mount *mp;
 	struct vnode *vp;
+	struct nameidata nd;
+	cap_rights_t rights;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
 restart:
 	bwillwrite();
-	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
-	    AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
+	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
@@ -3934,7 +3916,7 @@
 #ifdef MAC
 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
@@ -3944,7 +3926,6 @@
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
@@ -3959,7 +3940,6 @@
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -3996,8 +3976,9 @@
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
+	cap_rights_t rights;
 	caddr_t dirbuf;
-	int error, eofflag, readcnt, vfslocked;
+	int error, eofflag, readcnt;
 	long loff;
 	off_t foffset;
 
@@ -4004,8 +3985,9 @@
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
-	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
-	    &fp)) != 0)
+	error = getvnode(td->td_proc->p_fd, uap->fd,
+	    cap_rights_init(&rights, CAP_READ), &fp);
+	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
@@ -4014,9 +3996,7 @@
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
-		VFS_UNLOCK_GIANT(vfslocked);
 		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (EINVAL);
@@ -4033,9 +4013,8 @@
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
-	if (error) {
+	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		foffset_unlock(fp, foffset, FOF_NOUPDATE);
 		fdrop(fp, td);
 		return (error);
@@ -4092,9 +4071,8 @@
 		}
 		free(dirbuf, M_TEMP);
 	}
-	if (error) {
+	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		foffset_unlock(fp, foffset, 0);
 		fdrop(fp, td);
 		return (error);
@@ -4109,11 +4087,9 @@
 		fp->f_data = vp;
 		foffset = 0;
 		vput(tvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
@@ -4147,8 +4123,9 @@
 	long base;
 	int error;
 
-	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
-	if (error)
+	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+	    NULL, UIO_USERSPACE);
+	if (error != 0)
 		return (error);
 	if (uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
@@ -4157,23 +4134,24 @@
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
-    long *basep)
+    long *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
-	int vfslocked;
+	cap_rights_t rights;
 	long loff;
 	int error, eofflag;
 	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
+	if (count > IOSIZE_MAX)
+		return (EINVAL);
 	auio.uio_resid = count;
-	if (auio.uio_resid > IOSIZE_MAX)
-		return (EINVAL);
-	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
-	    &fp)) != 0)
+	error = getvnode(td->td_proc->p_fd, fd,
+	    cap_rights_init(&rights, CAP_READ), &fp);
+	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
@@ -4182,9 +4160,7 @@
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
-		VFS_UNLOCK_GIANT(vfslocked);
 		error = EINVAL;
 		goto fail;
 	}
@@ -4193,7 +4169,7 @@
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
-	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
@@ -4205,9 +4181,8 @@
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	foffset = auio.uio_offset;
-	if (error) {
+	if (error != 0) {
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		goto fail;
 	}
 	if (count == auio.uio_resid &&
@@ -4214,6 +4189,7 @@
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
+
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
@@ -4220,12 +4196,12 @@
 		fp->f_data = vp;
 		foffset = 0;
 		vput(tvp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
 	*basep = loff;
+	if (residp != NULL)
+		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
 	foffset_unlock(fp, foffset, 0);
@@ -4250,6 +4226,7 @@
 	} */ *uap;
 {
 	struct getdirentries_args ap;
+
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
@@ -4300,15 +4277,13 @@
 {
 	struct vnode *vp;
 	struct vattr vattr;
+	struct nameidata nd;
 	int error;
-	struct nameidata nd;
-	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	    UIO_USERSPACE, uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
@@ -4317,15 +4292,15 @@
 	}
 #ifdef MAC
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
-	if (error)
+	if (error != 0)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check(td, PRIV_VFS_ADMIN);
-		if (error)
+		if (error != 0)
 			goto out;
 	}
 	if (vcount(vp) > 1)
@@ -4332,7 +4307,6 @@
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -4342,35 +4316,14 @@
  * entry is held upon returning.
  */
 int
-getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
-    struct file **fpp)
+getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 	struct file *fp;
-#ifdef CAPABILITIES
-	struct file *fp_fromcap;
-#endif
 	int error;
 
-	error = 0;
-	fp = NULL;
-	if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
-		return (EBADF);
-#ifdef CAPABILITIES
-	/*
-	 * If the file descriptor is for a capability, test rights and use the
-	 * file descriptor referenced by the capability.
-	 */
-	error = cap_funwrap(fp, rights, &fp_fromcap);
-	if (error) {
-		fdrop(fp, curthread);
+	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+	if (error != 0)
 		return (error);
-	}
-	if (fp != fp_fromcap) {
-		fhold(fp_fromcap);
-		fdrop(fp, curthread);
-		fp = fp_fromcap;
-	}
-#endif /* CAPABILITIES */
 
 	/*
 	 * The file could be not of the vnode type, or it may be not
@@ -4410,18 +4363,16 @@
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
-	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
-	if (error)
+	if (error != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	    UIO_USERSPACE, uap->fname, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->fname, td);
 	error = namei(&nd);
-	if (error)
+	if (error != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
@@ -4428,10 +4379,8 @@
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
-		return (error);
-	error = copyout(&fh, uap->fhp, sizeof (fh));
+	if (error == 0)
+		error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
@@ -4449,18 +4398,16 @@
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
-	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
-	if (error)
+	if (error != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
-	    UIO_USERSPACE, uap->fname, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+	    uap->fname, td);
 	error = namei(&nd);
-	if (error)
+	if (error != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
@@ -4467,10 +4414,8 @@
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
-		return (error);
-	error = copyout(&fh, uap->fhp, sizeof (fh));
+	if (error == 0)
+		error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
@@ -4495,177 +4440,75 @@
 		int flags;
 	} */ *uap;
 {
-	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
-	struct vattr vat;
-	struct vattr *vap = &vat;
-	struct flock lf;
 	struct file *fp;
-	register struct filedesc *fdp = p->p_fd;
-	int fmode, error, type;
-	accmode_t accmode;
-	struct file *nfp;
-	int vfslocked;
+	int fmode, error;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
-	if (error)
+	if (error != 0)
 		return (error);
+	indx = -1;
 	fmode = FFLAGS(uap->flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
-	if (error)
+	if (error != 0)
 		return(error);
 	/* find the mount point */
 	mp = vfs_busyfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
-	vfslocked = VFS_LOCK_GIANT(mp);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
-	if (error)
-		goto out;
+	if (error != 0)
+		return (error);
+
+	error = falloc_noinstall(td, &fp);
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
 	/*
-	 * from now on we have to make sure not
-	 * to forget about the vnode
-	 * any error that causes an abort must vput(vp)
-	 * just set error = err and 'goto bad;'.
+	 * An extra reference on `fp' has been held for us by
+	 * falloc_noinstall().
 	 */
 
-	/*
-	 * from vn_open
-	 */
-	if (vp->v_type == VLNK) {
-		error = EMLINK;
+#ifdef INVARIANTS
+	td->td_dupfd = -1;
+#endif
+	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+	if (error != 0) {
+		KASSERT(fp->f_ops == &badfileops,
+		    ("VOP_OPEN in fhopen() set f_ops"));
+		KASSERT(td->td_dupfd < 0,
+		    ("fhopen() encountered fdopen()"));
+
+		vput(vp);
 		goto bad;
 	}
-	if (vp->v_type == VSOCK) {
-		error = EOPNOTSUPP;
-		goto bad;
-	}
-	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
-		error = ENOTDIR;
-		goto bad;
-	}
-	accmode = 0;
-	if (fmode & (FWRITE | O_TRUNC)) {
-		if (vp->v_type == VDIR) {
-			error = EISDIR;
-			goto bad;
-		}
-		error = vn_writechk(vp);
-		if (error)
-			goto bad;
-		accmode |= VWRITE;
-	}
-	if (fmode & FREAD)
-		accmode |= VREAD;
-	if ((fmode & O_APPEND) && (fmode & FWRITE))
-		accmode |= VAPPEND;
-#ifdef MAC
-	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
-	if (error)
-		goto bad;
+#ifdef INVARIANTS
+	td->td_dupfd = 0;
 #endif
-	if (accmode) {
-		error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
-		if (error)
+	fp->f_vnode = vp;
+	fp->f_seqcount = 1;
+	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+	    &vnops);
+	VOP_UNLOCK(vp, 0);
+	if ((fmode & O_TRUNC) != 0) {
+		error = fo_truncate(fp, 0, td->td_ucred, td);
+		if (error != 0)
 			goto bad;
 	}
-	if (fmode & O_TRUNC) {
-		vfs_ref(mp);
-		VOP_UNLOCK(vp, 0);				/* XXX */
-		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
-			vrele(vp);
-			vfs_rel(mp);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);	/* XXX */
-		vfs_rel(mp);
-#ifdef MAC
-		/*
-		 * We don't yet have fp->f_cred, so use td->td_ucred, which
-		 * should be right.
-		 */
-		error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
-		if (error == 0) {
-#endif
-			VATTR_NULL(vap);
-			vap->va_size = 0;
-			error = VOP_SETATTR(vp, vap, td->td_ucred);
-#ifdef MAC
-		}
-#endif
-		vn_finished_write(mp);
-		if (error)
-			goto bad;
-	}
-	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
-	if (error)
-		goto bad;
 
-	if (fmode & FWRITE)
-		vp->v_writecount++;
-
-	/*
-	 * end of vn_open code
-	 */
-
-	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
-		if (fmode & FWRITE)
-			vp->v_writecount--;
-		goto bad;
-	}
-	/* An extra reference on `nfp' has been held for us by falloc(). */
-	fp = nfp;
-	nfp->f_vnode = vp;
-	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
-	if (fmode & (O_EXLOCK | O_SHLOCK)) {
-		lf.l_whence = SEEK_SET;
-		lf.l_start = 0;
-		lf.l_len = 0;
-		if (fmode & O_EXLOCK)
-			lf.l_type = F_WRLCK;
-		else
-			lf.l_type = F_RDLCK;
-		type = F_FLOCK;
-		if ((fmode & FNONBLOCK) == 0)
-			type |= F_WAIT;
-		VOP_UNLOCK(vp, 0);
-		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
-			    type)) != 0) {
-			/*
-			 * The lock request failed.  Normally close the
-			 * descriptor but handle the case where someone might
-			 * have dup()d or close()d it when we weren't looking.
-			 */
-			fdclose(fdp, fp, indx, td);
-
-			/*
-			 * release our private reference
-			 */
-			fdrop(fp, td);
-			goto out;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-		atomic_set_int(&fp->f_flag, FHASLOCK);
-	}
-
-	VOP_UNLOCK(vp, 0);
+	error = finstall(td, fp, &indx, fmode, NULL);
+bad:
 	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
-	return (0);
-
-bad:
-	vput(vp);
-out:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -4687,33 +4530,36 @@
 	} */ *uap;
 {
 	struct stat sb;
-	fhandle_t fh;
+	struct fhandle fh;
+	int error;
+
+	error = copyin(uap->u_fhp, &fh, sizeof(fh));
+	if (error != 0)
+		return (error);
+	error = kern_fhstat(td, fh, &sb);
+	if (error == 0)
+		error = copyout(&sb, uap->sb, sizeof(sb));
+	return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
 	struct mount *mp;
 	struct vnode *vp;
-	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
-	if (error)
+	if (error != 0)
 		return (error);
-	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
-	if (error)
-		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
-	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
+	if (error != 0)
 		return (error);
-	}
-	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
+	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	if (error)
-		return (error);
-	error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
@@ -4739,10 +4585,10 @@
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
-	if (error)
+	if (error != 0)
 		return (error);
 	error = kern_fhstatfs(td, fh, &sf);
-	if (error)
+	if (error != 0)
 		return (error);
 	return (copyout(&sf, uap->buf, sizeof(sf)));
 }
@@ -4753,28 +4599,25 @@
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
-	int vfslocked;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
-	if (error)
+	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
-	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
-	if (error) {
+	if (error != 0) {
 		vfs_unbusy(mp);
-		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
-	if (error)
+	if (error != 0)
 		goto out;
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
-	if (error)
+	if (error != 0)
 		goto out;
 #endif
 	/*
@@ -4789,7 +4632,6 @@
 		*buf = *sp;
 out:
 	vfs_unbusy(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -4799,44 +4641,35 @@
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
+	cap_rights_t rights;
 	off_t olen, ooffset;
-	int error, vfslocked;
+	int error;
 
-	fp = NULL;
-	vfslocked = 0;
-	error = fget(td, fd, CAP_WRITE, &fp);
+	if (offset < 0 || len <= 0)
+		return (EINVAL);
+	/* Check for wrap. */
+	if (offset > OFF_MAX - len)
+		return (EFBIG);
+	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
 	if (error != 0)
-		goto out;
-
-	switch (fp->f_type) {
-	case DTYPE_VNODE:
-		break;
-	case DTYPE_PIPE:
-	case DTYPE_FIFO:
+		return (error);
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
-	default:
-		error = ENODEV;
-		goto out;
 	}
 	if ((fp->f_flag & FWRITE) == 0) {
 		error = EBADF;
 		goto out;
 	}
+	if (fp->f_type != DTYPE_VNODE) {
+		error = ENODEV;
+		goto out;
+	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG) {
 		error = ENODEV;
 		goto out;
 	}
-	if (offset < 0 || len <= 0) {
-		error = EINVAL;
-		goto out;
-	}
-	/* Check for wrap. */
-	if (offset > OFF_MAX - len) {
-		error = EFBIG;
-		goto out;
-	}
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
@@ -4844,17 +4677,13 @@
 		ooffset = offset;
 
 		bwillwrite();
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-		if (error != 0) {
-			VFS_UNLOCK_GIANT(vfslocked);
+		if (error != 0)
 			break;
-		}
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
-			VFS_UNLOCK_GIANT(vfslocked);
 			break;
 		}
 #ifdef MAC
@@ -4864,7 +4693,6 @@
 			error = VOP_ALLOCATE(vp, &offset, &len);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
-		VFS_UNLOCK_GIANT(vfslocked);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
@@ -4876,8 +4704,7 @@
 		maybe_yield();
 	}
  out:
-	if (fp != NULL)
-		fdrop(fp, td);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -4885,7 +4712,9 @@
 sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
 {
 
-	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
+	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
+	    uap->len);
+	return (0);
 }
 
 /*
@@ -4901,6 +4730,7 @@
 	struct fadvise_info *fa, *new;
 	struct file *fp;
 	struct vnode *vp;
+	cap_rights_t rights;
 	off_t end;
 	int error;
 
@@ -4921,18 +4751,14 @@
 		return (EINVAL);
 	}
 	/* XXX: CAP_POSIX_FADVISE? */
-	error = fget(td, fd, 0, &fp);
+	error = fget(td, fd, cap_rights_init(&rights), &fp);
 	if (error != 0)
 		goto out;
-	
-	switch (fp->f_type) {
-	case DTYPE_VNODE:
-		break;
-	case DTYPE_PIPE:
-	case DTYPE_FIFO:
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
-	default:
+	}
+	if (fp->f_type != DTYPE_VNODE) {
 		error = ENODEV;
 		goto out;
 	}
@@ -4988,7 +4814,7 @@
 				new = fa;
 				fp->f_advice = NULL;
 			} else if (offset <= fa->fa_start &&
- 			    end >= fa->fa_start)
+			    end >= fa->fa_start)
 				fa->fa_start = end + 1;
 			else if (offset <= fa->fa_end && end >= fa->fa_end)
 				fa->fa_end = offset - 1;
@@ -5023,6 +4849,7 @@
 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
 {
 
-	return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
-	    uap->advice));
+	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
+	    uap->len, uap->advice);
+	return (0);
 }