[Midnightbsd-cvs] src [9896] trunk/sys: sync with freebsd 10-stable

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Thu May 24 18:27:41 EDT 2018


Revision: 9896
          http://svnweb.midnightbsd.org/src/?rev=9896
Author:   laffer1
Date:     2018-05-24 18:27:41 -0400 (Thu, 24 May 2018)
Log Message:
-----------
sync with freebsd 10-stable

Modified Paths:
--------------
    trunk/sys/vm/default_pager.c
    trunk/sys/vm/device_pager.c
    trunk/sys/vm/memguard.c
    trunk/sys/vm/memguard.h
    trunk/sys/vm/phys_pager.c
    trunk/sys/vm/pmap.h
    trunk/sys/vm/redzone.c
    trunk/sys/vm/redzone.h
    trunk/sys/vm/sg_pager.c
    trunk/sys/vm/swap_pager.c
    trunk/sys/vm/swap_pager.h
    trunk/sys/vm/uma.h
    trunk/sys/vm/uma_core.c
    trunk/sys/vm/uma_dbg.c
    trunk/sys/vm/uma_dbg.h
    trunk/sys/vm/uma_int.h
    trunk/sys/vm/vm.h
    trunk/sys/vm/vm_extern.h
    trunk/sys/vm/vm_fault.c
    trunk/sys/vm/vm_glue.c
    trunk/sys/vm/vm_init.c
    trunk/sys/vm/vm_kern.c
    trunk/sys/vm/vm_kern.h
    trunk/sys/vm/vm_map.c
    trunk/sys/vm/vm_map.h
    trunk/sys/vm/vm_meter.c
    trunk/sys/vm/vm_mmap.c
    trunk/sys/vm/vm_object.c
    trunk/sys/vm/vm_object.h
    trunk/sys/vm/vm_page.c
    trunk/sys/vm/vm_page.h
    trunk/sys/vm/vm_pageout.c
    trunk/sys/vm/vm_pageout.h
    trunk/sys/vm/vm_pager.c
    trunk/sys/vm/vm_pager.h
    trunk/sys/vm/vm_param.h
    trunk/sys/vm/vm_phys.c
    trunk/sys/vm/vm_phys.h
    trunk/sys/vm/vm_reserv.c
    trunk/sys/vm/vm_reserv.h
    trunk/sys/vm/vm_unix.c
    trunk/sys/vm/vm_zeroidle.c
    trunk/sys/vm/vnode_pager.c
    trunk/sys/vm/vnode_pager.h

Added Paths:
-----------
    trunk/sys/vm/_vm_radix.h
    trunk/sys/vm/vm_radix.c
    trunk/sys/vm/vm_radix.h
    trunk/sys/x86/acpica/acpi_wakeup.c

Added: trunk/sys/vm/_vm_radix.h
===================================================================
--- trunk/sys/vm/_vm_radix.h	                        (rev 0)
+++ trunk/sys/vm/_vm_radix.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,56 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $
+ */
+
+#ifndef __VM_RADIX_H_
+#define __VM_RADIX_H_
+
+/*
+ * Radix tree root.
+ */
+struct vm_radix {
+	uintptr_t	rt_root;
+	uint8_t		rt_flags;
+};
+
+#define	RT_INSERT_INPROG	0x01
+#define	RT_TRIE_MODIFIED	0x02
+
+#ifdef _KERNEL
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+	return (rtree->rt_root == 0);
+}
+
+#endif /* _KERNEL */
+#endif /* !__VM_RADIX_H_ */


Property changes on: trunk/sys/vm/_vm_radix.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/default_pager.c
===================================================================
--- trunk/sys/vm/default_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/default_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1995, David Greenman
  * All rights reserved.
@@ -38,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -45,7 +46,7 @@
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
-#include <sys/mutex.h>
+#include <sys/rwlock.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
@@ -63,6 +64,16 @@
 		int *);
 /*
  * pagerops for OBJT_DEFAULT - "default pager".
+ *
+ * This pager handles anonymous (no handle) swap-backed memory, just
+ * like the swap pager.  It allows several optimizations based on the
+ * fact that no pages of a default object can be swapped out.  The
+ * most important optimization is in vm_fault(), where the pager is
+ * never asked for a non-resident page.  Instead, a freshly allocated
+ * zeroed page is used.
+ *
+ * On the first request to page out a page from a default object, the
+ * object is converted to swap pager type.
  */
 struct pagerops defaultpagerops = {
 	.pgo_alloc =	default_pager_alloc,
@@ -91,10 +102,10 @@
 	object = vm_object_allocate(OBJT_DEFAULT,
 	    OFF_TO_IDX(round_page(offset + size)));
 	if (cred != NULL) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		object->cred = cred;
 		object->charge = size;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	}
 	return (object);
 }
@@ -113,6 +124,7 @@
 	/*
 	 * OBJT_DEFAULT objects have no special resources allocated to them.
 	 */
+	object->type = OBJT_DEAD;
 }
 
 /*
@@ -137,14 +149,11 @@
  * cache to the free list.
  */
 static void
-default_pager_putpages(object, m, c, sync, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int c;
-	boolean_t sync;
-	int *rtvals;
+default_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+    int flags, int *rtvals)
 {
-	swappagerops.pgo_putpages(object, m, c, sync, rtvals);
+
+	swappagerops.pgo_putpages(object, m, count, flags, rtvals);
 }
 
 /*

Modified: trunk/sys/vm/device_pager.c
===================================================================
--- trunk/sys/vm/device_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/device_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -44,6 +45,7 @@
 #include <sys/proc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
@@ -51,6 +53,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/uma.h>
 
 static void dev_pager_init(void);
@@ -58,10 +61,8 @@
     vm_ooffset_t, struct ucred *);
 static void dev_pager_dealloc(vm_object_t);
 static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void dev_pager_putpages(vm_object_t, vm_page_t *, int, 
-		boolean_t, int *);
-static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *,
-		int *);
+static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
+static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static void dev_pager_free_page(vm_object_t object, vm_page_t m);
 
 /* list of device pager objects */
@@ -99,8 +100,9 @@
 };
 
 static void
-dev_pager_init()
+dev_pager_init(void)
 {
+
 	TAILQ_INIT(&dev_pager_object_list);
 	mtx_init(&dev_pager_mtx, "dev_pager list", NULL, MTX_DEF);
 }
@@ -157,6 +159,7 @@
 		object1->pg_color = color;
 		object1->handle = handle;
 		object1->un_pager.devp.ops = ops;
+		object1->un_pager.devp.dev = handle;
 		TAILQ_INIT(&object1->un_pager.devp.devp_pglist);
 		mtx_lock(&dev_pager_mtx);
 		object = vm_pager_object_lookup(&dev_pager_object_list, handle);
@@ -204,7 +207,7 @@
 cdev_pager_free_page(vm_object_t object, vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type == OBJT_MGTDEVICE) {
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("unmanaged %p", m));
 		pmap_remove_all(m);
@@ -219,27 +222,26 @@
 dev_pager_free_page(vm_object_t object, vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->type == OBJT_DEVICE &&
 	    (m->oflags & VPO_UNMANAGED) != 0),
 	    ("Managed device or page obj %p m %p", object, m));
-	TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq);
+	TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, plinks.q);
 	vm_page_putfake(m);
 }
 
 static void
-dev_pager_dealloc(object)
-	vm_object_t object;
+dev_pager_dealloc(vm_object_t object)
 {
 	vm_page_t m;
 
-	VM_OBJECT_UNLOCK(object);
-	object->un_pager.devp.ops->cdev_pg_dtor(object->handle);
+	VM_OBJECT_WUNLOCK(object);
+	object->un_pager.devp.ops->cdev_pg_dtor(object->un_pager.devp.dev);
 
 	mtx_lock(&dev_pager_mtx);
 	TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
 	mtx_unlock(&dev_pager_mtx);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 
 	if (object->type == OBJT_DEVICE) {
 		/*
@@ -249,6 +251,8 @@
 		    != NULL)
 			dev_pager_free_page(object, m);
 	}
+	object->handle = NULL;
+	object->type = OBJT_DEAD;
 }
 
 static int
@@ -256,11 +260,11 @@
 {
 	int error, i;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = object->un_pager.devp.ops->cdev_pg_fault(object,
 	    IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]);
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	for (i = 0; i < count; i++) {
 		if (i != reqpage) {
@@ -278,7 +282,7 @@
 		    ("Wrong page type %p %p", ma[reqpage], object));
 		if (object->type == OBJT_DEVICE) {
 			TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
-			    ma[reqpage], pageq);
+			    ma[reqpage], plinks.q);
 		}
 	}
 
@@ -289,7 +293,6 @@
 old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
-	vm_pindex_t pidx;
 	vm_paddr_t paddr;
 	vm_page_t m_paddr, page;
 	struct cdev *dev;
@@ -296,18 +299,17 @@
 	struct cdevsw *csw;
 	struct file *fpop;
 	struct thread *td;
-	vm_memattr_t memattr;
+	vm_memattr_t memattr, memattr1;
 	int ref, ret;
 
-	pidx = OFF_TO_IDX(offset);
 	memattr = object->memattr;
 
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
 	dev = object->handle;
 	csw = dev_refthread(dev, &ref);
 	if (csw == NULL) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		return (VM_PAGER_FAIL);
 	}
 	td = curthread;
@@ -319,16 +321,24 @@
 	if (ret != 0) {
 		printf(
 	    "WARNING: dev_pager_getpage: map function returns error %d", ret);
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		return (VM_PAGER_FAIL);
 	}
 
 	/* If "paddr" is a real page, perform a sanity check on "memattr". */
 	if ((m_paddr = vm_phys_paddr_to_vm_page(paddr)) != NULL &&
-	    pmap_page_get_memattr(m_paddr) != memattr) {
-		memattr = pmap_page_get_memattr(m_paddr);
-		printf(
-	    "WARNING: A device driver has set \"memattr\" inconsistently.\n");
+	    (memattr1 = pmap_page_get_memattr(m_paddr)) != memattr) {
+		/*
+		 * For the /dev/mem d_mmap routine to return the
+		 * correct memattr, pmap_page_get_memattr() needs to
+		 * be called, which we do there.
+		 */
+		if ((csw->d_flags & D_MEM) == 0) {
+			printf("WARNING: Device driver %s has set "
+			    "\"memattr\" inconsistently (drv %u pmap %u).\n",
+			    csw->d_name, memattr, memattr1);
+		}
+		memattr = memattr1;
 	}
 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 		/*
@@ -336,7 +346,7 @@
 		 * the new physical address.
 		 */
 		page = *mres;
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		vm_page_updatefake(page, paddr, memattr);
 	} else {
 		/*
@@ -344,12 +354,13 @@
 		 * free up the all of the original pages.
 		 */
 		page = vm_page_getfake(paddr, memattr);
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
+		if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
+			panic("old_dev_pager_fault: invalid page replacement");
 		vm_page_lock(*mres);
 		vm_page_free(*mres);
 		vm_page_unlock(*mres);
 		*mres = page;
-		vm_page_insert(page, object, pidx);
 	}
 	page->valid = VM_PAGE_BITS_ALL;
 	return (VM_PAGER_OK);
@@ -356,12 +367,8 @@
 }
 
 static void
-dev_pager_putpages(object, m, count, sync, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	boolean_t sync;
-	int *rtvals;
+dev_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags,
+    int *rtvals)
 {
 
 	panic("dev_pager_putpage called");
@@ -368,12 +375,10 @@
 }
 
 static boolean_t
-dev_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
+dev_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
+
 	if (before != NULL)
 		*before = 0;
 	if (after != NULL)
@@ -408,6 +413,7 @@
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	npages = OFF_TO_IDX(size);
+	paddr = 0; /* Make paddr initialized for the case of size == 0. */
 	for (off = foff; npages--; off += PAGE_SIZE) {
 		if (csw->d_mmap(dev, off, &paddr, (int)prot, &dummy) != 0) {
 			dev_relthread(dev, ref);

Modified: trunk/sys/vm/memguard.c
===================================================================
--- trunk/sys/vm/memguard.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/memguard.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005, Bosko Milekic <bmilekic at FreeBSD.org>.
  * Copyright (c) 2010 Isilon Systems, Inc. (http://www.isilon.com/)
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $");
 
 /*
  * MemGuard is a simple replacement allocator for debugging only
@@ -48,6 +49,7 @@
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
+#include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
@@ -55,7 +57,9 @@
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
+#include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
+#include <vm/uma_int.h>
 #include <vm/memguard.h>
 
 static SYSCTL_NODE(_vm, OID_AUTO, memguard, CTLFLAG_RW, NULL, "MemGuard data");
@@ -86,9 +90,7 @@
 		return (error);
 
 	mtx_lock(&malloc_mtx);
-	/*
-	 * If mtp is NULL, it will be initialized in memguard_cmp().
-	 */
+	/* If mtp is NULL, it will be initialized in memguard_cmp() */
 	vm_memguard_mtype = malloc_desc2type(desc);
 	strlcpy(vm_memguard_desc, desc, sizeof(vm_memguard_desc));
 	mtx_unlock(&malloc_mtx);
@@ -98,8 +100,8 @@
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     memguard_sysctl_desc, "A", "Short description of memory type to monitor");
 
-static vm_map_t memguard_map = NULL;
 static vm_offset_t memguard_cursor;
+static vm_offset_t memguard_base;
 static vm_size_t memguard_mapsize;
 static vm_size_t memguard_physlimit;
 static u_long memguard_wasted;
@@ -111,7 +113,7 @@
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, cursor, CTLFLAG_RD,
     &memguard_cursor, 0, "MemGuard cursor");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, mapsize, CTLFLAG_RD,
-    &memguard_mapsize, 0, "MemGuard private vm_map size");
+    &memguard_mapsize, 0, "MemGuard private arena size");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, phys_limit, CTLFLAG_RD,
     &memguard_physlimit, 0, "Limit on MemGuard memory consumption");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, wasted, CTLFLAG_RD,
@@ -125,15 +127,17 @@
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, fail_pgs, CTLFLAG_RD,
     &memguard_fail_pgs, 0, "MemGuard failures due to lack of pages");
 
-#define MG_GUARD	0x001
-#define MG_ALLLARGE	0x002
-static int memguard_options = MG_GUARD;
+#define MG_GUARD_AROUND		0x001
+#define MG_GUARD_ALLLARGE	0x002
+#define MG_GUARD_NOFREE		0x004
+static int memguard_options = MG_GUARD_AROUND;
 TUNABLE_INT("vm.memguard.options", &memguard_options);
 SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW,
     &memguard_options, 0,
     "MemGuard options:\n"
     "\t0x001 - add guard pages around each allocation\n"
-    "\t0x002 - always use MemGuard for allocations over a page");
+    "\t0x002 - always use MemGuard for allocations over a page\n"
+    "\t0x004 - guard uma(9) zones with UMA_ZONE_NOFREE flag");
 
 static u_int memguard_minsize;
 static u_long memguard_minsize_reject;
@@ -197,21 +201,18 @@
  * out of a single VM map (contiguous chunk of address space).
  */
 void
-memguard_init(vm_map_t parent_map)
+memguard_init(vmem_t *parent)
 {
-	vm_offset_t base, limit;
+	vm_offset_t base;
 
-	memguard_map = kmem_suballoc(parent_map, &base, &limit,
-	    memguard_mapsize, FALSE);
-	memguard_map->system_map = 1;
-	KASSERT(memguard_mapsize == limit - base,
-	    ("Expected %lu, got %lu", (u_long)memguard_mapsize,
-	     (u_long)(limit - base)));
+	vmem_alloc(parent, memguard_mapsize, M_BESTFIT | M_WAITOK, &base);
+	vmem_init(memguard_arena, "memguard arena", base, memguard_mapsize,
+	    PAGE_SIZE, 0, M_WAITOK);
 	memguard_cursor = base;
+	memguard_base = base;
 
 	printf("MEMGUARD DEBUGGING ALLOCATOR INITIALIZED:\n");
 	printf("\tMEMGUARD map base: 0x%lx\n", (u_long)base);
-	printf("\tMEMGUARD map limit: 0x%lx\n", (u_long)limit);
 	printf("\tMEMGUARD map size: %jd KBytes\n",
 	    (uintmax_t)memguard_mapsize >> 10);
 }
@@ -226,12 +227,14 @@
 
 	parent = SYSCTL_STATIC_CHILDREN(_vm_memguard);
 
-	SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "mapstart", CTLFLAG_RD,
-	    &memguard_map->min_offset, "MemGuard KVA base");
-	SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "maplimit", CTLFLAG_RD,
-	    &memguard_map->max_offset, "MemGuard KVA end");
+	SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "mapstart", CTLFLAG_RD,
+	    &memguard_base, "MemGuard KVA base");
+	SYSCTL_ADD_UAUTO(NULL, parent, OID_AUTO, "maplimit", CTLFLAG_RD,
+	    &memguard_mapsize, "MemGuard KVA size");
+#if 0
 	SYSCTL_ADD_ULONG(NULL, parent, OID_AUTO, "mapused", CTLFLAG_RD,
 	    &memguard_map->size, "MemGuard KVA used");
+#endif
 }
 SYSINIT(memguard, SI_SUB_KLD, SI_ORDER_ANY, memguard_sysinit, NULL);
 
@@ -257,9 +260,24 @@
 	p = PHYS_TO_VM_PAGE(pa);
 	KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
 	    ("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
-	return ((u_long *)&p->pageq.tqe_next);
+	return (&p->plinks.memguard.p);
 }
 
+static u_long *
+v2sizev(vm_offset_t va)
+{
+	vm_paddr_t pa;
+	struct vm_page *p;
+
+	pa = pmap_kextract(va);
+	if (pa == 0)
+		panic("MemGuard detected double-free of %p", (void *)va);
+	p = PHYS_TO_VM_PAGE(pa);
+	KASSERT(p->wire_count != 0 && p->queue == PQ_NONE,
+	    ("MEMGUARD: Expected wired page %p in vtomgfifo!", p));
+	return (&p->plinks.memguard.v);
+}
+
 /*
  * Allocate a single object of specified size with specified flags
  * (either M_WAITOK or M_NOWAIT).
@@ -267,7 +285,7 @@
 void *
 memguard_alloc(unsigned long req_size, int flags)
 {
-	vm_offset_t addr;
+	vm_offset_t addr, origaddr;
 	u_long size_p, size_v;
 	int do_guard, rv;
 
@@ -282,11 +300,10 @@
 	 * value.
 	 */
 	size_v = size_p;
-	do_guard = (memguard_options & MG_GUARD) != 0;
+	do_guard = (memguard_options & MG_GUARD_AROUND) != 0;
 	if (do_guard)
 		size_v += 2 * PAGE_SIZE;
 
-	vm_map_lock(memguard_map);
 	/*
 	 * When we pass our memory limit, reject sub-page allocations.
 	 * Page-size and larger allocations will use the same amount
@@ -293,7 +310,7 @@
 	 * of physical memory whether we allocate or hand off to
 	 * uma_large_alloc(), so keep those.
 	 */
-	if (memguard_map->size >= memguard_physlimit &&
+	if (vmem_size(memguard_arena, VMEM_ALLOC) >= memguard_physlimit &&
 	    req_size < PAGE_SIZE) {
 		addr = (vm_offset_t)NULL;
 		memguard_fail_pgs++;
@@ -310,9 +327,9 @@
 	 * map, unless vm_map_findspace() is tweaked.
 	 */
 	for (;;) {
-		rv = vm_map_findspace(memguard_map, memguard_cursor,
-		    size_v, &addr);
-		if (rv == KERN_SUCCESS)
+		if (vmem_xalloc(memguard_arena, size_v, 0, 0, 0,
+		    memguard_cursor, VMEM_ADDR_MAX,
+		    M_BESTFIT | M_NOWAIT, &origaddr) == 0)
 			break;
 		/*
 		 * The map has no space.  This may be due to
@@ -319,24 +336,27 @@
 		 * fragmentation, or because the cursor is near the
 		 * end of the map.
 		 */
-		if (memguard_cursor == vm_map_min(memguard_map)) {
+		if (memguard_cursor == memguard_base) {
 			memguard_fail_kva++;
 			addr = (vm_offset_t)NULL;
 			goto out;
 		}
 		memguard_wrap++;
-		memguard_cursor = vm_map_min(memguard_map);
+		memguard_cursor = memguard_base;
 	}
+	addr = origaddr;
 	if (do_guard)
 		addr += PAGE_SIZE;
-	rv = kmem_back(memguard_map, addr, size_p, flags);
+	rv = kmem_back(kmem_object, addr, size_p, flags);
 	if (rv != KERN_SUCCESS) {
+		vmem_xfree(memguard_arena, origaddr, size_v);
 		memguard_fail_pgs++;
 		addr = (vm_offset_t)NULL;
 		goto out;
 	}
-	memguard_cursor = addr + size_p;
+	memguard_cursor = addr + size_v;
 	*v2sizep(trunc_page(addr)) = req_size;
+	*v2sizev(trunc_page(addr)) = size_v;
 	memguard_succ++;
 	if (req_size < PAGE_SIZE) {
 		memguard_wasted += (PAGE_SIZE - req_size);
@@ -351,7 +371,6 @@
 		}
 	}
 out:
-	vm_map_unlock(memguard_map);
 	return ((void *)addr);
 }
 
@@ -360,7 +379,7 @@
 {
 	vm_offset_t a = (vm_offset_t)(uintptr_t)addr;
 
-	return (a >= memguard_map->min_offset && a < memguard_map->max_offset);
+	return (a >= memguard_base && a < memguard_base + memguard_mapsize);
 }
 
 /*
@@ -370,12 +389,13 @@
 memguard_free(void *ptr)
 {
 	vm_offset_t addr;
-	u_long req_size, size;
+	u_long req_size, size, sizev;
 	char *temp;
 	int i;
 
 	addr = trunc_page((uintptr_t)ptr);
 	req_size = *v2sizep(addr);
+	sizev = *v2sizev(addr);
 	size = round_page(req_size);
 
 	/*
@@ -397,11 +417,12 @@
 	 * vm_map lock to serialize updates to memguard_wasted, since
 	 * we had the lock at increment.
 	 */
-	vm_map_lock(memguard_map);
+	kmem_unback(kmem_object, addr, size);
+	if (sizev > size)
+		addr -= PAGE_SIZE;
+	vmem_xfree(memguard_arena, addr, sizev);
 	if (req_size < PAGE_SIZE)
 		memguard_wasted -= (PAGE_SIZE - req_size);
-	(void)vm_map_delete(memguard_map, addr, addr + size);
-	vm_map_unlock(memguard_map);
 }
 
 /*
@@ -429,8 +450,8 @@
 	return (newaddr);
 }
 
-int
-memguard_cmp(struct malloc_type *mtp, unsigned long size)
+static int
+memguard_cmp(unsigned long size)
 {
 
 	if (size < memguard_minsize) {
@@ -437,7 +458,7 @@
 		memguard_minsize_reject++;
 		return (0);
 	}
-	if ((memguard_options & MG_ALLLARGE) != 0 && size >= PAGE_SIZE)
+	if ((memguard_options & MG_GUARD_ALLLARGE) != 0 && size >= PAGE_SIZE)
 		return (1);
 	if (memguard_frequency > 0 &&
 	    (random() % 100000) < memguard_frequency) {
@@ -444,6 +465,17 @@
 		memguard_frequency_hits++;
 		return (1);
 	}
+
+	return (0);
+}
+
+int
+memguard_cmp_mtp(struct malloc_type *mtp, unsigned long size)
+{
+
+	if (memguard_cmp(size))
+		return(1);
+
 #if 1
 	/*
 	 * The safest way of comparsion is to always compare short description
@@ -467,3 +499,21 @@
 	return (0);
 #endif
 }
+
+int
+memguard_cmp_zone(uma_zone_t zone)
+{
+
+	if ((memguard_options & MG_GUARD_NOFREE) == 0 &&
+	    zone->uz_flags & UMA_ZONE_NOFREE)
+		return (0);
+
+	if (memguard_cmp(zone->uz_size))
+		return (1);
+
+	/*
+	 * The safest way of comparsion is to always compare zone name,
+	 * but it is also the slowest way.
+	 */
+	return (strcmp(zone->uz_name, vm_memguard_desc) == 0);
+}

Modified: trunk/sys/vm/memguard.h
===================================================================
--- trunk/sys/vm/memguard.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/memguard.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005,
  *     Bosko Milekic <bmilekic at FreeBSD.org>.  All rights reserved.
@@ -23,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
  */
 
 #ifndef _VM_MEMGUARD_H_
@@ -33,14 +34,16 @@
 
 struct malloc_type;
 struct vm_map;
+struct vmem;
 
 #ifdef DEBUG_MEMGUARD
 unsigned long	memguard_fudge(unsigned long, const struct vm_map *);
-void	memguard_init(struct vm_map *);
+void	memguard_init(struct vmem *);
 void 	*memguard_alloc(unsigned long, int);
 void	*memguard_realloc(void *, unsigned long, struct malloc_type *, int);
 void	memguard_free(void *);
-int	memguard_cmp(struct malloc_type *, unsigned long);
+int	memguard_cmp_mtp(struct malloc_type *, unsigned long);
+int	memguard_cmp_zone(uma_zone_t);
 int	is_memguard_addr(void *);
 #else
 #define	memguard_fudge(size, xxx)	(size)
@@ -48,7 +51,8 @@
 #define	memguard_alloc(size, flags)	NULL
 #define	memguard_realloc(a, s, mtp, f)	NULL
 #define	memguard_free(addr)		do { } while (0)
-#define	memguard_cmp(mtp, size)		0
+#define	memguard_cmp_mtp(mtp, size)	0
+#define	memguard_cmp_zone(zone)		0
 #define	is_memguard_addr(addr)		0
 #endif
 

Modified: trunk/sys/vm/phys_pager.c
===================================================================
--- trunk/sys/vm/phys_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/phys_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000 Peter Wemm
  *
@@ -24,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -34,9 +35,11 @@
 #include <sys/proc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
@@ -54,9 +57,6 @@
 	mtx_init(&phys_pager_mtx, "phys_pager list", NULL, MTX_DEF);
 }
 
-/*
- * MPSAFE
- */
 static vm_object_t
 phys_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred)
@@ -99,8 +99,8 @@
 				object = object1;
 				object1 = NULL;
 				object->handle = handle;
-				TAILQ_INSERT_TAIL(&phys_pager_object_list, object,
-				    pager_object_list);
+				TAILQ_INSERT_TAIL(&phys_pager_object_list,
+				    object, pager_object_list);
 			}
 		} else {
 			if (pindex > object->size)
@@ -115,20 +115,19 @@
 	return (object);
 }
 
-/*
- * MPSAFE
- */
 static void
 phys_pager_dealloc(vm_object_t object)
 {
 
 	if (object->handle != NULL) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		mtx_lock(&phys_pager_mtx);
 		TAILQ_REMOVE(&phys_pager_object_list, object, pager_object_list);
 		mtx_unlock(&phys_pager_mtx);
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 	}
+	object->handle = NULL;
+	object->type = OBJT_DEAD;
 }
 
 /*
@@ -139,7 +138,7 @@
 {
 	int i;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (i = 0; i < count; i++) {
 		if (m[i]->valid == 0) {
 			if ((m[i]->flags & PG_ZERO) == 0)
@@ -151,10 +150,12 @@
 		KASSERT(m[i]->dirty == 0,
 		    ("phys_pager_getpages: dirty page %p", m[i]));
 		/* The requested page must remain busy, the others not. */
-		if (i == reqpage)
+		if (i == reqpage) {
+			vm_page_lock(m[i]);
 			vm_page_flash(m[i]);
-		else
-			vm_page_wakeup(m[i]);
+			vm_page_unlock(m[i]);
+		} else
+			vm_page_xunbusy(m[i]);
 	}
 	return (VM_PAGER_OK);
 }
@@ -161,7 +162,7 @@
 
 static void
 phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
-		    int *rtvals)
+    int *rtvals)
 {
 
 	panic("phys_pager_putpage called");
@@ -179,7 +180,7 @@
 #endif
 static boolean_t
 phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
-		   int *after)
+    int *after)
 {
 	vm_pindex_t base, end;
 

Modified: trunk/sys/vm/pmap.h
===================================================================
--- trunk/sys/vm/pmap.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/pmap.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $
  */
 
 /*
@@ -97,21 +98,25 @@
  */
 extern vm_offset_t kernel_vm_end;
 
+/*
+ * Flags for pmap_enter().  The bits in the low-order byte are reserved
+ * for the protection code (vm_prot_t) that describes the fault type.
+ */
+#define	PMAP_ENTER_NOSLEEP	0x0100
+#define	PMAP_ENTER_WIRED	0x0200
+
 void		 pmap_activate(struct thread *td);
+void		 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+		    int advice);
 void		 pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
 		    vm_size_t);
-#if defined(__mips__)
-void		 pmap_align_tlb(vm_offset_t *);
-#endif
-void		 pmap_change_wiring(pmap_t, vm_offset_t, boolean_t);
 void		 pmap_clear_modify(vm_page_t m);
-void		 pmap_clear_reference(vm_page_t m);
 void		 pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
 void		 pmap_copy_page(vm_page_t, vm_page_t);
 void		 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset,
 		    vm_page_t mb[], vm_offset_t b_offset, int xfersize);
-void		 pmap_enter(pmap_t, vm_offset_t, vm_prot_t, vm_page_t,
-		    vm_prot_t, boolean_t);
+int		 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
+		    vm_prot_t prot, u_int flags, int8_t psind);
 void		 pmap_enter_object(pmap_t pmap, vm_offset_t start,
 		    vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
 void		 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
@@ -144,6 +149,7 @@
 void		 pmap_remove_write(vm_page_t m);
 void		 pmap_sync_icache(pmap_t, vm_offset_t, vm_size_t);
 boolean_t	 pmap_ts_referenced(vm_page_t m);
+void		 pmap_unwire(pmap_t pmap, vm_offset_t start, vm_offset_t end);
 void		 pmap_zero_page(vm_page_t);
 void		 pmap_zero_page_area(vm_page_t, int off, int size);
 void		 pmap_zero_page_idle(vm_page_t);

Modified: trunk/sys/vm/redzone.c
===================================================================
--- trunk/sys/vm/redzone.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/redzone.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/vm/redzone.h
===================================================================
--- trunk/sys/vm/redzone.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/redzone.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006 Pawel Jakub Dawidek <pjd at FreeBSD.org>
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
  */
 
 #ifndef	_VM_REDZONE_H_

Modified: trunk/sys/vm/sg_pager.c
===================================================================
--- trunk/sys/vm/sg_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/sg_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Copyright (c) 2009 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb at FreeBSD.org>
  * All rights reserved.
  *
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $");
 
 /*
  * This pager manages OBJT_SG objects.  These objects are backed by
@@ -36,6 +37,7 @@
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/rwlock.h>
 #include <sys/sglist.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -42,6 +44,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/uma.h>
 
 static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
@@ -122,12 +125,14 @@
 	 * Free up our fake pages.
 	 */
 	while ((m = TAILQ_FIRST(&object->un_pager.sgp.sgp_pglist)) != 0) {
-		TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, pageq);
+		TAILQ_REMOVE(&object->un_pager.sgp.sgp_pglist, m, plinks.q);
 		vm_page_putfake(m);
 	}
 	
 	sg = object->handle;
 	sglist_free(sg);
+	object->handle = NULL;
+	object->type = OBJT_DEAD;
 }
 
 static int
@@ -141,10 +146,10 @@
 	size_t space;
 	int i;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	sg = object->handle;
 	memattr = object->memattr;
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	offset = m[reqpage]->pindex;
 
 	/*
@@ -179,16 +184,18 @@
 
 	/* Construct a new fake page. */
 	page = vm_page_getfake(paddr, memattr);
-	VM_OBJECT_LOCK(object);
-	TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, pageq);
+	VM_OBJECT_WLOCK(object);
+	TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
 
 	/* Free the original pages and insert this fake page into the object. */
 	for (i = 0; i < count; i++) {
+		if (i == reqpage &&
+		    vm_page_replace(page, object, offset) != m[i])
+			panic("sg_pager_getpages: invalid place replacement");
 		vm_page_lock(m[i]);
 		vm_page_free(m[i]);
 		vm_page_unlock(m[i]);
 	}
-	vm_page_insert(page, object, offset);
 	m[reqpage] = page;
 	page->valid = VM_PAGE_BITS_ALL;
 

Modified: trunk/sys/vm/swap_pager.c
===================================================================
--- trunk/sys/vm/swap_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/swap_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
@@ -50,7 +51,7 @@
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
- *	  pages.  
+ *	  pages.
  *
  *	- on the fly deallocation of swap
  *
@@ -67,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $");
 
 #include "opt_swap.h"
 #include "opt_vm.h"
@@ -89,6 +90,7 @@
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
@@ -114,9 +116,8 @@
 #include <geom/geom.h>
 
 /*
- * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, 16
- * or 32 pages per allocation.
- * The 32-page limit is due to the radix code (kern/subr_blist.c).
+ * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
+ * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
@@ -133,7 +134,6 @@
  * Unused disk addresses within a swap area are allocated and managed
  * using a blist.
  */
-#define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
@@ -154,15 +154,21 @@
 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
 
 static vm_ooffset_t swap_total;
-SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0, 
+SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
     "Total amount of available swap storage.");
 static vm_ooffset_t swap_reserved;
-SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0, 
+SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
     "Amount of swap storage needed to back all allocated anonymous memory.");
 static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0, 
+SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
+static unsigned long swzone;
+SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
+    "Actual size of swap metadata zone");
+static unsigned long swap_maxpages;
+SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
+    "Maximum amount of swap supported");
 
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
@@ -184,7 +190,7 @@
 	static int curfail;
 	static struct timeval lastfail;
 	struct uidinfo *uip;
-	
+
 	uip = cred->cr_ruidinfo;
 
 	if (incr & PAGE_MASK)
@@ -191,11 +197,13 @@
 		panic("swap_reserve: & PAGE_MASK");
 
 #ifdef RACCT
-	PROC_LOCK(curproc);
-	error = racct_add(curproc, RACCT_SWAP, incr);
-	PROC_UNLOCK(curproc);
-	if (error != 0)
-		return (0);
+	if (racct_enable) {
+		PROC_LOCK(curproc);
+		error = racct_add(curproc, RACCT_SWAP, incr);
+		PROC_UNLOCK(curproc);
+		if (error != 0)
+			return (0);
+	}
 #endif
 
 	res = 0;
@@ -285,7 +293,7 @@
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
  	struct uidinfo *uip;
-	
+
 	uip = cred->cr_ruidinfo;
 
 	if (decr & PAGE_MASK)
@@ -328,7 +336,7 @@
 
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
-        CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
+	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
@@ -340,10 +348,9 @@
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
-static struct mtx sw_alloc_mtx;	/* protect list manipulation */ 
+static struct mtx sw_alloc_mtx;	/* protect list manipulation */
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t	swap_zone;
-static struct vm_object	swap_zone_obj;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
@@ -373,18 +380,14 @@
 };
 
 /*
- * dmmax is in page-sized chunks with the new swap system.  It was
- * dev-bsized chunks in the old.  dmmax is always a power of 2.
- *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
-static int dmmax;
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
-SYSCTL_INT(_vm, OID_AUTO, dmmax,
-	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
+SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
+    "Maximum size of a swap block in pages");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
@@ -419,7 +422,7 @@
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
- *	
+ *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
@@ -474,7 +477,7 @@
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
- *	Expected to be started from system init.  NOTE:  This code is run 
+ *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
@@ -490,11 +493,7 @@
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
-
-	/*
-	 * Device Stripe, in PAGE_SIZE'd blocks
-	 */
-	dmmax = SWB_NPAGES * 2;
+	sx_init(&sw_alloc_sx, "swspsx");
 }
 
 /*
@@ -506,7 +505,7 @@
 void
 swap_pager_swap_init(void)
 {
-	int n, n2;
+	unsigned long n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
@@ -519,7 +518,7 @@
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
-	 * Currently we hardwire nsw_wcount_async to 4.  This limit is 
+	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
@@ -542,7 +541,7 @@
 	/*
 	 * Initialize our zone.  Right now I'm just guessing on the number
 	 * we need based on the number of pages in the system.  Each swblock
-	 * can hold 16 pages, so this is probably overkill.  This reservation
+	 * can hold 32 pages, so this is probably overkill.  This reservation
 	 * is typically limited to around 32MB by default.
 	 */
 	n = cnt.v_page_count / 2;
@@ -554,7 +553,7 @@
 	if (swap_zone == NULL)
 		panic("failed to create swap_zone.");
 	do {
-		if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
+		if (uma_zone_reserve_kva(swap_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
@@ -563,12 +562,14 @@
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 	if (n2 != n)
-		printf("Swap zone entries reduced from %d to %d.\n", n2, n);
+		printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
+	swap_maxpages = n * SWAP_META_PAGES;
+	swzone = n * sizeof(struct swblock);
 	n2 = n;
 
 	/*
 	 * Initialize our meta-data hash table.  The swapper does not need to
-	 * be quite as efficient as the VM system, so we do not use an 
+	 * be quite as efficient as the VM system, so we do not use an
 	 * oversized hash table.
 	 *
 	 * 	n: 		size of hash table, must be power of 2
@@ -622,7 +623,7 @@
 				crhold(cred);
 			}
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
-			VM_OBJECT_LOCK(object);
+			VM_OBJECT_WLOCK(object);
 			object->handle = handle;
 			if (cred != NULL) {
 				object->cred = cred;
@@ -629,7 +630,7 @@
 				object->charge = size;
 			}
 			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
 		mtx_unlock(&Giant);
@@ -640,13 +641,13 @@
 			crhold(cred);
 		}
 		object = vm_object_allocate(OBJT_DEFAULT, pindex);
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		if (cred != NULL) {
 			object->cred = cred;
 			object->charge = size;
 		}
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	}
 	return (object);
 }
@@ -654,7 +655,7 @@
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
- *	The swap backing for the object is destroyed.  The code is 
+ *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
@@ -675,16 +676,18 @@
 		mtx_unlock(&sw_alloc_mtx);
 	}
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
-	 * Free all remaining metadata.  We only bother to free it from 
+	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
+	object->handle = NULL;
+	object->type = OBJT_DEAD;
 }
 
 /************************************************************************
@@ -748,7 +751,7 @@
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
-	
+
 static void
 swp_pager_strategy(struct buf *bp)
 {
@@ -758,6 +761,17 @@
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
 			mtx_unlock(&sw_dev_mtx);
+			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
+			    unmapped_buf_allowed) {
+				bp->b_kvaalloc = bp->b_data;
+				bp->b_data = unmapped_buf;
+				bp->b_kvabase = unmapped_buf;
+				bp->b_offset = 0;
+				bp->b_flags |= B_UNMAPPED;
+			} else {
+				pmap_qenter((vm_offset_t)bp->b_data,
+				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
+			}
 			sp->sw_strategy(bp, sp);
 			return;
 		}
@@ -764,10 +778,10 @@
 	}
 	panic("Swapdev not found");
 }
-	
 
+
 /*
- * SWP_PAGER_FREESWAPSPACE() -	free raw swap space 
+ * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
@@ -785,7 +799,7 @@
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
-			 * blocks free lest they be reused.  
+			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
@@ -808,15 +822,16 @@
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
- *	The external callers of this routine typically have already destroyed 
- *	or renamed vm_page_t's associated with this range in the object so 
+ *	The external callers of this routine typically have already destroyed
+ *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
+ *
+ *	The object must be locked.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	swp_pager_meta_free(object, start, size);
 }
 
@@ -823,8 +838,8 @@
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
- *	Assigns swap blocks to the specified range within the object.  The 
- *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
+ *	Assigns swap blocks to the specified range within the object.  The
+ *	swap blocks are not zeroed.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
@@ -835,7 +850,7 @@
 	daddr_t blk = SWAPBLK_NONE;
 	vm_pindex_t beg = start;	/* save start index */
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	while (size) {
 		if (n == 0) {
 			n = BLIST_MAX_ALLOC;
@@ -843,7 +858,7 @@
 				n >>= 1;
 				if (n == 0) {
 					swp_pager_meta_free(object, beg, start - beg);
-					VM_OBJECT_UNLOCK(object);
+					VM_OBJECT_WUNLOCK(object);
 					return (-1);
 				}
 			}
@@ -855,7 +870,7 @@
 		--n;
 	}
 	swp_pager_meta_free(object, start, n);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
@@ -869,7 +884,7 @@
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
- *	progress on the source. 
+ *	progress on the source.
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
@@ -884,12 +899,12 @@
 {
 	vm_pindex_t i;
 
-	VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(srcobject);
+	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	/*
-	 * If destroysource is set, we remove the source object from the 
-	 * swap_pager internal queue now. 
+	 * If destroysource is set, we remove the source object from the
+	 * swap_pager internal queue now.
 	 */
 	if (destroysource) {
 		if (srcobject->handle != NULL) {
@@ -925,7 +940,7 @@
 			daddr_t srcaddr;
 
 			srcaddr = swp_pager_meta_ctl(
-			    srcobject, 
+			    srcobject,
 			    i + offset,
 			    SWM_POP
 			);
@@ -935,11 +950,11 @@
 				 * swp_pager_meta_build() can sleep.
 				 */
 				vm_object_pip_add(srcobject, 1);
-				VM_OBJECT_UNLOCK(srcobject);
+				VM_OBJECT_WUNLOCK(srcobject);
 				vm_object_pip_add(dstobject, 1);
 				swp_pager_meta_build(dstobject, i, srcaddr);
 				vm_object_pip_wakeup(dstobject);
-				VM_OBJECT_LOCK(srcobject);
+				VM_OBJECT_WLOCK(srcobject);
 				vm_object_pip_wakeup(srcobject);
 			}
 		} else {
@@ -947,7 +962,7 @@
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the sourceblock.
 			 */
-			
+
 			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
@@ -988,7 +1003,7 @@
 {
 	daddr_t blk0;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
@@ -1042,7 +1057,7 @@
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
- *	not, from the page.  
+ *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
@@ -1054,12 +1069,13 @@
  *	depends on it.
  *
  *	This routine may not sleep.
+ *
+ *	The object containing the page must be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
@@ -1071,7 +1087,7 @@
  *	a chunk surrounding m[reqpage] as is contiguous in swap and which
  *	belongs to the same object.
  *
- *	The code is designed for asynchronous operation and 
+ *	The code is designed for asynchronous operation and
  *	immediate-notification of 'reqpage' but tends not to be
  *	used that way.  Please do not optimize-out this algorithmic
  *	feature, I intend to improve on it in the future.
@@ -1101,7 +1117,7 @@
 	 * Calculate range to retrieve.  The pages have already been assigned
 	 * their swapblks.  We require a *contiguous* range but we know it to
 	 * not span devices.   If we do not supply it, bad things
-	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
+	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
 	 * loops are set up such that the case(s) are handled implicitly.
 	 *
 	 * The swp_*() calls must be made with the object locked.
@@ -1139,7 +1155,7 @@
 	}
 
 	/*
-	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
+	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq
 	 * still busy, but the others unbusied.
 	 */
 	if (blk == SWAPBLK_NONE)
@@ -1148,7 +1164,7 @@
 	/*
 	 * Getpbuf() can sleep.
 	 */
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
@@ -1155,11 +1171,6 @@
 	bp = getpbuf(&nsw_rcount);
 	bp->b_flags |= B_PAGING;
 
-	/*
-	 * map our page(s) into kva for input
-	 */
-	pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
-
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
@@ -1169,7 +1180,7 @@
 	bp->b_bufsize = PAGE_SIZE * (j - i);
 	bp->b_pager.pg_reqpage = reqpage - i;
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	{
 		int k;
 
@@ -1188,7 +1199,7 @@
 	 * does not remove it.
 	 */
 	vm_object_pip_add(object, bp->b_npages);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
@@ -1209,11 +1220,12 @@
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
-		mreq->oflags |= VPO_WANTED;
+		mreq->oflags |= VPO_SWAPSLEEP;
 		PCPU_INC(cnt.v_intrans);
-		if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
+		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
+		    "swread", hz * 20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
@@ -1234,13 +1246,13 @@
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
-	 * the page clean when we return, causing the page to possibly revert 
+	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 /*
- *	swap_pager_putpages: 
+ *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
@@ -1247,8 +1259,8 @@
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
- *	In a low memory situation we may block in VOP_STRATEGY(), but the new 
- *	vm_page reservation system coupled with properly written VFS devices 
+ *	In a low memory situation we may block in VOP_STRATEGY(), but the new
+ *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
@@ -1263,14 +1275,14 @@
  */
 void
 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
-    boolean_t sync, int *rtvals)
+    int flags, int *rtvals)
 {
-	int i;
-	int n = 0;
+	int i, n;
+	boolean_t sync;
 
 	if (count && m[0]->object != object) {
-		panic("swap_pager_putpages: object mismatch %p/%p", 
-		    object, 
+		panic("swap_pager_putpages: object mismatch %p/%p",
+		    object,
 		    m[0]->object
 		);
 	}
@@ -1284,15 +1296,18 @@
 	 */
 	if (object->type != OBJT_SWAP)
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
+	n = 0;
 	if (curproc != pageproc)
 		sync = TRUE;
+	else
+		sync = (flags & VM_PAGER_PUT_SYNC) != 0;
 
 	/*
 	 * Step 2
 	 *
-	 * Update nsw parameters from swap_async_max sysctl values.  
+	 * Update nsw parameters from swap_async_max sysctl values.
 	 * Do not let the sysop crash the machine with bogus numbers.
 	 */
 	mtx_lock(&pbuf_mtx);
@@ -1371,8 +1386,6 @@
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
-		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
-
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
@@ -1379,22 +1392,22 @@
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = m[i+j];
 
 			swp_pager_meta_build(
-			    mreq->object, 
+			    mreq->object,
 			    mreq->pindex,
 			    blk + j
 			);
-			vm_page_dirty(mreq);
+			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
 			rtvals[i+j] = VM_PAGER_OK;
 
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
@@ -1444,7 +1457,7 @@
 		 */
 		swp_pager_async_iodone(bp);
 	}
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 }
 
 /*
@@ -1453,12 +1466,6 @@
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
- *	For READ operations, the pages are VPO_BUSY'd.  For WRITE operations, 
- *	the pages are vm_page_t->busy'd.  For READ operations, we VPO_BUSY 
- *	unbusy all pages except the 'main' request page.  For WRITE 
- *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
- *	because we marked them all VM_PAGER_PEND on return from putpages ).
- *
  *	This routine may not sleep.
  */
 static void
@@ -1475,7 +1482,7 @@
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
-		    (long)bp->b_blkno, 
+		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
@@ -1484,11 +1491,16 @@
 	/*
 	 * remove the mapping for kernel virtual
 	 */
-	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+	if ((bp->b_flags & B_UNMAPPED) != 0) {
+		bp->b_data = bp->b_kvaalloc;
+		bp->b_kvabase = bp->b_kvaalloc;
+		bp->b_flags &= ~B_UNMAPPED;
+	} else
+		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 	}
 
 	/*
@@ -1495,7 +1507,7 @@
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
-	 * in this case we remove the m->swapblk assignment for the page 
+	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
@@ -1503,12 +1515,16 @@
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
+		if (m->oflags & VPO_SWAPSLEEP) {
+			m->oflags &= ~VPO_SWAPSLEEP;
+			wakeup(&object->paging_in_progress);
+		}
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
-			 * can never be used again.  But I can't from an 
+			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
@@ -1517,7 +1533,7 @@
 				 * locked for the parent, but all other
 				 * pages can be freed.  We still want to
 				 * wakeup the parent waiting on the page,
-				 * though.  ( also: pg_reqpage can be -1 and 
+				 * though.  ( also: pg_reqpage can be -1 and
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
@@ -1531,10 +1547,13 @@
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
 					swp_pager_free_nrpage(m);
-				else
+				else {
+					vm_page_lock(m);
 					vm_page_flash(m);
+					vm_page_unlock(m);
+				}
 				/*
-				 * If i == bp->b_pager.pg_reqpage, do not wake 
+				 * If i == bp->b_pager.pg_reqpage, do not wake
 				 * the page up.  The caller needs to.
 				 */
 			} else {
@@ -1547,11 +1566,11 @@
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
-				vm_page_io_finish(m);
+				vm_page_sunbusy(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
-			 * NOTE: for reads, m->dirty will probably be 
+			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
@@ -1563,8 +1582,8 @@
 			 *
 			 * Note that the requested page, reqpage, is left
 			 * busied, but we still have to wake it up.  The
-			 * other pages are released (unbusied) by 
-			 * vm_page_wakeup().
+			 * other pages are released (unbusied) by
+			 * vm_page_xunbusy().
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
@@ -1577,7 +1596,7 @@
 			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
-			 * requested page - getpages expects it to be 
+			 * requested page - getpages expects it to be
 			 * left busy.
 			 */
 			if (i != bp->b_pager.pg_reqpage) {
@@ -1584,13 +1603,16 @@
 				vm_page_lock(m);
 				vm_page_deactivate(m);
 				vm_page_unlock(m);
-				vm_page_wakeup(m);
-			} else
+				vm_page_xunbusy(m);
+			} else {
+				vm_page_lock(m);
 				vm_page_flash(m);
+				vm_page_unlock(m);
+			}
 		} else {
 			/*
 			 * For write success, clear the dirty
-			 * status, then finish the I/O ( which decrements the 
+			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
@@ -1597,7 +1619,7 @@
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
-			vm_page_io_finish(m);
+			vm_page_sunbusy(m);
 			if (vm_page_count_severe()) {
 				vm_page_lock(m);
 				vm_page_try_to_cache(m);
@@ -1612,11 +1634,11 @@
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	}
 
-	/* 
-	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 
+	/*
+	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
@@ -1628,10 +1650,10 @@
 	 * release the physical I/O buffer
 	 */
 	relpbuf(
-	    bp, 
-	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
-		((bp->b_flags & B_ASYNC) ? 
-		    &nsw_wcount_async : 
+	    bp,
+	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
+		((bp->b_flags & B_ASYNC) ?
+		    &nsw_wcount_async :
 		    &nsw_wcount_sync
 		)
 	    )
@@ -1653,7 +1675,7 @@
 	int bcount;
 	int i;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return (0);
 
@@ -1695,7 +1717,7 @@
 	vm_page_t m;
 
 	vm_object_pip_add(object, 1);
-	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid == VM_PAGE_BITS_ALL) {
 		vm_object_pip_subtract(object, 1);
 		vm_page_dirty(m);
@@ -1702,7 +1724,7 @@
 		vm_page_lock(m);
 		vm_page_activate(m);
 		vm_page_unlock(m);
-		vm_page_wakeup(m);
+		vm_page_xunbusy(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
@@ -1714,7 +1736,7 @@
 	vm_page_lock(m);
 	vm_page_deactivate(m);
 	vm_page_unlock(m);
-	vm_page_wakeup(m);
+	vm_page_xunbusy(m);
 	vm_pager_page_unswapped(m);
 }
 
@@ -1732,36 +1754,49 @@
 swap_pager_swapoff(struct swdevt *sp)
 {
 	struct swblock *swap;
+	vm_object_t locked_obj, object;
+	vm_pindex_t pindex;
 	int i, j, retries;
 
 	GIANT_REQUIRED;
 
 	retries = 0;
+	locked_obj = NULL;
 full_rescan:
 	mtx_lock(&swhash_mtx);
 	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 restart:
 		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
-			vm_object_t object = swap->swb_object;
-			vm_pindex_t pindex = swap->swb_index;
-                        for (j = 0; j < SWAP_META_PAGES; ++j) {
-                                if (swp_pager_isondev(swap->swb_pages[j], sp)) {
-					/* avoid deadlock */
-					if (!VM_OBJECT_TRYLOCK(object)) {
-						break;
-					} else {
+			object = swap->swb_object;
+			pindex = swap->swb_index;
+			for (j = 0; j < SWAP_META_PAGES; ++j) {
+				if (!swp_pager_isondev(swap->swb_pages[j], sp))
+					continue;
+				if (locked_obj != object) {
+					if (locked_obj != NULL)
+						VM_OBJECT_WUNLOCK(locked_obj);
+					locked_obj = object;
+					if (!VM_OBJECT_TRYWLOCK(object)) {
 						mtx_unlock(&swhash_mtx);
-						swp_pager_force_pagein(object,
-						    pindex + j);
-						VM_OBJECT_UNLOCK(object);
+						/* Depends on type-stability. */
+						VM_OBJECT_WLOCK(object);
 						mtx_lock(&swhash_mtx);
 						goto restart;
 					}
 				}
-                        }
+				MPASS(locked_obj == object);
+				mtx_unlock(&swhash_mtx);
+				swp_pager_force_pagein(object, pindex + j);
+				mtx_lock(&swhash_mtx);
+				goto restart;
+			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
+	if (locked_obj != NULL) {
+		VM_OBJECT_WUNLOCK(locked_obj);
+		locked_obj = NULL;
+	}
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
@@ -1783,7 +1818,7 @@
  *				SWAP META DATA 				*
  ************************************************************************
  *
- *	These routines manipulate the swap metadata stored in the 
+ *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
@@ -1809,7 +1844,7 @@
 	struct swblock **pswap;
 	int idx;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * Convert default object to swap object if necessary
 	 */
@@ -1821,13 +1856,13 @@
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_INSERT_TAIL(
 			    NOBJLIST(object->handle),
-			    object, 
+			    object,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
-	
+
 	/*
 	 * Locate hash entry.  If not found create, but if we aren't adding
 	 * anything just return.  If we run out of space in the map we wait
@@ -1843,12 +1878,13 @@
 		if (swapblk == SWAPBLK_NONE)
 			goto done;
 
-		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
+		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
+		    (curproc == pageproc ? M_USE_RESERVE : 0));
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swap_zone)) {
-				if (atomic_cmpset_rel_int(&exhausted, 0, 1))
+				if (atomic_cmpset_int(&exhausted, 0, 1))
 					printf("swap zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
@@ -1855,11 +1891,11 @@
 				pause("swzonex", 10);
 			} else
 				VM_WAIT;
-			VM_OBJECT_LOCK(object);
+			VM_OBJECT_WLOCK(object);
 			goto retry;
 		}
 
-		if (atomic_cmpset_rel_int(&exhausted, 1, 0))
+		if (atomic_cmpset_int(&exhausted, 1, 0))
 			printf("swap zone ok\n");
 
 		swap->swb_hnext = NULL;
@@ -1896,10 +1932,10 @@
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
- *	The requested range of blocks is freed, with any associated swap 
+ *	The requested range of blocks is freed, with any associated swap
  *	returned to the swap bitmap.
  *
- *	This routine will free swap metadata structures as they are cleaned 
+ *	This routine will free swap metadata structures as they are cleaned
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  */
@@ -1907,7 +1943,7 @@
 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
@@ -1951,23 +1987,22 @@
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
-	daddr_t index = 0;
+	struct swblock **pswap, *swap;
+	vm_pindex_t index;
+	daddr_t v;
+	int i;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
-	while (object->un_pager.swp.swp_bcount) {
-		struct swblock **pswap;
-		struct swblock *swap;
-
+	index = 0;
+	while (object->un_pager.swp.swp_bcount != 0) {
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 		if ((swap = *pswap) != NULL) {
-			int i;
-
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
-				daddr_t v = swap->swb_pages[i];
+				v = swap->swb_pages[i];
 				if (v != SWAPBLK_NONE) {
 					--swap->swb_count;
 					swp_pager_freeswapspace(v, 1);
@@ -1974,7 +2009,8 @@
 				}
 			}
 			if (swap->swb_count != 0)
-				panic("swap_pager_meta_free_all: swb_count != 0");
+				panic(
+				    "swap_pager_meta_free_all: swb_count != 0");
 			*pswap = swap->swb_hnext;
 			uma_zfree(swap_zone, swap);
 			--object->un_pager.swp.swp_bcount;
@@ -1991,14 +2027,14 @@
  *	swapblk assignments in the swap meta data or in the vm_page_t.
  *	The routine typically returns the swapblk being looked-up, or popped,
  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
- *	was invalid.  This routine will automatically free any invalid 
+ *	was invalid.  This routine will automatically free any invalid
  *	meta-data swapblks.
  *
  *	It is not possible to store invalid swapblks in the swap meta data
  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
  *
- *	When acting on a busy resident page and paging is in progress, we 
- *	have to wait until paging is complete but otherwise can act on the 
+ *	When acting on a busy resident page and paging is in progress, we
+ *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  *
  *	SWM_FREE	remove and free swap block from metadata
@@ -2012,9 +2048,9 @@
 	daddr_t r1;
 	int idx;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	/*
-	 * The meta data only exists of the object is OBJT_SWAP 
+	 * The meta data only exists of the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
@@ -2040,7 +2076,7 @@
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
-			} 
+			}
 		}
 	}
 	mtx_unlock(&swhash_mtx);
@@ -2058,7 +2094,7 @@
 };
 #endif
 
-/* 
+/*
  * MPSAFE
  */
 /* ARGSUSED */
@@ -2144,7 +2180,8 @@
 }
 
 static void
-swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
+swaponsomething(struct vnode *vp, void *id, u_long nblks,
+    sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
@@ -2153,7 +2190,7 @@
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
-	 * 
+	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
@@ -2180,6 +2217,7 @@
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
+	sp->sw_flags = flags;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	/*
@@ -2204,7 +2242,7 @@
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
-	swap_pager_avail += nblks;
+	swap_pager_avail += nblks - 2;
 	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
 	swapon_check_swzone(swap_total / PAGE_SIZE);
 	swp_sizecheck();
@@ -2276,7 +2314,7 @@
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
-	u_long nblks, dvbase;
+	u_long nblks;
 #ifdef MAC
 	int error;
 #endif
@@ -2307,10 +2345,7 @@
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
-	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
-		swap_pager_avail -= blist_fill(sp->sw_blist,
-		     dvbase, dmmax);
-	}
+	swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
 	swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
 	mtx_unlock(&sw_dev_mtx);
 
@@ -2320,8 +2355,8 @@
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
+	mtx_lock(&sw_dev_mtx);
 	sp->sw_id = NULL;
-	mtx_lock(&sw_dev_mtx);
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
@@ -2342,12 +2377,12 @@
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
- 
+
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
- 
+
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
@@ -2365,7 +2400,7 @@
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
- 
+
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
@@ -2465,7 +2500,7 @@
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 		    (object = cur->object.vm_object) != NULL) {
-			VM_OBJECT_LOCK(object);
+			VM_OBJECT_WLOCK(object);
 			if (object->type == OBJT_SWAP &&
 			    object->un_pager.swp.swp_bcount != 0) {
 				n = (cur->end - cur->start) / PAGE_SIZE;
@@ -2472,7 +2507,7 @@
 				count += object->un_pager.swp.swp_bcount *
 				    SWAP_META_PAGES * n / object->size + 1;
 			}
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 		}
 	}
 	return (count);
@@ -2497,11 +2532,52 @@
 
 
 static void
+swapgeom_close_ev(void *arg, int flags)
+{
+	struct g_consumer *cp;
+
+	cp = arg;
+	g_access(cp, -1, -1, 0);
+	g_detach(cp);
+	g_destroy_consumer(cp);
+}
+
+/*
+ * Add a reference to the g_consumer for an inflight transaction.
+ */
+static void
+swapgeom_acquire(struct g_consumer *cp)
+{
+
+	mtx_assert(&sw_dev_mtx, MA_OWNED);
+	cp->index++;
+}
+
+/*
+ * Remove a reference from the g_consumer. Post a close event if
+ * all referneces go away.
+ */
+static void
+swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
+{
+
+	mtx_assert(&sw_dev_mtx, MA_OWNED);
+	cp->index--;
+	if (cp->index == 0) {
+		if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
+			sp->sw_id = NULL;
+	}
+}
+
+static void
 swapgeom_done(struct bio *bp2)
 {
+	struct swdevt *sp;
 	struct buf *bp;
+	struct g_consumer *cp;
 
 	bp = bp2->bio_caller2;
+	cp = bp2->bio_from;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
@@ -2508,6 +2584,10 @@
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bufdone(bp);
+	sp = bp2->bio_caller1;
+	mtx_lock(&sw_dev_mtx);
+	swapgeom_release(cp, sp);
+	mtx_unlock(&sw_dev_mtx);
 	g_destroy_bio(bp2);
 }
 
@@ -2517,18 +2597,25 @@
 	struct bio *bio;
 	struct g_consumer *cp;
 
+	mtx_lock(&sw_dev_mtx);
 	cp = sp->sw_id;
 	if (cp == NULL) {
+		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
+	swapgeom_acquire(cp);
+	mtx_unlock(&sw_dev_mtx);
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
+		mtx_lock(&sw_dev_mtx);
+		swapgeom_release(cp, sp);
+		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
@@ -2535,12 +2622,22 @@
 		return;
 	}
 
+	bio->bio_caller1 = sp;
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
-	bio->bio_data = bp->b_data;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
+	if ((bp->b_flags & B_UNMAPPED) != 0) {
+		bio->bio_ma = bp->b_pages;
+		bio->bio_data = unmapped_buf;
+		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+		bio->bio_ma_n = bp->b_npages;
+		bio->bio_flags |= BIO_UNMAPPED;
+	} else {
+		bio->bio_data = bp->b_data;
+		bio->bio_ma = NULL;
+	}
 	g_io_request(bio, cp);
 	return;
 }
@@ -2549,31 +2646,41 @@
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
+	int destroy;
 
 	mtx_lock(&sw_dev_mtx);
-	TAILQ_FOREACH(sp, &swtailq, sw_list)
-		if (sp->sw_id == cp)
+	TAILQ_FOREACH(sp, &swtailq, sw_list) {
+		if (sp->sw_id == cp) {
 			sp->sw_flags |= SW_CLOSING;
+			break;
+		}
+	}
+	/*
+	 * Drop reference we were created with. Do directly since we're in a
+	 * special context where we don't have to queue the call to
+	 * swapgeom_close_ev().
+	 */
+	cp->index--;
+	destroy = ((sp != NULL) && (cp->index == 0));
+	if (destroy)
+		sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
+	if (destroy)
+		swapgeom_close_ev(cp, 0);
 }
 
 static void
-swapgeom_close_ev(void *arg, int flags)
+swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 	struct g_consumer *cp;
 
-	cp = arg;
-	g_access(cp, -1, -1, 0);
-	g_detach(cp);
-	g_destroy_consumer(cp);
-}
-
-static void
-swapgeom_close(struct thread *td, struct swdevt *sw)
-{
-
+	mtx_lock(&sw_dev_mtx);
+	cp = sw->sw_id;
+	sw->sw_id = NULL;
+	mtx_unlock(&sw_dev_mtx);
 	/* XXX: direct call when Giant untangled */
-	g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
+	if (cp != NULL)
+		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 
@@ -2614,6 +2721,8 @@
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
+	cp->index = 1;		/* Number of active I/Os, plus one for being active. */
+	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	/*
 	 * XXX: Everytime you think you can improve the margin for
@@ -2630,9 +2739,9 @@
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
-	    swapgeom_close, dev2udev(swh->dev));
+	    swapgeom_close, dev2udev(swh->dev),
+	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
 	swh->error = 0;
-	return;
 }
 
 static int
@@ -2709,7 +2818,7 @@
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
-    
+
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
@@ -2721,6 +2830,6 @@
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
-	    NODEV);
+	    NODEV, 0);
 	return (0);
 }

Modified: trunk/sys/vm/swap_pager.h
===================================================================
--- trunk/sys/vm/swap_pager.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/swap_pager.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
@@ -32,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $
  */
 
 #ifndef	_VM_SWAP_PAGER_H_
@@ -68,6 +69,7 @@
 	sw_close_t		*sw_close;
 };
 
+#define	SW_UNMAPPED	0x01
 #define	SW_CLOSING	0x04
 
 #ifdef _KERNEL

Modified: trunk/sys/vm/uma.h
===================================================================
--- trunk/sys/vm/uma.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -24,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $
  *
  */
 
@@ -33,8 +34,8 @@
  *
 */
 
-#ifndef VM_UMA_H
-#define VM_UMA_H
+#ifndef _VM_UMA_H_
+#define _VM_UMA_H_
 
 #include <sys/param.h>		/* For NULL */
 #include <sys/malloc.h>		/* For M_* */
@@ -50,7 +51,7 @@
 
 void zone_drain(uma_zone_t);
 
-/* 
+/*
  * Item constructor
  *
  * Arguments:
@@ -58,7 +59,7 @@
  *	arg   The arg field passed to uma_zalloc_arg
  *	size  The size of the allocated item
  *	flags See zalloc flags
- * 
+ *
  * Returns:
  *	0      on success
  *      errno  on failure
@@ -76,7 +77,7 @@
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being destructed.
  *	arg   Argument passed through uma_zfree_arg
- * 
+ *
  * Returns:
  *	Nothing
  *
@@ -87,7 +88,7 @@
  */
 typedef void (*uma_dtor)(void *mem, int size, void *arg);
 
-/* 
+/*
  * Item initializer
  *
  * Arguments:
@@ -94,13 +95,13 @@
  *	item  A pointer to the memory which has been allocated.
  *	size  The size of the item being initialized.
  *	flags See zalloc flags
- * 
+ *
  * Returns:
  *	0      on success
  *      errno  on failure
  *
  * Discussion:
- *	The initializer is called when the memory is cached in the uma zone. 
+ *	The initializer is called when the memory is cached in the uma zone.
  *	The initializer and the destructor should leave the object in the same
  *	state.
  */
@@ -110,7 +111,7 @@
  * Item discard function
  *
  * Arguments:
- * 	item  A pointer to memory which has been 'freed' but has not left the 
+ *	item  A pointer to memory which has been 'freed' but has not left the
  *	      zone's cache.
  *	size  The size of the item being discarded.
  *
@@ -124,9 +125,19 @@
 typedef void (*uma_fini)(void *mem, int size);
 
 /*
+ * Import new memory into a cache zone.
+ */
+typedef int (*uma_import)(void *arg, void **store, int count, int flags);
+
+/*
+ * Free memory from a cache zone.
+ */
+typedef void (*uma_release)(void *arg, void **store, int count);
+
+/*
  * What's the difference between initializing and constructing?
  *
- * The item is initialized when it is cached, and this is the state that the 
+ * The item is initialized when it is cached, and this is the state that the
  * object should be in when returned to the allocator. The purpose of this is
  * to remove some code which would otherwise be called on each allocation by
  * utilizing a known, stable state.  This differs from the constructor which
@@ -167,7 +178,7 @@
  */
 uma_zone_t uma_zcreate(const char *name, size_t size, uma_ctor ctor,
 		    uma_dtor dtor, uma_init uminit, uma_fini fini,
-		    int align, u_int32_t flags);
+		    int align, uint32_t flags);
 
 /*
  * Create a secondary uma zone
@@ -211,11 +222,24 @@
  * the only supported.
  *
  * Returns:
- * 	Error on failure, 0 on success.
+ *	Error on failure, 0 on success.
  */
 int uma_zsecond_add(uma_zone_t zone, uma_zone_t master);
 
 /*
+ * Create cache-only zones.
+ *
+ * This allows uma's per-cpu cache facilities to handle arbitrary
+ * pointers.  Consumers must specify the import and release functions to
+ * fill and destroy caches.  UMA does not allocate any memory for these
+ * zones.  The 'arg' parameter is passed to import/release and is caller
+ * specific.
+ */
+uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_import zimport,
+		    uma_release zrelease, void *arg, int flags);
+
+/*
  * Definitions for uma_zcreate flags
  *
  * These flags share space with UMA_ZFLAGs in uma_int.h.  Be careful not to
@@ -252,6 +276,10 @@
 					 * Zone's pages will not be included in
 					 * mini-dumps.
 					 */
+#define	UMA_ZONE_PCPU		0x8000	/*
+					 * Allocates mp_ncpus slabs sized to
+					 * sizeof(struct pcpu).
+					 */
 
 /*
  * These flags are shared between the keg and zone.  In zones wishing to add
@@ -259,8 +287,8 @@
  * physical parameters of the request and may not be provided by the consumer.
  */
 #define	UMA_ZONE_INHERIT						\
-    (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_HASH |		\
-    UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB)
+    (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
+    UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@@ -269,6 +297,7 @@
 #define UMA_ALIGN_SHORT	(sizeof(short) - 1)	/* "" short */
 #define UMA_ALIGN_CHAR	(sizeof(char) - 1)	/* "" char */
 #define UMA_ALIGN_CACHE	(0 - 1)			/* Cache line size align */
+#define	UMA_ALIGNOF(type) (_Alignof(type) - 1)	/* Alignment fit for 'type' */
 
 /*
  * Destroys an empty uma zone.  If the zone is not empty uma complains loudly.
@@ -355,7 +384,8 @@
  *	A pointer to the allocated memory or NULL on failure.
  */
 
-typedef void *(*uma_alloc)(uma_zone_t zone, int size, u_int8_t *pflag, int wait);
+typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
+    int wait);
 
 /*
  * Backend page free routines
@@ -368,7 +398,7 @@
  * Returns:
  *	None
  */
-typedef void (*uma_free)(void *item, int size, u_int8_t pflag);
+typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
 
 
 
@@ -403,7 +433,7 @@
  * Discussion:
  *	uma_startup2 is called by kmeminit() to enable us of uma for malloc.
  */
- 
+
 void uma_startup2(void);
 
 /*
@@ -432,24 +462,29 @@
 void uma_set_align(int align);
 
 /*
- * Switches the backing object of a zone
+ * Set a reserved number of items to hold for M_USE_RESERVE allocations.  All
+ * other requests must allocate new backing pages.
+ */
+void uma_zone_reserve(uma_zone_t zone, int nitems);
+
+/*
+ * Reserves the maximum KVA space required by the zone and configures the zone
+ * to use a VM_ALLOC_NOOBJ-based backend allocator.
  *
  * Arguments:
  *	zone  The zone to update.
- *	obj   The VM object to use for future allocations.
- *	size  The size of the object to allocate.
+ *	nitems  The upper limit on the number of items that can be allocated.
  *
  * Returns:
- *	0  if kva space can not be allocated
+ *	0  if KVA space can not be allocated
  *	1  if successful
  *
  * Discussion:
- *	A NULL object can be used and uma will allocate one for you.  Setting
- *	the size will limit the amount of memory allocated to this zone.
- *
+ *	When the machine supports a direct map and the zone's items are smaller
+ *	than a page, the zone will use the direct map instead of allocating KVA
+ *	space.
  */
-struct vm_object;
-int uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int size);
+int uma_zone_reserve_kva(uma_zone_t zone, int nitems);
 
 /*
  * Sets a high limit on the number of items allowed in a zone
@@ -476,6 +511,18 @@
 int uma_zone_get_max(uma_zone_t zone);
 
 /*
+ * Sets a warning to be printed when limit is reached
+ *
+ * Arguments:
+ *	zone  The zone we will warn about
+ *	warning  Warning content
+ *
+ * Returns:
+ *	Nothing
+ */
+void uma_zone_set_warning(uma_zone_t zone, const char *warning);
+
+/*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
@@ -509,7 +556,7 @@
 void uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini);
 
 /*
- * Replaces the standard page_alloc or obj_alloc functions for this zone
+ * Replaces the standard backend allocator for this zone.
  *
  * Arguments:
  *	zone   The zone whose backend allocator is being changed.
@@ -571,13 +618,13 @@
  * the underlying slab header.
  *
  * Arguments:
- * 	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
+ *	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
  *	item  The address of the item for which we want a refcnt.
  *
  * Returns:
- * 	A pointer to a u_int32_t reference counter.
+ *	A pointer to a uint32_t reference counter.
  */
-u_int32_t *uma_find_refcnt(uma_zone_t zone, void *item);
+uint32_t *uma_find_refcnt(uma_zone_t zone, void *item);
 
 /*
  * Used to determine if a fixed-size zone is exhausted.
@@ -586,12 +633,18 @@
  *	zone    The zone to check
  *
  * Returns:
- * 	Non-zero if zone is exhausted.
+ *	Non-zero if zone is exhausted.
  */
 int uma_zone_exhausted(uma_zone_t zone);
 int uma_zone_exhausted_nolock(uma_zone_t zone);
 
 /*
+ * Common UMA_ZONE_PCPU zones.
+ */
+extern uma_zone_t pcpu_zone_64;
+extern uma_zone_t pcpu_zone_ptr;
+
+/*
  * Exported statistics structures to be used by user space monitoring tools.
  * Statistics stream consists of a uma_stream_header, followed by a series of
  * alternative uma_type_header and uma_type_stat structures.
@@ -598,10 +651,10 @@
  */
 #define	UMA_STREAM_VERSION	0x00000001
 struct uma_stream_header {
-	u_int32_t	ush_version;	/* Stream format version. */
-	u_int32_t	ush_maxcpus;	/* Value of MAXCPU for stream. */
-	u_int32_t	ush_count;	/* Number of records. */
-	u_int32_t	_ush_pad;	/* Pad/reserved field. */
+	uint32_t	ush_version;	/* Stream format version. */
+	uint32_t	ush_maxcpus;	/* Value of MAXCPU for stream. */
+	uint32_t	ush_count;	/* Number of records. */
+	uint32_t	_ush_pad;	/* Pad/reserved field. */
 };
 
 #define	UTH_MAX_NAME	32
@@ -611,32 +664,35 @@
 	 * Static per-zone data, some extracted from the supporting keg.
 	 */
 	char		uth_name[UTH_MAX_NAME];
-	u_int32_t	uth_align;	/* Keg: alignment. */
-	u_int32_t	uth_size;	/* Keg: requested size of item. */
-	u_int32_t	uth_rsize;	/* Keg: real size of item. */
-	u_int32_t	uth_maxpages;	/* Keg: maximum number of pages. */
-	u_int32_t	uth_limit;	/* Keg: max items to allocate. */
+	uint32_t	uth_align;	/* Keg: alignment. */
+	uint32_t	uth_size;	/* Keg: requested size of item. */
+	uint32_t	uth_rsize;	/* Keg: real size of item. */
+	uint32_t	uth_maxpages;	/* Keg: maximum number of pages. */
+	uint32_t	uth_limit;	/* Keg: max items to allocate. */
 
 	/*
 	 * Current dynamic zone/keg-derived statistics.
 	 */
-	u_int32_t	uth_pages;	/* Keg: pages allocated. */
-	u_int32_t	uth_keg_free;	/* Keg: items free. */
-	u_int32_t	uth_zone_free;	/* Zone: items free. */
-	u_int32_t	uth_bucketsize;	/* Zone: desired bucket size. */
-	u_int32_t	uth_zone_flags;	/* Zone: flags. */
-	u_int64_t	uth_allocs;	/* Zone: number of allocations. */
-	u_int64_t	uth_frees;	/* Zone: number of frees. */
-	u_int64_t	uth_fails;	/* Zone: number of alloc failures. */
-	u_int64_t	uth_sleeps;	/* Zone: number of alloc sleeps. */
-	u_int64_t	_uth_reserved1[2];	/* Reserved. */
+	uint32_t	uth_pages;	/* Keg: pages allocated. */
+	uint32_t	uth_keg_free;	/* Keg: items free. */
+	uint32_t	uth_zone_free;	/* Zone: items free. */
+	uint32_t	uth_bucketsize;	/* Zone: desired bucket size. */
+	uint32_t	uth_zone_flags;	/* Zone: flags. */
+	uint64_t	uth_allocs;	/* Zone: number of allocations. */
+	uint64_t	uth_frees;	/* Zone: number of frees. */
+	uint64_t	uth_fails;	/* Zone: number of alloc failures. */
+	uint64_t	uth_sleeps;	/* Zone: number of alloc sleeps. */
+	uint64_t	_uth_reserved1[2];	/* Reserved. */
 };
 
 struct uma_percpu_stat {
-	u_int64_t	ups_allocs;	/* Cache: number of allocations. */
-	u_int64_t	ups_frees;	/* Cache: number of frees. */
-	u_int64_t	ups_cache_free;	/* Cache: free items in cache. */
-	u_int64_t	_ups_reserved[5];	/* Reserved. */
+	uint64_t	ups_allocs;	/* Cache: number of allocations. */
+	uint64_t	ups_frees;	/* Cache: number of frees. */
+	uint64_t	ups_cache_free;	/* Cache: free items in cache. */
+	uint64_t	_ups_reserved[5];	/* Reserved. */
 };
 
-#endif
+void uma_reclaim_wakeup(void);
+void uma_reclaim_worker(void *);
+
+#endif	/* _VM_UMA_H_ */

Modified: trunk/sys/vm/uma_core.c
===================================================================
--- trunk/sys/vm/uma_core.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_core.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff at FreeBSD.org>
+ * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
@@ -48,7 +49,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $");
 
 /* I should really use ktr.. */
 /*
@@ -59,9 +60,12 @@
 
 #include "opt_ddb.h"
 #include "opt_param.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/bitset.h>
+#include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
@@ -71,7 +75,9 @@
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sbuf.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
@@ -78,6 +84,7 @@
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
@@ -88,6 +95,10 @@
 
 #include <ddb/ddb.h>
 
+#ifdef DEBUG_MEMGUARD
+#include <vm/memguard.h>
+#endif
+
 /*
  * This is the zone and keg from which all zones are spawned.  The idea is that
  * even the zone & keg heads are allocated from the allocator, so we use the
@@ -122,24 +133,32 @@
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
 
-/* This mutex protects the keg list */
-static struct mtx uma_mtx;
+/* Linked list of all cache-only zones in the system */
+static LIST_HEAD(,uma_zone) uma_cachezones =
+    LIST_HEAD_INITIALIZER(uma_cachezones);
 
+/* This RW lock protects the keg list */
+static struct rwlock_padalign uma_rwlock;
+
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
     LIST_HEAD_INITIALIZER(uma_boot_pages);
 
 /* This mutex protects the boot time pages list */
-static struct mtx uma_boot_pages_mtx;
+static struct mtx_padalign uma_boot_pages_mtx;
 
+static struct sx uma_drain_lock;
+
 /* Is the VM done starting up? */
 static int booted = 0;
 #define	UMA_STARTUP	1
 #define	UMA_STARTUP2	2
 
-/* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
-static u_int uma_max_ipers;
-static u_int uma_max_ipers_ref;
+/*
+ * Only mbuf clusters use ref zones.  Just provide enough references
+ * to support the one user.  New code should not use the ref facility.
+ */
+static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
 
 /*
  * This is the handle used to schedule events that need to happen
@@ -159,9 +178,12 @@
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
+	uma_import import;
+	uma_release release;
+	void *arg;
 	uma_keg_t keg;
 	int align;
-	u_int32_t flags;
+	uint32_t flags;
 };
 
 struct uma_kctor_args {
@@ -170,48 +192,49 @@
 	uma_init uminit;
 	uma_fini fini;
 	int align;
-	u_int32_t flags;
+	uint32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
-	int		ubz_entries;
+	int		ubz_entries;	/* Number of items it can hold. */
+	int		ubz_maxsize;	/* Maximum allocation size per-item. */
 };
 
-#define	BUCKET_MAX	128
+/*
+ * Compute the actual number of bucket entries to pack them in power
+ * of two sizes for more efficient space utilization.
+ */
+#define	BUCKET_SIZE(n)						\
+    (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
 
+#define	BUCKET_MAX	BUCKET_SIZE(256)
+
 struct uma_bucket_zone bucket_zones[] = {
-	{ NULL, "16 Bucket", 16 },
-	{ NULL, "32 Bucket", 32 },
-	{ NULL, "64 Bucket", 64 },
-	{ NULL, "128 Bucket", 128 },
+	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
+	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
+	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
+	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
+	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
+	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
+	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
+	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
+	{ NULL, "256 Bucket", BUCKET_SIZE(256), 64 },
 	{ NULL, NULL, 0}
 };
 
-#define	BUCKET_SHIFT	4
-#define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
-
 /*
- * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
- * of approximately the right size.
- */
-static uint8_t bucket_size[BUCKET_ZONES];
-
-/*
  * Flags and enumerations to be passed to internal functions.
  */
-enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
+enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
 
-#define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
-#define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
-
 /* Prototypes.. */
 
-static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
-static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
-static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
-static void page_free(void *, int, u_int8_t);
+static void *noobj_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *page_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void *startup_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
+static void page_free(void *, vm_size_t, uint8_t);
 static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
@@ -231,21 +254,22 @@
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *zone_alloc_item(uma_zone_t, void *, int);
-static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip,
-    int);
+static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
 static void bucket_init(void);
-static uma_bucket_t bucket_alloc(int, int);
-static void bucket_free(uma_bucket_t);
+static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
+static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
 static void bucket_zone_drain(void);
-static int zone_alloc_bucket(uma_zone_t zone, int flags);
+static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
 static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
 static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
-static void *slab_alloc_item(uma_zone_t zone, uma_slab_t slab);
+static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
+static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
 static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
-    uma_fini fini, int align, u_int32_t flags);
-static inline void zone_relock(uma_zone_t zone, uma_keg_t keg);
-static inline void keg_relock(uma_keg_t keg, uma_zone_t zone);
+    uma_fini fini, int align, uint32_t flags);
+static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
+static void zone_release(uma_zone_t zone, void **bucket, int cnt);
+static void uma_zero_item(void *item, uma_zone_t zone);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
@@ -260,10 +284,14 @@
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
+static int zone_warnings = 1;
+TUNABLE_INT("vm.zone_warnings", &zone_warnings);
+SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
+    "Warn when UMA zones becomes full");
+
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
-
 static void
 bucket_enable(void)
 {
@@ -274,27 +302,20 @@
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
- * of the header and an array of pointers.  Initialize bucket_size[] to point
- * the range of appropriate bucket sizes at the zone.
+ * of the header and an array of pointers.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
-	int i;
-	int j;
+	int size;
 
-	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
-		int size;
-
-		ubz = &bucket_zones[j];
+	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
-		    UMA_ZFLAG_INTERNAL | UMA_ZFLAG_BUCKET);
-		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
-			bucket_size[i >> BUCKET_SHIFT] = j;
+		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
 	}
 }
 
@@ -305,14 +326,33 @@
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
-	int idx;
+	struct uma_bucket_zone *ubz;
 
-	idx = howmany(entries, 1 << BUCKET_SHIFT);
-	return (&bucket_zones[bucket_size[idx]]);
+	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
+		if (ubz->ubz_entries >= entries)
+			return (ubz);
+	ubz--;
+	return (ubz);
 }
 
+static int
+bucket_select(int size)
+{
+	struct uma_bucket_zone *ubz;
+
+	ubz = &bucket_zones[0];
+	if (size > ubz->ubz_maxsize)
+		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
+
+	for (; ubz->ubz_entries != 0; ubz++)
+		if (ubz->ubz_maxsize < size)
+			break;
+	ubz--;
+	return (ubz->ubz_entries);
+}
+
 static uma_bucket_t
-bucket_alloc(int entries, int bflags)
+bucket_alloc(uma_zone_t zone, void *udata, int flags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
@@ -325,9 +365,29 @@
 	 */
 	if (bucketdisable)
 		return (NULL);
-
-	ubz = bucket_zone_lookup(entries);
-	bucket = zone_alloc_item(ubz->ubz_zone, NULL, bflags);
+	/*
+	 * To limit bucket recursion we store the original zone flags
+	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
+	 * NOVM flag to persist even through deep recursions.  We also
+	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
+	 * a bucket for a bucket zone so we do not allow infinite bucket
+	 * recursion.  This cookie will even persist to frees of unused
+	 * buckets via the allocation path or bucket allocations in the
+	 * free path.
+	 */
+	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
+		udata = (void *)(uintptr_t)zone->uz_flags;
+	else {
+		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
+			return (NULL);
+		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
+	}
+	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
+		flags |= M_NOVM;
+	ubz = bucket_zone_lookup(zone->uz_count);
+	if (ubz->ubz_zone == zone && (ubz + 1)->ubz_entries != 0)
+		ubz++;
+	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
@@ -340,13 +400,16 @@
 }
 
 static void
-bucket_free(uma_bucket_t bucket)
+bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
 {
 	struct uma_bucket_zone *ubz;
 
+	KASSERT(bucket->ub_cnt == 0,
+	    ("bucket_free: Freeing a non free bucket."));
+	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
+		udata = (void *)(uintptr_t)zone->uz_flags;
 	ubz = bucket_zone_lookup(bucket->ub_entries);
-	zone_free_item(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
-	    ZFREE_STATFREE);
+	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
 }
 
 static void
@@ -358,11 +421,16 @@
 		zone_drain(ubz->ubz_zone);
 }
 
-static inline uma_keg_t
-zone_first_keg(uma_zone_t zone)
+static void
+zone_log_warning(uma_zone_t zone)
 {
+	static const struct timeval warninterval = { 300, 0 };
 
-	return (LIST_FIRST(&zone->uz_kegs)->kl_keg);
+	if (!zone_warnings || zone->uz_warning == NULL)
+		return;
+
+	if (ratecheck(&zone->uz_ratecheck, &warninterval))
+		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
 static void
@@ -437,7 +505,7 @@
 
 			KEG_UNLOCK(keg);
 			hash_free(&oldhash);
-			KEG_LOCK(keg);
+			return;
 		}
 	}
 	KEG_UNLOCK(keg);
@@ -549,8 +617,7 @@
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
-		zone_free_item(hashzone,
-		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
+		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
@@ -569,21 +636,16 @@
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
-	void *item;
+	int i;
 
 	if (bucket == NULL)
 		return;
 
-	while (bucket->ub_cnt > 0)  {
-		bucket->ub_cnt--;
-		item = bucket->ub_bucket[bucket->ub_cnt];
-#ifdef INVARIANTS
-		bucket->ub_bucket[bucket->ub_cnt] = NULL;
-		KASSERT(item != NULL,
-		    ("bucket_drain: botched ptr, item is NULL"));
-#endif
-		zone_free_item(zone, item, NULL, SKIP_DTOR, 0);
-	}
+	if (zone->uz_fini)
+		for (i = 0; i < bucket->ub_cnt; i++) 
+			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
+	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
+	bucket->ub_cnt = 0;
 }
 
 /*
@@ -622,9 +684,9 @@
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
-			bucket_free(cache->uc_allocbucket);
+			bucket_free(zone, cache->uc_allocbucket, NULL);
 		if (cache->uc_freebucket != NULL)
-			bucket_free(cache->uc_freebucket);
+			bucket_free(zone, cache->uc_freebucket, NULL);
 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
 	}
 	ZONE_LOCK(zone);
@@ -632,7 +694,91 @@
 	ZONE_UNLOCK(zone);
 }
 
+static void
+cache_shrink(uma_zone_t zone)
+{
+
+	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+		return;
+
+	ZONE_LOCK(zone);
+	zone->uz_count = (zone->uz_count_min + zone->uz_count) / 2;
+	ZONE_UNLOCK(zone);
+}
+
+static void
+cache_drain_safe_cpu(uma_zone_t zone)
+{
+	uma_cache_t cache;
+	uma_bucket_t b1, b2;
+
+	if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
+		return;
+
+	b1 = b2 = NULL;
+	ZONE_LOCK(zone);
+	critical_enter();
+	cache = &zone->uz_cpu[curcpu];
+	if (cache->uc_allocbucket) {
+		if (cache->uc_allocbucket->ub_cnt != 0)
+			LIST_INSERT_HEAD(&zone->uz_buckets,
+			    cache->uc_allocbucket, ub_link);
+		else
+			b1 = cache->uc_allocbucket;
+		cache->uc_allocbucket = NULL;
+	}
+	if (cache->uc_freebucket) {
+		if (cache->uc_freebucket->ub_cnt != 0)
+			LIST_INSERT_HEAD(&zone->uz_buckets,
+			    cache->uc_freebucket, ub_link);
+		else
+			b2 = cache->uc_freebucket;
+		cache->uc_freebucket = NULL;
+	}
+	critical_exit();
+	ZONE_UNLOCK(zone);
+	if (b1)
+		bucket_free(zone, b1, NULL);
+	if (b2)
+		bucket_free(zone, b2, NULL);
+}
+
 /*
+ * Safely drain per-CPU caches of a zone(s) to alloc bucket.
+ * This is an expensive call because it needs to bind to all CPUs
+ * one by one and enter a critical section on each of them in order
+ * to safely access their cache buckets.
+ * Zone lock must not be held on call this function.
+ */
+static void
+cache_drain_safe(uma_zone_t zone)
+{
+	int cpu;
+
+	/*
+	 * Polite bucket sizes shrinking was not enouth, shrink aggressively.
+	 */
+	if (zone)
+		cache_shrink(zone);
+	else
+		zone_foreach(cache_shrink);
+
+	CPU_FOREACH(cpu) {
+		thread_lock(curthread);
+		sched_bind(curthread, cpu);
+		thread_unlock(curthread);
+
+		if (zone)
+			cache_drain_safe_cpu(zone);
+		else
+			zone_foreach(cache_drain_safe_cpu);
+	}
+	thread_lock(curthread);
+	sched_unbind(curthread);
+	thread_unlock(curthread);
+}
+
+/*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
@@ -644,19 +790,44 @@
 	 * Drain the bucket queues and free the buckets, we just keep two per
 	 * cpu (alloc/free).
 	 */
-	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
+	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		bucket_drain(zone, bucket);
-		bucket_free(bucket);
+		bucket_free(zone, bucket, NULL);
 		ZONE_LOCK(zone);
 	}
 
-	/* Now we do the free queue.. */
-	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
-		LIST_REMOVE(bucket, ub_link);
-		bucket_free(bucket);
+	/*
+	 * Shrink further bucket sizes.  Price of single zone lock collision
+	 * is probably lower then price of global cache drain.
+	 */
+	if (zone->uz_count > zone->uz_count_min)
+		zone->uz_count--;
+}
+
+static void
+keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
+{
+	uint8_t *mem;
+	int i;
+	uint8_t flags;
+
+	mem = slab->us_data;
+	flags = slab->us_flags;
+	i = start;
+	if (keg->uk_fini != NULL) {
+		for (i--; i > -1; i--)
+			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
+			    keg->uk_size);
 	}
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
+#ifdef UMA_DEBUG
+	printf("%s: Returning %d bytes.\n", keg->uk_name,
+	    PAGE_SIZE * keg->uk_ppera);
+#endif
+	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
 }
 
 /*
@@ -671,9 +842,6 @@
 	struct slabhead freeslabs = { 0 };
 	uma_slab_t slab;
 	uma_slab_t n;
-	u_int8_t flags;
-	u_int8_t *mem;
-	int i;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -715,35 +883,7 @@
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
-		if (keg->uk_fini)
-			for (i = 0; i < keg->uk_ipers; i++)
-				keg->uk_fini(
-				    slab->us_data + (keg->uk_rsize * i),
-				    keg->uk_size);
-		flags = slab->us_flags;
-		mem = slab->us_data;
-
-		if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
-			vm_object_t obj;
-
-			if (flags & UMA_SLAB_KMEM)
-				obj = kmem_object;
-			else if (flags & UMA_SLAB_KERNEL)
-				obj = kernel_object;
-			else
-				obj = NULL;
-			for (i = 0; i < keg->uk_ppera; i++)
-				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
-				    obj);
-		}
-		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-			zone_free_item(keg->uk_slabzone, slab, NULL,
-			    SKIP_NONE, ZFREE_STATFREE);
-#ifdef UMA_DEBUG
-		printf("%s: Returning %d bytes.\n",
-		    keg->uk_name, UMA_SLAB_SIZE * keg->uk_ppera);
-#endif
-		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
+		keg_free_slab(keg, slab, keg->uk_ipers);
 	}
 }
 
@@ -761,9 +901,7 @@
 	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
 		if (waitok == M_NOWAIT)
 			goto out;
-		mtx_unlock(&uma_mtx);
-		msleep(zone, zone->uz_lock, PVM, "zonedrain", 1);
-		mtx_lock(&uma_mtx);
+		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
 	}
 	zone->uz_flags |= UMA_ZFLAG_DRAINING;
 	bucket_cache_drain(zone);
@@ -770,7 +908,7 @@
 	ZONE_UNLOCK(zone);
 	/*
 	 * The DRAINING flag protects us from being freed while
-	 * we're running.  Normally the uma_mtx would protect us but we
+	 * we're running.  Normally the uma_rwlock would protect us but we
 	 * must be able to release and acquire the right lock for each keg.
 	 */
 	zone_foreach_keg(zone, &keg_drain);
@@ -804,15 +942,16 @@
 	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
 	uma_slab_t slab;
-	u_int8_t *mem;
-	u_int8_t flags;
+	uint8_t *mem;
+	uint8_t flags;
 	int i;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
+	mem = NULL;
 
 #ifdef UMA_DEBUG
-	printf("slab_zalloc:  Allocating a new slab for %s\n", keg->uk_name);
+	printf("alloc_slab:  Allocating a new slab for %s\n", keg->uk_name);
 #endif
 	allocf = keg->uk_allocf;
 	KEG_UNLOCK(keg);
@@ -819,10 +958,8 @@
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
-		if (slab == NULL) {
-			KEG_LOCK(keg);
-			return NULL;
-		}
+		if (slab == NULL)
+			goto out;
 	}
 
 	/*
@@ -841,13 +978,12 @@
 		wait |= M_NODUMP;
 
 	/* zone is passed for legacy reasons. */
-	mem = allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE, &flags, wait);
+	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-			zone_free_item(keg->uk_slabzone, slab, NULL,
-			    SKIP_NONE, ZFREE_STATFREE);
-		KEG_LOCK(keg);
-		return (NULL);
+			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
+		slab = NULL;
+		goto out;
 	}
 
 	/* Point the slab into the allocated memory */
@@ -861,18 +997,15 @@
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
-	slab->us_firstfree = 0;
 	slab->us_flags = flags;
-
+	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
+#ifdef INVARIANTS
+	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
+#endif
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
-		for (i = 0; i < keg->uk_ipers; i++) {
-			slabref->us_freelist[i].us_refcnt = 0;
-			slabref->us_freelist[i].us_item = i+1;
-		}
-	} else {
 		for (i = 0; i < keg->uk_ipers; i++)
-			slab->us_freelist[i].us_item = i+1;
+			slabref->us_refcnt[i] = 0;
 	}
 
 	if (keg->uk_init != NULL) {
@@ -881,41 +1014,21 @@
 			    keg->uk_size, wait) != 0)
 				break;
 		if (i != keg->uk_ipers) {
-			if (keg->uk_fini != NULL) {
-				for (i--; i > -1; i--)
-					keg->uk_fini(slab->us_data +
-					    (keg->uk_rsize * i),
-					    keg->uk_size);
-			}
-			if (keg->uk_flags & UMA_ZONE_VTOSLAB) {
-				vm_object_t obj;
-
-				if (flags & UMA_SLAB_KMEM)
-					obj = kmem_object;
-				else if (flags & UMA_SLAB_KERNEL)
-					obj = kernel_object;
-				else
-					obj = NULL;
-				for (i = 0; i < keg->uk_ppera; i++)
-					vsetobj((vm_offset_t)mem +
-					    (i * PAGE_SIZE), obj);
-			}
-			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
-				zone_free_item(keg->uk_slabzone, slab,
-				    NULL, SKIP_NONE, ZFREE_STATFREE);
-			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
-			    flags);
-			KEG_LOCK(keg);
-			return (NULL);
+			keg_free_slab(keg, slab, i);
+			slab = NULL;
+			goto out;
 		}
 	}
+out:
 	KEG_LOCK(keg);
 
-	if (keg->uk_flags & UMA_ZONE_HASH)
-		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
+	if (slab != NULL) {
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
-	keg->uk_pages += keg->uk_ppera;
-	keg->uk_free += keg->uk_ipers;
+		keg->uk_pages += keg->uk_ppera;
+		keg->uk_free += keg->uk_ipers;
+	}
 
 	return (slab);
 }
@@ -926,7 +1039,7 @@
  * the VM is ready.
  */
 static void *
-startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
+startup_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
 {
 	uma_keg_t keg;
 	uma_slab_t tmps;
@@ -986,12 +1099,12 @@
  *	NULL if M_NOWAIT is set.
  */
 static void *
-page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
+page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KMEM;
-	p = (void *) kmem_malloc(kmem_map, bytes, wait);
+	p = (void *) kmem_malloc(kmem_arena, bytes, wait);
 
 	return (p);
 }
@@ -1008,50 +1121,53 @@
  *	NULL if M_NOWAIT is set.
  */
 static void *
-obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
+noobj_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
 {
-	vm_object_t object;
+	TAILQ_HEAD(, vm_page) alloctail;
+	u_long npages;
 	vm_offset_t retkva, zkva;
-	vm_page_t p;
-	int pages, startpages;
+	vm_page_t p, p_next;
 	uma_keg_t keg;
 
+	TAILQ_INIT(&alloctail);
 	keg = zone_first_keg(zone);
-	object = keg->uk_obj;
-	retkva = 0;
 
-	/*
-	 * This looks a little weird since we're getting one page at a time.
-	 */
-	VM_OBJECT_LOCK(object);
-	p = TAILQ_LAST(&object->memq, pglist);
-	pages = p != NULL ? p->pindex + 1 : 0;
-	startpages = pages;
-	zkva = keg->uk_kva + pages * PAGE_SIZE;
-	for (; bytes > 0; bytes -= PAGE_SIZE) {
-		p = vm_page_alloc(object, pages,
-		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
-		if (p == NULL) {
-			if (pages != startpages)
-				pmap_qremove(retkva, pages - startpages);
-			while (pages != startpages) {
-				pages--;
-				p = TAILQ_LAST(&object->memq, pglist);
-				vm_page_unwire(p, 0);
-				vm_page_free(p);
-			}
-			retkva = 0;
-			goto done;
+	npages = howmany(bytes, PAGE_SIZE);
+	while (npages > 0) {
+		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
+		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+		if (p != NULL) {
+			/*
+			 * Since the page does not belong to an object, its
+			 * listq is unused.
+			 */
+			TAILQ_INSERT_TAIL(&alloctail, p, listq);
+			npages--;
+			continue;
 		}
+		if (wait & M_WAITOK) {
+			VM_WAIT;
+			continue;
+		}
+
+		/*
+		 * Page allocation failed, free intermediate pages and
+		 * exit.
+		 */
+		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
+			vm_page_unwire(p, 0);
+			vm_page_free(p); 
+		}
+		return (NULL);
+	}
+	*flags = UMA_SLAB_PRIV;
+	zkva = keg->uk_kva +
+	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
+	retkva = zkva;
+	TAILQ_FOREACH(p, &alloctail, listq) {
 		pmap_qenter(zkva, &p, 1);
-		if (retkva == 0)
-			retkva = zkva;
 		zkva += PAGE_SIZE;
-		pages += 1;
 	}
-done:
-	VM_OBJECT_UNLOCK(object);
-	*flags = UMA_SLAB_PRIV;
 
 	return ((void *)retkva);
 }
@@ -1068,18 +1184,18 @@
  *	Nothing
  */
 static void
-page_free(void *mem, int size, u_int8_t flags)
+page_free(void *mem, vm_size_t size, uint8_t flags)
 {
-	vm_map_t map;
+	struct vmem *vmem;
 
 	if (flags & UMA_SLAB_KMEM)
-		map = kmem_map;
+		vmem = kmem_arena;
 	else if (flags & UMA_SLAB_KERNEL)
-		map = kernel_map;
+		vmem = kernel_arena;
 	else
 		panic("UMA: page_free used with invalid flags %d", flags);
 
-	kmem_free(map, (vm_offset_t)mem, size);
+	kmem_free(vmem, (vm_offset_t)mem, size);
 }
 
 /*
@@ -1110,47 +1226,74 @@
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
+	u_int slabsize;
 
-	KASSERT(keg != NULL, ("Keg is null in keg_small_init"));
+	if (keg->uk_flags & UMA_ZONE_PCPU) {
+		u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+		slabsize = sizeof(struct pcpu);
+		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
+		    PAGE_SIZE);
+	} else {
+		slabsize = UMA_SLAB_SIZE;
+		keg->uk_ppera = 1;
+	}
+
+	/*
+	 * Calculate the size of each allocation (rsize) according to
+	 * alignment.  If the requested size is smaller than we have
+	 * allocation bits for we round it up.
+	 */
 	rsize = keg->uk_size;
-
-	if (rsize < UMA_SMALLEST_UNIT)
-		rsize = UMA_SMALLEST_UNIT;
+	if (rsize < slabsize / SLAB_SETSIZE)
+		rsize = slabsize / SLAB_SETSIZE;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
-
 	keg->uk_rsize = rsize;
-	keg->uk_ppera = 1;
 
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
-		shsize = sizeof(struct uma_slab_refcnt);
-	} else {
-		rsize += UMA_FRITM_SZ;	/* Account for linkage */
+	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
+	    keg->uk_rsize < sizeof(struct pcpu),
+	    ("%s: size %u too large", __func__, keg->uk_rsize));
+
+	if (keg->uk_flags & UMA_ZONE_REFCNT)
+		rsize += sizeof(uint32_t);
+
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		shsize = 0;
+	else 
 		shsize = sizeof(struct uma_slab);
-	}
 
-	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
-	KASSERT(keg->uk_ipers != 0, ("keg_small_init: ipers is 0"));
+	keg->uk_ipers = (slabsize - shsize) / rsize;
+	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
+	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
+
 	memused = keg->uk_ipers * rsize + shsize;
-	wastedspace = UMA_SLAB_SIZE - memused;
+	wastedspace = slabsize - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
-	 * may end up going to the VM (kmem_map) for slabs which we
-	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
-	 * result of UMA_ZONE_VM, which clearly forbids it.
+	 * may end up going to the VM  for slabs which we do not
+	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
+	 * of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
-	if ((wastedspace >= UMA_MAX_WASTE) &&
-	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
-		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
-		KASSERT(keg->uk_ipers <= 255,
-		    ("keg_small_init: keg->uk_ipers too high!"));
+	/*
+	 * See if using an OFFPAGE slab will limit our waste.  Only do
+	 * this if it permits more items per-slab.
+	 *
+	 * XXX We could try growing slabsize to limit max waste as well.
+	 * Historically this was not done because the VM could not
+	 * efficiently handle contiguous allocations.
+	 */
+	if ((wastedspace >= slabsize / UMA_MAX_WASTE) &&
+	    (keg->uk_ipers < (slabsize / keg->uk_rsize))) {
+		keg->uk_ipers = slabsize / keg->uk_rsize;
+		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
+		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 #ifdef UMA_DEBUG
 		printf("UMA decided we need offpage slab headers for "
 		    "keg: %s, calculated wastedspace = %d, "
@@ -1157,13 +1300,15 @@
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", keg->uk_name, wastedspace,
-		    UMA_MAX_WASTE, keg->uk_ipers,
-		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
+		    slabsize / UMA_MAX_WASTE, keg->uk_ipers,
+		    slabsize - keg->uk_ipers * keg->uk_rsize);
 #endif
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
-		if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
-			keg->uk_flags |= UMA_ZONE_HASH;
 	}
+
+	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
+	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
+		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
 /*
@@ -1180,19 +1325,15 @@
 static void
 keg_large_init(uma_keg_t keg)
 {
-	int pages;
+	u_int shsize;
 
 	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
+	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
+	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
 
-	pages = keg->uk_size / UMA_SLAB_SIZE;
-
-	/* Account for remainder */
-	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
-		pages++;
-
-	keg->uk_ppera = pages;
+	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
@@ -1200,8 +1341,21 @@
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
 		return;
 
-	keg->uk_flags |= UMA_ZONE_OFFPAGE;
-	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
+	/* Check whether we have enough space to not do OFFPAGE. */
+	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
+		shsize = sizeof(struct uma_slab);
+		if (keg->uk_flags & UMA_ZONE_REFCNT)
+			shsize += keg->uk_ipers * sizeof(uint32_t);
+		if (shsize & UMA_ALIGN_PTR)
+			shsize = (shsize & ~UMA_ALIGN_PTR) +
+			    (UMA_ALIGN_PTR + 1);
+
+		if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
+			keg->uk_flags |= UMA_ZONE_OFFPAGE;
+	}
+
+	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
+	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 }
 
@@ -1213,6 +1367,9 @@
 	int pages;
 	int rsize;
 
+	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
+	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
+
 	alignsize = keg->uk_align + 1;
 	rsize = keg->uk_size;
 	/*
@@ -1232,8 +1389,8 @@
 	keg->uk_ppera = pages;
 	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
 	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
-	KASSERT(keg->uk_ipers <= uma_max_ipers,
-	    ("keg_small_init: keg->uk_ipers too high(%d) increase max_ipers",
+	KASSERT(keg->uk_ipers <= SLAB_SETSIZE,
+	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
 	    keg->uk_ipers));
 }
 
@@ -1257,11 +1414,11 @@
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_free = 0;
+	keg->uk_reserve = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_allocf = page_alloc;
 	keg->uk_freef = page_free;
-	keg->uk_recurse = 0;
 	keg->uk_slabzone = NULL;
 
 	/*
@@ -1279,25 +1436,24 @@
 	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
-	/*
-	 * The +UMA_FRITM_SZ added to uk_size is to account for the
-	 * linkage that is added to the size in keg_small_init().  If
-	 * we don't account for this here then we may end up in
-	 * keg_small_init() with a calculated 'ipers' of 0.
-	 */
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
-			keg_cachespread_init(keg);
-		else if ((keg->uk_size+UMA_FRITMREF_SZ) >
-		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
+	if (arg->flags & UMA_ZONE_PCPU)
+#ifdef SMP
+		keg->uk_flags |= UMA_ZONE_OFFPAGE;
+#else
+		keg->uk_flags &= ~UMA_ZONE_PCPU;
+#endif
+
+	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
+		keg_cachespread_init(keg);
+	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
+		if (keg->uk_size >
+		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
+		    sizeof(uint32_t)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
 	} else {
-		if (keg->uk_flags & UMA_ZONE_CACHESPREAD)
-			keg_cachespread_init(keg);
-		else if ((keg->uk_size+UMA_FRITM_SZ) >
-		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
+		if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			keg_large_init(keg);
 		else
 			keg_small_init(keg);
@@ -1304,9 +1460,12 @@
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
+		if (keg->uk_flags & UMA_ZONE_REFCNT) {
+			if (keg->uk_ipers > uma_max_ipers_ref)
+				panic("Too many ref items per zone: %d > %d\n",
+				    keg->uk_ipers, uma_max_ipers_ref);
 			keg->uk_slabzone = slabrefzone;
-		else
+		} else
 			keg->uk_slabzone = slabzone;
 	}
 
@@ -1330,12 +1489,9 @@
 		keg->uk_allocf = startup_alloc;
 
 	/*
-	 * Initialize keg's lock (shared among zones).
+	 * Initialize keg's lock
 	 */
-	if (arg->flags & UMA_ZONE_MTXCLASS)
-		KEG_LOCK_INIT(keg, 1);
-	else
-		KEG_LOCK_INIT(keg, 0);
+	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
@@ -1346,25 +1502,17 @@
 		u_int totsize;
 
 		/* Size of the slab struct and free list */
+		totsize = sizeof(struct uma_slab);
+
+		/* Size of the reference counts. */
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize = sizeof(struct uma_slab_refcnt) +
-			    keg->uk_ipers * UMA_FRITMREF_SZ;
-		else
-			totsize = sizeof(struct uma_slab) +
-			    keg->uk_ipers * UMA_FRITM_SZ;
+			totsize += keg->uk_ipers * sizeof(uint32_t);
 
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
-		keg->uk_pgoff = (UMA_SLAB_SIZE * keg->uk_ppera) - totsize;
+		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
 
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
-			    + keg->uk_ipers * UMA_FRITMREF_SZ;
-		else
-			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
-			    + keg->uk_ipers * UMA_FRITM_SZ;
-
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
@@ -1372,7 +1520,10 @@
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
-		if (totsize > UMA_SLAB_SIZE * keg->uk_ppera) {
+		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
+		if (keg->uk_flags & UMA_ZONE_REFCNT)
+			totsize += keg->uk_ipers * sizeof(uint32_t);
+		if (totsize > PAGE_SIZE * keg->uk_ppera) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
 			    keg->uk_size);
@@ -1387,14 +1538,15 @@
 	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
-	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
+	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
+	    keg->uk_free);
 #endif
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
-	mtx_lock(&uma_mtx);
+	rw_wlock(&uma_rwlock);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
-	mtx_unlock(&uma_mtx);
+	rw_wunlock(&uma_rwlock);
 	return (0);
 }
 
@@ -1423,17 +1575,47 @@
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_sleeps = 0;
-	zone->uz_fills = zone->uz_count = 0;
+	zone->uz_count = 0;
+	zone->uz_count_min = 0;
 	zone->uz_flags = 0;
+	zone->uz_warning = NULL;
+	timevalclear(&zone->uz_ratecheck);
 	keg = arg->keg;
 
+	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
+
+	/*
+	 * This is a pure cache zone, no kegs.
+	 */
+	if (arg->import) {
+		if (arg->flags & UMA_ZONE_VM)
+			arg->flags |= UMA_ZFLAG_CACHEONLY;
+		zone->uz_flags = arg->flags;
+		zone->uz_size = arg->size;
+		zone->uz_import = arg->import;
+		zone->uz_release = arg->release;
+		zone->uz_arg = arg->arg;
+		zone->uz_lockptr = &zone->uz_lock;
+		rw_wlock(&uma_rwlock);
+		LIST_INSERT_HEAD(&uma_cachezones, zone, uz_link);
+		rw_wunlock(&uma_rwlock);
+		goto out;
+	}
+
+	/*
+	 * Use the regular zone/keg/slab allocator.
+	 */
+	zone->uz_import = (uma_import)zone_import;
+	zone->uz_release = (uma_release)zone_release;
+	zone->uz_arg = zone; 
+
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
-		zone->uz_lock = &keg->uk_lock;
+		zone->uz_lockptr = &keg->uk_lock;
 		zone->uz_flags |= UMA_ZONE_SECONDARY;
-		mtx_lock(&uma_mtx);
+		rw_wlock(&uma_rwlock);
 		ZONE_LOCK(zone);
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
@@ -1442,7 +1624,7 @@
 			}
 		}
 		ZONE_UNLOCK(zone);
-		mtx_unlock(&uma_mtx);
+		rw_wunlock(&uma_rwlock);
 	} else if (keg == NULL) {
 		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags)) == NULL)
@@ -1463,12 +1645,13 @@
 		if (error)
 			return (error);
 	}
+
 	/*
 	 * Link in the first keg.
 	 */
 	zone->uz_klink.kl_keg = keg;
 	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
-	zone->uz_lock = &keg->uk_lock;
+	zone->uz_lockptr = &keg->uk_lock;
 	zone->uz_size = keg->uk_size;
 	zone->uz_flags |= (keg->uk_flags &
 	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
@@ -1483,12 +1666,13 @@
 		return (0);
 	}
 
-	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
-		zone->uz_count = BUCKET_MAX;
-	else if (keg->uk_ipers <= BUCKET_MAX)
-		zone->uz_count = keg->uk_ipers;
+out:
+	if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
+		zone->uz_count = bucket_select(zone->uz_size);
 	else
 		zone->uz_count = BUCKET_MAX;
+	zone->uz_count_min = zone->uz_count;
+
 	return (0);
 }
 
@@ -1507,8 +1691,9 @@
 	keg = (uma_keg_t)arg;
 	KEG_LOCK(keg);
 	if (keg->uk_free != 0) {
-		printf("Freed UMA keg was not empty (%d items). "
+		printf("Freed UMA keg (%s) was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
+		    keg->uk_name ? keg->uk_name : "",
 		    keg->uk_free, keg->uk_pages);
 	}
 	KEG_UNLOCK(keg);
@@ -1537,9 +1722,9 @@
 	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
-	mtx_lock(&uma_mtx);
+	rw_wlock(&uma_rwlock);
 	LIST_REMOVE(zone, uz_link);
-	mtx_unlock(&uma_mtx);
+	rw_wunlock(&uma_rwlock);
 	/*
 	 * XXX there are some races here where
 	 * the zone can be drained but zone lock
@@ -1560,13 +1745,13 @@
 	/*
 	 * We only destroy kegs from non secondary zones.
 	 */
-	if ((zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
-		mtx_lock(&uma_mtx);
+	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
+		rw_wlock(&uma_rwlock);
 		LIST_REMOVE(keg, uk_link);
-		mtx_unlock(&uma_mtx);
-		zone_free_item(kegs, keg, NULL, SKIP_NONE,
-		    ZFREE_STATFREE);
+		rw_wunlock(&uma_rwlock);
+		zone_free_item(kegs, keg, NULL, SKIP_NONE);
 	}
+	ZONE_LOCK_FINI(zone);
 }
 
 /*
@@ -1585,12 +1770,12 @@
 	uma_keg_t keg;
 	uma_zone_t zone;
 
-	mtx_lock(&uma_mtx);
+	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
-	mtx_unlock(&uma_mtx);
+	rw_runlock(&uma_rwlock);
 }
 
 /* Public functions */
@@ -1601,86 +1786,15 @@
 	struct uma_zctor_args args;
 	uma_slab_t slab;
 	u_int slabsize;
-	u_int objsize, totsize, wsize;
 	int i;
 
 #ifdef UMA_DEBUG
 	printf("Creating uma keg headers zone and keg.\n");
 #endif
-	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
+	rw_init(&uma_rwlock, "UMA lock");
 
-	/*
-	 * Figure out the maximum number of items-per-slab we'll have if
-	 * we're using the OFFPAGE slab header to track free items, given
-	 * all possible object sizes and the maximum desired wastage
-	 * (UMA_MAX_WASTE).
-	 *
-	 * We iterate until we find an object size for
-	 * which the calculated wastage in keg_small_init() will be
-	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
-	 * is an overall increasing see-saw function, we find the smallest
-	 * objsize such that the wastage is always acceptable for objects
-	 * with that objsize or smaller.  Since a smaller objsize always
-	 * generates a larger possible uma_max_ipers, we use this computed
-	 * objsize to calculate the largest ipers possible.  Since the
-	 * ipers calculated for OFFPAGE slab headers is always larger than
-	 * the ipers initially calculated in keg_small_init(), we use
-	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
-	 * obtain the maximum ipers possible for offpage slab headers.
-	 *
-	 * It should be noted that ipers versus objsize is an inversly
-	 * proportional function which drops off rather quickly so as
-	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
-	 * falls into the portion of the inverse relation AFTER the steep
-	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
-	 *
-	 * Note that we have 8-bits (1 byte) to use as a freelist index
-	 * inside the actual slab header itself and this is enough to
-	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
-	 * object with offpage slab header would have ipers =
-	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
-	 * 1 greater than what our byte-integer freelist index can
-	 * accomodate, but we know that this situation never occurs as
-	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
-	 * that we need to go to offpage slab headers.  Or, if we do,
-	 * then we trap that condition below and panic in the INVARIANTS case.
-	 */
-	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
-	totsize = wsize;
-	objsize = UMA_SMALLEST_UNIT;
-	while (totsize >= wsize) {
-		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
-		    (objsize + UMA_FRITM_SZ);
-		totsize *= (UMA_FRITM_SZ + objsize);
-		objsize++;
-	}
-	if (objsize > UMA_SMALLEST_UNIT)
-		objsize--;
-	uma_max_ipers = MAX(UMA_SLAB_SIZE / objsize, 64);
-
-	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
-	totsize = wsize;
-	objsize = UMA_SMALLEST_UNIT;
-	while (totsize >= wsize) {
-		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
-		    (objsize + UMA_FRITMREF_SZ);
-		totsize *= (UMA_FRITMREF_SZ + objsize);
-		objsize++;
-	}
-	if (objsize > UMA_SMALLEST_UNIT)
-		objsize--;
-	uma_max_ipers_ref = MAX(UMA_SLAB_SIZE / objsize, 64);
-
-	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
-	    ("uma_startup: calculated uma_max_ipers values too large!"));
-
-#ifdef UMA_DEBUG
-	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
-	printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n",
-	    uma_max_ipers_ref);
-#endif
-
 	/* "manually" create the initial zone */
+	memset(&args, 0, sizeof(args));
 	args.name = "UMA Kegs";
 	args.size = sizeof(struct uma_keg);
 	args.ctor = keg_ctor;
@@ -1697,8 +1811,8 @@
 	printf("Filling boot free list.\n");
 #endif
 	for (i = 0; i < boot_pages; i++) {
-		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
-		slab->us_data = (u_int8_t *)slab;
+		slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
+		slab->us_data = (uint8_t *)slab;
 		slab->us_flags = UMA_SLAB_BOOT;
 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
 	}
@@ -1727,16 +1841,9 @@
 	printf("Creating slab and hash zones.\n");
 #endif
 
-	/*
-	 * This is the max number of free list items we'll have with
-	 * offpage slabs.
-	 */
-	slabsize = uma_max_ipers * UMA_FRITM_SZ;
-	slabsize += sizeof(struct uma_slab);
-
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
-				slabsize,
+				sizeof(struct uma_slab),
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
@@ -1744,8 +1851,8 @@
 	 * We also create a zone for the bigger slabs with reference
 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
 	 */
-	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
-	slabsize += sizeof(struct uma_slab_refcnt);
+	slabsize = sizeof(struct uma_slab_refcnt);
+	slabsize += uma_max_ipers_ref * sizeof(uint32_t);
 	slabrefzone = uma_zcreate("UMA RCntSlabs",
 				  slabsize,
 				  NULL, NULL, NULL, NULL,
@@ -1772,6 +1879,7 @@
 {
 	booted = UMA_STARTUP2;
 	bucket_enable();
+	sx_init(&uma_drain_lock, "umadrain");
 #ifdef UMA_DEBUG
 	printf("UMA startup2 complete.\n");
 #endif
@@ -1788,7 +1896,7 @@
 #ifdef UMA_DEBUG
 	printf("Starting callout.\n");
 #endif
-	callout_init(&uma_callout, CALLOUT_MPSAFE);
+	callout_init(&uma_callout, 1);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
@@ -1797,7 +1905,7 @@
 
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
-		int align, u_int32_t flags)
+		int align, uint32_t flags)
 {
 	struct uma_kctor_args args;
 
@@ -1822,12 +1930,18 @@
 /* See uma.h */
 uma_zone_t
 uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
-		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
+		uma_init uminit, uma_fini fini, int align, uint32_t flags)
 
 {
 	struct uma_zctor_args args;
+	uma_zone_t res;
+	bool locked;
 
+	KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
+	    align, name));
+
 	/* This stuff is essential for the zone ctor */
+	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
@@ -1838,7 +1952,16 @@
 	args.flags = flags;
 	args.keg = NULL;
 
-	return (zone_alloc_item(zones, &args, M_WAITOK));
+	if (booted < UMA_STARTUP2) {
+		locked = false;
+	} else {
+		sx_slock(&uma_drain_lock);
+		locked = true;
+	}
+	res = zone_alloc_item(zones, &args, M_WAITOK);
+	if (locked)
+		sx_sunlock(&uma_drain_lock);
+	return (res);
 }
 
 /* See uma.h */
@@ -1848,8 +1971,11 @@
 {
 	struct uma_zctor_args args;
 	uma_keg_t keg;
+	uma_zone_t res;
+	bool locked;
 
 	keg = zone_first_keg(master);
+	memset(&args, 0, sizeof(args));
 	args.name = name;
 	args.size = keg->uk_size;
 	args.ctor = ctor;
@@ -1860,7 +1986,40 @@
 	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = keg;
 
+	if (booted < UMA_STARTUP2) {
+		locked = false;
+	} else {
+		sx_slock(&uma_drain_lock);
+		locked = true;
+	}
 	/* XXX Attaches only one keg of potentially many. */
+	res = zone_alloc_item(zones, &args, M_WAITOK);
+	if (locked)
+		sx_sunlock(&uma_drain_lock);
+	return (res);
+}
+
+/* See uma.h */
+uma_zone_t
+uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
+		    uma_init zinit, uma_fini zfini, uma_import zimport,
+		    uma_release zrelease, void *arg, int flags)
+{
+	struct uma_zctor_args args;
+
+	memset(&args, 0, sizeof(args));
+	args.name = name;
+	args.size = size;
+	args.ctor = ctor;
+	args.dtor = dtor;
+	args.uminit = zinit;
+	args.fini = zfini;
+	args.import = zimport;
+	args.release = zrelease;
+	args.arg = arg;
+	args.align = 0;
+	args.flags = flags;
+
 	return (zone_alloc_item(zones, &args, M_WAITOK));
 }
 
@@ -1869,10 +2028,10 @@
 {
 	if (a < b) {
 		ZONE_LOCK(a);
-		mtx_lock_flags(b->uz_lock, MTX_DUPOK);
+		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
 	} else {
 		ZONE_LOCK(b);
-		mtx_lock_flags(a->uz_lock, MTX_DUPOK);
+		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
 	}
 }
 
@@ -1955,7 +2114,9 @@
 uma_zdestroy(uma_zone_t zone)
 {
 
-	zone_free_item(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
+	sx_slock(&uma_drain_lock);
+	zone_free_item(zones, zone, NULL, SKIP_NONE);
+	sx_sunlock(&uma_drain_lock);
 }
 
 /* See uma.h */
@@ -1965,6 +2126,7 @@
 	void *item;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
+	int lockfail;
 	int cpu;
 
 	/* This is the fast path allocation */
@@ -1978,7 +2140,30 @@
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
-
+#ifdef DEBUG_MEMGUARD
+	if (memguard_cmp_zone(zone)) {
+		item = memguard_alloc(zone->uz_size, flags);
+		if (item != NULL) {
+			/*
+			 * Avoid conflict with the use-after-free
+			 * protecting infrastructure from INVARIANTS.
+			 */
+			if (zone->uz_init != NULL &&
+			    zone->uz_init != mtrash_init &&
+			    zone->uz_init(item, zone->uz_size, flags) != 0)
+				return (NULL);
+			if (zone->uz_ctor != NULL &&
+			    zone->uz_ctor != mtrash_ctor &&
+			    zone->uz_ctor(item, zone->uz_size, udata,
+			    flags) != 0) {
+			    	zone->uz_fini(item, zone->uz_size);
+				return (NULL);
+			}
+			return (item);
+		}
+		/* This is unfortunate but should not be fatal. */
+	}
+#endif
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
@@ -1990,7 +2175,6 @@
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
-zalloc_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
@@ -1997,54 +2181,57 @@
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
-
-	if (bucket) {
-		if (bucket->ub_cnt > 0) {
-			bucket->ub_cnt--;
-			item = bucket->ub_bucket[bucket->ub_cnt];
+	if (bucket != NULL && bucket->ub_cnt > 0) {
+		bucket->ub_cnt--;
+		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
-			bucket->ub_bucket[bucket->ub_cnt] = NULL;
+		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
-			KASSERT(item != NULL,
-			    ("uma_zalloc: Bucket pointer mangled."));
-			cache->uc_allocs++;
-			critical_exit();
+		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
+		cache->uc_allocs++;
+		critical_exit();
+		if (zone->uz_ctor != NULL &&
+		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
+			atomic_add_long(&zone->uz_fails, 1);
+			zone_free_item(zone, item, udata, SKIP_DTOR);
+			return (NULL);
+		}
 #ifdef INVARIANTS
-			ZONE_LOCK(zone);
-			uma_dbg_alloc(zone, NULL, item);
-			ZONE_UNLOCK(zone);
+		uma_dbg_alloc(zone, NULL, item);
 #endif
-			if (zone->uz_ctor != NULL) {
-				if (zone->uz_ctor(item, zone->uz_size,
-				    udata, flags) != 0) {
-					zone_free_item(zone, item, udata,
-					    SKIP_DTOR, ZFREE_STATFAIL |
-					    ZFREE_STATFREE);
-					return (NULL);
-				}
-			}
-			if (flags & M_ZERO)
-				bzero(item, zone->uz_size);
-			return (item);
-		} else if (cache->uc_freebucket) {
-			/*
-			 * We have run out of items in our allocbucket.
-			 * See if we can switch with our free bucket.
-			 */
-			if (cache->uc_freebucket->ub_cnt > 0) {
+		if (flags & M_ZERO)
+			uma_zero_item(item, zone);
+		return (item);
+	}
+
+	/*
+	 * We have run out of items in our alloc bucket.
+	 * See if we can switch with our free bucket.
+	 */
+	bucket = cache->uc_freebucket;
+	if (bucket != NULL && bucket->ub_cnt > 0) {
 #ifdef UMA_DEBUG_ALLOC
-				printf("uma_zalloc: Swapping empty with"
-				    " alloc.\n");
+		printf("uma_zalloc: Swapping empty with alloc.\n");
 #endif
-				bucket = cache->uc_freebucket;
-				cache->uc_freebucket = cache->uc_allocbucket;
-				cache->uc_allocbucket = bucket;
+		cache->uc_freebucket = cache->uc_allocbucket;
+		cache->uc_allocbucket = bucket;
+		goto zalloc_start;
+	}
 
-				goto zalloc_start;
-			}
-		}
-	}
 	/*
+	 * Discard any empty allocation bucket while we hold no locks.
+	 */
+	bucket = cache->uc_allocbucket;
+	cache->uc_allocbucket = NULL;
+	critical_exit();
+	if (bucket != NULL)
+		bucket_free(zone, bucket, udata);
+
+	/* Short-circuit for zones without buckets and low memory. */
+	if (zone->uz_count == 0 || bucketdisable)
+		goto zalloc_item;
+
+	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
@@ -2053,41 +2240,34 @@
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
-	critical_exit();
-	ZONE_LOCK(zone);
+	lockfail = 0;
+	if (ZONE_TRYLOCK(zone) == 0) {
+		/* Record contention to size the buckets. */
+		ZONE_LOCK(zone);
+		lockfail = 1;
+	}
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
-	bucket = cache->uc_allocbucket;
-	if (bucket != NULL) {
-		if (bucket->ub_cnt > 0) {
-			ZONE_UNLOCK(zone);
-			goto zalloc_start;
-		}
-		bucket = cache->uc_freebucket;
-		if (bucket != NULL && bucket->ub_cnt > 0) {
-			ZONE_UNLOCK(zone);
-			goto zalloc_start;
-		}
-	}
 
-	/* Since we have locked the zone we may as well send back our stats */
-	zone->uz_allocs += cache->uc_allocs;
+	/*
+	 * Since we have locked the zone we may as well send back our stats.
+	 */
+	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
+	atomic_add_long(&zone->uz_frees, cache->uc_frees);
 	cache->uc_allocs = 0;
-	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
-	/* Our old one is now a free bucket */
-	if (cache->uc_allocbucket) {
-		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
-		    ("uma_zalloc_arg: Freeing a non free bucket."));
-		LIST_INSERT_HEAD(&zone->uz_free_bucket,
-		    cache->uc_allocbucket, ub_link);
-		cache->uc_allocbucket = NULL;
+	/* See if we lost the race to fill the cache. */
+	if (cache->uc_allocbucket != NULL) {
+		ZONE_UNLOCK(zone);
+		goto zalloc_start;
 	}
 
-	/* Check the free list for a new alloc bucket */
-	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
+	/*
+	 * Check the zone's cache of buckets.
+	 */
+	if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
@@ -2099,19 +2279,38 @@
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
-	/* Bump up our uz_count so we get here less */
-	if (zone->uz_count < BUCKET_MAX)
+	/*
+	 * We bump the uz count when the cache size is insufficient to
+	 * handle the working set.
+	 */
+	if (lockfail && zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
+	ZONE_UNLOCK(zone);
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
-	 * works we'll restart the allocation from the begining.
+	 * works we'll restart the allocation from the begining and it
+	 * will use the just filled bucket.
 	 */
-	if (zone_alloc_bucket(zone, flags)) {
+	bucket = zone_alloc_bucket(zone, udata, flags);
+	if (bucket != NULL) {
+		ZONE_LOCK(zone);
+		critical_enter();
+		cpu = curcpu;
+		cache = &zone->uz_cpu[cpu];
+		/*
+		 * See if we lost the race or were migrated.  Cache the
+		 * initialized bucket to make this less likely or claim
+		 * the memory directly.
+		 */
+		if (cache->uc_allocbucket == NULL)
+			cache->uc_allocbucket = bucket;
+		else
+			LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
 		ZONE_UNLOCK(zone);
-		goto zalloc_restart;
+		goto zalloc_start;
 	}
-	ZONE_UNLOCK(zone);
+
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
@@ -2119,7 +2318,9 @@
 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
 #endif
 
+zalloc_item:
 	item = zone_alloc_item(zone, udata, flags);
+
 	return (item);
 }
 
@@ -2127,9 +2328,13 @@
 keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
+	int reserve;
 
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 	slab = NULL;
+	reserve = 0;
+	if ((flags & M_USE_RESERVE) == 0)
+		reserve = keg->uk_reserve;
 
 	for (;;) {
 		/*
@@ -2137,7 +2342,7 @@
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
-		if (keg->uk_free != 0) {
+		if (keg->uk_free > reserve) {
 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
 				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
@@ -2162,8 +2367,10 @@
 			 * If this is not a multi-zone, set the FULL bit.
 			 * Otherwise slab_multi() takes care of it.
 			 */
-			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0)
+			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
 				zone->uz_flags |= UMA_ZFLAG_FULL;
+				zone_log_warning(zone);
+			}
 			if (flags & M_NOWAIT)
 				break;
 			zone->uz_sleeps++;
@@ -2170,9 +2377,7 @@
 			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
 			continue;
 		}
-		keg->uk_recurse++;
 		slab = keg_alloc_slab(keg, zone, flags);
-		keg->uk_recurse--;
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
@@ -2193,42 +2398,15 @@
 	return (slab);
 }
 
-static inline void
-zone_relock(uma_zone_t zone, uma_keg_t keg)
-{
-	if (zone->uz_lock != &keg->uk_lock) {
-		KEG_UNLOCK(keg);
-		ZONE_LOCK(zone);
-	}
-}
-
-static inline void
-keg_relock(uma_keg_t keg, uma_zone_t zone)
-{
-	if (zone->uz_lock != &keg->uk_lock) {
-		ZONE_UNLOCK(zone);
-		KEG_LOCK(keg);
-	}
-}
-
 static uma_slab_t
 zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
 {
 	uma_slab_t slab;
 
-	if (keg == NULL)
+	if (keg == NULL) {
 		keg = zone_first_keg(zone);
-	/*
-	 * This is to prevent us from recursively trying to allocate
-	 * buckets.  The problem is that if an allocation forces us to
-	 * grab a new bucket we will call page_alloc, which will go off
-	 * and cause the vm to allocate vm_map_entries.  If we need new
-	 * buckets there too we will recurse in kmem_alloc and bad
-	 * things happen.  So instead we return a NULL bucket, and make
-	 * the code that allocates buckets smart enough to deal with it
-	 */
-	if (keg->uk_flags & UMA_ZFLAG_BUCKET && keg->uk_recurse != 0)
-		return (NULL);
+		KEG_LOCK(keg);
+	}
 
 	for (;;) {
 		slab = keg_fetch_slab(keg, zone, flags);
@@ -2237,13 +2415,13 @@
 		if (flags & (M_NOWAIT | M_NOVM))
 			break;
 	}
+	KEG_UNLOCK(keg);
 	return (NULL);
 }
 
 /*
  * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
- * with the keg locked.  Caller must call zone_relock() afterwards if the
- * zone lock is required.  On NULL the zone lock is held.
+ * with the keg locked.  On NULL no lock is held.
  *
  * The last pointer is used to seed the search.  It is not required.
  */
@@ -2267,12 +2445,11 @@
 	 * Use the last slab allocated as a hint for where to start
 	 * the search.
 	 */
-	if (last) {
+	if (last != NULL) {
 		slab = keg_fetch_slab(last, zone, flags);
 		if (slab)
 			return (slab);
-		zone_relock(zone, last);
-		last = NULL;
+		KEG_UNLOCK(last);
 	}
 	/*
 	 * Loop until we have a slab incase of transient failures
@@ -2288,7 +2465,7 @@
 		 */
 		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
 			keg = klink->kl_keg;
-			keg_relock(keg, zone);
+			KEG_LOCK(keg);
 			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
 				slab = keg_fetch_slab(keg, zone, flags);
 				if (slab)
@@ -2298,7 +2475,7 @@
 				full++;
 			else
 				empty++;
-			zone_relock(zone, keg);
+			KEG_UNLOCK(keg);
 		}
 		if (rflags & (M_NOWAIT | M_NOVM))
 			break;
@@ -2308,10 +2485,14 @@
 		 * and sleep so just sleep for a short period and retry.
 		 */
 		if (full && !empty) {
+			ZONE_LOCK(zone);
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
-			msleep(zone, zone->uz_lock, PVM, "zonelimit", hz/100);
+			zone_log_warning(zone);
+			msleep(zone, zone->uz_lockptr, PVM,
+			    "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
+			ZONE_UNLOCK(zone);
 			continue;
 		}
 	}
@@ -2319,30 +2500,20 @@
 }
 
 static void *
-slab_alloc_item(uma_zone_t zone, uma_slab_t slab)
+slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
 {
-	uma_keg_t keg;
-	uma_slabrefcnt_t slabref;
 	void *item;
-	u_int8_t freei;
+	uint8_t freei;
 
-	keg = slab->us_keg;
+	MPASS(keg == slab->us_keg);
 	mtx_assert(&keg->uk_lock, MA_OWNED);
 
-	freei = slab->us_firstfree;
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		slab->us_firstfree = slabref->us_freelist[freei].us_item;
-	} else {
-		slab->us_firstfree = slab->us_freelist[freei].us_item;
-	}
+	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
+	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
 	item = slab->us_data + (keg->uk_rsize * freei);
-
 	slab->us_freecount--;
 	keg->uk_free--;
-#ifdef INVARIANTS
-	uma_dbg_alloc(zone, slab, item);
-#endif
+
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
@@ -2353,82 +2524,58 @@
 }
 
 static int
-zone_alloc_bucket(uma_zone_t zone, int flags)
+zone_import(uma_zone_t zone, void **bucket, int max, int flags)
 {
-	uma_bucket_t bucket;
 	uma_slab_t slab;
 	uma_keg_t keg;
-	int16_t saved;
-	int max, origflags = flags;
+	int i;
 
-	/*
-	 * Try this zone's free list first so we don't allocate extra buckets.
-	 */
-	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
-		KASSERT(bucket->ub_cnt == 0,
-		    ("zone_alloc_bucket: Bucket on free list is not empty."));
-		LIST_REMOVE(bucket, ub_link);
-	} else {
-		int bflags;
-
-		bflags = (flags & ~M_ZERO);
-		if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
-			bflags |= M_NOVM;
-
-		ZONE_UNLOCK(zone);
-		bucket = bucket_alloc(zone->uz_count, bflags);
-		ZONE_LOCK(zone);
+	slab = NULL;
+	keg = NULL;
+	/* Try to keep the buckets totally full */
+	for (i = 0; i < max; ) {
+		if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
+			break;
+		keg = slab->us_keg;
+		while (slab->us_freecount && i < max) { 
+			bucket[i++] = slab_alloc_item(keg, slab);
+			if (keg->uk_free <= keg->uk_reserve)
+				break;
+		}
+		/* Don't grab more than one slab at a time. */
+		flags &= ~M_WAITOK;
+		flags |= M_NOWAIT;
 	}
+	if (slab != NULL)
+		KEG_UNLOCK(keg);
 
-	if (bucket == NULL) {
-		return (0);
-	}
+	return i;
+}
 
-#ifdef SMP
-	/*
-	 * This code is here to limit the number of simultaneous bucket fills
-	 * for any given zone to the number of per cpu caches in this zone. This
-	 * is done so that we don't allocate more memory than we really need.
-	 */
-	if (zone->uz_fills >= mp_ncpus)
-		goto done;
+static uma_bucket_t
+zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
+{
+	uma_bucket_t bucket;
+	int max;
 
-#endif
-	zone->uz_fills++;
+	/* Don't wait for buckets, preserve caller's NOVM setting. */
+	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
+	if (bucket == NULL)
+		return (NULL);
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
-	/* Try to keep the buckets totally full */
-	saved = bucket->ub_cnt;
-	slab = NULL;
-	keg = NULL;
-	while (bucket->ub_cnt < max &&
-	    (slab = zone->uz_slab(zone, keg, flags)) != NULL) {
-		keg = slab->us_keg;
-		while (slab->us_freecount && bucket->ub_cnt < max) {
-			bucket->ub_bucket[bucket->ub_cnt++] =
-			    slab_alloc_item(zone, slab);
-		}
+	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
+	    max, flags);
 
-		/* Don't block on the next fill */
-		flags |= M_NOWAIT;
-	}
-	if (slab)
-		zone_relock(zone, keg);
-
 	/*
-	 * We unlock here because we need to call the zone's init.
-	 * It should be safe to unlock because the slab dealt with
-	 * above is already on the appropriate list within the keg
-	 * and the bucket we filled is not yet on any list, so we
-	 * own it.
+	 * Initialize the memory if necessary.
 	 */
-	if (zone->uz_init != NULL) {
+	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
 		int i;
 
-		ZONE_UNLOCK(zone);
-		for (i = saved; i < bucket->ub_cnt; i++)
+		for (i = 0; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
-			    origflags) != 0)
+			    flags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
@@ -2435,35 +2582,27 @@
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
-			int j;
-
-			for (j = i; j < bucket->ub_cnt; j++) {
-				zone_free_item(zone, bucket->ub_bucket[j],
-				    NULL, SKIP_FINI, 0);
+			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
+			    bucket->ub_cnt - i);
 #ifdef INVARIANTS
-				bucket->ub_bucket[j] = NULL;
+			bzero(&bucket->ub_bucket[i],
+			    sizeof(void *) * (bucket->ub_cnt - i));
 #endif
-			}
 			bucket->ub_cnt = i;
 		}
-		ZONE_LOCK(zone);
 	}
 
-	zone->uz_fills--;
-	if (bucket->ub_cnt != 0) {
-		LIST_INSERT_HEAD(&zone->uz_full_bucket,
-		    bucket, ub_link);
-		return (1);
+	if (bucket->ub_cnt == 0) {
+		bucket_free(zone, bucket, udata);
+		atomic_add_long(&zone->uz_fails, 1);
+		return (NULL);
 	}
-#ifdef SMP
-done:
-#endif
-	bucket_free(bucket);
 
-	return (0);
+	return (bucket);
 }
+
 /*
- * Allocates an item for an internal zone
+ * Allocates a single item from a zone.
  *
  * Arguments
  *	zone   The zone to alloc for.
@@ -2478,7 +2617,6 @@
 static void *
 zone_alloc_item(uma_zone_t zone, void *udata, int flags)
 {
-	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
@@ -2486,21 +2624,10 @@
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
-	ZONE_LOCK(zone);
+	if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
+		goto fail;
+	atomic_add_long(&zone->uz_allocs, 1);
 
-	slab = zone->uz_slab(zone, NULL, flags);
-	if (slab == NULL) {
-		zone->uz_fails++;
-		ZONE_UNLOCK(zone);
-		return (NULL);
-	}
-
-	item = slab_alloc_item(zone, slab);
-
-	zone_relock(zone, slab->us_keg);
-	zone->uz_allocs++;
-	ZONE_UNLOCK(zone);
-
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
@@ -2509,22 +2636,27 @@
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
-			zone_free_item(zone, item, udata, SKIP_FINI,
-			    ZFREE_STATFAIL | ZFREE_STATFREE);
-			return (NULL);
+			zone_free_item(zone, item, udata, SKIP_FINI);
+			goto fail;
 		}
 	}
 	if (zone->uz_ctor != NULL) {
 		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
-			zone_free_item(zone, item, udata, SKIP_DTOR,
-			    ZFREE_STATFAIL | ZFREE_STATFREE);
-			return (NULL);
+			zone_free_item(zone, item, udata, SKIP_DTOR);
+			goto fail;
 		}
 	}
+#ifdef INVARIANTS
+	uma_dbg_alloc(zone, NULL, item);
+#endif
 	if (flags & M_ZERO)
-		bzero(item, zone->uz_size);
+		uma_zero_item(item, zone);
 
 	return (item);
+
+fail:
+	atomic_add_long(&zone->uz_fails, 1);
+	return (NULL);
 }
 
 /* See uma.h */
@@ -2533,7 +2665,7 @@
 {
 	uma_cache_t cache;
 	uma_bucket_t bucket;
-	int bflags;
+	int lockfail;
 	int cpu;
 
 #ifdef UMA_DEBUG_ALLOC_1
@@ -2545,24 +2677,31 @@
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
-
-	if (zone->uz_dtor)
-		zone->uz_dtor(item, zone->uz_size, udata);
-
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(item)) {
+		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
+			zone->uz_dtor(item, zone->uz_size, udata);
+		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
+			zone->uz_fini(item, zone->uz_size);
+		memguard_free(item);
+		return;
+	}
+#endif
 #ifdef INVARIANTS
-	ZONE_LOCK(zone);
 	if (zone->uz_flags & UMA_ZONE_MALLOC)
 		uma_dbg_free(zone, udata, item);
 	else
 		uma_dbg_free(zone, NULL, item);
-	ZONE_UNLOCK(zone);
 #endif
+	if (zone->uz_dtor != NULL)
+		zone->uz_dtor(item, zone->uz_size, udata);
+
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (zone->uz_flags & UMA_ZFLAG_FULL)
-		goto zfree_internal;
+		goto zfree_item;
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
@@ -2581,45 +2720,25 @@
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
-	bucket = cache->uc_freebucket;
+	/*
+	 * Try to free into the allocbucket first to give LIFO ordering
+	 * for cache-hot datastructures.  Spill over into the freebucket
+	 * if necessary.  Alloc will swap them if one runs dry.
+	 */
+	bucket = cache->uc_allocbucket;
+	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
+		bucket = cache->uc_freebucket;
+	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
+		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
+		    ("uma_zfree: Freeing to non free bucket index."));
+		bucket->ub_bucket[bucket->ub_cnt] = item;
+		bucket->ub_cnt++;
+		cache->uc_frees++;
+		critical_exit();
+		return;
+	}
 
-	if (bucket) {
-		/*
-		 * Do we have room in our bucket? It is OK for this uz count
-		 * check to be slightly out of sync.
-		 */
-
-		if (bucket->ub_cnt < bucket->ub_entries) {
-			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
-			    ("uma_zfree: Freeing to non free bucket index."));
-			bucket->ub_bucket[bucket->ub_cnt] = item;
-			bucket->ub_cnt++;
-			cache->uc_frees++;
-			critical_exit();
-			return;
-		} else if (cache->uc_allocbucket) {
-#ifdef UMA_DEBUG_ALLOC
-			printf("uma_zfree: Swapping buckets.\n");
-#endif
-			/*
-			 * We have run out of space in our freebucket.
-			 * See if we can switch with our alloc bucket.
-			 */
-			if (cache->uc_allocbucket->ub_cnt <
-			    cache->uc_freebucket->ub_cnt) {
-				bucket = cache->uc_freebucket;
-				cache->uc_freebucket = cache->uc_allocbucket;
-				cache->uc_allocbucket = bucket;
-				goto zfree_start;
-			}
-		}
-	}
 	/*
-	 * We can get here for two reasons:
-	 *
-	 * 1) The buckets are NULL
-	 * 2) The alloc and free buckets are both somewhat full.
-	 *
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
@@ -2628,32 +2747,35 @@
 	 * the critical section.
 	 */
 	critical_exit();
-	ZONE_LOCK(zone);
+	if (zone->uz_count == 0 || bucketdisable)
+		goto zfree_item;
+
+	lockfail = 0;
+	if (ZONE_TRYLOCK(zone) == 0) {
+		/* Record contention to size the buckets. */
+		ZONE_LOCK(zone);
+		lockfail = 1;
+	}
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
-	if (cache->uc_freebucket != NULL) {
-		if (cache->uc_freebucket->ub_cnt <
-		    cache->uc_freebucket->ub_entries) {
-			ZONE_UNLOCK(zone);
-			goto zfree_start;
-		}
-		if (cache->uc_allocbucket != NULL &&
-		    (cache->uc_allocbucket->ub_cnt <
-		    cache->uc_freebucket->ub_cnt)) {
-			ZONE_UNLOCK(zone);
-			goto zfree_start;
-		}
-	}
 
-	/* Since we have locked the zone we may as well send back our stats */
-	zone->uz_allocs += cache->uc_allocs;
+	/*
+	 * Since we have locked the zone we may as well send back our stats.
+	 */
+	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
+	atomic_add_long(&zone->uz_frees, cache->uc_frees);
 	cache->uc_allocs = 0;
-	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	bucket = cache->uc_freebucket;
+	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
+		ZONE_UNLOCK(zone);
+		goto zfree_start;
+	}
 	cache->uc_freebucket = NULL;
+	/* We are no longer associated with this CPU. */
+	critical_exit();
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
@@ -2663,34 +2785,35 @@
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
-		LIST_INSERT_HEAD(&zone->uz_full_bucket,
-		    bucket, ub_link);
+		LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
 	}
-	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
-		LIST_REMOVE(bucket, ub_link);
-		ZONE_UNLOCK(zone);
-		cache->uc_freebucket = bucket;
-		goto zfree_start;
-	}
-	/* We are no longer associated with this CPU. */
-	critical_exit();
 
-	/* And the zone.. */
+	/*
+	 * We bump the uz count when the cache size is insufficient to
+	 * handle the working set.
+	 */
+	if (lockfail && zone->uz_count < BUCKET_MAX)
+		zone->uz_count++;
 	ZONE_UNLOCK(zone);
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("uma_zfree: Allocating new free bucket.\n");
 #endif
-	bflags = M_NOWAIT;
-
-	if (zone->uz_flags & UMA_ZFLAG_CACHEONLY)
-		bflags |= M_NOVM;
-	bucket = bucket_alloc(zone->uz_count, bflags);
+	bucket = bucket_alloc(zone, udata, M_NOWAIT);
 	if (bucket) {
-		ZONE_LOCK(zone);
-		LIST_INSERT_HEAD(&zone->uz_free_bucket,
-		    bucket, ub_link);
-		ZONE_UNLOCK(zone);
+		critical_enter();
+		cpu = curcpu;
+		cache = &zone->uz_cpu[cpu];
+		if (cache->uc_freebucket == NULL) {
+			cache->uc_freebucket = bucket;
+			goto zfree_start;
+		}
+		/*
+		 * We lost the race, start over.  We have to drop our
+		 * critical section to free the bucket.
+		 */
+		critical_exit();
+		bucket_free(zone, bucket, udata);
 		goto zfree_restart;
 	}
 
@@ -2697,63 +2820,18 @@
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
-zfree_internal:
-	zone_free_item(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
+zfree_item:
+	zone_free_item(zone, item, udata, SKIP_DTOR);
 
 	return;
 }
 
-/*
- * Frees an item to an INTERNAL zone or allocates a free bucket
- *
- * Arguments:
- *	zone   The zone to free to
- *	item   The item we're freeing
- *	udata  User supplied data for the dtor
- *	skip   Skip dtors and finis
- */
 static void
-zone_free_item(uma_zone_t zone, void *item, void *udata,
-    enum zfreeskip skip, int flags)
+slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
 {
-	uma_slab_t slab;
-	uma_slabrefcnt_t slabref;
-	uma_keg_t keg;
-	u_int8_t *mem;
-	u_int8_t freei;
-	int clearfull;
+	uint8_t freei;
 
-	if (skip < SKIP_DTOR && zone->uz_dtor)
-		zone->uz_dtor(item, zone->uz_size, udata);
-
-	if (skip < SKIP_FINI && zone->uz_fini)
-		zone->uz_fini(item, zone->uz_size);
-
-	ZONE_LOCK(zone);
-
-	if (flags & ZFREE_STATFAIL)
-		zone->uz_fails++;
-	if (flags & ZFREE_STATFREE)
-		zone->uz_frees++;
-
-	if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
-		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
-		keg = zone_first_keg(zone); /* Must only be one. */
-		if (zone->uz_flags & UMA_ZONE_HASH) {
-			slab = hash_sfind(&keg->uk_hash, mem);
-		} else {
-			mem += keg->uk_pgoff;
-			slab = (uma_slab_t)mem;
-		}
-	} else {
-		/* This prevents redundant lookups via free(). */
-		if ((zone->uz_flags & UMA_ZONE_MALLOC) && udata != NULL)
-			slab = (uma_slab_t)udata;
-		else
-			slab = vtoslab((vm_offset_t)item);
-		keg = slab->us_keg;
-		keg_relock(keg, zone);
-	}
+	mtx_assert(&keg->uk_lock, MA_OWNED);
 	MPASS(keg == slab->us_keg);
 
 	/* Do we need to remove from any lists? */
@@ -2765,51 +2843,104 @@
 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
-	/* Slab management stuff */
-	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-		/ keg->uk_rsize;
-
-#ifdef INVARIANTS
-	if (!skip)
-		uma_dbg_free(zone, slab, item);
-#endif
-
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		slabref->us_freelist[freei].us_item = slab->us_firstfree;
-	} else {
-		slab->us_freelist[freei].us_item = slab->us_firstfree;
-	}
-	slab->us_firstfree = freei;
+	/* Slab management. */
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
 	slab->us_freecount++;
 
-	/* Zone statistics */
+	/* Keg statistics. */
 	keg->uk_free++;
+}
 
+static void
+zone_release(uma_zone_t zone, void **bucket, int cnt)
+{
+	void *item;
+	uma_slab_t slab;
+	uma_keg_t keg;
+	uint8_t *mem;
+	int clearfull;
+	int i;
+
 	clearfull = 0;
-	if (keg->uk_flags & UMA_ZFLAG_FULL) {
-		if (keg->uk_pages < keg->uk_maxpages) {
-			keg->uk_flags &= ~UMA_ZFLAG_FULL;
-			clearfull = 1;
+	keg = zone_first_keg(zone);
+	KEG_LOCK(keg);
+	for (i = 0; i < cnt; i++) {
+		item = bucket[i];
+		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
+			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
+			if (zone->uz_flags & UMA_ZONE_HASH) {
+				slab = hash_sfind(&keg->uk_hash, mem);
+			} else {
+				mem += keg->uk_pgoff;
+				slab = (uma_slab_t)mem;
+			}
+		} else {
+			slab = vtoslab((vm_offset_t)item);
+			if (slab->us_keg != keg) {
+				KEG_UNLOCK(keg);
+				keg = slab->us_keg;
+				KEG_LOCK(keg);
+			}
 		}
+		slab_free_item(keg, slab, item);
+		if (keg->uk_flags & UMA_ZFLAG_FULL) {
+			if (keg->uk_pages < keg->uk_maxpages) {
+				keg->uk_flags &= ~UMA_ZFLAG_FULL;
+				clearfull = 1;
+			}
 
-		/* 
-		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
-		 * wake up all procs blocked on pages. This should be uncommon, so 
-		 * keeping this simple for now (rather than adding count of blocked 
-		 * threads etc).
-		 */
-		wakeup(keg);
+			/* 
+			 * We can handle one more allocation. Since we're
+			 * clearing ZFLAG_FULL, wake up all procs blocked
+			 * on pages. This should be uncommon, so keeping this
+			 * simple for now (rather than adding count of blocked 
+			 * threads etc).
+			 */
+			wakeup(keg);
+		}
 	}
+	KEG_UNLOCK(keg);
 	if (clearfull) {
-		zone_relock(zone, keg);
+		ZONE_LOCK(zone);
 		zone->uz_flags &= ~UMA_ZFLAG_FULL;
 		wakeup(zone);
 		ZONE_UNLOCK(zone);
-	} else
-		KEG_UNLOCK(keg);
+	}
+
 }
 
+/*
+ * Frees a single item to any zone.
+ *
+ * Arguments:
+ *	zone   The zone to free to
+ *	item   The item we're freeing
+ *	udata  User supplied data for the dtor
+ *	skip   Skip dtors and finis
+ */
+static void
+zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
+{
+
+#ifdef INVARIANTS
+	if (skip == SKIP_NONE) {
+		if (zone->uz_flags & UMA_ZONE_MALLOC)
+			uma_dbg_free(zone, udata, item);
+		else
+			uma_dbg_free(zone, NULL, item);
+	}
+#endif
+	if (skip < SKIP_DTOR && zone->uz_dtor)
+		zone->uz_dtor(item, zone->uz_size, udata);
+
+	if (skip < SKIP_FINI && zone->uz_fini)
+		zone->uz_fini(item, zone->uz_size);
+
+	atomic_add_long(&zone->uz_frees, 1);
+	zone->uz_release(zone->uz_arg, &item, 1);
+}
+
 /* See uma.h */
 int
 uma_zone_set_max(uma_zone_t zone, int nitems)
@@ -2816,13 +2947,15 @@
 {
 	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
+	if (keg == NULL)
+		return (0);
+	KEG_LOCK(keg);
 	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
 		keg->uk_maxpages += keg->uk_ppera;
-	nitems = keg->uk_maxpages * keg->uk_ipers;
-	ZONE_UNLOCK(zone);
+	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
+	KEG_UNLOCK(keg);
 
 	return (nitems);
 }
@@ -2834,15 +2967,27 @@
 	int nitems;
 	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
-	nitems = keg->uk_maxpages * keg->uk_ipers;
-	ZONE_UNLOCK(zone);
+	if (keg == NULL)
+		return (0);
+	KEG_LOCK(keg);
+	nitems = (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers;
+	KEG_UNLOCK(keg);
 
 	return (nitems);
 }
 
 /* See uma.h */
+void
+uma_zone_set_warning(uma_zone_t zone, const char *warning)
+{
+
+	ZONE_LOCK(zone);
+	zone->uz_warning = warning;
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
@@ -2871,12 +3016,13 @@
 {
 	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
+	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
+	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	keg->uk_init = uminit;
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
@@ -2885,12 +3031,13 @@
 {
 	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
+	KASSERT(keg != NULL, ("uma_zone_set_fini: Invalid zone type"));
+	KEG_LOCK(keg);
 	KASSERT(keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	keg->uk_fini = fini;
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
@@ -2897,6 +3044,7 @@
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
+
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
@@ -2908,6 +3056,7 @@
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
+
 	ZONE_LOCK(zone);
 	KASSERT(zone_first_keg(zone)->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
@@ -2920,10 +3069,13 @@
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
+	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
-	zone_first_keg(zone)->uk_freef = freef;
-	ZONE_UNLOCK(zone);
+	keg = zone_first_keg(zone);
+	KASSERT(keg != NULL, ("uma_zone_set_freef: Invalid zone type"));
+	KEG_LOCK(keg);
+	keg->uk_freef = freef;
+	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
@@ -2933,44 +3085,67 @@
 {
 	uma_keg_t keg;
 
-	ZONE_LOCK(zone);
 	keg = zone_first_keg(zone);
-	keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
+	KEG_LOCK(keg);
 	keg->uk_allocf = allocf;
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
+void
+uma_zone_reserve(uma_zone_t zone, int items)
+{
+	uma_keg_t keg;
+
+	keg = zone_first_keg(zone);
+	if (keg == NULL)
+		return;
+	KEG_LOCK(keg);
+	keg->uk_reserve = items;
+	KEG_UNLOCK(keg);
+
+	return;
+}
+
+/* See uma.h */
 int
-uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
+uma_zone_reserve_kva(uma_zone_t zone, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
-	int pages;
+	u_int pages;
 
 	keg = zone_first_keg(zone);
+	if (keg == NULL)
+		return (0);
 	pages = count / keg->uk_ipers;
 
 	if (pages * keg->uk_ipers < count)
 		pages++;
+	pages *= keg->uk_ppera;
 
-	kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
-
-	if (kva == 0)
-		return (0);
-	if (obj == NULL)
-		obj = vm_object_allocate(OBJT_PHYS, pages);
-	else {
-		VM_OBJECT_LOCK_INIT(obj, "uma object");
-		_vm_object_allocate(OBJT_PHYS, pages, obj);
-	}
-	ZONE_LOCK(zone);
+#ifdef UMA_MD_SMALL_ALLOC
+	if (keg->uk_ppera > 1) {
+#else
+	if (1) {
+#endif
+		kva = kva_alloc((vm_size_t)pages * PAGE_SIZE);
+		if (kva == 0)
+			return (0);
+	} else
+		kva = 0;
+	KEG_LOCK(keg);
 	keg->uk_kva = kva;
-	keg->uk_obj = obj;
+	keg->uk_offset = 0;
 	keg->uk_maxpages = pages;
-	keg->uk_allocf = obj_alloc;
-	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
-	ZONE_UNLOCK(zone);
+#ifdef UMA_MD_SMALL_ALLOC
+	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
+#else
+	keg->uk_allocf = noobj_alloc;
+#endif
+	keg->uk_flags |= UMA_ZONE_NOFREE;
+	KEG_UNLOCK(keg);
+
 	return (1);
 }
 
@@ -2983,7 +3158,9 @@
 	uma_keg_t keg;
 
 	keg = zone_first_keg(zone);
-	ZONE_LOCK(zone);
+	if (keg == NULL)
+		return;
+	KEG_LOCK(keg);
 	slabs = items / keg->uk_ipers;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
@@ -2995,38 +3172,44 @@
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
-	ZONE_UNLOCK(zone);
+	KEG_UNLOCK(keg);
 }
 
 /* See uma.h */
-u_int32_t *
+uint32_t *
 uma_find_refcnt(uma_zone_t zone, void *item)
 {
 	uma_slabrefcnt_t slabref;
+	uma_slab_t slab;
 	uma_keg_t keg;
-	u_int32_t *refcnt;
+	uint32_t *refcnt;
 	int idx;
 
-	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
-	    (~UMA_SLAB_MASK));
-	keg = slabref->us_keg;
-	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
+	slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
+	slabref = (uma_slabrefcnt_t)slab;
+	keg = slab->us_keg;
+	KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
-	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
-	    / keg->uk_rsize;
-	refcnt = &slabref->us_freelist[idx].us_refcnt;
+	idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+	refcnt = &slabref->us_refcnt[idx];
 	return refcnt;
 }
 
 /* See uma.h */
-void
-uma_reclaim(void)
+static void
+uma_reclaim_locked(bool kmem_danger)
 {
+
 #ifdef UMA_DEBUG
 	printf("UMA: vm asked us to release pages!\n");
 #endif
+	sx_assert(&uma_drain_lock, SA_XLOCKED);
 	bucket_enable();
 	zone_foreach(zone_drain);
+	if (vm_page_count_min() || kmem_danger) {
+		cache_drain_safe(NULL);
+		zone_foreach(zone_drain);
+	}
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
@@ -3037,6 +3220,43 @@
 	bucket_zone_drain();
 }
 
+void
+uma_reclaim(void)
+{
+
+	sx_xlock(&uma_drain_lock);
+	uma_reclaim_locked(false);
+	sx_xunlock(&uma_drain_lock);
+}
+
+static int uma_reclaim_needed;
+
+void
+uma_reclaim_wakeup(void)
+{
+
+	uma_reclaim_needed = 1;
+	wakeup(&uma_reclaim_needed);
+}
+
+void
+uma_reclaim_worker(void *arg __unused)
+{
+
+	sx_xlock(&uma_drain_lock);
+	for (;;) {
+		sx_sleep(&uma_reclaim_needed, &uma_drain_lock, PVM,
+		    "umarcl", 0);
+		if (uma_reclaim_needed) {
+			uma_reclaim_needed = 0;
+			sx_xunlock(&uma_drain_lock);
+			EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_KMEM);
+			sx_xlock(&uma_drain_lock);
+			uma_reclaim_locked(true);
+		}
+	}
+}
+
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
@@ -3056,11 +3276,11 @@
 }
 
 void *
-uma_large_malloc(int size, int wait)
+uma_large_malloc(vm_size_t size, int wait)
 {
 	void *mem;
 	uma_slab_t slab;
-	u_int8_t flags;
+	uint8_t flags;
 
 	slab = zone_alloc_item(slabzone, NULL, wait);
 	if (slab == NULL)
@@ -3072,8 +3292,7 @@
 		slab->us_flags = flags | UMA_SLAB_MALLOC;
 		slab->us_size = size;
 	} else {
-		zone_free_item(slabzone, slab, NULL, SKIP_NONE,
-		    ZFREE_STATFAIL | ZFREE_STATFREE);
+		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 	}
 
 	return (mem);
@@ -3082,11 +3301,22 @@
 void
 uma_large_free(uma_slab_t slab)
 {
-	vsetobj((vm_offset_t)slab->us_data, kmem_object);
+
 	page_free(slab->us_data, slab->us_size, slab->us_flags);
-	zone_free_item(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
+	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
 }
 
+static void
+uma_zero_item(void *item, uma_zone_t zone)
+{
+
+	if (zone->uz_flags & UMA_ZONE_PCPU) {
+		for (int i = 0; i < mp_ncpus; i++)
+			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
+	} else
+		bzero(item, zone->uz_size);
+}
+
 void
 uma_print_stats(void)
 {
@@ -3096,9 +3326,8 @@
 static void
 slab_print(uma_slab_t slab)
 {
-	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
-		slab->us_keg, slab->us_data, slab->us_freecount,
-		slab->us_firstfree);
+	printf("slab: keg %p, data %p, freecount %d\n",
+		slab->us_keg, slab->us_data, slab->us_freecount);
 }
 
 static void
@@ -3120,8 +3349,8 @@
 	    "out %d free %d limit %d\n",
 	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
-	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
-	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
+	    (keg->uk_pages / keg->uk_ppera) * keg->uk_ipers - keg->uk_free,
+	    keg->uk_free, (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
 	printf("Part slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
@@ -3164,11 +3393,11 @@
  * directly so that we don't have to.
  */
 static void
-uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
-    u_int64_t *freesp, u_int64_t *sleepsp)
+uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
+    uint64_t *freesp, uint64_t *sleepsp)
 {
 	uma_cache_t cache;
-	u_int64_t allocs, frees, sleeps;
+	uint64_t allocs, frees, sleeps;
 	int cachefree, cpu;
 
 	allocs = frees = sleeps = 0;
@@ -3204,12 +3433,12 @@
 	int count;
 
 	count = 0;
-	mtx_lock(&uma_mtx);
+	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
-	mtx_unlock(&uma_mtx);
+	rw_runlock(&uma_rwlock);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
@@ -3234,7 +3463,7 @@
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 
 	count = 0;
-	mtx_lock(&uma_mtx);
+	rw_rlock(&uma_rwlock);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
@@ -3274,7 +3503,7 @@
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 
-			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
+			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
 				uth.uth_zone_free += bucket->ub_cnt;
 			uth.uth_allocs = z->uz_allocs;
 			uth.uth_frees = z->uz_frees;
@@ -3310,23 +3539,52 @@
 			ZONE_UNLOCK(z);
 		}
 	}
-	mtx_unlock(&uma_mtx);
+	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
+int
+sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
+{
+	uma_zone_t zone = *(uma_zone_t *)arg1;
+	int error, max, old;
+
+	old = max = uma_zone_get_max(zone);
+	error = sysctl_handle_int(oidp, &max, 0, req);
+	if (error || !req->newptr)
+		return (error);
+
+	if (max < old)
+		return (EINVAL);
+
+	uma_zone_set_max(zone, max);
+
+	return (0);
+}
+
+int
+sysctl_handle_uma_zone_cur(SYSCTL_HANDLER_ARGS)
+{
+	uma_zone_t zone = *(uma_zone_t *)arg1;
+	int cur;
+
+	cur = uma_zone_get_cur(zone);
+	return (sysctl_handle_int(oidp, &cur, 0, req));
+}
+
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
-	u_int64_t allocs, frees, sleeps;
+	uint64_t allocs, frees, sleeps;
 	uma_bucket_t bucket;
 	uma_keg_t kz;
 	uma_zone_t z;
 	int cachefree;
 
-	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
-	    "Requests", "Sleeps");
+	db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
+	    "Free", "Requests", "Sleeps", "Bucket");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
@@ -3340,15 +3598,37 @@
 			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
-			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
+			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
 				cachefree += bucket->ub_cnt;
-			db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
-			    (uintmax_t)kz->uk_size,
+			db_printf("%18s %8ju %8jd %8d %12ju %8ju %8u\n",
+			    z->uz_name, (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
-			    (uintmax_t)allocs, sleeps);
+			    (uintmax_t)allocs, sleeps, z->uz_count);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
+
+DB_SHOW_COMMAND(umacache, db_show_umacache)
+{
+	uint64_t allocs, frees;
+	uma_bucket_t bucket;
+	uma_zone_t z;
+	int cachefree;
+
+	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
+	    "Requests", "Bucket");
+	LIST_FOREACH(z, &uma_cachezones, uz_link) {
+		uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
+		LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
+			cachefree += bucket->ub_cnt;
+		db_printf("%18s %8ju %8jd %8d %12ju %8u\n",
+		    z->uz_name, (uintmax_t)z->uz_size,
+		    (intmax_t)(allocs - frees), cachefree,
+		    (uintmax_t)allocs, z->uz_count);
+		if (db_pager_quit)
+			return;
+	}
+}
 #endif

Modified: trunk/sys/vm/uma_dbg.c
===================================================================
--- trunk/sys/vm/uma_dbg.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_dbg.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -31,10 +32,11 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/bitset.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
@@ -49,7 +51,7 @@
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
-static const u_int32_t uma_junk = 0xdeadc0de;
+static const uint32_t uma_junk = 0xdeadc0de;
 
 /*
  * Checks an item to make sure it hasn't been overwritten since it was freed,
@@ -62,7 +64,7 @@
 trash_ctor(void *mem, int size, void *arg, int flags)
 {
 	int cnt;
-	u_int32_t *p;
+	uint32_t *p;
 
 	cnt = size / sizeof(uma_junk);
 
@@ -85,7 +87,7 @@
 trash_dtor(void *mem, int size, void *arg)
 {
 	int cnt;
-	u_int32_t *p;
+	uint32_t *p;
 
 	cnt = size / sizeof(uma_junk);
 
@@ -122,7 +124,7 @@
 mtrash_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct malloc_type **ksp;
-	u_int32_t *p = mem;
+	uint32_t *p = mem;
 	int cnt;
 
 	size -= sizeof(struct malloc_type *);
@@ -150,7 +152,7 @@
 mtrash_dtor(void *mem, int size, void *arg)
 {
 	int cnt;
-	u_int32_t *p;
+	uint32_t *p;
 
 	size -= sizeof(struct malloc_type *);
 	cnt = size / sizeof(uma_junk);
@@ -191,22 +193,30 @@
 	(void)mtrash_ctor(mem, size, NULL, 0);
 }
 
+#ifdef INVARIANTS
 static uma_slab_t
 uma_dbg_getslab(uma_zone_t zone, void *item)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
-	u_int8_t *mem;
+	uint8_t *mem;
 
-	mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
+	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
 	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
 		slab = vtoslab((vm_offset_t)mem);
 	} else {
+		/*
+		 * It is safe to return the slab here even though the
+		 * zone is unlocked because the item's allocation state
+		 * essentially holds a reference.
+		 */
+		ZONE_LOCK(zone);
 		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			slab = hash_sfind(&keg->uk_hash, mem);
 		else
 			slab = (uma_slab_t)(mem + keg->uk_pgoff);
+		ZONE_UNLOCK(zone);
 	}
 
 	return (slab);
@@ -216,14 +226,14 @@
  * Set up the slab's freei data such that uma_dbg_free can function.
  *
  */
-
 void
 uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
-	uma_slabrefcnt_t slabref;
 	int freei;
 
+	if (zone_first_keg(zone) == NULL)
+		return;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -231,17 +241,13 @@
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
-	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / keg->uk_rsize;
+	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
 
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		slabref->us_freelist[freei].us_item = 255;
-	} else {
-		slab->us_freelist[freei].us_item = 255;
-	}
-
 	return;
 }
 
@@ -250,14 +256,14 @@
  * and duplicate frees.
  *
  */
-
 void
 uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
 {
 	uma_keg_t keg;
-	uma_slabrefcnt_t slabref;
 	int freei;
 
+	if (zone_first_keg(zone) == NULL)
+		return;
 	if (slab == NULL) {
 		slab = uma_dbg_getslab(zone, item);
 		if (slab == NULL) 
@@ -265,49 +271,21 @@
 			    item, zone->uz_name);
 	}
 	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
 
-	freei = ((unsigned long)item - (unsigned long)slab->us_data)
-	    / keg->uk_rsize;
-
 	if (freei >= keg->uk_ipers)
-		panic("zone: %s(%p) slab %p freelist %d out of range 0-%d\n",
-		    zone->uz_name, zone, slab, freei, keg->uk_ipers-1);
+		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
 
-	if (((freei * keg->uk_rsize) + slab->us_data) != item) {
-		printf("zone: %s(%p) slab %p freed address %p unaligned.\n",
-		    zone->uz_name, zone, slab, item);
-		panic("should be %p\n",
-		    (freei * keg->uk_rsize) + slab->us_data);
-	}
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
+		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
 
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		if (slabref->us_freelist[freei].us_item != 255) {
-			printf("Slab at %p, freei %d = %d.\n",
-			    slab, freei, slabref->us_freelist[freei].us_item);
-			panic("Duplicate free of item %p from zone %p(%s)\n",
-			    item, zone, zone->uz_name);
-		}
+	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
 
-		/*
-		 * When this is actually linked into the slab this will change.
-		 * Until then the count of valid slabs will make sure we don't
-		 * accidentally follow this and assume it's a valid index.
-		 */
-		slabref->us_freelist[freei].us_item = 0;
-	} else {
-		if (slab->us_freelist[freei].us_item != 255) {
-			printf("Slab at %p, freei %d = %d.\n",
-			    slab, freei, slab->us_freelist[freei].us_item);
-			panic("Duplicate free of item %p from zone %p(%s)\n",
-			    item, zone, zone->uz_name);
-		}
+	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+}
 
-		/*
-		 * When this is actually linked into the slab this will change.
-		 * Until then the count of valid slabs will make sure we don't
-		 * accidentally follow this and assume it's a valid index.
-		 */
-		slab->us_freelist[freei].us_item = 0;
-	}
-}
+#endif /* INVARIANTS */

Modified: trunk/sys/vm/uma_dbg.h
===================================================================
--- trunk/sys/vm/uma_dbg.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_dbg.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
@@ -24,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $
  *
  */
 

Modified: trunk/sys/vm/uma_int.h
===================================================================
--- trunk/sys/vm/uma_int.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/uma_int.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 /*-
- * Copyright (c) 2002-2005, 2009 Jeffrey Roberson <jeff at FreeBSD.org>
+ * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff at FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic at FreeBSD.org>
  * All rights reserved.
  *
@@ -24,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $
  *
  */
 
@@ -45,21 +46,10 @@
  *  
  * The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may
  * be allocated off the page from a special slab zone.  The free list within a
- * slab is managed with a linked list of indices, which are 8 bit values.  If
- * UMA_SLAB_SIZE is defined to be too large I will have to switch to 16bit
- * values.  Currently on alpha you can get 250 or so 32 byte items and on x86
- * you can get 250 or so 16byte items.  For item sizes that would yield more
- * than 10% memory waste we potentially allocate a separate uma_slab_t if this
- * will improve the number of items per slab that will fit.  
+ * slab is managed with a bitmask.  For item sizes that would yield more than
+ * 10% memory waste we potentially allocate a separate uma_slab_t if this will
+ * improve the number of items per slab that will fit.  
  *
- * Other potential space optimizations are storing the 8bit of linkage in space
- * wasted between items due to alignment problems.  This may yield a much better
- * memory footprint for certain sizes of objects.  Another alternative is to
- * increase the UMA_SLAB_SIZE, or allow for dynamic slab sizes.  I prefer
- * dynamic slab sizes because we could stick with 8 bit indices and only use
- * large slab sizes for zones with a lot of waste per slab.  This may create
- * inefficiencies in the vm subsystem due to fragmentation in the address space.
- *
  * The only really gross cases, with regards to memory waste, are for those
  * items that are just over half the page size.   You can get nearly 50% waste,
  * so you fall back to the memory footprint of the power of two allocator. I
@@ -120,8 +110,8 @@
 
 #define UMA_BOOT_PAGES		64	/* Pages allocated for startup */
 
-/* Max waste before going to off page slab management */
-#define UMA_MAX_WASTE	(UMA_SLAB_SIZE / 10)
+/* Max waste percentage before going to off page slab management */
+#define UMA_MAX_WASTE	10
 
 /*
  * I doubt there will be many cases where this is exceeded. This is the initial
@@ -133,14 +123,9 @@
 /* 
  * I should investigate other hashing algorithms.  This should yield a low
  * number of collisions if the pages are relatively contiguous.
- *
- * This is the same algorithm that most processor caches use.
- *
- * I'm shifting and masking instead of % because it should be faster.
  */
 
-#define UMA_HASH(h, s) ((((unsigned long)s) >> UMA_SLAB_SHIFT) &	\
-    (h)->uh_hashmask)
+#define UMA_HASH(h, s) ((((uintptr_t)s) >> UMA_SLAB_SHIFT) & (h)->uh_hashmask)
 
 #define UMA_HASH_INSERT(h, s, mem)					\
 		SLIST_INSERT_HEAD(&(h)->uh_slab_hash[UMA_HASH((h),	\
@@ -184,8 +169,8 @@
 struct uma_cache {
 	uma_bucket_t	uc_freebucket;	/* Bucket we're freeing to */
 	uma_bucket_t	uc_allocbucket;	/* Bucket to allocate from */
-	u_int64_t	uc_allocs;	/* Count of allocations */
-	u_int64_t	uc_frees;	/* Count of frees */
+	uint64_t	uc_allocs;	/* Count of allocations */
+	uint64_t	uc_frees;	/* Count of frees */
 } UMA_ALIGN;
 
 typedef struct uma_cache * uma_cache_t;
@@ -197,24 +182,21 @@
  *
  */
 struct uma_keg {
-	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
-
-	struct mtx	uk_lock;	/* Lock for the keg */
+	struct mtx_padalign	uk_lock;	/* Lock for the keg */
 	struct uma_hash	uk_hash;
 
-	const char	*uk_name;		/* Name of creating zone. */
 	LIST_HEAD(,uma_zone)	uk_zones;	/* Keg's zones */
 	LIST_HEAD(,uma_slab)	uk_part_slab;	/* partially allocated slabs */
 	LIST_HEAD(,uma_slab)	uk_free_slab;	/* empty slab list */
 	LIST_HEAD(,uma_slab)	uk_full_slab;	/* full slabs */
 
-	u_int32_t	uk_recurse;	/* Allocation recursion count */
-	u_int32_t	uk_align;	/* Alignment mask */
-	u_int32_t	uk_pages;	/* Total page count */
-	u_int32_t	uk_free;	/* Count of items free in slabs */
-	u_int32_t	uk_size;	/* Requested size of each item */
-	u_int32_t	uk_rsize;	/* Real size of each item */
-	u_int32_t	uk_maxpages;	/* Maximum number of pages to alloc */
+	uint32_t	uk_align;	/* Alignment mask */
+	uint32_t	uk_pages;	/* Total page count */
+	uint32_t	uk_free;	/* Count of items free in slabs */
+	uint32_t	uk_reserve;	/* Number of reserved items. */
+	uint32_t	uk_size;	/* Requested size of each item */
+	uint32_t	uk_rsize;	/* Real size of each item */
+	uint32_t	uk_maxpages;	/* Maximum number of pages to alloc */
 
 	uma_init	uk_init;	/* Keg's init routine */
 	uma_fini	uk_fini;	/* Keg's fini routine */
@@ -221,21 +203,32 @@
 	uma_alloc	uk_allocf;	/* Allocation function */
 	uma_free	uk_freef;	/* Free routine */
 
-	struct vm_object	*uk_obj;	/* Zone specific object */
-	vm_offset_t	uk_kva;		/* Base kva for zones with objs */
+	u_long		uk_offset;	/* Next free offset from base KVA */
+	vm_offset_t	uk_kva;		/* Zone base KVA */
 	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
 
-	u_int16_t	uk_pgoff;	/* Offset to uma_slab struct */
-	u_int16_t	uk_ppera;	/* pages per allocation from backend */
-	u_int16_t	uk_ipers;	/* Items per slab */
-	u_int32_t	uk_flags;	/* Internal flags */
+	uint16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	uint16_t	uk_ppera;	/* pages per allocation from backend */
+	uint16_t	uk_ipers;	/* Items per slab */
+	uint32_t	uk_flags;	/* Internal flags */
+
+	/* Least used fields go to the last cache line. */
+	const char	*uk_name;		/* Name of creating zone. */
+	LIST_ENTRY(uma_keg)	uk_link;	/* List of all kegs */
 };
 typedef struct uma_keg	* uma_keg_t;
 
-/* Page management structure */
+/*
+ * Free bits per-slab.
+ */
+#define	SLAB_SETSIZE	(PAGE_SIZE / UMA_SMALLEST_UNIT)
+BITSET_DEFINE(slabbits, SLAB_SETSIZE);
 
-/* Sorry for the union, but space efficiency is important */
-struct uma_slab_head {
+/*
+ * The slab structure manages a single contiguous allocation from backing
+ * store and subdivides it into individually allocatable items.
+ */
+struct uma_slab {
 	uma_keg_t	us_keg;			/* Keg we live in */
 	union {
 		LIST_ENTRY(uma_slab)	_us_link;	/* slabs in zone */
@@ -242,19 +235,18 @@
 		unsigned long	_us_size;	/* Size of allocation */
 	} us_type;
 	SLIST_ENTRY(uma_slab)	us_hlink;	/* Link for hash table */
-	u_int8_t	*us_data;		/* First item */
-	u_int8_t	us_flags;		/* Page flags see uma.h */
-	u_int8_t	us_freecount;	/* How many are free? */
-	u_int8_t	us_firstfree;	/* First free item index */
+	uint8_t		*us_data;		/* First item */
+	struct slabbits	us_free;		/* Free bitmask. */
+#ifdef INVARIANTS
+	struct slabbits	us_debugfree;		/* Debug bitmask. */
+#endif
+	uint16_t	us_freecount;		/* How many are free? */
+	uint8_t		us_flags;		/* Page flags see uma.h */
+	uint8_t		us_pad;			/* Pad to 32bits, unused. */
 };
 
-/* The standard slab structure */
-struct uma_slab {
-	struct uma_slab_head	us_head;	/* slab header data */
-	struct {
-		u_int8_t	us_item;
-	} us_freelist[1];			/* actual number bigger */
-};
+#define	us_link	us_type._us_link
+#define	us_size	us_type._us_size
 
 /*
  * The slab structure for UMA_ZONE_REFCNT zones for whose items we
@@ -261,37 +253,14 @@
  * maintain reference counters in the slab for.
  */
 struct uma_slab_refcnt {
-	struct uma_slab_head	us_head;	/* slab header data */
-	struct {
-		u_int8_t	us_item;
-		u_int32_t	us_refcnt;
-	} us_freelist[1];			/* actual number bigger */
+	struct uma_slab		us_head;	/* slab header data */
+	uint32_t		us_refcnt[0];	/* Actually larger. */
 };
 
-#define	us_keg		us_head.us_keg
-#define	us_link		us_head.us_type._us_link
-#define	us_size		us_head.us_type._us_size
-#define	us_hlink	us_head.us_hlink
-#define	us_data		us_head.us_data
-#define	us_flags	us_head.us_flags
-#define	us_freecount	us_head.us_freecount
-#define	us_firstfree	us_head.us_firstfree
-
 typedef struct uma_slab * uma_slab_t;
 typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
 typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
 
-
-/*
- * These give us the size of one free item reference within our corresponding
- * uma_slab structures, so that our calculations during zone setup are correct
- * regardless of what the compiler decides to do with padding the structure
- * arrays within uma_slab.
- */
-#define	UMA_FRITM_SZ	(sizeof(struct uma_slab) - sizeof(struct uma_slab_head))
-#define	UMA_FRITMREF_SZ	(sizeof(struct uma_slab_refcnt) -	\
-    sizeof(struct uma_slab_head))
-
 struct uma_klink {
 	LIST_ENTRY(uma_klink)	kl_link;
 	uma_keg_t		kl_keg;
@@ -305,12 +274,12 @@
  *
  */
 struct uma_zone {
-	const char	*uz_name;	/* Text name of the zone */
-	struct mtx	*uz_lock;	/* Lock for the zone (keg's lock) */
+	struct mtx_padalign	uz_lock;	/* Lock for the zone */
+	struct mtx_padalign	*uz_lockptr;
+	const char		*uz_name;	/* Text name of the zone */
 
 	LIST_ENTRY(uma_zone)	uz_link;	/* List of all zones in keg */
-	LIST_HEAD(,uma_bucket)	uz_full_bucket;	/* full buckets */
-	LIST_HEAD(,uma_bucket)	uz_free_bucket;	/* Buckets for frees */
+	LIST_HEAD(,uma_bucket)	uz_buckets;	/* full buckets */
 
 	LIST_HEAD(,uma_klink)	uz_kegs;	/* List of kegs. */
 	struct uma_klink	uz_klink;	/* klink for first keg. */
@@ -319,18 +288,25 @@
 	uma_ctor	uz_ctor;	/* Constructor for each allocation */
 	uma_dtor	uz_dtor;	/* Destructor */
 	uma_init	uz_init;	/* Initializer for each item */
-	uma_fini	uz_fini;	/* Discards memory */
+	uma_fini	uz_fini;	/* Finalizer for each item. */
+	uma_import	uz_import;	/* Import new memory to cache. */
+	uma_release	uz_release;	/* Release memory from cache. */
+	void		*uz_arg;	/* Import/release argument. */
 
-	u_int32_t	uz_flags;	/* Flags inherited from kegs */
-	u_int32_t	uz_size;	/* Size inherited from kegs */
+	uint32_t	uz_flags;	/* Flags inherited from kegs */
+	uint32_t	uz_size;	/* Size inherited from kegs */
 
-	u_int64_t	uz_allocs UMA_ALIGN; /* Total number of allocations */
-	u_int64_t	uz_frees;	/* Total number of frees */
-	u_int64_t	uz_fails;	/* Total number of alloc failures */
-	u_int64_t	uz_sleeps;	/* Total number of alloc sleeps */
-	uint16_t	uz_fills;	/* Outstanding bucket fills */
-	uint16_t	uz_count;	/* Highest value ub_ptr can have */
+	volatile u_long	uz_allocs UMA_ALIGN; /* Total number of allocations */
+	volatile u_long	uz_fails;	/* Total number of alloc failures */
+	volatile u_long	uz_frees;	/* Total number of frees */
+	uint64_t	uz_sleeps;	/* Total number of alloc sleeps */
+	uint16_t	uz_count;	/* Amount of items in full bucket */
+	uint16_t	uz_count_min;	/* Minimal amount of items there */
 
+	/* The next three fields are used to print a rate-limited warnings. */
+	const char	*uz_warning;	/* Warning to print on failure */
+	struct timeval	uz_ratecheck;	/* Warnings rate-limiting */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@@ -341,23 +317,31 @@
 /*
  * These flags must not overlap with the UMA_ZONE flags specified in uma.h.
  */
-#define	UMA_ZFLAG_BUCKET	0x02000000	/* Bucket zone. */
 #define	UMA_ZFLAG_MULTI		0x04000000	/* Multiple kegs in the zone. */
 #define	UMA_ZFLAG_DRAINING	0x08000000	/* Running zone_drain. */
-#define UMA_ZFLAG_PRIVALLOC	0x10000000	/* Use uz_allocf. */
+#define	UMA_ZFLAG_BUCKET	0x10000000	/* Bucket zone. */
 #define UMA_ZFLAG_INTERNAL	0x20000000	/* No offpage no PCPU. */
 #define UMA_ZFLAG_FULL		0x40000000	/* Reached uz_maxpages */
 #define UMA_ZFLAG_CACHEONLY	0x80000000	/* Don't ask VM for buckets. */
 
-#define	UMA_ZFLAG_INHERIT	(UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | \
-				    UMA_ZFLAG_BUCKET)
+#define	UMA_ZFLAG_INHERIT						\
+    (UMA_ZFLAG_INTERNAL | UMA_ZFLAG_CACHEONLY | UMA_ZFLAG_BUCKET)
 
+static inline uma_keg_t
+zone_first_keg(uma_zone_t zone)
+{
+	uma_klink_t klink;
+
+	klink = LIST_FIRST(&zone->uz_kegs);
+	return (klink != NULL) ? klink->kl_keg : NULL;
+}
+
 #undef UMA_ALIGN
 
 #ifdef _KERNEL
 /* Internal prototypes */
-static __inline uma_slab_t hash_sfind(struct uma_hash *hash, u_int8_t *data);
-void *uma_large_malloc(int size, int wait);
+static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data);
+void *uma_large_malloc(vm_size_t size, int wait);
 void uma_large_free(uma_slab_t slab);
 
 /* Lock Macros */
@@ -371,13 +355,26 @@
 			mtx_init(&(k)->uk_lock, (k)->uk_name,	\
 			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
 	} while (0)
-	    
+
 #define	KEG_LOCK_FINI(k)	mtx_destroy(&(k)->uk_lock)
 #define	KEG_LOCK(k)	mtx_lock(&(k)->uk_lock)
 #define	KEG_UNLOCK(k)	mtx_unlock(&(k)->uk_lock)
-#define	ZONE_LOCK(z)	mtx_lock((z)->uz_lock)
-#define ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lock)
 
+#define	ZONE_LOCK_INIT(z, lc)					\
+	do {							\
+		if ((lc))					\
+			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			    (z)->uz_name, MTX_DEF | MTX_DUPOK);	\
+		else						\
+			mtx_init(&(z)->uz_lock, (z)->uz_name,	\
+			    "UMA zone", MTX_DEF | MTX_DUPOK);	\
+	} while (0)
+	    
+#define	ZONE_LOCK(z)	mtx_lock((z)->uz_lockptr)
+#define	ZONE_TRYLOCK(z)	mtx_trylock((z)->uz_lockptr)
+#define	ZONE_UNLOCK(z)	mtx_unlock((z)->uz_lockptr)
+#define	ZONE_LOCK_FINI(z)	mtx_destroy(&(z)->uz_lock)
+
 /*
  * Find a slab within a hash table.  This is used for OFFPAGE zones to lookup
  * the slab structure.
@@ -390,7 +387,7 @@
  *	A pointer to a slab if successful, else NULL.
  */
 static __inline uma_slab_t
-hash_sfind(struct uma_hash *hash, u_int8_t *data)
+hash_sfind(struct uma_hash *hash, uint8_t *data)
 {
         uma_slab_t slab;
         int hval;
@@ -398,7 +395,7 @@
         hval = UMA_HASH(hash, data);
 
         SLIST_FOREACH(slab, &hash->uh_slab_hash[hval], us_hlink) {
-                if ((u_int8_t *)slab->us_data == data)
+                if ((uint8_t *)slab->us_data == data)
                         return (slab);
         }
         return (NULL);
@@ -408,15 +405,9 @@
 vtoslab(vm_offset_t va)
 {
 	vm_page_t p;
-	uma_slab_t slab;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
-	slab = (uma_slab_t )p->object;
-
-	if (p->flags & PG_SLAB)
-		return (slab);
-	else
-		return (NULL);
+	return ((uma_slab_t)p->plinks.s.pv);
 }
 
 static __inline void
@@ -425,27 +416,17 @@
 	vm_page_t p;
 
 	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
-	p->object = (vm_object_t)slab;
-	p->flags |= PG_SLAB;
+	p->plinks.s.pv = slab;
 }
 
-static __inline void
-vsetobj(vm_offset_t va, vm_object_t obj)
-{
-	vm_page_t p;
-
-	p = PHYS_TO_VM_PAGE(pmap_kextract(va));
-	p->object = obj;
-	p->flags &= ~PG_SLAB;
-}
-
 /*
  * The following two functions may be defined by architecture specific code
  * if they can provide more effecient allocation functions.  This is useful
  * for using direct mapped addresses.
  */
-void *uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait);
-void uma_small_free(void *mem, int size, u_int8_t flags);
+void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
+    int wait);
+void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
 #endif /* _KERNEL */
 
 #endif /* VM_UMA_INT_H */

Modified: trunk/sys/vm/vm.h
===================================================================
--- trunk/sys/vm/vm.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -55,7 +56,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $
  */
 
 #ifndef VM_H
@@ -68,6 +69,7 @@
 #define	VM_INHERIT_SHARE	((vm_inherit_t) 0)
 #define	VM_INHERIT_COPY		((vm_inherit_t) 1)
 #define	VM_INHERIT_NONE		((vm_inherit_t) 2)
+#define	VM_INHERIT_ZERO		((vm_inherit_t) 3)
 #define	VM_INHERIT_DEFAULT	VM_INHERIT_COPY
 
 typedef u_char vm_prot_t;	/* protection codes */
@@ -77,6 +79,7 @@
 #define	VM_PROT_WRITE		((vm_prot_t) 0x02)
 #define	VM_PROT_EXECUTE		((vm_prot_t) 0x04)
 #define	VM_PROT_COPY		((vm_prot_t) 0x08)	/* copy-on-read */
+#define	VM_PROT_FAULT_LOOKUP	((vm_prot_t) 0x010)
 
 #define	VM_PROT_ALL		(VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
 #define VM_PROT_RW		(VM_PROT_READ|VM_PROT_WRITE)
@@ -134,8 +137,6 @@
 	vm_offset_t	buffer_eva;
 	vm_offset_t	clean_sva;
 	vm_offset_t	clean_eva;
-	vm_offset_t	pager_sva;
-	vm_offset_t	pager_eva;
 };
 
 extern struct kva_md_info	kmi;
@@ -149,6 +150,7 @@
 void swap_reserve_force(vm_ooffset_t incr);
 void swap_release(vm_ooffset_t decr);
 void swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred);
+void swapper(void);
 
 #endif				/* VM_H */
 

Modified: trunk/sys/vm/vm_extern.h
===================================================================
--- trunk/sys/vm/vm_extern.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_extern.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -27,36 +28,50 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $
  */
 
 #ifndef _VM_EXTERN_H_
 #define	_VM_EXTERN_H_
 
+struct pmap;
 struct proc;
 struct vmspace;
 struct vnode;
+struct vmem;
 
 #ifdef _KERNEL
 
-int kernacc(void *, int, int);
-vm_offset_t kmem_alloc(vm_map_t, vm_size_t);
-vm_offset_t kmem_alloc_attr(vm_map_t map, vm_size_t size, int flags,
+/* These operate on kernel virtual addresses only. */
+vm_offset_t kva_alloc(vm_size_t);
+void kva_free(vm_offset_t, vm_size_t);
+
+/* These operate on pageable virtual addresses. */
+vm_offset_t kmap_alloc_wait(vm_map_t, vm_size_t);
+void kmap_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
+
+/* These operate on virtual addresses backed by memory. */
+vm_offset_t kmem_alloc_attr(struct vmem *, vm_size_t size, int flags,
     vm_paddr_t low, vm_paddr_t high, vm_memattr_t memattr);
-vm_offset_t kmem_alloc_contig(vm_map_t map, vm_size_t size, int flags,
-    vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
-    unsigned long boundary, vm_memattr_t memattr);
-vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t);
-vm_offset_t kmem_alloc_nofault_space(vm_map_t, vm_size_t, int);
-vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t);
-void kmem_free(vm_map_t, vm_offset_t, vm_size_t);
-void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
-void kmem_init(vm_offset_t, vm_offset_t);
-vm_offset_t kmem_malloc(vm_map_t map, vm_size_t size, int flags);
-int kmem_back(vm_map_t, vm_offset_t, vm_size_t, int);
+vm_offset_t kmem_alloc_contig(struct vmem *, vm_size_t size, int flags,
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+    vm_memattr_t memattr);
+vm_offset_t kmem_malloc(struct vmem *, vm_size_t size, int flags);
+void kmem_free(struct vmem *, vm_offset_t, vm_size_t);
+
+/* This provides memory for previously allocated address space. */
+int kmem_back(vm_object_t, vm_offset_t, vm_size_t, int);
+void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
+
+/* Bootstrapping. */
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
+void kmem_init(vm_offset_t, vm_offset_t);
+void kmem_init_zero_region(void);
+void kmeminit(void);
+
 void swapout_procs(int);
+int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t,
@@ -67,8 +82,6 @@
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
-void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
-int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
@@ -75,7 +88,8 @@
 int vm_mmap_to_errno(int rv);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
-struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
+typedef int (*pmap_pinit_t)(struct pmap *pmap);
+struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t);
 struct vmspace *vmspace_fork(struct vmspace *, vm_ooffset_t *);
 int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
 int vmspace_unshare(struct proc *);
@@ -90,5 +104,6 @@
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
+int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */

Modified: trunk/sys/vm/vm_fault.c
===================================================================
--- trunk/sys/vm/vm_fault.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_fault.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -72,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
@@ -81,9 +82,9 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
@@ -101,22 +102,12 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_reserv.h>
 
-#include <sys/mount.h>	/* XXX Temporary for VFS_LOCK_GIANT() */
-
 #define PFBAK 4
 #define PFFOR 4
-#define PAGEORDER_SIZE (PFBAK+PFFOR)
 
-static int prefault_pageorder[] = {
-	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
-	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
-	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
-	-4 * PAGE_SIZE, 4 * PAGE_SIZE
-};
-
 static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
-static void vm_fault_prefault(pmap_t, vm_offset_t, vm_map_entry_t);
 
 #define	VM_FAULT_READ_BEHIND	8
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
@@ -134,17 +125,19 @@
 	vm_map_t map;
 	vm_map_entry_t entry;
 	int lookup_still_valid;
+	int map_generation;
 	struct vnode *vp;
-	int vfslocked;
 };
 
 static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
+	    int faultcount, int reqpage);
 
 static inline void
 release_page(struct faultstate *fs)
 {
 
-	vm_page_wakeup(fs->m);
+	vm_page_xunbusy(fs->m);
 	vm_page_lock(fs->m);
 	vm_page_deactivate(fs->m);
 	vm_page_unlock(fs->m);
@@ -162,39 +155,141 @@
 }
 
 static void
+unlock_vp(struct faultstate *fs)
+{
+
+	if (fs->vp != NULL) {
+		vput(fs->vp);
+		fs->vp = NULL;
+	}
+}
+
+static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
 	vm_object_pip_wakeup(fs->object);
-	VM_OBJECT_UNLOCK(fs->object);
+	VM_OBJECT_WUNLOCK(fs->object);
 	if (fs->object != fs->first_object) {
-		VM_OBJECT_LOCK(fs->first_object);
+		VM_OBJECT_WLOCK(fs->first_object);
 		vm_page_lock(fs->first_m);
 		vm_page_free(fs->first_m);
 		vm_page_unlock(fs->first_m);
 		vm_object_pip_wakeup(fs->first_object);
-		VM_OBJECT_UNLOCK(fs->first_object);
+		VM_OBJECT_WUNLOCK(fs->first_object);
 		fs->first_m = NULL;
 	}
 	vm_object_deallocate(fs->first_object);
-	unlock_map(fs);	
-	if (fs->vp != NULL) { 
-		vput(fs->vp);
-		fs->vp = NULL;
+	unlock_map(fs);
+	unlock_vp(fs);
+}
+
+static void
+vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
+    vm_prot_t fault_type, int fault_flags, bool set_wd)
+{
+	bool need_dirty;
+
+	if (((prot & VM_PROT_WRITE) == 0 &&
+	    (fault_flags & VM_FAULT_DIRTY) == 0) ||
+	    (m->oflags & VPO_UNMANAGED) != 0)
+		return;
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+
+	need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
+	    (fault_flags & VM_FAULT_WIRE) == 0) ||
+	    (fault_flags & VM_FAULT_DIRTY) != 0;
+
+	if (set_wd)
+		vm_object_set_writeable_dirty(m->object);
+	else
+		/*
+		 * If two callers of vm_fault_dirty() with set_wd ==
+		 * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
+		 * flag set, other with flag clear, race, it is
+		 * possible for the no-NOSYNC thread to see m->dirty
+		 * != 0 and not clear VPO_NOSYNC.  Take vm_page lock
+		 * around manipulation of VPO_NOSYNC and
+		 * vm_page_dirty() call, to avoid the race and keep
+		 * m->oflags consistent.
+		 */
+		vm_page_lock(m);
+
+	/*
+	 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
+	 * if the page is already dirty to prevent data written with
+	 * the expectation of being synced from not being synced.
+	 * Likewise if this entry does not request NOSYNC then make
+	 * sure the page isn't marked NOSYNC.  Applications sharing
+	 * data should use the same flags to avoid ping ponging.
+	 */
+	if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
+		if (m->dirty == 0) {
+			m->oflags |= VPO_NOSYNC;
+		}
+	} else {
+		m->oflags &= ~VPO_NOSYNC;
 	}
-	VFS_UNLOCK_GIANT(fs->vfslocked);
-	fs->vfslocked = 0;
+
+	/*
+	 * If the fault is a write, we know that this page is being
+	 * written NOW so dirty it explicitly to save on
+	 * pmap_is_modified() calls later.
+	 *
+	 * Also tell the backing pager, if any, that it should remove
+	 * any swap backing since the page is now dirty.
+	 */
+	if (need_dirty)
+		vm_page_dirty(m);
+	if (!set_wd)
+		vm_page_unlock(m);
+	if (need_dirty)
+		vm_pager_page_unswapped(m);
 }
 
+static void
+vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m)
+{
+
+	if (m_hold != NULL) {
+		*m_hold = m;
+		vm_page_lock(m);
+		vm_page_hold(m);
+		vm_page_unlock(m);
+	}
+}
+
 /*
- * TRYPAGER - used by vm_fault to calculate whether the pager for the
- *	      current object *might* contain the page.
- *
- *	      default objects are zero-fill, there is no real pager.
+ * Unlocks fs.first_object and fs.map on success.
  */
-#define TRYPAGER	(fs.object->type != OBJT_DEFAULT && \
-			((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired))
+static int
+vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
+    int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
+{
+	vm_page_t m;
+	int rv;
 
+	MPASS(fs->vp == NULL);
+	m = vm_page_lookup(fs->first_object, fs->first_pindex);
+	/* A busy page can be mapped for read|execute access. */
+	if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
+	    vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
+		return (KERN_FAILURE);
+	rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
+	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+	if (rv != KERN_SUCCESS)
+		return (rv);
+	vm_fault_fill_hold(m_hold, m);
+	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+	VM_OBJECT_RUNLOCK(fs->first_object);
+	if (!wired)
+		vm_fault_prefault(fs, vaddr, 0, 0);
+	vm_map_lookup_done(fs->map, fs->entry);
+	curthread->td_ru.ru_minflt++;
+	return (KERN_SUCCESS);
+}
+
 /*
  *	vm_fault:
  *
@@ -242,8 +337,7 @@
 	vm_prot_t prot;
 	long ahead, behind;
 	int alloc_req, era, faultcount, nera, reqpage, result;
-	boolean_t growstack, is_first_object_locked, wired;
-	int map_generation;
+	boolean_t dead, is_first_object_locked, wired;
 	vm_object_t next_object;
 	vm_page_t marray[VM_FAULT_READ_MAX];
 	int hardfault;
@@ -252,10 +346,8 @@
 	int locked, error;
 
 	hardfault = 0;
-	growstack = TRUE;
 	PCPU_INC(cnt.v_vm_faults);
 	fs.vp = NULL;
-	fs.vfslocked = 0;
 	faultcount = reqpage = 0;
 
 RetryFault:;
@@ -265,21 +357,15 @@
 	 * search.
 	 */
 	fs.map = map;
-	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
-	    &fs.first_object, &fs.first_pindex, &prot, &wired);
+	result = vm_map_lookup(&fs.map, vaddr, fault_type |
+	    VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object,
+	    &fs.first_pindex, &prot, &wired);
 	if (result != KERN_SUCCESS) {
-		if (growstack && result == KERN_INVALID_ADDRESS &&
-		    map != kernel_map) {
-			result = vm_map_growstack(curproc, vaddr);
-			if (result != KERN_SUCCESS)
-				return (KERN_FAILURE);
-			growstack = FALSE;
-			goto RetryFault;
-		}
+		unlock_vp(&fs);
 		return (result);
 	}
 
-	map_generation = fs.map->timestamp;
+	fs.map_generation = fs.map->timestamp;
 
 	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
 		panic("vm_fault: fault on nofault entry, addr: %lx",
@@ -286,7 +372,63 @@
 		    (u_long)vaddr);
 	}
 
+	if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
+	    fs.entry->wiring_thread != curthread) {
+		vm_map_unlock_read(fs.map);
+		vm_map_lock(fs.map);
+		if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
+		    (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
+			unlock_vp(&fs);
+			fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+			vm_map_unlock_and_wait(fs.map, 0);
+		} else
+			vm_map_unlock(fs.map);
+		goto RetryFault;
+	}
+
+	MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0);
+
+	if (wired)
+		fault_type = prot | (fault_type & VM_PROT_COPY);
+	else
+		KASSERT((fault_flags & VM_FAULT_WIRE) == 0,
+		    ("!wired && VM_FAULT_WIRE"));
+
 	/*
+	 * Try to avoid lock contention on the top-level object through
+	 * special-case handling of some types of page faults, specifically,
+	 * those that are both (1) mapping an existing page from the top-
+	 * level object and (2) not having to mark that object as containing
+	 * dirty pages.  Under these conditions, a read lock on the top-level
+	 * object suffices, allowing multiple page faults of a similar type to
+	 * run in parallel on the same top-level object.
+	 */
+	if (fs.vp == NULL /* avoid locked vnode leak */ &&
+	    (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 &&
+	    /* avoid calling vm_object_set_writeable_dirty() */
+	    ((prot & VM_PROT_WRITE) == 0 ||
+	    (fs.first_object->type != OBJT_VNODE &&
+	    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
+	    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) {
+		VM_OBJECT_RLOCK(fs.first_object);
+		if ((prot & VM_PROT_WRITE) == 0 ||
+		    (fs.first_object->type != OBJT_VNODE &&
+		    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
+		    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
+			result = vm_fault_soft_fast(&fs, vaddr, prot,
+			    fault_type, fault_flags, wired, m_hold);
+			if (result == KERN_SUCCESS)
+				return (result);
+		}
+		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
+			VM_OBJECT_RUNLOCK(fs.first_object);
+			VM_OBJECT_WLOCK(fs.first_object);
+		}
+	} else {
+		VM_OBJECT_WLOCK(fs.first_object);
+	}
+
+	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
@@ -296,15 +438,11 @@
 	 * truncation operations) during I/O.  This must be done after
 	 * obtaining the vnode lock in order to avoid possible deadlocks.
 	 */
-	VM_OBJECT_LOCK(fs.first_object);
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
 	fs.lookup_still_valid = TRUE;
 
-	if (wired)
-		fault_type = prot | (fault_type & VM_PROT_COPY);
-
 	fs.first_m = NULL;
 
 	/*
@@ -314,11 +452,18 @@
 	fs.pindex = fs.first_pindex;
 	while (TRUE) {
 		/*
-		 * If the object is dead, we stop here
+		 * If the object is marked for imminent termination,
+		 * we retry here, since the collapse pass has raced
+		 * with us.  Otherwise, if we see terminally dead
+		 * object, return fail.
 		 */
-		if (fs.object->flags & OBJ_DEAD) {
+		if ((fs.object->flags & OBJ_DEAD) != 0) {
+			dead = fs.object->type == OBJT_DEAD;
 			unlock_and_deallocate(&fs);
-			return (KERN_PROTECTION_FAILURE);
+			if (dead)
+				return (KERN_PROTECTION_FAILURE);
+			pause("vmf_de", 1);
+			goto RetryFault;
 		}
 
 		/*
@@ -326,31 +471,13 @@
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
-			/* 
-			 * check for page-based copy on write.
-			 * We check fs.object == fs.first_object so
-			 * as to ensure the legacy COW mechanism is
-			 * used when the page in question is part of
-			 * a shadow object.  Otherwise, vm_page_cowfault()
-			 * removes the page from the backing object, 
-			 * which is not what we want.
-			 */
-			vm_page_lock(fs.m);
-			if ((fs.m->cow) && 
-			    (fault_type & VM_PROT_WRITE) &&
-			    (fs.object == fs.first_object)) {
-				vm_page_cowfault(fs.m);
-				unlock_and_deallocate(&fs);
-				goto RetryFault;
-			}
-
 			/*
 			 * Wait/Retry if the page is busy.  We have to do this
-			 * if the page is busy via either VPO_BUSY or 
-			 * vm_page_t->busy because the vm_pager may be using
-			 * vm_page_t->busy for pageouts ( and even pageins if
-			 * it is the vnode pager ), and we could end up trying
-			 * to pagein and pageout the same page simultaneously.
+			 * if the page is either exclusive or shared busy
+			 * because the vm_pager may be using read busy for
+			 * pageouts (and even pageins if it is the vnode
+			 * pager), and we could end up trying to pagein and
+			 * pageout the same page simultaneously.
 			 *
 			 * We can theoretically allow the busy case on a read
 			 * fault if the page is marked valid, but since such
@@ -357,10 +484,10 @@
 			 * pages are typically already pmap'd, putting that
 			 * special case in might be more effort then it is 
 			 * worth.  We cannot under any circumstances mess
-			 * around with a vm_page_t->busy page except, perhaps,
+			 * around with a shared busied page except, perhaps,
 			 * to pmap it.
 			 */
-			if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
+			if (vm_page_busied(fs.m)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
@@ -367,34 +494,33 @@
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(fs.m, PGA_REFERENCED);
-				vm_page_unlock(fs.m);
 				if (fs.object != fs.first_object) {
-					if (!VM_OBJECT_TRYLOCK(
+					if (!VM_OBJECT_TRYWLOCK(
 					    fs.first_object)) {
-						VM_OBJECT_UNLOCK(fs.object);
-						VM_OBJECT_LOCK(fs.first_object);
-						VM_OBJECT_LOCK(fs.object);
+						VM_OBJECT_WUNLOCK(fs.object);
+						VM_OBJECT_WLOCK(fs.first_object);
+						VM_OBJECT_WLOCK(fs.object);
 					}
 					vm_page_lock(fs.first_m);
 					vm_page_free(fs.first_m);
 					vm_page_unlock(fs.first_m);
 					vm_object_pip_wakeup(fs.first_object);
-					VM_OBJECT_UNLOCK(fs.first_object);
+					VM_OBJECT_WUNLOCK(fs.first_object);
 					fs.first_m = NULL;
 				}
 				unlock_map(&fs);
 				if (fs.m == vm_page_lookup(fs.object,
 				    fs.pindex)) {
-					vm_page_sleep_if_busy(fs.m, TRUE,
-					    "vmpfw");
+					vm_page_sleep_if_busy(fs.m, "vmpfw");
 				}
 				vm_object_pip_wakeup(fs.object);
-				VM_OBJECT_UNLOCK(fs.object);
+				VM_OBJECT_WUNLOCK(fs.object);
 				PCPU_INC(cnt.v_intrans);
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
-			vm_pageq_remove(fs.m);
+			vm_page_lock(fs.m);
+			vm_page_remque(fs.m);
 			vm_page_unlock(fs.m);
 
 			/*
@@ -403,7 +529,7 @@
 			 * (readable), jump to readrest, else break-out ( we
 			 * found the page ).
 			 */
-			vm_page_busy(fs.m);
+			vm_page_xbusy(fs.m);
 			if (fs.m->valid != VM_PAGE_BITS_ALL)
 				goto readrest;
 			break;
@@ -410,10 +536,12 @@
 		}
 
 		/*
-		 * Page is not resident, If this is the search termination
+		 * Page is not resident.  If this is the search termination
 		 * or the pager might contain the page, allocate a new page.
+		 * Default objects are zero-fill, there is no real pager.
 		 */
-		if (TRYPAGER || fs.object == fs.first_object) {
+		if (fs.object->type != OBJT_DEFAULT ||
+		    fs.object == fs.first_object) {
 			if (fs.pindex >= fs.object->size) {
 				unlock_and_deallocate(&fs);
 				return (KERN_PROTECTION_FAILURE);
@@ -460,9 +588,10 @@
 		 *
 		 * Attempt to fault-in the page if there is a chance that the
 		 * pager has it, and potentially fault in additional pages
-		 * at the same time.
+		 * at the same time.  For default objects simply provide
+		 * zero-filled pages.
 		 */
-		if (TRYPAGER) {
+		if (fs.object->type != OBJT_DEFAULT) {
 			int rv;
 			u_char behavior = vm_map_entry_behavior(fs.entry);
 
@@ -509,30 +638,15 @@
 			/*
 			 * Call the pager to retrieve the data, if any, after
 			 * releasing the lock on the map.  We hold a ref on
-			 * fs.object and the pages are VPO_BUSY'd.
+			 * fs.object and the pages are exclusive busied.
 			 */
 			unlock_map(&fs);
 
-vnode_lock:
-			if (fs.object->type == OBJT_VNODE) {
-				vp = fs.object->handle;
-				if (vp == fs.vp)
-					goto vnode_locked;
-				else if (fs.vp != NULL) {
-					vput(fs.vp);
-					fs.vp = NULL;
-				}
+			if (fs.object->type == OBJT_VNODE &&
+			    (vp = fs.object->handle) != fs.vp) {
+				unlock_vp(&fs);
 				locked = VOP_ISLOCKED(vp);
 
-				if (VFS_NEEDSGIANT(vp->v_mount) && !fs.vfslocked) {
-					fs.vfslocked = 1;
-					if (!mtx_trylock(&Giant)) {
-						VM_OBJECT_UNLOCK(fs.object);
-						mtx_lock(&Giant);
-						VM_OBJECT_LOCK(fs.object);
-						goto vnode_lock;
-					}
-				}
 				if (locked != LK_EXCLUSIVE)
 					locked = LK_SHARED;
 				/* Do not sleep for vnode lock while fs.m is busy */
@@ -539,10 +653,6 @@
 				error = vget(vp, locked | LK_CANRECURSE |
 				    LK_NOWAIT, curthread);
 				if (error != 0) {
-					int vfslocked;
-
-					vfslocked = fs.vfslocked;
-					fs.vfslocked = 0; /* Keep Giant */
 					vhold(vp);
 					release_page(&fs);
 					unlock_and_deallocate(&fs);
@@ -550,7 +660,6 @@
 					    LK_CANRECURSE, curthread);
 					vdrop(vp);
 					fs.vp = vp;
-					fs.vfslocked = vfslocked;
 					KASSERT(error == 0,
 					    ("vm_fault: vget failed"));
 					goto RetryFault;
@@ -557,7 +666,6 @@
 				}
 				fs.vp = vp;
 			}
-vnode_locked:
 			KASSERT(fs.vp == NULL || !fs.map->system_map,
 			    ("vm_fault: vnode-backed object mapped by system map"));
 
@@ -573,7 +681,7 @@
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
 			 *
-			 * fs.m plus the additional pages are VPO_BUSY'd.
+			 * fs.m plus the additional pages are exclusive busied.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    fs.m, behind, ahead, marray, &reqpage);
@@ -667,12 +775,12 @@
 			 */
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
-				VM_OBJECT_UNLOCK(fs.object);
+				VM_OBJECT_WUNLOCK(fs.object);
 
 				fs.object = fs.first_object;
 				fs.pindex = fs.first_pindex;
 				fs.m = fs.first_m;
-				VM_OBJECT_LOCK(fs.object);
+				VM_OBJECT_WLOCK(fs.object);
 			}
 			fs.first_m = NULL;
 
@@ -686,21 +794,22 @@
 			}
 			PCPU_INC(cnt.v_zfod);
 			fs.m->valid = VM_PAGE_BITS_ALL;
+			/* Don't try to prefault neighboring pages. */
+			faultcount = 1;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			KASSERT(fs.object != next_object,
 			    ("object loop %p", next_object));
-			VM_OBJECT_LOCK(next_object);
+			VM_OBJECT_WLOCK(next_object);
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
-			VM_OBJECT_UNLOCK(fs.object);
+			VM_OBJECT_WUNLOCK(fs.object);
 			fs.object = next_object;
 		}
 	}
 
-	KASSERT((fs.m->oflags & VPO_BUSY) != 0,
-	    ("vm_fault: not busy after main loop"));
+	vm_page_assert_xbusied(fs.m);
 
 	/*
 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
@@ -746,7 +855,7 @@
 				 */
 				((fs.object->type == OBJT_DEFAULT) ||
 				 (fs.object->type == OBJT_SWAP)) &&
-			    (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object)) &&
+			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
 				/*
 				 * We don't chase down the shadow chain
 				 */
@@ -762,10 +871,20 @@
 				 * process'es object.  The page is 
 				 * automatically made dirty.
 				 */
-				vm_page_lock(fs.m);
-				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
-				vm_page_unlock(fs.m);
-				vm_page_busy(fs.m);
+				if (vm_page_rename(fs.m, fs.first_object,
+				    fs.first_pindex)) {
+					unlock_and_deallocate(&fs);
+					goto RetryFault;
+				}
+#if VM_NRESERVLEVEL > 0
+				/*
+				 * Rename the reservation.
+				 */
+				vm_reserv_rename(fs.m, fs.first_object,
+				    fs.object, OFF_TO_IDX(
+				    fs.first_object->backing_object_offset));
+#endif
+				vm_page_xbusy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
 				PCPU_INC(cnt.v_cow_optim);
@@ -775,8 +894,12 @@
 				 */
 				pmap_copy_page(fs.m, fs.first_m);
 				fs.first_m->valid = VM_PAGE_BITS_ALL;
+				if ((fault_flags & VM_FAULT_WIRE) == 0) {
+					prot &= ~VM_PROT_WRITE;
+					fault_type &= ~VM_PROT_WRITE;
+				}
 				if (wired && (fault_flags &
-				    VM_FAULT_CHANGE_WIRING) == 0) {
+				    VM_FAULT_WIRE) == 0) {
 					vm_page_lock(fs.first_m);
 					vm_page_wire(fs.first_m);
 					vm_page_unlock(fs.first_m);
@@ -795,7 +918,7 @@
 			 * conditional
 			 */
 			vm_object_pip_wakeup(fs.object);
-			VM_OBJECT_UNLOCK(fs.object);
+			VM_OBJECT_WUNLOCK(fs.object);
 			/*
 			 * Only use the new page below...
 			 */
@@ -803,7 +926,7 @@
 			fs.pindex = fs.first_pindex;
 			fs.m = fs.first_m;
 			if (!is_first_object_locked)
-				VM_OBJECT_LOCK(fs.object);
+				VM_OBJECT_WLOCK(fs.object);
 			PCPU_INC(cnt.v_cow_faults);
 			curthread->td_cow++;
 		} else {
@@ -826,7 +949,7 @@
 			goto RetryFault;
 		}
 		fs.lookup_still_valid = TRUE;
-		if (fs.map->timestamp != map_generation) {
+		if (fs.map->timestamp != fs.map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 
@@ -878,53 +1001,16 @@
 	if (hardfault)
 		fs.entry->next_read = fs.pindex + faultcount - reqpage;
 
-	if ((prot & VM_PROT_WRITE) != 0 ||
-	    (fault_flags & VM_FAULT_DIRTY) != 0) {
-		vm_object_set_writeable_dirty(fs.object);
+	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
+	vm_page_assert_xbusied(fs.m);
 
-		/*
-		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
-		 * if the page is already dirty to prevent data written with
-		 * the expectation of being synced from not being synced.
-		 * Likewise if this entry does not request NOSYNC then make
-		 * sure the page isn't marked NOSYNC.  Applications sharing
-		 * data should use the same flags to avoid ping ponging.
-		 */
-		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
-			if (fs.m->dirty == 0)
-				fs.m->oflags |= VPO_NOSYNC;
-		} else {
-			fs.m->oflags &= ~VPO_NOSYNC;
-		}
-
-		/*
-		 * If the fault is a write, we know that this page is being
-		 * written NOW so dirty it explicitly to save on 
-		 * pmap_is_modified() calls later.
-		 *
-		 * Also tell the backing pager, if any, that it should remove
-		 * any swap backing since the page is now dirty.
-		 */
-		if (((fault_type & VM_PROT_WRITE) != 0 &&
-		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
-		    (fault_flags & VM_FAULT_DIRTY) != 0) {
-			vm_page_dirty(fs.m);
-			vm_pager_page_unswapped(fs.m);
-		}
-	}
-
 	/*
-	 * Page had better still be busy
-	 */
-	KASSERT(fs.m->oflags & VPO_BUSY,
-		("vm_fault: page %p not busy!", fs.m));
-	/*
 	 * Page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
 	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_fault: page %p partially invalid", fs.m));
-	VM_OBJECT_UNLOCK(fs.object);
+	VM_OBJECT_WUNLOCK(fs.object);
 
 	/*
 	 * Put this page into the physical map.  We had to do the unlock above
@@ -932,10 +1018,12 @@
 	 * back on the active queue until later so that the pageout daemon
 	 * won't find it (yet).
 	 */
-	pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired);
-	if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0)
-		vm_fault_prefault(fs.map->pmap, vaddr, fs.entry);
-	VM_OBJECT_LOCK(fs.object);
+	pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
+	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
+	if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
+	    wired == 0)
+		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
+	VM_OBJECT_WLOCK(fs.object);
 	vm_page_lock(fs.m);
 
 	/*
@@ -942,11 +1030,9 @@
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
-	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
-		if (wired)
-			vm_page_wire(fs.m);
-		else
-			vm_page_unwire(fs.m, 1);
+	if ((fault_flags & VM_FAULT_WIRE) != 0) {
+		KASSERT(wired, ("VM_FAULT_WIRE && !wired"));
+		vm_page_wire(fs.m);
 	} else
 		vm_page_activate(fs.m);
 	if (m_hold != NULL) {
@@ -954,15 +1040,16 @@
 		vm_page_hold(fs.m);
 	}
 	vm_page_unlock(fs.m);
-	vm_page_wakeup(fs.m);
+	vm_page_xunbusy(fs.m);
 
 	/*
 	 * Unlock everything, and return
 	 */
 	unlock_and_deallocate(&fs);
-	if (hardfault)
+	if (hardfault) {
+		PCPU_INC(cnt.v_io_faults);
 		curthread->td_ru.ru_majflt++;
-	else
+	} else 
 		curthread->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
@@ -980,17 +1067,17 @@
 	vm_pindex_t pindex;
 
 	object = fs->object;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	first_object = fs->first_object;
 	if (first_object != object) {
-		if (!VM_OBJECT_TRYLOCK(first_object)) {
-			VM_OBJECT_UNLOCK(object);
-			VM_OBJECT_LOCK(first_object);
-			VM_OBJECT_LOCK(object);
+		if (!VM_OBJECT_TRYWLOCK(first_object)) {
+			VM_OBJECT_WUNLOCK(object);
+			VM_OBJECT_WLOCK(first_object);
+			VM_OBJECT_WLOCK(object);
 		}
 	}
-	if (first_object->type != OBJT_DEVICE &&
-	    first_object->type != OBJT_PHYS && first_object->type != OBJT_SG) {
+	/* Neither fictitious nor unmanaged pages can be cached. */
+	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 		if (fs->first_pindex < distance)
 			pindex = 0;
 		else
@@ -998,13 +1085,12 @@
 		if (pindex < OFF_TO_IDX(fs->entry->offset))
 			pindex = OFF_TO_IDX(fs->entry->offset);
 		m = first_object != object ? fs->first_m : fs->m;
-		KASSERT((m->oflags & VPO_BUSY) != 0,
-		    ("vm_fault_cache_behind: page %p is not busy", m));
+		vm_page_assert_xbusied(m);
 		m_prev = vm_page_prev(m);
 		while ((m = m_prev) != NULL && m->pindex >= pindex &&
 		    m->valid == VM_PAGE_BITS_ALL) {
 			m_prev = vm_page_prev(m);
-			if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0)
+			if (vm_page_busied(m))
 				continue;
 			vm_page_lock(m);
 			if (m->hold_count == 0 && m->wire_count == 0) {
@@ -1019,7 +1105,7 @@
 		}
 	}
 	if (first_object != object)
-		VM_OBJECT_UNLOCK(first_object);
+		VM_OBJECT_WUNLOCK(first_object);
 }
 
 /*
@@ -1029,31 +1115,50 @@
  * of mmap time.
  */
 static void
-vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
+vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
+    int faultcount, int reqpage)
 {
-	int i;
+	pmap_t pmap;
+	vm_map_entry_t entry;
+	vm_object_t backing_object, lobject;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
-	vm_object_t object;
+	int backward, forward, i;
 
+	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
-	object = entry->object.vm_object;
+	if (faultcount > 0) {
+		backward = reqpage;
+		forward = faultcount - reqpage - 1;
+	} else {
+		backward = PFBAK;
+		forward = PFFOR;
+	}
+	entry = fs->entry;
 
-	starta = addra - PFBAK * PAGE_SIZE;
-	if (starta < entry->start) {
+	if (addra < backward * PAGE_SIZE) {
 		starta = entry->start;
-	} else if (starta > addra) {
-		starta = 0;
+	} else {
+		starta = addra - backward * PAGE_SIZE;
+		if (starta < entry->start)
+			starta = entry->start;
 	}
 
-	for (i = 0; i < PAGEORDER_SIZE; i++) {
-		vm_object_t backing_object, lobject;
-
-		addr = addra + prefault_pageorder[i];
-		if (addr > addra + (PFFOR * PAGE_SIZE))
+	/*
+	 * Generate the sequence of virtual addresses that are candidates for
+	 * prefaulting in an outward spiral from the faulting virtual address,
+	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
+	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
+	 * If the candidate address doesn't have a backing physical page, then
+	 * the loop immediately terminates.
+	 */
+	for (i = 0; i < 2 * imax(backward, forward); i++) {
+		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
+		    PAGE_SIZE);
+		if (addr > addra + forward * PAGE_SIZE)
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
@@ -1063,8 +1168,8 @@
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
-		lobject = object;
-		VM_OBJECT_LOCK(lobject);
+		lobject = entry->object.vm_object;
+		VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
@@ -1071,21 +1176,18 @@
 			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
-			VM_OBJECT_LOCK(backing_object);
-			VM_OBJECT_UNLOCK(lobject);
+			VM_OBJECT_RLOCK(backing_object);
+			VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
-		/*
-		 * give-up when a page is not in memory
-		 */
 		if (m == NULL) {
-			VM_OBJECT_UNLOCK(lobject);
+			VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (m->valid == VM_PAGE_BITS_ALL &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
-		VM_OBJECT_UNLOCK(lobject);
+		VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
@@ -1108,7 +1210,7 @@
 
 	if (len == 0)
 		return (0);
-	end = round_page(addr + len);	
+	end = round_page(addr + len);
 	addr = trunc_page(addr);
 
 	/*
@@ -1117,9 +1219,9 @@
 	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
 		return (-1);
 
-	count = howmany(end - addr, PAGE_SIZE);
-	if (count > max_count)
+	if (atop(end - addr) > max_count)
 		panic("vm_fault_quick_hold_pages: count > max_count");
+	count = atop(end - addr);
 
 	/*
 	 * Most likely, the physical pages are resident in the pmap, so it is
@@ -1168,68 +1270,6 @@
 }
 
 /*
- *	vm_fault_wire:
- *
- *	Wire down a range of virtual addresses in a map.
- */
-int
-vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
-    boolean_t fictitious)
-{
-	vm_offset_t va;
-	int rv;
-
-	/*
-	 * We simulate a fault to get the page and enter it in the physical
-	 * map.  For user wiring, we only ask for read access on currently
-	 * read-only sections.
-	 */
-	for (va = start; va < end; va += PAGE_SIZE) {
-		rv = vm_fault(map, va, VM_PROT_NONE, VM_FAULT_CHANGE_WIRING);
-		if (rv) {
-			if (va != start)
-				vm_fault_unwire(map, start, va, fictitious);
-			return (rv);
-		}
-	}
-	return (KERN_SUCCESS);
-}
-
-/*
- *	vm_fault_unwire:
- *
- *	Unwire a range of virtual addresses in a map.
- */
-void
-vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
-    boolean_t fictitious)
-{
-	vm_paddr_t pa;
-	vm_offset_t va;
-	vm_page_t m;
-	pmap_t pmap;
-
-	pmap = vm_map_pmap(map);
-
-	/*
-	 * Since the pages are wired down, we must be able to get their
-	 * mappings from the physical map system.
-	 */
-	for (va = start; va < end; va += PAGE_SIZE) {
-		pa = pmap_extract(pmap, va);
-		if (pa != 0) {
-			pmap_change_wiring(pmap, va, FALSE);
-			if (!fictitious) {
-				m = PHYS_TO_VM_PAGE(pa);
-				vm_page_lock(m);
-				vm_page_unwire(m, TRUE);
-				vm_page_unlock(m);
-			}
-		}
-	}
-}
-
-/*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
@@ -1254,7 +1294,7 @@
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
-	boolean_t src_readonly, upgrade;
+	boolean_t upgrade;
 
 #ifdef	lint
 	src_map++;
@@ -1261,28 +1301,35 @@
 #endif	/* lint */
 
 	upgrade = src_entry == dst_entry;
+	access = prot = dst_entry->protection;
 
 	src_object = src_entry->object.vm_object;
 	src_pindex = OFF_TO_IDX(src_entry->offset);
-	src_readonly = (src_entry->protection & VM_PROT_WRITE) == 0;
 
-	/*
-	 * Create the top-level object for the destination entry. (Doesn't
-	 * actually shadow anything - we copy the pages directly.)
-	 */
-	dst_object = vm_object_allocate(OBJT_DEFAULT,
-	    OFF_TO_IDX(dst_entry->end - dst_entry->start));
+	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
+		dst_object = src_object;
+		vm_object_reference(dst_object);
+	} else {
+		/*
+		 * Create the top-level object for the destination entry. (Doesn't
+		 * actually shadow anything - we copy the pages directly.)
+		 */
+		dst_object = vm_object_allocate(OBJT_DEFAULT,
+		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
 #if VM_NRESERVLEVEL > 0
-	dst_object->flags |= OBJ_COLORED;
-	dst_object->pg_color = atop(dst_entry->start);
+		dst_object->flags |= OBJ_COLORED;
+		dst_object->pg_color = atop(dst_entry->start);
 #endif
+	}
 
-	VM_OBJECT_LOCK(dst_object);
+	VM_OBJECT_WLOCK(dst_object);
 	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 	    ("vm_fault_copy_entry: vm_object not NULL"));
-	dst_entry->object.vm_object = dst_object;
-	dst_entry->offset = 0;
-	dst_object->charge = dst_entry->end - dst_entry->start;
+	if (src_object != dst_object) {
+		dst_entry->object.vm_object = dst_object;
+		dst_entry->offset = 0;
+		dst_object->charge = dst_entry->end - dst_entry->start;
+	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
 		    ("vm_fault_copy_entry: leaked swp charge"));
@@ -1289,11 +1336,13 @@
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
-	} else {
+	} else if (dst_object->cred == NULL) {
+		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
+		    dst_entry));
 		dst_object->cred = dst_entry->cred;
 		dst_entry->cred = NULL;
 	}
-	access = prot = dst_entry->protection;
+
 	/*
 	 * If not an upgrade, then enter the mappings in the pmap as
 	 * read and/or execute accesses.  Otherwise, enter them as
@@ -1319,75 +1368,100 @@
 	for (vaddr = dst_entry->start, dst_pindex = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_pindex++) {
-
+again:
 		/*
-		 * Allocate a page in the destination object.
-		 */
-		do {
-			dst_m = vm_page_alloc(dst_object, dst_pindex,
-			    VM_ALLOC_NORMAL);
-			if (dst_m == NULL) {
-				VM_OBJECT_UNLOCK(dst_object);
-				VM_WAIT;
-				VM_OBJECT_LOCK(dst_object);
-			}
-		} while (dst_m == NULL);
-
-		/*
 		 * Find the page in the source object, and copy it in.
-		 * (Because the source is wired down, the page will be in
-		 * memory.)
+		 * Because the source is wired down, the page will be
+		 * in memory.
 		 */
-		VM_OBJECT_LOCK(src_object);
+		if (src_object != dst_object)
+			VM_OBJECT_RLOCK(src_object);
 		object = src_object;
 		pindex = src_pindex + dst_pindex;
 		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
-		    src_readonly &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
-			 * Allow fallback to backing objects if we are reading.
+			 * Unless the source mapping is read-only or
+			 * it is presently being upgraded from
+			 * read-only, the first object in the shadow
+			 * chain should provide all of the pages.  In
+			 * other words, this loop body should never be
+			 * executed when the source mapping is already
+			 * read/write.
 			 */
-			VM_OBJECT_LOCK(backing_object);
+			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
+			    upgrade,
+			    ("vm_fault_copy_entry: main object missing page"));
+
+			VM_OBJECT_RLOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
-			VM_OBJECT_UNLOCK(object);
+			if (object != dst_object)
+				VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
-		if (src_m == NULL)
-			panic("vm_fault_copy_wired: page missing");
-		pmap_copy_page(src_m, dst_m);
-		VM_OBJECT_UNLOCK(object);
-		dst_m->valid = VM_PAGE_BITS_ALL;
-		dst_m->dirty = VM_PAGE_BITS_ALL;
-		VM_OBJECT_UNLOCK(dst_object);
+		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 
+		if (object != dst_object) {
+			/*
+			 * Allocate a page in the destination object.
+			 */
+			dst_m = vm_page_alloc(dst_object, (src_object ==
+			    dst_object ? src_pindex : 0) + dst_pindex,
+			    VM_ALLOC_NORMAL);
+			if (dst_m == NULL) {
+				VM_OBJECT_WUNLOCK(dst_object);
+				VM_OBJECT_RUNLOCK(object);
+				VM_WAIT;
+				VM_OBJECT_WLOCK(dst_object);
+				goto again;
+			}
+			pmap_copy_page(src_m, dst_m);
+			VM_OBJECT_RUNLOCK(object);
+			dst_m->valid = VM_PAGE_BITS_ALL;
+			dst_m->dirty = VM_PAGE_BITS_ALL;
+		} else {
+			dst_m = src_m;
+			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
+				goto again;
+			vm_page_xbusy(dst_m);
+			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
+			    ("invalid dst page %p", dst_m));
+		}
+		VM_OBJECT_WUNLOCK(dst_object);
+
 		/*
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
 		 */
-		pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade);
+		pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
+		    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
-		VM_OBJECT_LOCK(dst_object);
+		VM_OBJECT_WLOCK(dst_object);
 		
 		if (upgrade) {
-			vm_page_lock(src_m);
-			vm_page_unwire(src_m, 0);
-			vm_page_unlock(src_m);
-
-			vm_page_lock(dst_m);
-			vm_page_wire(dst_m);
-			vm_page_unlock(dst_m);
+			if (src_m != dst_m) {
+				vm_page_lock(src_m);
+				vm_page_unwire(src_m, 0);
+				vm_page_unlock(src_m);
+				vm_page_lock(dst_m);
+				vm_page_wire(dst_m);
+				vm_page_unlock(dst_m);
+			} else {
+				KASSERT(dst_m->wire_count > 0,
+				    ("dst_m %p is not wired", dst_m));
+			}
 		} else {
 			vm_page_lock(dst_m);
 			vm_page_activate(dst_m);
 			vm_page_unlock(dst_m);
 		}
-		vm_page_wakeup(dst_m);
+		vm_page_xunbusy(dst_m);
 	}
-	VM_OBJECT_UNLOCK(dst_object);
+	VM_OBJECT_WUNLOCK(dst_object);
 	if (upgrade) {
 		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 		vm_object_deallocate(src_object);
@@ -1423,7 +1497,7 @@
 	vm_page_t rtm;
 	int cbehind, cahead;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	object = m->object;
 	pindex = m->pindex;

Modified: trunk/sys/vm/vm_glue.c
===================================================================
--- trunk/sys/vm/vm_glue.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_glue.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,24 +58,28 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
+#include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
+#include <sys/vmem.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/_kstack_cache.h>
@@ -95,15 +100,7 @@
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
-/*
- * System initialization
- *
- * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
- *
- * Note: run scheduling should be divorced from the vm system.
- */
-static void scheduler(void *);
-SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL);
+#include <machine/cpu.h>
 
 #ifndef NO_SWAPPING
 static int swapout(struct proc *);
@@ -238,9 +235,9 @@
 	vm_pindex_t pindex;
 	int rv;
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
-	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		ma[0] = m;
 		rv = vm_pager_get_pages(object, ma, 1, 0);
@@ -255,12 +252,13 @@
 			goto out;
 		}
 	}
+	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
+	vm_page_activate(m);
 	vm_page_unlock(m);
-	vm_page_wakeup(m);
 out:
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	return (m);
 }
 
@@ -307,6 +305,8 @@
 static int kstack_cache_size = 128;
 static int kstacks;
 static struct mtx kstack_cache_mtx;
+MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
+
 SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
     "");
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
@@ -364,11 +364,13 @@
 	 * We need to align the kstack's mapped address to fit within
 	 * a single TLB entry.
 	 */
-	ks = kmem_alloc_nofault_space(kernel_map,
-	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE, VMFS_TLB_ALIGNED_SPACE);
+	if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
+	    PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
+	    M_BESTFIT | M_NOWAIT, &ks)) {
+		ks = 0;
+	}
 #else
-	ks = kmem_alloc_nofault(kernel_map,
-	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
+	ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 #endif
 	if (ks == 0) {
 		printf("vm_thread_new: kstack allocation failed\n");
@@ -392,17 +394,17 @@
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
-	VM_OBJECT_LOCK(ksobj);
+	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		/*
 		 * Get a kernel stack page.
 		 */
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
+		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 		ma[i] = m;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
-	VM_OBJECT_UNLOCK(ksobj);
+	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
 }
@@ -415,7 +417,7 @@
 
 	atomic_add_int(&kstacks, -1);
 	pmap_qremove(ks, pages);
-	VM_OBJECT_LOCK(ksobj);
+	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
@@ -425,9 +427,9 @@
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
-	VM_OBJECT_UNLOCK(ksobj);
+	VM_OBJECT_WUNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
-	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
+	kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
@@ -486,9 +488,54 @@
 	    EVENTHANDLER_PRI_ANY);
 }
 
-MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
 SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
 
+#ifdef KSTACK_USAGE_PROF
+/*
+ * Track maximum stack used by a thread in kernel.
+ */
+static int max_kstack_used;
+
+SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
+    &max_kstack_used, 0,
+    "Maxiumum stack depth used by a thread in kernel");
+
+void
+intr_prof_stack_use(struct thread *td, struct trapframe *frame)
+{
+	vm_offset_t stack_top;
+	vm_offset_t current;
+	int used, prev_used;
+
+	/*
+	 * Testing for interrupted kernel mode isn't strictly
+	 * needed. It optimizes the execution, since interrupts from
+	 * usermode will have only the trap frame on the stack.
+	 */
+	if (TRAPF_USERMODE(frame))
+		return;
+
+	stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
+	current = (vm_offset_t)(uintptr_t)&stack_top;
+
+	/*
+	 * Try to detect if interrupt is using kernel thread stack.
+	 * Hardware could use a dedicated stack for interrupt handling.
+	 */
+	if (stack_top <= current || current < td->td_kstack)
+		return;
+
+	used = stack_top - current;
+	for (;;) {
+		prev_used = max_kstack_used;
+		if (prev_used >= used)
+			break;
+		if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
+			break;
+	}
+}
+#endif /* KSTACK_USAGE_PROF */
+
 #ifndef NO_SWAPPING
 /*
  * Allow a thread's kernel stack to be paged out.
@@ -504,7 +551,7 @@
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
-	VM_OBJECT_LOCK(ksobj);
+	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
@@ -514,7 +561,7 @@
 		vm_page_unwire(m, 0);
 		vm_page_unlock(m);
 	}
-	VM_OBJECT_UNLOCK(ksobj);
+	VM_OBJECT_WUNLOCK(ksobj);
 }
 
 /*
@@ -529,19 +576,17 @@
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
-	VM_OBJECT_LOCK(ksobj);
+	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++)
-		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
+		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
 		    VM_ALLOC_WIRED);
 	for (i = 0; i < pages; i++) {
 		if (ma[i]->valid != VM_PAGE_BITS_ALL) {
-			KASSERT(ma[i]->oflags & VPO_BUSY,
-			    ("lost busy 1"));
+			vm_page_assert_xbusied(ma[i]);
 			vm_object_pip_add(ksobj, 1);
 			for (j = i + 1; j < pages; j++) {
-				KASSERT(ma[j]->valid == VM_PAGE_BITS_ALL ||
-				    (ma[j]->oflags & VPO_BUSY),
-				    ("lost busy 2"));
+				if (ma[j]->valid != VM_PAGE_BITS_ALL)
+					vm_page_assert_xbusied(ma[j]);
 				if (ma[j]->valid == VM_PAGE_BITS_ALL)
 					break;
 			}
@@ -552,11 +597,11 @@
 			vm_object_pip_wakeup(ksobj);
 			for (k = i; k < j; k++)
 				ma[k] = vm_page_lookup(ksobj, k);
-			vm_page_wakeup(ma[i]);
-		} else if (ma[i]->oflags & VPO_BUSY)
-			vm_page_wakeup(ma[i]);
+			vm_page_xunbusy(ma[i]);
+		} else if (vm_page_xbusied(ma[i]))
+			vm_page_xunbusy(ma[i]);
 	}
-	VM_OBJECT_UNLOCK(ksobj);
+	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
@@ -688,13 +733,9 @@
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
- *
- * Giant is held on entry.
  */
-/* ARGSUSED*/
-static void
-scheduler(dummy)
-	void *dummy;
+void
+swapper(void)
 {
 	struct proc *p;
 	struct thread *td;
@@ -704,9 +745,6 @@
 	int ppri;
 	int pri;
 
-	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
-	mtx_unlock(&Giant);
-
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
@@ -757,7 +795,7 @@
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
-		tsleep(&proc0, PVM, "sched", MAXSLP * hz / 2);
+		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
 		goto loop;
 	}
 	PROC_LOCK(p);

Modified: trunk/sys/vm/vm_init.c
===================================================================
--- trunk/sys/vm/vm_init.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_init.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -63,13 +64,14 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/selinfo.h>
@@ -76,6 +78,7 @@
 #include <sys/pipe.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
+#include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -100,6 +103,26 @@
 SYSINIT(vm_mem, SI_SUB_VM, SI_ORDER_FIRST, vm_mem_init, NULL);
 
 /*
+ * Import kva into the kernel arena.
+ */
+static int
+kva_import(void *unused, vmem_size_t size, int flags, vmem_addr_t *addrp)
+{
+	vm_offset_t addr;
+	int result;
+ 
+	addr = vm_map_min(kernel_map);
+	result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
+	    VMFS_SUPER_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+	if (result != KERN_SUCCESS)
+                return (ENOMEM);
+
+	*addrp = addr;
+
+	return (0);
+}
+
+/*
  *	vm_init initializes the virtual memory system.
  *	This is done only by the first cpu up.
  *
@@ -110,6 +133,7 @@
 vm_mem_init(dummy)
 	void *dummy;
 {
+
 	/*
 	 * Initializes resident memory structures. From here on, all physical
 	 * memory is accounted for, and we use only virtual addresses.
@@ -120,9 +144,24 @@
 	/*
 	 * Initialize other VM packages
 	 */
+	vmem_startup();
 	vm_object_init();
 	vm_map_startup();
 	kmem_init(virtual_avail, virtual_end);
+
+	/*
+	 * Initialize the kernel_arena.  This can grow on demand.
+	 */
+	vmem_init(kernel_arena, "kernel arena", 0, 0, PAGE_SIZE, 0, 0);
+	vmem_set_import(kernel_arena, kva_import, NULL, NULL,
+#if VM_NRESERVLEVEL > 0
+	    1 << (VM_LEVEL_0_ORDER + PAGE_SHIFT));
+#else
+	    /* On non-superpage architectures want large import sizes. */
+	    PAGE_SIZE * 1024);
+#endif
+
+	kmem_init_zero_region();
 	pmap_init();
 	vm_pager_init();
 }
@@ -136,7 +175,6 @@
 	long physmem_est;
 	vm_offset_t minaddr;
 	vm_offset_t maxaddr;
-	vm_map_t clean_map;
 
 	/*
 	 * Allocate space for system data structures.
@@ -144,8 +182,6 @@
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
-	 * An index into the kernel page table corresponding to the
-	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
@@ -157,8 +193,6 @@
 again:
 	v = (caddr_t)firstaddr;
 
-	v = kern_timeout_callwheel_alloc(v);
-
 	/*
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
@@ -173,7 +207,8 @@
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)v;
-		firstaddr = kmem_alloc(kernel_map, round_page(size));
+		firstaddr = kmem_malloc(kernel_arena, round_page(size),
+		    M_ZERO | M_WAITOK);
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
@@ -185,27 +220,49 @@
 	if ((vm_size_t)((char *)v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
-	clean_map = kmem_suballoc(kernel_map, &kmi->clean_sva, &kmi->clean_eva,
-	    (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS, TRUE);
-	buffer_map = kmem_suballoc(clean_map, &kmi->buffer_sva,
-	    &kmi->buffer_eva, (long)nbuf * BKVASIZE, FALSE);
-	buffer_map->system_map = 1;
-	pager_map = kmem_suballoc(clean_map, &kmi->pager_sva, &kmi->pager_eva,
-	    (long)nswbuf * MAXPHYS, FALSE);
-	pager_map->system_map = 1;
-	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
-	    exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
-	pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
-	    FALSE);
+	/*
+	 * Allocate the clean map to hold all of the paging and I/O virtual
+	 * memory.
+	 */
+	size = (long)nbuf * BKVASIZE + (long)nswbuf * MAXPHYS +
+	    (long)bio_transient_maxcnt * MAXPHYS;
+	kmi->clean_sva = firstaddr = kva_alloc(size);
+	kmi->clean_eva = firstaddr + size;
 
 	/*
-	 * XXX: Mbuf system machine-specific initializations should
-	 *      go here, if anywhere.
+	 * Allocate the buffer arena.
 	 */
+	size = (long)nbuf * BKVASIZE;
+	kmi->buffer_sva = firstaddr;
+	kmi->buffer_eva = kmi->buffer_sva + size;
+	vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
+	    PAGE_SIZE, 0, 0);
+	firstaddr += size;
 
 	/*
-	 * Initialize the callouts we just allocated.
+	 * Now swap kva.
 	 */
-	kern_timeout_callwheel_init();
+	swapbkva = firstaddr;
+	size = (long)nswbuf * MAXPHYS;
+	firstaddr += size;
+
+	/*
+	 * And optionally transient bio space.
+	 */
+	if (bio_transient_maxcnt != 0) {
+		size = (long)bio_transient_maxcnt * MAXPHYS;
+		vmem_init(transient_arena, "transient arena",
+		    firstaddr, size, PAGE_SIZE, 0, 0);
+		firstaddr += size;
+	}
+	if (firstaddr != kmi->clean_eva)
+		panic("Clean map calculation incorrect");
+
+	/*
+ 	 * Allocate the pageable submaps.
+	 */
+	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
+	    exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
+	pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
+	    FALSE);
 }
-

Modified: trunk/sys/vm/vm_kern.c
===================================================================
--- trunk/sys/vm/vm_kern.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_kern.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -63,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -70,13 +71,15 @@
 #include <sys/kernel.h>		/* for ticks and hz */
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
-#include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
+#include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
+#include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
@@ -86,16 +89,28 @@
 #include <vm/uma.h>
 
 vm_map_t kernel_map;
-vm_map_t kmem_map;
 vm_map_t exec_map;
 vm_map_t pipe_map;
-vm_map_t buffer_map;
 
 const void *zero_region;
 CTASSERT((ZERO_REGION_SIZE & PAGE_MASK) == 0);
 
+/* NB: Used by kernel debuggers. */
+const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
+
+SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
+    SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
+
+SYSCTL_ULONG(_vm, OID_AUTO, max_kernel_address, CTLFLAG_RD,
+#if defined(__arm__) || defined(__sparc64__)
+    &vm_max_kernel_address, 0,
+#else
+    SYSCTL_NULL_ULONG_PTR, VM_MAX_KERNEL_ADDRESS,
+#endif
+    "Max kernel address");
+
 /*
- *	kmem_alloc_nofault:
+ *	kva_alloc:
  *
  *	Allocate a virtual address range with no underlying object and
  *	no initial mapping to physical memory.  Any mapping from this
@@ -104,113 +119,137 @@
  *	a mapping on demand through vm_fault() will result in a panic. 
  */
 vm_offset_t
-kmem_alloc_nofault(map, size)
-	vm_map_t map;
-	vm_size_t size;
+kva_alloc(vm_size_t size)
 {
 	vm_offset_t addr;
-	int result;
 
 	size = round_page(size);
-	addr = vm_map_min(map);
-	result = vm_map_find(map, NULL, 0, &addr, size, VMFS_ANY_SPACE,
-	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
-	if (result != KERN_SUCCESS) {
+	if (vmem_alloc(kernel_arena, size, M_BESTFIT | M_NOWAIT, &addr))
 		return (0);
-	}
+
 	return (addr);
 }
 
 /*
- *	kmem_alloc_nofault_space:
+ *	kva_free:
  *
- *	Allocate a virtual address range with no underlying object and
- *	no initial mapping to physical memory within the specified
- *	address space.  Any mapping from this range to physical memory
- *	must be explicitly created prior to its use, typically with
- *	pmap_qenter().  Any attempt to create a mapping on demand
- *	through vm_fault() will result in a panic. 
+ *	Release a region of kernel virtual memory allocated
+ *	with kva_alloc, and return the physical pages
+ *	associated with that region.
+ *
+ *	This routine may not block on kernel maps.
  */
-vm_offset_t
-kmem_alloc_nofault_space(map, size, find_space)
-	vm_map_t map;
-	vm_size_t size;
-	int find_space;
+void
+kva_free(vm_offset_t addr, vm_size_t size)
 {
-	vm_offset_t addr;
-	int result;
 
 	size = round_page(size);
-	addr = vm_map_min(map);
-	result = vm_map_find(map, NULL, 0, &addr, size, find_space,
-	    VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
-	if (result != KERN_SUCCESS) {
-		return (0);
-	}
-	return (addr);
+	vmem_free(kernel_arena, addr, size);
 }
 
 /*
- *	Allocate wired-down memory in the kernel's address map
- *	or a submap.
+ *	Allocates a region from the kernel address map and physical pages
+ *	within the specified address range to the kernel object.  Creates a
+ *	wired mapping from this region to these pages, and returns the
+ *	region's starting virtual address.  The allocated pages are not
+ *	necessarily physically contiguous.  If M_ZERO is specified through the
+ *	given flags, then the pages are zeroed before they are mapped.
  */
 vm_offset_t
-kmem_alloc(map, size)
-	vm_map_t map;
-	vm_size_t size;
+kmem_alloc_attr(vmem_t *vmem, vm_size_t size, int flags, vm_paddr_t low,
+    vm_paddr_t high, vm_memattr_t memattr)
 {
-	vm_offset_t addr;
-	vm_offset_t offset;
+	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
+	vm_offset_t addr, i;
+	vm_ooffset_t offset;
+	vm_page_t m;
+	int pflags, tries;
 
 	size = round_page(size);
-
-	/*
-	 * Use the kernel object for wired-down kernel pages. Assume that no
-	 * region of the kernel object is referenced more than once.
-	 */
-
-	/*
-	 * Locate sufficient space in the map.  This will give us the final
-	 * virtual address for the new memory, and thus will tell us the
-	 * offset within the kernel map.
-	 */
-	vm_map_lock(map);
-	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
-		vm_map_unlock(map);
+	if (vmem_alloc(vmem, size, M_BESTFIT | flags, &addr))
 		return (0);
+	offset = addr - VM_MIN_KERNEL_ADDRESS;
+	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	VM_OBJECT_WLOCK(object);
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		tries = 0;
+retry:
+		m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i),
+		    pflags, 1, low, high, PAGE_SIZE, 0, memattr);
+		if (m == NULL) {
+			VM_OBJECT_WUNLOCK(object);
+			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+				vm_pageout_grow_cache(tries, low, high);
+				VM_OBJECT_WLOCK(object);
+				tries++;
+				goto retry;
+			}
+			kmem_unback(object, addr, i);
+			vmem_free(vmem, addr, size);
+			return (0);
+		}
+		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
+			pmap_zero_page(m);
+		m->valid = VM_PAGE_BITS_ALL;
+		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
+		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
 	}
-	offset = addr - VM_MIN_KERNEL_ADDRESS;
-	vm_object_reference(kernel_object);
-	vm_map_insert(map, kernel_object, offset, addr, addr + size,
-		VM_PROT_ALL, VM_PROT_ALL, 0);
-	vm_map_unlock(map);
-
-	/*
-	 * And finally, mark the data as non-pageable.
-	 */
-	(void) vm_map_wire(map, addr, addr + size,
-	    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
-
+	VM_OBJECT_WUNLOCK(object);
 	return (addr);
 }
 
 /*
- *	kmem_free:
- *
- *	Release a region of kernel virtual memory allocated
- *	with kmem_alloc, and return the physical pages
- *	associated with that region.
- *
- *	This routine may not block on kernel maps.
+ *	Allocates a region from the kernel address map and physically
+ *	contiguous pages within the specified address range to the kernel
+ *	object.  Creates a wired mapping from this region to these pages, and
+ *	returns the region's starting virtual address.  If M_ZERO is specified
+ *	through the given flags, then the pages are zeroed before they are
+ *	mapped.
  */
-void
-kmem_free(map, addr, size)
-	vm_map_t map;
-	vm_offset_t addr;
-	vm_size_t size;
+vm_offset_t
+kmem_alloc_contig(struct vmem *vmem, vm_size_t size, int flags, vm_paddr_t low,
+    vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+    vm_memattr_t memattr)
 {
-
-	(void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
+	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
+	vm_offset_t addr, tmp;
+	vm_ooffset_t offset;
+	vm_page_t end_m, m;
+	int pflags, tries;
+ 
+	size = round_page(size);
+	if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
+		return (0);
+	offset = addr - VM_MIN_KERNEL_ADDRESS;
+	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	VM_OBJECT_WLOCK(object);
+	tries = 0;
+retry:
+	m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
+	    atop(size), low, high, alignment, boundary, memattr);
+	if (m == NULL) {
+		VM_OBJECT_WUNLOCK(object);
+		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
+			vm_pageout_grow_cache(tries, low, high);
+			VM_OBJECT_WLOCK(object);
+			tries++;
+			goto retry;
+		}
+		vmem_free(vmem, addr, size);
+		return (0);
+	}
+	end_m = m + atop(size);
+	tmp = addr;
+	for (; m < end_m; m++) {
+		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
+			pmap_zero_page(m);
+		m->valid = VM_PAGE_BITS_ALL;
+		pmap_enter(kernel_pmap, tmp, m, VM_PROT_ALL,
+		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
+		tmp += PAGE_SIZE;
+	}
+	VM_OBJECT_WUNLOCK(object);
+	return (addr);
 }
 
 /*
@@ -236,8 +275,8 @@
 	size = round_page(size);
 
 	*min = vm_map_min(parent);
-	ret = vm_map_find(parent, NULL, 0, min, size, superpage_align ?
-	    VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
+	ret = vm_map_find(parent, NULL, 0, min, size, 0, superpage_align ?
+	    VMFS_SUPER_SPACE : VMFS_ANY_SPACE, VM_PROT_ALL, VM_PROT_ALL,
 	    MAP_ACC_NO_CHARGE);
 	if (ret != KERN_SUCCESS)
 		panic("kmem_suballoc: bad status return of %d", ret);
@@ -253,65 +292,25 @@
 /*
  *	kmem_malloc:
  *
- * 	Allocate wired-down memory in the kernel's address map for the higher
- * 	level kernel memory allocator (kern/kern_malloc.c).  We cannot use
- * 	kmem_alloc() because we may need to allocate memory at interrupt
- * 	level where we cannot block (canwait == FALSE).
- *
- * 	This routine has its own private kernel submap (kmem_map) and object
- * 	(kmem_object).  This, combined with the fact that only malloc uses
- * 	this routine, ensures that we will never block in map or object waits.
- *
- * 	We don't worry about expanding the map (adding entries) since entries
- * 	for wired maps are statically allocated.
- *
- *	`map' is ONLY allowed to be kmem_map or one of the mbuf submaps to
- *	which we never free.
+ *	Allocate wired-down pages in the kernel's address space.
  */
 vm_offset_t
-kmem_malloc(map, size, flags)
-	vm_map_t map;
-	vm_size_t size;
-	int flags;
+kmem_malloc(struct vmem *vmem, vm_size_t size, int flags)
 {
 	vm_offset_t addr;
-	int i, rv;
+	int rv;
 
 	size = round_page(size);
-	addr = vm_map_min(map);
+	if (vmem_alloc(vmem, size, flags | M_BESTFIT, &addr))
+		return (0);
 
-	/*
-	 * Locate sufficient space in the map.  This will give us the final
-	 * virtual address for the new memory, and thus will tell us the
-	 * offset within the kernel map.
-	 */
-	vm_map_lock(map);
-	if (vm_map_findspace(map, vm_map_min(map), size, &addr)) {
-		vm_map_unlock(map);
-                if ((flags & M_NOWAIT) == 0) {
-			for (i = 0; i < 8; i++) {
-				EVENTHANDLER_INVOKE(vm_lowmem, 0);
-				uma_reclaim();
-				vm_map_lock(map);
-				if (vm_map_findspace(map, vm_map_min(map),
-				    size, &addr) == 0) {
-					break;
-				}
-				vm_map_unlock(map);
-				tsleep(&i, 0, "nokva", (hz / 4) * (i + 1));
-			}
-			if (i == 8) {
-				panic("kmem_malloc(%ld): kmem_map too small: %ld total allocated",
-				    (long)size, (long)map->size);
-			}
-		} else {
-			return (0);
-		}
+	rv = kmem_back((vmem == kmem_arena) ? kmem_object : kernel_object,
+	    addr, size, flags);
+	if (rv != KERN_SUCCESS) {
+		vmem_free(vmem, addr, size);
+		return (0);
 	}
-
-	rv = kmem_back(map, addr, size, flags);
-	vm_map_unlock(map);
-	return (rv == KERN_SUCCESS ? addr : 0);
+	return (addr);
 }
 
 /*
@@ -320,45 +319,22 @@
  *	Allocate physical pages for the specified virtual address range.
  */
 int
-kmem_back(vm_map_t map, vm_offset_t addr, vm_size_t size, int flags)
+kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 {
 	vm_offset_t offset, i;
-	vm_map_entry_t entry;
 	vm_page_t m;
 	int pflags;
-	boolean_t found;
 
-	KASSERT(vm_map_locked(map), ("kmem_back: map %p is not locked", map));
+	KASSERT(object == kmem_object || object == kernel_object,
+	    ("kmem_back: only supports kernel objects."));
+
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
-	vm_object_reference(kmem_object);
-	vm_map_insert(map, kmem_object, offset, addr, addr + size,
-	    VM_PROT_ALL, VM_PROT_ALL, 0);
+	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
 
-	/*
-	 * Assert: vm_map_insert() will never be able to extend the
-	 * previous entry so vm_map_lookup_entry() will find a new
-	 * entry exactly corresponding to this address range and it
-	 * will have wired_count == 0.
-	 */
-	found = vm_map_lookup_entry(map, addr, &entry);
-	KASSERT(found && entry->start == addr && entry->end == addr + size &&
-	    entry->wired_count == 0 && (entry->eflags & MAP_ENTRY_IN_TRANSITION)
-	    == 0, ("kmem_back: entry not found or misaligned"));
-
-	if ((flags & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
-		pflags = VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED;
-	else
-		pflags = VM_ALLOC_SYSTEM | VM_ALLOC_WIRED;
-
-	if (flags & M_ZERO)
-		pflags |= VM_ALLOC_ZERO;
-	if (flags & M_NODUMP)
-		pflags |= VM_ALLOC_NODUMP;
-
-	VM_OBJECT_LOCK(kmem_object);
+	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
 retry:
-		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i), pflags);
+		m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
@@ -366,80 +342,75 @@
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
+			VM_OBJECT_WUNLOCK(object);
 			if ((flags & M_NOWAIT) == 0) {
-				VM_OBJECT_UNLOCK(kmem_object);
-				entry->eflags |= MAP_ENTRY_IN_TRANSITION;
-				vm_map_unlock(map);
 				VM_WAIT;
-				vm_map_lock(map);
-				KASSERT(
-(entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_NEEDS_WAKEUP)) ==
-				    MAP_ENTRY_IN_TRANSITION,
-				    ("kmem_back: volatile entry"));
-				entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
-				VM_OBJECT_LOCK(kmem_object);
+				VM_OBJECT_WLOCK(object);
 				goto retry;
 			}
-			/* 
-			 * Free the pages before removing the map entry.
-			 * They are already marked busy.  Calling
-			 * vm_map_delete before the pages has been freed or
-			 * unbusied will cause a deadlock.
-			 */
-			while (i != 0) {
-				i -= PAGE_SIZE;
-				m = vm_page_lookup(kmem_object,
-						   OFF_TO_IDX(offset + i));
-				vm_page_unwire(m, 0);
-				vm_page_free(m);
-			}
-			VM_OBJECT_UNLOCK(kmem_object);
-			vm_map_delete(map, addr, addr + size);
+			kmem_unback(object, addr, i);
 			return (KERN_NO_SPACE);
 		}
 		if (flags & M_ZERO && (m->flags & PG_ZERO) == 0)
 			pmap_zero_page(m);
-		m->valid = VM_PAGE_BITS_ALL;
 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
 		    ("kmem_malloc: page %p is managed", m));
+		m->valid = VM_PAGE_BITS_ALL;
+		pmap_enter(kernel_pmap, addr + i, m, VM_PROT_ALL,
+		    VM_PROT_ALL | PMAP_ENTER_WIRED, 0);
 	}
-	VM_OBJECT_UNLOCK(kmem_object);
+	VM_OBJECT_WUNLOCK(object);
 
-	/*
-	 * Mark map entry as non-pageable.  Repeat the assert.
-	 */
-	KASSERT(entry->start == addr && entry->end == addr + size &&
-	    entry->wired_count == 0,
-	    ("kmem_back: entry not found or misaligned after allocation"));
-	entry->wired_count = 1;
+	return (KERN_SUCCESS);
+}
 
-	/*
-	 * At this point, the kmem_object must be unlocked because
-	 * vm_map_simplify_entry() calls vm_object_deallocate(), which
-	 * locks the kmem_object.
-	 */
-	vm_map_simplify_entry(map, entry);
+/*
+ *	kmem_unback:
+ *
+ *	Unmap and free the physical pages underlying the specified virtual
+ *	address range.
+ *
+ *	A physical page must exist within the specified object at each index
+ *	that is being unmapped.
+ */
+void
+kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
+{
+	vm_page_t m;
+	vm_offset_t i, offset;
 
-	/*
-	 * Loop thru pages, entering them in the pmap.
-	 */
-	VM_OBJECT_LOCK(kmem_object);
+	KASSERT(object == kmem_object || object == kernel_object,
+	    ("kmem_unback: only supports kernel objects."));
+
+	pmap_remove(kernel_pmap, addr, addr + size);
+	offset = addr - VM_MIN_KERNEL_ADDRESS;
+	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
-		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
-		/*
-		 * Because this is kernel_pmap, this call will not block.
-		 */
-		pmap_enter(kernel_pmap, addr + i, VM_PROT_ALL, m, VM_PROT_ALL,
-		    TRUE);
-		vm_page_wakeup(m);
+		m = vm_page_lookup(object, OFF_TO_IDX(offset + i));
+		vm_page_unwire(m, 0);
+		vm_page_free(m);
 	}
-	VM_OBJECT_UNLOCK(kmem_object);
+	VM_OBJECT_WUNLOCK(object);
+}
 
-	return (KERN_SUCCESS);
+/*
+ *	kmem_free:
+ *
+ *	Free memory allocated with kmem_malloc.  The size must match the
+ *	original allocation.
+ */
+void
+kmem_free(struct vmem *vmem, vm_offset_t addr, vm_size_t size)
+{
+
+	size = round_page(size);
+	kmem_unback((vmem == kmem_arena) ? kmem_object : kernel_object,
+	    addr, size);
+	vmem_free(vmem, addr, size);
 }
 
 /*
- *	kmem_alloc_wait:
+ *	kmap_alloc_wait:
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
@@ -447,9 +418,7 @@
  *	This routine may block.
  */
 vm_offset_t
-kmem_alloc_wait(map, size)
-	vm_map_t map;
-	vm_size_t size;
+kmap_alloc_wait(vm_map_t map, vm_size_t size)
 {
 	vm_offset_t addr;
 
@@ -481,16 +450,13 @@
 }
 
 /*
- *	kmem_free_wakeup:
+ *	kmap_free_wakeup:
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
  */
 void
-kmem_free_wakeup(map, addr, size)
-	vm_map_t map;
-	vm_offset_t addr;
-	vm_size_t size;
+kmap_free_wakeup(vm_map_t map, vm_offset_t addr, vm_size_t size)
 {
 
 	vm_map_lock(map);
@@ -502,12 +468,11 @@
 	vm_map_unlock(map);
 }
 
-static void
+void
 kmem_init_zero_region(void)
 {
 	vm_offset_t addr, i;
 	vm_page_t m;
-	int error;
 
 	/*
 	 * Map a single physical page of zeros to a larger virtual range.
@@ -514,7 +479,7 @@
 	 * This requires less looping in places that want large amounts of
 	 * zeros, while not using much more physical resources.
 	 */
-	addr = kmem_alloc_nofault(kernel_map, ZERO_REGION_SIZE);
+	addr = kva_alloc(ZERO_REGION_SIZE);
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	if ((m->flags & PG_ZERO) == 0)
@@ -521,9 +486,7 @@
 		pmap_zero_page(m);
 	for (i = 0; i < ZERO_REGION_SIZE; i += PAGE_SIZE)
 		pmap_qenter(addr + i, &m, 1);
-	error = vm_map_protect(kernel_map, addr, addr + ZERO_REGION_SIZE,
-	    VM_PROT_READ, TRUE);
-	KASSERT(error == 0, ("error=%d", error));
+	pmap_protect(kernel_pmap, addr, addr + ZERO_REGION_SIZE, VM_PROT_READ);
 
 	zero_region = (const void *)addr;
 }
@@ -537,8 +500,7 @@
  *	`start' as allocated, and the range between `start' and `end' as free.
  */
 void
-kmem_init(start, end)
-	vm_offset_t start, end;
+kmem_init(vm_offset_t start, vm_offset_t end)
 {
 	vm_map_t m;
 
@@ -556,8 +518,6 @@
 	    start, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
-
-	kmem_init_zero_region();
 }
 
 #ifdef DIAGNOSTIC
@@ -574,11 +534,13 @@
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error)
 		return (error);
-	if (i)	 
-		EVENTHANDLER_INVOKE(vm_lowmem, 0);
+	if ((i & ~(VM_LOW_KMEM | VM_LOW_PAGES)) != 0)
+		return (EINVAL);
+	if (i != 0)
+		EVENTHANDLER_INVOKE(vm_lowmem, i);
 	return (0);
 }
 
 SYSCTL_PROC(_debug, OID_AUTO, vm_lowmem, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
-    debug_vm_lowmem, "I", "set to trigger vm_lowmem event");
+    debug_vm_lowmem, "I", "set to trigger vm_lowmem event with given flags");
 #endif

Modified: trunk/sys/vm/vm_kern.h
===================================================================
--- trunk/sys/vm/vm_kern.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_kern.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $
  */
 
 #ifndef _VM_VM_KERN_H_
@@ -64,11 +65,15 @@
 #define _VM_VM_KERN_H_ 1
 
 /* Kernel memory management definitions. */
-extern vm_map_t buffer_map;
 extern vm_map_t kernel_map;
-extern vm_map_t kmem_map;
 extern vm_map_t exec_map;
 extern vm_map_t pipe_map;
+extern struct vmem *kernel_arena;
+extern struct vmem *kmem_arena;
+extern struct vmem *buffer_arena;
+extern struct vmem *transient_arena;
+extern struct vmem *memguard_arena;
+extern vm_offset_t swapbkva;
 extern u_long vm_kmem_size;
 
 #endif				/* _VM_VM_KERN_H_ */

Modified: trunk/sys/vm/vm_map.c
===================================================================
--- trunk/sys/vm/vm_map.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_map.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -63,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -77,6 +78,7 @@
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/file.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -125,19 +127,24 @@
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
-static struct vm_object kmapentobj;
 static int vmspace_zinit(void *mem, int size, int flags);
-static void vmspace_zfini(void *mem, int size);
 static int vm_map_zinit(void *mem, int ize, int flags);
-static void vm_map_zfini(void *mem, int size);
 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
     vm_offset_t max);
 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
+static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
+static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
+    vm_map_entry_t gap_entry);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
+static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
+    vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
+    int cow);
+static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
+    vm_offset_t failed_addr);
 
 #define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
     ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
@@ -192,25 +199,22 @@
 #else
 	    NULL,
 #endif
-	    vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
-	uma_prealloc(kmapentzone, MAX_KMAPENT);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
+#ifdef INVARIANTS
+	    vmspace_zdtor,
+#else
+	    NULL,
+#endif
+	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
-static void
-vmspace_zfini(void *mem, int size)
-{
-	struct vmspace *vm;
-
-	vm = (struct vmspace *)mem;
-	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
-}
-
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
@@ -220,19 +224,10 @@
 
 	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
+	PMAP_LOCK_INIT(vmspace_pmap(vm));
 	return (0);
 }
 
-static void
-vm_map_zfini(void *mem, int size)
-{
-	vm_map_t map;
-
-	map = (vm_map_t)mem;
-	mtx_destroy(&map->system_mtx);
-	sx_destroy(&map->lock);
-}
-
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
@@ -239,8 +234,7 @@
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
-	map->nentries = 0;
-	map->size = 0;
+	memset(map, 0, sizeof(*map));
 	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "vm map (user)");
 	return (0);
@@ -274,15 +268,22 @@
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
+ *
+ * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
  */
 struct vmspace *
-vmspace_alloc(min, max)
-	vm_offset_t min, max;
+vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
-	if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
+
+	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
+
+	if (pinit == NULL)
+		pinit = &pmap_pinit;
+
+	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
 	}
@@ -300,26 +301,11 @@
 	return (vm);
 }
 
-void
-vm_init2(void)
-{
-	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
-	    (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE) / 8 +
-	     maxproc * 2 + maxfiles);
-	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
-#ifdef INVARIANTS
-	    vmspace_zdtor,
-#else
-	    NULL,
-#endif
-	    vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-}
-
+#ifdef RACCT
 static void
 vmspace_container_reset(struct proc *p)
 {
 
-#ifdef RACCT
 	PROC_LOCK(p);
 	racct_set(p, RACCT_DATA, 0);
 	racct_set(p, RACCT_STACK, 0);
@@ -327,8 +313,8 @@
 	racct_set(p, RACCT_MEMLOCK, 0);
 	racct_set(p, RACCT_VMEM, 0);
 	PROC_UNLOCK(p);
+}
 #endif
-}
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
@@ -359,6 +345,9 @@
 vmspace_free(struct vmspace *vm)
 {
 
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+	    "vmspace_free() called with non-sleepable lock held");
+
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
@@ -427,7 +416,10 @@
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
-	vmspace_container_reset(p);
+#ifdef RACCT
+	if (racct_enable)
+		vmspace_container_reset(p);
+#endif
 }
 
 /* Acquire reference to vmspace owned by another process. */
@@ -960,6 +952,15 @@
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(after_where == &map->header ||
+	    after_where->end <= entry->start,
+	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
+	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
+	KASSERT(after_where->next == &map->header ||
+	    entry->end <= after_where->next->start,
+	    ("vm_map_entry_link: new end %jx next start %jx overlap",
+	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
+
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
@@ -1132,24 +1133,26 @@
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
-	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
-	      int cow)
+    vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
 {
-	vm_map_entry_t new_entry;
-	vm_map_entry_t prev_entry;
-	vm_map_entry_t temp_entry;
+	vm_map_entry_t new_entry, prev_entry, temp_entry;
+	struct ucred *cred;
 	vm_eflags_t protoeflags;
-	struct ucred *cred;
 	vm_inherit_t inheritance;
-	boolean_t charge_prev_obj;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT((object != kmem_object && object != kernel_object) ||
+	    (cow & MAP_COPY_ON_WRITE) == 0,
+	    ("vm_map_insert: kmem or kernel object and COW"));
+	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
+	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
+	KASSERT((prot & ~max) == 0,
+	    ("prot %#x is not subset of max_prot %#x", prot, max));
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
-	if ((start < map->min_offset) || (end > map->max_offset) ||
-	    (start >= end))
+	if (start < map->min_offset || end > map->max_offset || start >= end)
 		return (KERN_INVALID_ADDRESS);
 
 	/*
@@ -1164,28 +1167,34 @@
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
-	if ((prev_entry->next != &map->header) &&
-	    (prev_entry->next->start < end))
+	if (prev_entry->next != &map->header && prev_entry->next->start < end)
 		return (KERN_NO_SPACE);
 
+	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
+	    max != VM_PROT_NONE))
+		return (KERN_INVALID_ARGUMENT);
+
 	protoeflags = 0;
-	charge_prev_obj = FALSE;
-
 	if (cow & MAP_COPY_ON_WRITE)
-		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
-
-	if (cow & MAP_NOFAULT) {
+		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
+	if (cow & MAP_NOFAULT)
 		protoeflags |= MAP_ENTRY_NOFAULT;
-
-		KASSERT(object == NULL,
-			("vm_map_insert: paradoxical MAP_NOFAULT request"));
-	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
+	if (cow & MAP_STACK_GROWS_DOWN)
+		protoeflags |= MAP_ENTRY_GROWS_DOWN;
+	if (cow & MAP_STACK_GROWS_UP)
+		protoeflags |= MAP_ENTRY_GROWS_UP;
 	if (cow & MAP_VN_WRITECOUNT)
 		protoeflags |= MAP_ENTRY_VN_WRITECNT;
+	if ((cow & MAP_CREATE_GUARD) != 0)
+		protoeflags |= MAP_ENTRY_GUARD;
+	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
+		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
+	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
+		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
 	if (cow & MAP_INHERIT_SHARE)
 		inheritance = VM_INHERIT_SHARE;
 	else
@@ -1192,23 +1201,17 @@
 		inheritance = VM_INHERIT_DEFAULT;
 
 	cred = NULL;
-	KASSERT((object != kmem_object && object != kernel_object) ||
-	    ((object == kmem_object || object == kernel_object) &&
-		!(protoeflags & MAP_ENTRY_NEEDS_COPY)),
-	    ("kmem or kernel object and cow"));
-	if (cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT))
+	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
 		goto charged;
 	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
 	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
 		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
 			return (KERN_RESOURCE_SHORTAGE);
-		KASSERT(object == NULL || (protoeflags & MAP_ENTRY_NEEDS_COPY) ||
+		KASSERT(object == NULL ||
+		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
 		    object->cred == NULL,
-		    ("OVERCOMMIT: vm_map_insert o %p", object));
+		    ("overcommit: vm_map_insert o %p", object));
 		cred = curthread->td_ucred;
-		crhold(cred);
-		if (object == NULL && !(protoeflags & MAP_ENTRY_NEEDS_COPY))
-			charge_prev_obj = TRUE;
 	}
 
 charged:
@@ -1223,37 +1226,35 @@
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
-		VM_OBJECT_UNLOCK(object);
-	}
-	else if ((prev_entry != &map->header) &&
-		 (prev_entry->eflags == protoeflags) &&
-		 (cow & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) == 0 &&
-		 (prev_entry->end == start) &&
-		 (prev_entry->wired_count == 0) &&
-		 (prev_entry->cred == cred ||
-		  (prev_entry->object.vm_object != NULL &&
-		   (prev_entry->object.vm_object->cred == cred))) &&
-		   vm_object_coalesce(prev_entry->object.vm_object,
-		       prev_entry->offset,
-		       (vm_size_t)(prev_entry->end - prev_entry->start),
-		       (vm_size_t)(end - prev_entry->end), charge_prev_obj)) {
+		VM_OBJECT_WUNLOCK(object);
+	} else if (prev_entry != &map->header &&
+	    prev_entry->eflags == protoeflags &&
+	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
+	    prev_entry->end == start && prev_entry->wired_count == 0 &&
+	    (prev_entry->cred == cred ||
+	    (prev_entry->object.vm_object != NULL &&
+	    prev_entry->object.vm_object->cred == cred)) &&
+	    vm_object_coalesce(prev_entry->object.vm_object,
+	    prev_entry->offset,
+	    (vm_size_t)(prev_entry->end - prev_entry->start),
+	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
+	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
-		if ((prev_entry->inheritance == inheritance) &&
-		    (prev_entry->protection == prot) &&
-		    (prev_entry->max_protection == max)) {
-			map->size += (end - prev_entry->end);
+		if (prev_entry->inheritance == inheritance &&
+		    prev_entry->protection == prot &&
+		    prev_entry->max_protection == max) {
+			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
+				map->size += end - prev_entry->end;
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
-			if (cred != NULL)
-				crfree(cred);
 			return (KERN_SUCCESS);
 		}
 
@@ -1265,23 +1266,18 @@
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
-			(prev_entry->end - prev_entry->start);
+		    (prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 		if (cred != NULL && object != NULL && object->cred != NULL &&
 		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
 			/* Object already accounts for this uid. */
-			crfree(cred);
 			cred = NULL;
 		}
 	}
+	if (cred != NULL)
+		crhold(cred);
 
 	/*
-	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
-	 * in things like the buffer map where we manage kva but do not manage
-	 * backing objects.
-	 */
-
-	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
@@ -1292,17 +1288,17 @@
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
-	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = inheritance;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
+	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
 	new_entry->next_read = OFF_TO_IDX(offset);
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
-	    ("OVERCOMMIT: vm_map_insert leaks vm_map %p", new_entry));
+	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
 	new_entry->cred = cred;
 
 	/*
@@ -1309,20 +1305,20 @@
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, prev_entry, new_entry);
-	map->size += new_entry->end - new_entry->start;
+	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
+		map->size += new_entry->end - new_entry->start;
 
 	/*
-	 * It may be possible to merge the new entry with the next and/or
-	 * previous entries.  However, due to MAP_STACK_* being a hack, a
-	 * panic can result from merging such entries.
+	 * Try to coalesce the new entry with both the previous and next
+	 * entries in the list.  Previously, we only attempted to coalesce
+	 * with the previous entry when object is NULL.  Here, we handle the
+	 * other cases, which are less common.
 	 */
-	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0)
-		vm_map_simplify_entry(map, new_entry);
+	vm_map_simplify_entry(map, new_entry);
 
-	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
-		vm_map_pmap_enter(map, start, prot,
-				    object, OFF_TO_IDX(offset), end - start,
-				    cow & MAP_PREFAULT_PARTIAL);
+	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
+		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
+		    end - start, cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
@@ -1421,11 +1417,20 @@
 	int result;
 
 	end = start + length;
+	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
+	    object == NULL,
+	    ("vm_map_fixed: non-NULL backing object for stack"));
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
-	(void) vm_map_delete(map, start, end);
-	result = vm_map_insert(map, object, offset, start, end, prot,
-	    max, cow);
+	if ((cow & MAP_CHECK_EXCL) == 0)
+		vm_map_delete(map, start, end);
+	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
+		result = vm_map_stack_locked(map, start, length, sgrowsiz,
+		    prot, max, cow);
+	} else {
+		result = vm_map_insert(map, object, offset, start, end,
+		    prot, max, cow);
+	}
 	vm_map_unlock(map);
 	return (result);
 }
@@ -1442,48 +1447,101 @@
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
-	    vm_size_t length, int find_space, vm_prot_t prot,
-	    vm_prot_t max, int cow)
+	    vm_size_t length, vm_offset_t max_addr, int find_space,
+	    vm_prot_t prot, vm_prot_t max, int cow)
 {
-	vm_offset_t start;
+	vm_offset_t alignment, initial_addr, start;
 	int result;
 
-	start = *addr;
+	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
+	    object == NULL,
+	    ("vm_map_find: non-NULL backing object for stack"));
+	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
+	    (object->flags & OBJ_COLORED) == 0))
+		find_space = VMFS_ANY_SPACE;
+	if (find_space >> 8 != 0) {
+		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
+		alignment = (vm_offset_t)1 << (find_space >> 8);
+	} else
+		alignment = 0;
+	initial_addr = *addr;
+again:
+	start = initial_addr;
 	vm_map_lock(map);
 	do {
 		if (find_space != VMFS_NO_SPACE) {
-			if (vm_map_findspace(map, start, length, addr)) {
+			if (vm_map_findspace(map, start, length, addr) ||
+			    (max_addr != 0 && *addr + length > max_addr)) {
 				vm_map_unlock(map);
+				if (find_space == VMFS_OPTIMAL_SPACE) {
+					find_space = VMFS_ANY_SPACE;
+					goto again;
+				}
 				return (KERN_NO_SPACE);
 			}
 			switch (find_space) {
-			case VMFS_ALIGNED_SPACE:
+			case VMFS_SUPER_SPACE:
+			case VMFS_OPTIMAL_SPACE:
 				pmap_align_superpage(object, offset, addr,
 				    length);
 				break;
-#ifdef VMFS_TLB_ALIGNED_SPACE
-			case VMFS_TLB_ALIGNED_SPACE:
-				pmap_align_tlb(addr);
+			case VMFS_ANY_SPACE:
 				break;
-#endif
 			default:
+				if ((*addr & (alignment - 1)) != 0) {
+					*addr &= ~(alignment - 1);
+					*addr += alignment;
+				}
 				break;
 			}
 
 			start = *addr;
 		}
-		result = vm_map_insert(map, object, offset, start, start +
-		    length, prot, max, cow);
-	} while (result == KERN_NO_SPACE && (find_space == VMFS_ALIGNED_SPACE
-#ifdef VMFS_TLB_ALIGNED_SPACE
-	    || find_space == VMFS_TLB_ALIGNED_SPACE
-#endif
-	    ));
+		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
+			result = vm_map_stack_locked(map, start, length,
+			    sgrowsiz, prot, max, cow);
+		} else {
+			result = vm_map_insert(map, object, offset, start,
+			    start + length, prot, max, cow);
+		}
+	} while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
+	    find_space != VMFS_ANY_SPACE);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
+ *	vm_map_find_min() is a variant of vm_map_find() that takes an
+ *	additional parameter (min_addr) and treats the given address
+ *	(*addr) differently.  Specifically, it treats *addr as a hint
+ *	and not as the minimum address where the mapping is created.
+ *
+ *	This function works in two phases.  First, it tries to
+ *	allocate above the hint.  If that fails and the hint is
+ *	greater than min_addr, it performs a second pass, replacing
+ *	the hint with min_addr as the minimum address for the
+ *	allocation.
+ */
+int
+vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
+    vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
+    vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
+    int cow)
+{
+	vm_offset_t hint;
+	int rv;
+
+	hint = *addr;
+	for (;;) {
+		rv = vm_map_find(map, object, offset, addr, length, max_addr,
+		    find_space, prot, max, cow);
+		if (rv == KERN_SUCCESS || min_addr >= hint)
+			return (rv);
+		*addr = hint = min_addr;
+	}
+}
+
+/*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
@@ -1501,7 +1559,8 @@
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
-	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
+	if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
+	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
 		return;
 
 	prev = entry->prev;
@@ -1611,7 +1670,8 @@
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
-	if (entry->object.vm_object == NULL && !map->system_map) {
+	if (entry->object.vm_object == NULL && !map->system_map &&
+	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
@@ -1625,12 +1685,12 @@
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
-		VM_OBJECT_LOCK(entry->object.vm_object);
+		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
-		VM_OBJECT_UNLOCK(entry->object.vm_object);
+		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
@@ -1688,7 +1748,8 @@
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
-	if (entry->object.vm_object == NULL && !map->system_map) {
+	if (entry->object.vm_object == NULL && !map->system_map &&
+	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
@@ -1702,12 +1763,12 @@
 	} else if (entry->object.vm_object != NULL &&
 		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
 		   entry->cred != NULL) {
-		VM_OBJECT_LOCK(entry->object.vm_object);
+		VM_OBJECT_WLOCK(entry->object.vm_object);
 		KASSERT(entry->object.vm_object->cred == NULL,
 		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
 		entry->object.vm_object->cred = entry->cred;
 		entry->object.vm_object->charge = entry->end - entry->start;
-		VM_OBJECT_UNLOCK(entry->object.vm_object);
+		VM_OBJECT_WUNLOCK(entry->object.vm_object);
 		entry->cred = NULL;
 	}
 
@@ -1781,7 +1842,7 @@
 }
 
 /*
- * The maximum number of pages to map
+ * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
  */
 #define	MAX_INIT_PT	96
 
@@ -1788,10 +1849,16 @@
 /*
  *	vm_map_pmap_enter:
  *
- *	Preload read-only mappings for the given object's resident pages into
- *	the given map.  This eliminates the soft faults on process startup and
- *	immediately after an mmap(2).  Because these are speculative mappings,
- *	cached pages are not reactivated and mapped.
+ *	Preload the specified map's pmap with mappings to the specified
+ *	object's memory-resident pages.  No further physical pages are
+ *	allocated, and no further virtual pages are retrieved from secondary
+ *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
+ *	limited number of page mappings are created at the low-end of the
+ *	specified address range.  (For this purpose, a superpage mapping
+ *	counts as one page mapping.)  Otherwise, all resident pages within
+ *	the specified address range are mapped.  Because these mappings are
+ *	being created speculatively, cached pages are not reactivated and
+ *	mapped.
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
@@ -1799,30 +1866,35 @@
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
-	vm_pindex_t psize, tmpidx;
+	vm_pindex_t mask, psize, threshold, tmpidx;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_RLOCK(object);
 	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
-		pmap_object_init_pt(map->pmap, addr, object, pindex, size);
-		goto unlock_return;
+		VM_OBJECT_RUNLOCK(object);
+		VM_OBJECT_WLOCK(object);
+		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
+			pmap_object_init_pt(map->pmap, addr, object, pindex,
+			    size);
+			VM_OBJECT_WUNLOCK(object);
+			return;
+		}
+		VM_OBJECT_LOCK_DOWNGRADE(object);
 	}
 
 	psize = atop(size);
-
-	if ((flags & MAP_PREFAULT_PARTIAL) && psize > MAX_INIT_PT &&
-	    object->resident_page_count > MAX_INIT_PT)
-		goto unlock_return;
-
 	if (psize + pindex > object->size) {
-		if (object->size < pindex)
-			goto unlock_return;
+		if (object->size < pindex) {
+			VM_OBJECT_RUNLOCK(object);
+			return;
+		}
 		psize = object->size - pindex;
 	}
 
 	start = 0;
 	p_start = NULL;
+	threshold = MAX_INIT_PT;
 
 	p = vm_page_find_least(object, pindex);
 	/*
@@ -1837,8 +1909,10 @@
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
-		if ((flags & MAP_PREFAULT_MADVISE) &&
-		    cnt.v_free_count < cnt.v_free_reserved) {
+		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
+		    cnt.v_free_count < cnt.v_free_reserved) ||
+		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
+		    tmpidx >= threshold)) {
 			psize = tmpidx;
 			break;
 		}
@@ -1847,6 +1921,16 @@
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
+			/* Jump ahead if a superpage mapping is possible. */
+			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
+			    (pagesizes[p->psind] - 1)) == 0) {
+				mask = atop(pagesizes[p->psind]) - 1;
+				if (tmpidx + mask < psize &&
+				    vm_page_ps_is_valid(p)) {
+					p += mask;
+					threshold += mask;
+				}
+			}
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
@@ -1856,8 +1940,7 @@
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
-unlock_return:
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_RUNLOCK(object);
 }
 
 /*
@@ -1877,6 +1960,9 @@
 	struct ucred *cred;
 	vm_prot_t old_prot;
 
+	if (start == end)
+		return (KERN_SUCCESS);
+
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
@@ -1890,8 +1976,10 @@
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
-	current = entry;
-	while ((current != &map->header) && (current->start < end)) {
+	for (current = entry; current != &map->header && current->start < end;
+	    current = current->next) {
+		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
+			continue;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
@@ -1900,23 +1988,22 @@
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
-		current = current->next;
 	}
 
-
 	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
-	for (current = entry; (current != &map->header) &&
-	     (current->start < end); current = current->next) {
+	for (current = entry; current != &map->header && current->start < end;
+	    current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
 		if (set_max ||
 		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
-		    ENTRY_CHARGED(current)) {
+		    ENTRY_CHARGED(current) ||
+		    (current->eflags & MAP_ENTRY_GUARD) != 0) {
 			continue;
 		}
 
@@ -1933,9 +2020,9 @@
 			continue;
 		}
 
-		VM_OBJECT_LOCK(obj);
+		VM_OBJECT_WLOCK(obj);
 		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
-			VM_OBJECT_UNLOCK(obj);
+			VM_OBJECT_WUNLOCK(obj);
 			continue;
 		}
 
@@ -1945,9 +2032,10 @@
 		 * charged clipped mapping of the same object later.
 		 */
 		KASSERT(obj->charge == 0,
-		    ("vm_map_protect: object %p overcharged\n", obj));
+		    ("vm_map_protect: object %p overcharged (entry %p)",
+		    obj, current));
 		if (!swap_reserve(ptoa(obj->size))) {
-			VM_OBJECT_UNLOCK(obj);
+			VM_OBJECT_WUNLOCK(obj);
 			vm_map_unlock(map);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
@@ -1955,7 +2043,7 @@
 		crhold(cred);
 		obj->cred = cred;
 		obj->charge = ptoa(obj->size);
-		VM_OBJECT_UNLOCK(obj);
+		VM_OBJECT_WUNLOCK(obj);
 	}
 
 	/*
@@ -1962,8 +2050,11 @@
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
-	current = entry;
-	while ((current != &map->header) && (current->start < end)) {
+	for (current = entry; current != &map->header && current->start < end;
+	    current = current->next) {
+		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
+			continue;
+
 		old_prot = current->protection;
 
 		if (set_max)
@@ -1973,12 +2064,16 @@
 		else
 			current->protection = new_prot;
 
-		if ((current->eflags & (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED))
-		     == (MAP_ENTRY_COW | MAP_ENTRY_USER_WIRED) &&
+		/*
+		 * For user wired map entries, the normal lazy evaluation of
+		 * write access upgrades through soft page faults is
+		 * undesirable.  Instead, immediately copy any pages that are
+		 * copy-on-write and enable write access in the physical map.
+		 */
+		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
 		    (current->protection & VM_PROT_WRITE) != 0 &&
-		    (old_prot & VM_PROT_WRITE) == 0) {
+		    (old_prot & VM_PROT_WRITE) == 0)
 			vm_fault_copy_entry(map, map, current, current, NULL);
-		}
 
 		/*
 		 * When restricting access, update the physical map.  Worry
@@ -1993,7 +2088,6 @@
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
-		current = current->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
@@ -2031,6 +2125,8 @@
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
+		if (start == end)
+			return (KERN_SUCCESS);
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
@@ -2037,6 +2133,8 @@
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
+		if (start == end)
+			return (KERN_SUCCESS);
 		vm_map_lock_read(map);
 		break;
 	default:
@@ -2113,7 +2211,7 @@
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
-			vm_offset_t useStart;
+			vm_offset_t useEnd, useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
@@ -2121,20 +2219,44 @@
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
+			useEnd = current->end;
 
 			if (current->start < start) {
 				pstart += atop(start - current->start);
 				useStart = start;
 			}
-			if (current->end > end)
+			if (current->end > end) {
 				pend -= atop(current->end - end);
+				useEnd = end;
+			}
 
 			if (pstart >= pend)
 				continue;
 
+			/*
+			 * Perform the pmap_advise() before clearing
+			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
+			 * concurrent pmap operation, such as pmap_remove(),
+			 * could clear a reference in the pmap and set
+			 * PGA_REFERENCED on the page before the pmap_advise()
+			 * had completed.  Consequently, the page would appear
+			 * referenced based upon an old reference that
+			 * occurred before this pmap_advise() ran.
+			 */
+			if (behav == MADV_DONTNEED || behav == MADV_FREE)
+				pmap_advise(map->pmap, useStart, useEnd,
+				    behav);
+
 			vm_object_madvise(current->object.vm_object, pstart,
 			    pend, behav);
-			if (behav == MADV_WILLNEED) {
+
+			/*
+			 * Pre-populate paging structures in the
+			 * WILLNEED case.  For wired entries, the
+			 * paging structures are already populated.
+			 */
+			if (behav == MADV_WILLNEED &&
+			    current->wired_count == 0) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
@@ -2170,10 +2292,13 @@
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
+	case VM_INHERIT_ZERO:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
+	if (start == end)
+		return (KERN_SUCCESS);
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
@@ -2183,7 +2308,9 @@
 		entry = temp_entry->next;
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
-		entry->inheritance = new_inheritance;
+		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
+		    new_inheritance != VM_INHERIT_ZERO)
+			entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
@@ -2206,6 +2333,8 @@
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
+	if (start == end)
+		return (KERN_SUCCESS);
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
@@ -2272,7 +2401,11 @@
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
+		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
+		    entry->wiring_thread == NULL,
+		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
+		entry->wiring_thread = curthread;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
@@ -2305,32 +2438,44 @@
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
-	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	for (entry = first_entry; entry != &map->header && entry->start < end;
+	    entry = entry->next) {
+		/*
+		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
+		 * space in the unwired region could have been mapped
+		 * while the map lock was dropped for draining
+		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
+		 * could be simultaneously wiring this new mapping
+		 * entry.  Detect these cases and skip any entries
+		 * marked as in transition by us.
+		 */
+		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
+		    entry->wiring_thread != curthread) {
+			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
+			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
+			continue;
+		}
+
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
-			entry->wired_count--;
-			if (entry->wired_count == 0) {
-				/*
-				 * Retain the map lock.
-				 */
-				vm_fault_unwire(map, entry->start, entry->end,
-				    entry->object.vm_object != NULL &&
-				    (entry->object.vm_object->type == OBJT_DEVICE ||
-				    entry->object.vm_object->type == OBJT_SG));
-			}
+			if (entry->wired_count == 1)
+				vm_map_entry_unwire(map, entry);
+			else
+				entry->wired_count--;
 		}
-		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
-			("vm_map_unwire: in-transition flag missing"));
+		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
+		    ("vm_map_unwire: in-transition flag missing %p", entry));
+		KASSERT(entry->wiring_thread == curthread,
+		    ("vm_map_unwire: alien wire %p", entry));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
+		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
-		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
@@ -2339,6 +2484,42 @@
 }
 
 /*
+ *	vm_map_wire_entry_failure:
+ *
+ *	Handle a wiring failure on the given entry.
+ *
+ *	The map should be locked.
+ */
+static void
+vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
+    vm_offset_t failed_addr)
+{
+
+	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
+	    entry->wired_count == 1,
+	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
+	KASSERT(failed_addr < entry->end,
+	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
+
+	/*
+	 * If any pages at the start of this entry were successfully wired,
+	 * then unwire them.
+	 */
+	if (failed_addr > entry->start) {
+		pmap_unwire(map->pmap, entry->start, failed_addr);
+		vm_object_unwire(entry->object.vm_object, entry->offset,
+		    failed_addr - entry->start, PQ_ACTIVE);
+	}
+
+	/*
+	 * Assign an out-of-range value to represent the failure to wire this
+	 * entry.
+	 */
+	entry->wired_count = -1;
+}
+
+/*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
@@ -2348,12 +2529,14 @@
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
-	vm_offset_t saved_end, saved_start;
+	vm_offset_t faddr, saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
-	boolean_t fictitious, need_wakeup, result, user_wire;
+	boolean_t need_wakeup, result, user_wire;
 	vm_prot_t prot;
 
+	if (start == end)
+		return (KERN_SUCCESS);
 	prot = 0;
 	if (flags & VM_MAP_WIRE_WRITE)
 		prot |= VM_PROT_WRITE;
@@ -2423,7 +2606,11 @@
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
+		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
+		    entry->wiring_thread == NULL,
+		    ("owned map entry %p", entry));
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
+		entry->wiring_thread = curthread;
 		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
 		    || (entry->protection & prot) != prot) {
 			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
@@ -2438,9 +2625,7 @@
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
-			fictitious = entry->object.vm_object != NULL &&
-			    (entry->object.vm_object->type == OBJT_DEVICE ||
-			    entry->object.vm_object->type == OBJT_SG);
+
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.  Mark the map busy for fork.
@@ -2447,8 +2632,17 @@
 			 */
 			vm_map_busy(map);
 			vm_map_unlock(map);
-			rv = vm_fault_wire(map, saved_start, saved_end,
-			    fictitious);
+
+			faddr = saved_start;
+			do {
+				/*
+				 * Simulate a fault to get the page and enter
+				 * it into the physical map.
+				 */
+				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
+				    VM_FAULT_WIRE)) != KERN_SUCCESS)
+					break;
+			} while ((faddr += PAGE_SIZE) < saved_end);
 			vm_map_lock(map);
 			vm_map_unbusy(map);
 			if (last_timestamp + 1 != map->timestamp) {
@@ -2467,23 +2661,22 @@
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
-					if (rv != KERN_SUCCESS) {
-						KASSERT(entry->wired_count == 1,
-						    ("vm_map_wire: bad count"));
-						entry->wired_count = -1;
-					}
+					/*
+					 * In case of failure, handle entries
+					 * that were not fully wired here;
+					 * fully wired entries are handled
+					 * later.
+					 */
+					if (rv != KERN_SUCCESS &&
+					    faddr < entry->end)
+						vm_map_wire_entry_failure(map,
+						    entry, faddr);
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
-				KASSERT(entry->wired_count == 1,
-				    ("vm_map_wire: bad count"));
-				/*
-				 * Assign an out-of-range value to represent
-				 * the failure to wire this entry.
-				 */
-				entry->wired_count = -1;
+				vm_map_wire_entry_failure(map, entry, faddr);
 				end = entry->end;
 				goto done;
 			}
@@ -2496,9 +2689,9 @@
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 	next_entry:
-		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
-		    (entry->end < end && (entry->next == &map->header ||
-		    entry->next->start > entry->end))) {
+		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
+		    entry->end < end && (entry->next == &map->header ||
+		    entry->next->start > entry->end)) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
@@ -2515,10 +2708,27 @@
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
-	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	for (entry = first_entry; entry != &map->header && entry->start < end;
+	    entry = entry->next) {
+		/*
+		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
+		 * space in the unwired region could have been mapped
+		 * while the map lock was dropped for faulting in the
+		 * pages or draining MAP_ENTRY_IN_TRANSITION.
+		 * Moreover, another thread could be simultaneously
+		 * wiring this new mapping entry.  Detect these cases
+		 * and skip any entries marked as in transition not by us.
+		 */
+		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
+		    entry->wiring_thread != curthread) {
+			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
+			    ("vm_map_wire: !HOLESOK and new/changed entry"));
+			continue;
+		}
+
 		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
 			goto next_entry_done;
+
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
@@ -2528,30 +2738,30 @@
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
-		} else {
-			if (!user_wire ||
-			    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
+		} else if (!user_wire ||
+		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
+			/*
+			 * Undo the wiring.  Wiring succeeded on this entry
+			 * but failed on a later entry.  
+			 */
+			if (entry->wired_count == 1)
+				vm_map_entry_unwire(map, entry);
+			else
 				entry->wired_count--;
-			if (entry->wired_count == 0) {
-				/*
-				 * Retain the map lock.
-				 */
-				vm_fault_unwire(map, entry->start, entry->end,
-				    entry->object.vm_object != NULL &&
-				    (entry->object.vm_object->type == OBJT_DEVICE ||
-				    entry->object.vm_object->type == OBJT_SG));
-			}
 		}
 	next_entry_done:
-		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
-			("vm_map_wire: in-transition flag missing"));
-		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION|MAP_ENTRY_WIRE_SKIPPED);
+		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
+		    ("vm_map_wire: in-transition flag missing %p", entry));
+		KASSERT(entry->wiring_thread == curthread,
+		    ("vm_map_wire: alien wire %p", entry));
+		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
+		    MAP_ENTRY_WIRE_SKIPPED);
+		entry->wiring_thread = NULL;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
-		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
@@ -2673,10 +2883,13 @@
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
-	vm_fault_unwire(map, entry->start, entry->end,
-	    entry->object.vm_object != NULL &&
-	    (entry->object.vm_object->type == OBJT_DEVICE ||
-	    entry->object.vm_object->type == OBJT_SG));
+
+	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(entry->wired_count > 0,
+	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
+	pmap_unwire(map->pmap, entry->start, entry->end);
+	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
+	    entry->start, PQ_ACTIVE);
 	entry->wired_count = 0;
 }
 
@@ -2703,6 +2916,15 @@
 
 	vm_map_entry_unlink(map, entry);
 	object = entry->object.vm_object;
+
+	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
+		MPASS(entry->cred == NULL);
+		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
+		MPASS(object == NULL);
+		vm_map_entry_deallocate(entry, map->system_map);
+		return;
+	}
+
 	size = entry->end - entry->start;
 	map->size -= size;
 
@@ -2719,9 +2941,9 @@
 		count = OFF_TO_IDX(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
-		VM_OBJECT_LOCK(object);
-		if (object->ref_count != 1 &&
-		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
+		VM_OBJECT_WLOCK(object);
+		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
+		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 
@@ -2734,7 +2956,8 @@
 			vm_object_page_remove(object, offidxstart, offidxend,
 			    OBJPR_NOTMAPPED);
 			if (object->type == OBJT_SWAP)
-				swap_pager_freespace(object, offidxstart, count);
+				swap_pager_freespace(object, offidxstart,
+				    count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size) {
 				size1 = object->size;
@@ -2742,13 +2965,14 @@
 				if (object->cred != NULL) {
 					size1 -= object->size;
 					KASSERT(object->charge >= ptoa(size1),
-					    ("vm_map_entry_delete: object->charge < 0"));
-					swap_release_by_cred(ptoa(size1), object->cred);
+					    ("object %p charge < 0", object));
+					swap_release_by_cred(ptoa(size1),
+					    object->cred);
 					object->charge -= ptoa(size1);
 				}
 			}
 		}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	} else
 		entry->object.vm_object = NULL;
 	if (map->system_map)
@@ -2772,6 +2996,8 @@
 	vm_map_entry_t first_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	if (start == end)
+		return (KERN_SUCCESS);
 
 	/*
 	 * Find the start of the region, and clip it
@@ -2938,13 +3164,14 @@
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
-	if (src_entry->wired_count == 0) {
-
+	if (src_entry->wired_count == 0 ||
+	    (src_entry->protection & VM_PROT_WRITE) == 0) {
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
-		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
+		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
+		    (src_entry->protection & VM_PROT_WRITE) != 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
@@ -2956,15 +3183,17 @@
 		 */
 		size = src_entry->end - src_entry->start;
 		if ((src_object = src_entry->object.vm_object) != NULL) {
-			VM_OBJECT_LOCK(src_object);
+			VM_OBJECT_WLOCK(src_object);
 			charged = ENTRY_CHARGED(src_entry);
-			if ((src_object->handle == NULL) &&
-				(src_object->type == OBJT_DEFAULT ||
-				 src_object->type == OBJT_SWAP)) {
+			if (src_object->handle == NULL &&
+			    (src_object->type == OBJT_DEFAULT ||
+			    src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
-				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
+				if ((src_object->flags & (OBJ_NOSPLIT |
+				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
-					src_object = src_entry->object.vm_object;
+					src_object =
+					    src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
@@ -2977,7 +3206,7 @@
 				src_object->cred = src_entry->cred;
 				src_object->charge = size;
 			}
-			VM_OBJECT_UNLOCK(src_object);
+			VM_OBJECT_WUNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			if (charged) {
 				cred = curthread->td_ucred;
@@ -2991,8 +3220,10 @@
 					*fork_charge += size;
 				}
 			}
-			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
-			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
+			src_entry->eflags |= MAP_ENTRY_COW |
+			    MAP_ENTRY_NEEDS_COPY;
+			dst_entry->eflags |= MAP_ENTRY_COW |
+			    MAP_ENTRY_NEEDS_COPY;
 			dst_entry->offset = src_entry->offset;
 			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				/*
@@ -3015,6 +3246,10 @@
 				fake_entry->next = curthread->td_map_def_user;
 				curthread->td_map_def_user = fake_entry;
 			}
+
+			pmap_copy(dst_map->pmap, src_map->pmap,
+			    dst_entry->start, dst_entry->end - dst_entry->start,
+			    src_entry->start);
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
@@ -3024,14 +3259,11 @@
 				*fork_charge += size;
 			}
 		}
-
-		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
-		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
-		 * Of course, wired down pages can't be set copy-on-write.
-		 * Cause wired pages to be copied into the new map by
-		 * simulating faults (the new pages are pageable)
+		 * We don't want to make writeable wired pages copy-on-write.
+		 * Immediately copy these pages into the new map by simulating
+		 * page faults.  The new pages are pageable.
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
 		    fork_charge);
@@ -3051,6 +3283,8 @@
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
+	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
+		return;
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
@@ -3087,10 +3321,11 @@
 	vm_map_entry_t new_entry, old_entry;
 	vm_object_t object;
 	int locked;
+	vm_inherit_t inh;
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
-	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
+	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
 	if (vm2 == NULL)
 		return (NULL);
 	vm2->vm_taddr = vm1->vm_taddr;
@@ -3109,7 +3344,12 @@
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
-		switch (old_entry->inheritance) {
+		inh = old_entry->inheritance;
+		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
+		    inh != VM_INHERIT_NONE)
+			inh = VM_INHERIT_COPY;
+
+		switch (inh) {
 		case VM_INHERIT_NONE:
 			break;
 
@@ -3153,7 +3393,7 @@
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
-			VM_OBJECT_LOCK(object);
+			VM_OBJECT_WLOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			if (old_entry->cred != NULL) {
 				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
@@ -3177,7 +3417,7 @@
 				    ("vmspace_fork: vnp.writecount %p",
 				    object));
 			}
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
@@ -3186,6 +3426,7 @@
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION);
+			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
 				vnode_pager_update_writecount(object,
@@ -3220,6 +3461,7 @@
 			 */
 			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
 			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
+			new_entry->wiring_thread = NULL;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->cred = NULL;
@@ -3229,6 +3471,33 @@
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry, fork_charge);
 			break;
+
+		case VM_INHERIT_ZERO:
+			/*
+			 * Create a new anonymous mapping entry modelled from
+			 * the old one.
+			 */
+			new_entry = vm_map_entry_create(new_map);
+			memset(new_entry, 0, sizeof(*new_entry));
+
+			new_entry->start = old_entry->start;
+			new_entry->end = old_entry->end;
+			new_entry->eflags = old_entry->eflags &
+			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
+			    MAP_ENTRY_VN_WRITECNT);
+			new_entry->protection = old_entry->protection;
+			new_entry->max_protection = old_entry->max_protection;
+			new_entry->inheritance = VM_INHERIT_ZERO;
+
+			vm_map_entry_link(new_map, new_map->header.prev,
+			    new_entry);
+			vmspace_map_entry_forked(vm1, vm2, new_entry);
+
+			new_entry->cred = curthread->td_ucred;
+			crhold(new_entry->cred);
+			*fork_charge += (new_entry->end - new_entry->start);
+
+			break;
 		}
 		old_entry = old_entry->next;
 	}
@@ -3244,73 +3513,83 @@
 	return (vm2);
 }
 
+/*
+ * Create a process's stack for exec_new_vmspace().  This function is never
+ * asked to wire the newly created stack.
+ */
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
+	vm_size_t growsize, init_ssize;
+	rlim_t vmemlim;
+	int rv;
+
+	MPASS((map->flags & MAP_WIREFUTURE) == 0);
+	growsize = sgrowsiz;
+	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
+	vm_map_lock(map);
+	PROC_LOCK(curproc);
+	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
+	PROC_UNLOCK(curproc);
+	/* If we would blow our VMEM resource limit, no go */
+	if (map->size + init_ssize > vmemlim) {
+		rv = KERN_NO_SPACE;
+		goto out;
+	}
+	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
+	    max, cow);
+out:
+	vm_map_unlock(map);
+	return (rv);
+}
+
+static int stack_guard_page = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
+    &stack_guard_page, 0,
+    "Specifies the number of guard pages for a stack that grows");
+
+static int
+vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
+    vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
+{
 	vm_map_entry_t new_entry, prev_entry;
-	vm_offset_t bot, top;
-	vm_size_t growsize, init_ssize;
+	vm_offset_t bot, gap_bot, gap_top, top;
+	vm_size_t init_ssize, sgp;
 	int orient, rv;
-	rlim_t lmemlim, vmemlim;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
-	 * NOTE: We explicitly allow bi-directional stacks.
 	 */
-	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
+	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
 	KASSERT(orient != 0, ("No stack grow direction"));
+	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
+	    ("bi-dir stack"));
 
 	if (addrbos < vm_map_min(map) ||
-	    addrbos > vm_map_max(map) ||
-	    addrbos + max_ssize < addrbos)
-		return (KERN_NO_SPACE);
+	    addrbos + max_ssize > vm_map_max(map) ||
+	    addrbos + max_ssize <= addrbos)
+		return (KERN_INVALID_ADDRESS);
+	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+	if (sgp >= max_ssize)
+		return (KERN_INVALID_ARGUMENT);
 
-	growsize = sgrowsiz;
-	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
+	init_ssize = growsize;
+	if (max_ssize < init_ssize + sgp)
+		init_ssize = max_ssize - sgp;
 
-	PROC_LOCK(curproc);
-	lmemlim = lim_cur(curproc, RLIMIT_MEMLOCK);
-	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
-	PROC_UNLOCK(curproc);
-
-	vm_map_lock(map);
-
 	/* If addr is already mapped, no go */
-	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
-		vm_map_unlock(map);
+	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
 		return (KERN_NO_SPACE);
-	}
 
-	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-		if (ptoa(pmap_wired_count(map->pmap)) + init_ssize > lmemlim) {
-			vm_map_unlock(map);
-			return (KERN_NO_SPACE);
-		}
-	}
-
-	/* If we would blow our VMEM resource limit, no go */
-	if (map->size + init_ssize > vmemlim) {
-		vm_map_unlock(map);
-		return (KERN_NO_SPACE);
-	}
-
 	/*
 	 * If we can't accomodate max_ssize in the current mapping, no go.
-	 * However, we need to be aware that subsequent user mappings might
-	 * map into the space we have reserved for stack, and currently this
-	 * space is not protected.
-	 *
-	 * Hopefully we will at least detect this condition when we try to
-	 * grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
-	    (prev_entry->next->start < addrbos + max_ssize)) {
-		vm_map_unlock(map);
+	    (prev_entry->next->start < addrbos + max_ssize))
 		return (KERN_NO_SPACE);
-	}
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
@@ -3322,59 +3601,53 @@
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
-	if (orient == MAP_STACK_GROWS_DOWN)
+	if (orient == MAP_STACK_GROWS_DOWN) {
 		bot = addrbos + max_ssize - init_ssize;
-	else if (orient == MAP_STACK_GROWS_UP)
+		top = bot + init_ssize;
+		gap_bot = addrbos;
+		gap_top = bot;
+	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
 		bot = addrbos;
-	else
-		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
-	top = bot + init_ssize;
+		top = bot + init_ssize;
+		gap_bot = top;
+		gap_top = addrbos + max_ssize;
+	}
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
-
-	/* Now set the avail_ssize amount. */
-	if (rv == KERN_SUCCESS) {
-		if (prev_entry != &map->header)
-			vm_map_clip_end(map, prev_entry, bot);
-		new_entry = prev_entry->next;
-		if (new_entry->end != top || new_entry->start != bot)
-			panic("Bad entry start/end for new stack entry");
-
-		new_entry->avail_ssize = max_ssize - init_ssize;
-		if (orient & MAP_STACK_GROWS_DOWN)
-			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
-		if (orient & MAP_STACK_GROWS_UP)
-			new_entry->eflags |= MAP_ENTRY_GROWS_UP;
-	}
-
-	vm_map_unlock(map);
+	if (rv != KERN_SUCCESS)
+		return (rv);
+	new_entry = prev_entry->next;
+	KASSERT(new_entry->end == top || new_entry->start == bot,
+	    ("Bad entry start/end for new stack entry"));
+	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
+	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
+	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
+	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
+	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
+	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
+	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
+	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
+	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
+	if (rv != KERN_SUCCESS)
+		(void)vm_map_delete(map, bot, top);
 	return (rv);
 }
 
-static int stack_guard_page = 0;
-TUNABLE_INT("security.bsd.stack_guard_page", &stack_guard_page);
-SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RW,
-    &stack_guard_page, 0,
-    "Insert stack guard page ahead of the growable segments.");
-
-/* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
- * desired address is already mapped, or if we successfully grow
- * the stack.  Also returns KERN_SUCCESS if addr is outside the
- * stack range (this is strange, but preserves compatibility with
- * the grow function in vm_machdep.c).
+/*
+ * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
+ * successfully grow the stack.
  */
-int
-vm_map_growstack(struct proc *p, vm_offset_t addr)
+static int
+vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
 {
-	vm_map_entry_t next_entry, prev_entry;
-	vm_map_entry_t new_entry, stack_entry;
-	struct vmspace *vm = p->p_vmspace;
-	vm_map_t map = &vm->vm_map;
-	vm_offset_t end;
-	vm_size_t growsize;
-	size_t grow_amount, max_grow;
+	vm_map_entry_t stack_entry;
+	struct proc *p;
+	struct vmspace *vm;
+	struct ucred *cred;
+	vm_offset_t gap_end, gap_start, grow_start;
+	size_t grow_amount, guard, max_grow;
 	rlim_t lmemlim, stacklim, vmemlim;
-	int is_procstack, rv;
-	struct ucred *cred;
+	int rv, rv1;
+	bool gap_deleted, grow_down, is_procstack;
 #ifdef notyet
 	uint64_t limit;
 #endif
@@ -3382,124 +3655,84 @@
 	int error;
 #endif
 
-Retry:
+	p = curproc;
+	vm = p->p_vmspace;
+
+	/*
+	 * Disallow stack growth when the access is performed by a
+	 * debugger or AIO daemon.  The reason is that the wrong
+	 * resource limits are applied.
+	 */
+	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
+		return (KERN_FAILURE);
+
+	MPASS(!map->system_map);
+
+	guard = stack_guard_page * PAGE_SIZE;
 	PROC_LOCK(p);
 	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
 	stacklim = lim_cur(p, RLIMIT_STACK);
 	vmemlim = lim_cur(p, RLIMIT_VMEM);
 	PROC_UNLOCK(p);
-
-	vm_map_lock_read(map);
-
-	/* If addr is already in the entry range, no need to grow.*/
-	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
-		vm_map_unlock_read(map);
+retry:
+	/* If addr is not in a hole for a stack grow area, no need to grow. */
+	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
+		return (KERN_FAILURE);
+	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
 		return (KERN_SUCCESS);
-	}
-
-	next_entry = prev_entry->next;
-	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
-		/*
-		 * This entry does not grow upwards. Since the address lies
-		 * beyond this entry, the next entry (if one exists) has to
-		 * be a downward growable entry. The entry list header is
-		 * never a growable entry, so it suffices to check the flags.
-		 */
-		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
-			vm_map_unlock_read(map);
-			return (KERN_SUCCESS);
-		}
-		stack_entry = next_entry;
+	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
+		stack_entry = gap_entry->next;
+		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
+		    stack_entry->start != gap_entry->end)
+			return (KERN_FAILURE);
+		grow_amount = round_page(stack_entry->start - addr);
+		grow_down = true;
+	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
+		stack_entry = gap_entry->prev;
+		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
+		    stack_entry->end != gap_entry->start)
+			return (KERN_FAILURE);
+		grow_amount = round_page(addr + 1 - stack_entry->end);
+		grow_down = false;
 	} else {
-		/*
-		 * This entry grows upward. If the next entry does not at
-		 * least grow downwards, this is the entry we need to grow.
-		 * otherwise we have two possible choices and we have to
-		 * select one.
-		 */
-		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
-			/*
-			 * We have two choices; grow the entry closest to
-			 * the address to minimize the amount of growth.
-			 */
-			if (addr - prev_entry->end <= next_entry->start - addr)
-				stack_entry = prev_entry;
-			else
-				stack_entry = next_entry;
-		} else
-			stack_entry = prev_entry;
+		return (KERN_FAILURE);
 	}
-
-	if (stack_entry == next_entry) {
-		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
-		KASSERT(addr < stack_entry->start, ("foo"));
-		end = (prev_entry != &map->header) ? prev_entry->end :
-		    stack_entry->start - stack_entry->avail_ssize;
-		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
-		max_grow = stack_entry->start - end;
-	} else {
-		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
-		KASSERT(addr >= stack_entry->end, ("foo"));
-		end = (next_entry != &map->header) ? next_entry->start :
-		    stack_entry->end + stack_entry->avail_ssize;
-		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
-		max_grow = end - stack_entry->end;
-	}
-
-	if (grow_amount > stack_entry->avail_ssize) {
-		vm_map_unlock_read(map);
+	max_grow = gap_entry->end - gap_entry->start;
+	if (guard > max_grow)
 		return (KERN_NO_SPACE);
-	}
-
-	/*
-	 * If there is no longer enough space between the entries nogo, and
-	 * adjust the available space.  Note: this  should only happen if the
-	 * user has mapped into the stack area after the stack was created,
-	 * and is probably an error.
-	 *
-	 * This also effectively destroys any guard page the user might have
-	 * intended by limiting the stack size.
-	 */
-	if (grow_amount + (stack_guard_page ? PAGE_SIZE : 0) > max_grow) {
-		if (vm_map_lock_upgrade(map))
-			goto Retry;
-
-		stack_entry->avail_ssize = max_grow;
-
-		vm_map_unlock(map);
+	max_grow -= guard;
+	if (grow_amount > max_grow)
 		return (KERN_NO_SPACE);
-	}
 
-	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
-
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
-	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
-		vm_map_unlock_read(map);
+	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
+	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
+	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
 		return (KERN_NO_SPACE);
-	}
+
 #ifdef RACCT
-	PROC_LOCK(p);
-	if (is_procstack &&
-	    racct_set(p, RACCT_STACK, ctob(vm->vm_ssize) + grow_amount)) {
+	if (racct_enable) {
+		PROC_LOCK(p);
+		if (is_procstack && racct_set(p, RACCT_STACK,
+		    ctob(vm->vm_ssize) + grow_amount)) {
+			PROC_UNLOCK(p);
+			return (KERN_NO_SPACE);
+		}
 		PROC_UNLOCK(p);
-		vm_map_unlock_read(map);
-		return (KERN_NO_SPACE);
 	}
-	PROC_UNLOCK(p);
 #endif
 
-	/* Round up the grow amount modulo sgrowsiz */
-	growsize = sgrowsiz;
-	grow_amount = roundup(grow_amount, growsize);
-	if (grow_amount > stack_entry->avail_ssize)
-		grow_amount = stack_entry->avail_ssize;
+	grow_amount = roundup(grow_amount, sgrowsiz);
+	if (grow_amount > max_grow)
+		grow_amount = max_grow;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = trunc_page((vm_size_t)stacklim) -
 		    ctob(vm->vm_ssize);
 	}
+
 #ifdef notyet
 	PROC_LOCK(p);
 	limit = racct_get_available(p, RACCT_STACK);
@@ -3507,97 +3740,79 @@
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
 		grow_amount = limit - ctob(vm->vm_ssize);
 #endif
-	if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+
+	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
 		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
-			vm_map_unlock_read(map);
 			rv = KERN_NO_SPACE;
 			goto out;
 		}
 #ifdef RACCT
-		PROC_LOCK(p);
-		if (racct_set(p, RACCT_MEMLOCK,
-		    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+		if (racct_enable) {
+			PROC_LOCK(p);
+			if (racct_set(p, RACCT_MEMLOCK,
+			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
+				PROC_UNLOCK(p);
+				rv = KERN_NO_SPACE;
+				goto out;
+			}
 			PROC_UNLOCK(p);
-			vm_map_unlock_read(map);
-			rv = KERN_NO_SPACE;
-			goto out;
 		}
-		PROC_UNLOCK(p);
 #endif
 	}
+
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
-		vm_map_unlock_read(map);
 		rv = KERN_NO_SPACE;
 		goto out;
 	}
 #ifdef RACCT
-	PROC_LOCK(p);
-	if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+	if (racct_enable) {
+		PROC_LOCK(p);
+		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
+			PROC_UNLOCK(p);
+			rv = KERN_NO_SPACE;
+			goto out;
+		}
 		PROC_UNLOCK(p);
-		vm_map_unlock_read(map);
-		rv = KERN_NO_SPACE;
-		goto out;
 	}
-	PROC_UNLOCK(p);
 #endif
 
-	if (vm_map_lock_upgrade(map))
-		goto Retry;
+	if (vm_map_lock_upgrade(map)) {
+		gap_entry = NULL;
+		vm_map_lock_read(map);
+		goto retry;
+	}
 
-	if (stack_entry == next_entry) {
-		/*
-		 * Growing downward.
-		 */
-		/* Get the preliminary new entry start value */
-		addr = stack_entry->start - grow_amount;
-
-		/*
-		 * If this puts us into the previous entry, cut back our
-		 * growth to the available space. Also, see the note above.
-		 */
-		if (addr < end) {
-			stack_entry->avail_ssize = max_grow;
-			addr = end;
-			if (stack_guard_page)
-				addr += PAGE_SIZE;
+	if (grow_down) {
+		grow_start = gap_entry->end - grow_amount;
+		if (gap_entry->start + grow_amount == gap_entry->end) {
+			gap_start = gap_entry->start;
+			gap_end = gap_entry->end;
+			vm_map_entry_delete(map, gap_entry);
+			gap_deleted = true;
+		} else {
+			MPASS(gap_entry->start < gap_entry->end - grow_amount);
+			gap_entry->end -= grow_amount;
+			vm_map_entry_resize_free(map, gap_entry);
+			gap_deleted = false;
 		}
-
-		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
-		    next_entry->protection, next_entry->max_protection, 0);
-
-		/* Adjust the available stack space by the amount we grew. */
-		if (rv == KERN_SUCCESS) {
-			if (prev_entry != &map->header)
-				vm_map_clip_end(map, prev_entry, addr);
-			new_entry = prev_entry->next;
-			KASSERT(new_entry == stack_entry->prev, ("foo"));
-			KASSERT(new_entry->end == stack_entry->start, ("foo"));
-			KASSERT(new_entry->start == addr, ("foo"));
-			grow_amount = new_entry->end - new_entry->start;
-			new_entry->avail_ssize = stack_entry->avail_ssize -
-			    grow_amount;
-			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
-			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
+		rv = vm_map_insert(map, NULL, 0, grow_start,
+		    grow_start + grow_amount,
+		    stack_entry->protection, stack_entry->max_protection,
+		    MAP_STACK_GROWS_DOWN);
+		if (rv != KERN_SUCCESS) {
+			if (gap_deleted) {
+				rv1 = vm_map_insert(map, NULL, 0, gap_start,
+				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
+				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
+				MPASS(rv1 == KERN_SUCCESS);
+			} else {
+				gap_entry->end += grow_amount;
+				vm_map_entry_resize_free(map, gap_entry);
+			}
 		}
 	} else {
-		/*
-		 * Growing upward.
-		 */
-		addr = stack_entry->end + grow_amount;
-
-		/*
-		 * If this puts us into the next entry, cut back our growth
-		 * to the available space. Also, see the note above.
-		 */
-		if (addr > end) {
-			stack_entry->avail_ssize = end - stack_entry->end;
-			addr = end;
-			if (stack_guard_page)
-				addr -= PAGE_SIZE;
-		}
-
-		grow_amount = addr - stack_entry->end;
+		grow_start = stack_entry->end;
 		cred = stack_entry->cred;
 		if (cred == NULL && stack_entry->object.vm_object != NULL)
 			cred = stack_entry->object.vm_object->cred;
@@ -3605,43 +3820,40 @@
 			rv = KERN_NO_SPACE;
 		/* Grow the underlying object if applicable. */
 		else if (stack_entry->object.vm_object == NULL ||
-			 vm_object_coalesce(stack_entry->object.vm_object,
-			 stack_entry->offset,
-			 (vm_size_t)(stack_entry->end - stack_entry->start),
-			 (vm_size_t)grow_amount, cred != NULL)) {
-			map->size += (addr - stack_entry->end);
-			/* Update the current entry. */
-			stack_entry->end = addr;
-			stack_entry->avail_ssize -= grow_amount;
+		    vm_object_coalesce(stack_entry->object.vm_object,
+		    stack_entry->offset,
+		    (vm_size_t)(stack_entry->end - stack_entry->start),
+		    (vm_size_t)grow_amount, cred != NULL)) {
+			if (gap_entry->start + grow_amount == gap_entry->end)
+				vm_map_entry_delete(map, gap_entry);
+			else
+				gap_entry->start += grow_amount;
+			stack_entry->end += grow_amount;
+			map->size += grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
-
-			if (next_entry != &map->header)
-				vm_map_clip_start(map, next_entry, addr);
 		} else
 			rv = KERN_FAILURE;
 	}
-
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
-	vm_map_unlock(map);
-
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
-	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
-		vm_map_wire(map,
-		    (stack_entry == next_entry) ? addr : addr - grow_amount,
-		    (stack_entry == next_entry) ? stack_entry->start : addr,
+	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
+		vm_map_unlock(map);
+		vm_map_wire(map, grow_start, grow_start + grow_amount,
 		    (p->p_flag & P_SYSTEM)
 		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
 		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
-	}
+		vm_map_lock_read(map);
+	} else
+		vm_map_lock_downgrade(map);
 
 out:
 #ifdef RACCT
-	if (rv != KERN_SUCCESS) {
+	if (racct_enable && rv != KERN_SUCCESS) {
 		PROC_LOCK(p);
 		error = racct_set(p, RACCT_VMEM, map->size);
 		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
@@ -3669,7 +3881,9 @@
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
-	newvmspace = vmspace_alloc(minuser, maxuser);
+	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
+	    ("vmspace_exec recursed"));
+	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
@@ -3685,7 +3899,7 @@
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)
 		pmap_activate(curthread);
-	vmspace_free(oldvmspace);
+	curthread->td_pflags |= TDP_EXECVMSPC;
 	return (0);
 }
 
@@ -3759,10 +3973,11 @@
 	vm_size_t size;
 	struct ucred *cred;
 
-RetryLookup:;
+RetryLookup:
 
 	vm_map_lock_read(map);
 
+RetryLookupLocked:
 	/*
 	 * Lookup the faulting address.
 	 */
@@ -3788,17 +4003,24 @@
 	 * Check whether this task is allowed to have this page.
 	 */
 	prot = entry->protection;
-	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
+	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
+		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
+		if (prot == VM_PROT_NONE && map != kernel_map &&
+		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
+		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
+		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
+		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
+			goto RetryLookupLocked;
+	}
+	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
 		vm_map_unlock_read(map);
 		return (KERN_PROTECTION_FAILURE);
 	}
-	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
-	    (entry->eflags & MAP_ENTRY_COW) &&
-	    (fault_type & VM_PROT_WRITE)) {
-		vm_map_unlock_read(map);
-		return (KERN_PROTECTION_FAILURE);
-	}
+	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
+	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
+	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
+	    ("entry %p flags %x", entry, entry->eflags));
 	if ((fault_typea & VM_PROT_COPY) != 0 &&
 	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
 	    (entry->eflags & MAP_ENTRY_COW) == 0) {
@@ -3862,10 +4084,10 @@
 				crfree(entry->cred);
 				entry->cred = NULL;
 			} else if (entry->cred != NULL) {
-				VM_OBJECT_LOCK(eobject);
+				VM_OBJECT_WLOCK(eobject);
 				eobject->cred = entry->cred;
 				eobject->charge = size;
-				VM_OBJECT_UNLOCK(eobject);
+				VM_OBJECT_WUNLOCK(eobject);
 				entry->cred = NULL;
 			}
 
@@ -3890,10 +4112,10 @@
 		    atop(size));
 		entry->offset = 0;
 		if (entry->cred != NULL) {
-			VM_OBJECT_LOCK(entry->object.vm_object);
+			VM_OBJECT_WLOCK(entry->object.vm_object);
 			entry->object.vm_object->cred = entry->cred;
 			entry->object.vm_object->charge = size;
-			VM_OBJECT_UNLOCK(entry->object.vm_object);
+			VM_OBJECT_WUNLOCK(entry->object.vm_object);
 			entry->cred = NULL;
 		}
 		vm_map_lock_downgrade(map);
@@ -3952,10 +4174,6 @@
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
-	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
-	    (entry->eflags & MAP_ENTRY_COW) &&
-	    (fault_type & VM_PROT_WRITE))
-		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
@@ -4016,32 +4234,21 @@
 
 #include <ddb/ddb.h>
 
-/*
- *	vm_map_print:	[ debug ]
- */
-DB_SHOW_COMMAND(map, vm_map_print)
+static void
+vm_map_print(vm_map_t map)
 {
-	static int nlines;
-	/* XXX convert args. */
-	vm_map_t map = (vm_map_t)addr;
-	boolean_t full = have_addr;
-
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
-	nlines++;
 
-	if (!full && db_indent)
-		return;
-
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
-		db_iprintf("map entry %p: start=%p, end=%p\n",
-		    (void *)entry, (void *)entry->start, (void *)entry->end);
-		nlines++;
+		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
+		    (void *)entry, (void *)entry->start, (void *)entry->end,
+		    entry->eflags);
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
@@ -4057,14 +4264,11 @@
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
-			nlines++;
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
-				vm_map_print((db_expr_t)(intptr_t)
-					     entry->object.sub_map,
-					     full, 0, (char *)0);
+				vm_map_print((vm_map_t)entry->object.sub_map);
 				db_indent -= 2;
 			}
 		} else {
@@ -4081,7 +4285,6 @@
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
-			nlines++;
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
@@ -4089,24 +4292,30 @@
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
-						full, 0, (char *)0);
-				nlines += 4;
+						0, 0, (char *)0);
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
-	if (db_indent == 0)
-		nlines = 0;
 }
 
+DB_SHOW_COMMAND(map, map)
+{
 
+	if (!have_addr) {
+		db_printf("usage: show map <addr>\n");
+		return;
+	}
+	vm_map_print((vm_map_t)addr);
+}
+
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
-		p = (struct proc *) addr;
+		p = db_lookup_proc(addr);
 	} else {
 		p = curproc;
 	}
@@ -4115,7 +4324,7 @@
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
-	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
+	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
 }
 
 #endif /* DDB */

Modified: trunk/sys/vm/vm_map.h
===================================================================
--- trunk/sys/vm/vm_map.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_map.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $
  */
 
 /*
@@ -103,7 +104,7 @@
 	struct vm_map_entry *right;	/* right child in binary search tree */
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
-	vm_offset_t avail_ssize;	/* amt can grow if this is a stack */
+	vm_offset_t pad0;
 	vm_size_t adj_free;		/* amount of adjacent free space */
 	vm_size_t max_free;		/* max free space in subtree */
 	union vm_map_object object;	/* object I point to */
@@ -116,6 +117,7 @@
 	int wired_count;		/* can be paged if = 0 */
 	vm_pindex_t next_read;		/* index of the next sequential read */
 	struct ucred *cred;		/* tmp storage for creator ref */
+	struct thread *wiring_thread;
 };
 
 #define MAP_ENTRY_NOSYNC		0x0001
@@ -141,6 +143,9 @@
 
 #define	MAP_ENTRY_WIRE_SKIPPED		0x4000
 #define	MAP_ENTRY_VN_WRITECNT		0x8000	/* writeable vnode mapping */
+#define	MAP_ENTRY_GUARD			0x10000
+#define	MAP_ENTRY_STACK_GAP_DN		0x20000
+#define	MAP_ENTRY_STACK_GAP_UP		0x40000
 
 #ifdef	_KERNEL
 static __inline u_char
@@ -314,6 +319,8 @@
 #define MAP_PREFAULT		0x0008
 #define MAP_PREFAULT_PARTIAL	0x0010
 #define MAP_DISABLE_SYNCER	0x0020
+#define	MAP_CHECK_EXCL		0x0040
+#define	MAP_CREATE_GUARD	0x0080
 #define MAP_DISABLE_COREDUMP	0x0100
 #define MAP_PREFAULT_MADVISE	0x0200	/* from (user) madvise request */
 #define	MAP_VN_WRITECOUNT	0x0400
@@ -321,13 +328,15 @@
 #define	MAP_STACK_GROWS_UP	0x2000
 #define	MAP_ACC_CHARGED		0x4000
 #define	MAP_ACC_NO_CHARGE	0x8000
+#define	MAP_CREATE_STACK_GAP_UP	0x10000
+#define	MAP_CREATE_STACK_GAP_DN	0x20000
 
 /*
  * vm_fault option flags
  */
-#define VM_FAULT_NORMAL 0		/* Nothing special */
-#define VM_FAULT_CHANGE_WIRING 1	/* Change the wiring as appropriate */
-#define	VM_FAULT_DIRTY 2		/* Dirty the page; use w/VM_PROT_COPY */
+#define	VM_FAULT_NORMAL	0		/* Nothing special */
+#define	VM_FAULT_WIRE	1		/* Wire the mapped page */
+#define	VM_FAULT_DIRTY	2		/* Dirty the page; use w/VM_PROT_COPY */
 
 /*
  * Initially, mappings are slightly sequential.  The maximum window size must
@@ -338,14 +347,16 @@
 #define	VM_FAULT_READ_AHEAD_MAX		min(atop(MAXPHYS) - 1, UINT8_MAX)
 
 /*
- * The following "find_space" options are supported by vm_map_find()
+ * The following "find_space" options are supported by vm_map_find().
+ *
+ * For VMFS_ALIGNED_SPACE, the desired alignment is specified to
+ * the macro argument as log base 2 of the desired alignment.
  */
 #define	VMFS_NO_SPACE		0	/* don't find; use the given range */
 #define	VMFS_ANY_SPACE		1	/* find a range with any alignment */
-#define	VMFS_ALIGNED_SPACE	2	/* find a superpage-aligned range */
-#if defined(__mips__)
-#define	VMFS_TLB_ALIGNED_SPACE	3	/* find a TLB entry aligned range */
-#endif
+#define	VMFS_OPTIMAL_SPACE	2	/* find a range with optimal alignment*/
+#define	VMFS_SUPER_SPACE	3	/* find a superpage-aligned range */
+#define	VMFS_ALIGNED_SPACE(x)	((x) << 8) /* find a range with fixed alignment */
 
 /*
  * vm_map_wire and vm_map_unwire option flags
@@ -363,7 +374,9 @@
 vm_map_t vm_map_create(pmap_t, vm_offset_t, vm_offset_t);
 int vm_map_delete(vm_map_t, vm_offset_t, vm_offset_t);
 int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t,
-    int, vm_prot_t, vm_prot_t, int);
+    vm_offset_t, int, vm_prot_t, vm_prot_t, int);
+int vm_map_find_min(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *,
+    vm_size_t, vm_offset_t, vm_offset_t, int, vm_prot_t, vm_prot_t, int);
 int vm_map_fixed(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_size_t,
     vm_prot_t, vm_prot_t, int);
 int vm_map_findspace (vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *);
@@ -385,9 +398,7 @@
 int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
 void vm_map_simplify_entry (vm_map_t, vm_map_entry_t);
-void vm_init2 (void);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
-int vm_map_growstack (struct proc *p, vm_offset_t addr);
 int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);
 int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,

Modified: trunk/sys/vm/vm_meter.c
===================================================================
--- trunk/sys/vm/vm_meter.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_meter.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -39,6 +40,7 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/smp.h>
@@ -55,21 +57,21 @@
 struct vmmeter cnt;
 
 SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
-	CTLFLAG_RW, &cnt.v_free_min, 0, "");
+	CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold");
 SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
-	CTLFLAG_RW, &cnt.v_free_target, 0, "");
+	CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages");
 SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
-	CTLFLAG_RW, &cnt.v_free_reserved, 0, "");
+	CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock");
 SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
-	CTLFLAG_RW, &cnt.v_inactive_target, 0, "");
+	CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive");
 SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
-	CTLFLAG_RW, &cnt.v_cache_min, 0, "");
+	CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue");
 SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
-	CTLFLAG_RW, &cnt.v_cache_max, 0, "");
+	CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue");
 SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
-	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
+	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
-	CTLFLAG_RW, &cnt.v_free_severe, 0, "");
+	CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point");
 
 static int
 sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
@@ -92,50 +94,40 @@
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_loadavg, "S,loadavg",
     "Machine loadaverage history");
 
+/*
+ * This function aims to determine if the object is mapped,
+ * specifically, if it is referenced by a vm_map_entry.  Because
+ * objects occasionally acquire transient references that do not
+ * represent a mapping, the method used here is inexact.  However, it
+ * has very low overhead and is good enough for the advisory
+ * vm.vmtotal sysctl.
+ */
+static bool
+is_object_active(vm_object_t obj)
+{
+
+	return (obj->ref_count > obj->shadow_count);
+}
+
 static int
 vmtotal(SYSCTL_HANDLER_ARGS)
 {
-	struct proc *p;
 	struct vmtotal total;
-	vm_map_entry_t entry;
 	vm_object_t object;
-	vm_map_t map;
-	int paging;
+	struct proc *p;
 	struct thread *td;
-	struct vmspace *vm;
 
 	bzero(&total, sizeof(total));
+
 	/*
-	 * Mark all objects as inactive.
-	 */
-	mtx_lock(&vm_object_list_mtx);
-	TAILQ_FOREACH(object, &vm_object_list, object_list) {
-		if (!VM_OBJECT_TRYLOCK(object)) {
-			/*
-			 * Avoid a lock-order reversal.  Consequently,
-			 * the reported number of active pages may be
-			 * greater than the actual number.
-			 */
-			continue;
-		}
-		vm_object_clear_flag(object, OBJ_ACTIVE);
-		VM_OBJECT_UNLOCK(object);
-	}
-	mtx_unlock(&vm_object_list_mtx);
-	/*
 	 * Calculate process statistics.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		if (p->p_flag & P_SYSTEM)
+		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
 		PROC_LOCK(p);
-		switch (p->p_state) {
-		case PRS_NEW:
-			PROC_UNLOCK(p);
-			continue;
-			break;
-		default:
+		if (p->p_state != PRS_NEW) {
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				switch (td->td_state) {
@@ -142,13 +134,16 @@
 				case TDS_INHIBITED:
 					if (TD_IS_SWAPPED(td))
 						total.t_sw++;
-					else if (TD_IS_SLEEPING(td) &&
-					    td->td_priority <= PZERO)
-						total.t_dw++;
-					else
-						total.t_sl++;
+					else if (TD_IS_SLEEPING(td)) {
+						if (td->td_priority <= PZERO)
+							total.t_dw++;
+						else
+							total.t_sl++;
+						if (td->td_wchan ==
+						    &cnt.v_free_count)
+							total.t_pw++;
+					}
 					break;
-
 				case TDS_CAN_RUN:
 					total.t_sw++;
 					break;
@@ -155,8 +150,7 @@
 				case TDS_RUNQ:
 				case TDS_RUNNING:
 					total.t_rq++;
-					thread_unlock(td);
-					continue;
+					break;
 				default:
 					break;
 				}
@@ -164,29 +158,6 @@
 			}
 		}
 		PROC_UNLOCK(p);
-		/*
-		 * Note active objects.
-		 */
-		paging = 0;
-		vm = vmspace_acquire_ref(p);
-		if (vm == NULL)
-			continue;
-		map = &vm->vm_map;
-		vm_map_lock_read(map);
-		for (entry = map->header.next;
-		    entry != &map->header; entry = entry->next) {
-			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
-			    (object = entry->object.vm_object) == NULL)
-				continue;
-			VM_OBJECT_LOCK(object);
-			vm_object_set_flag(object, OBJ_ACTIVE);
-			paging |= object->paging_in_progress;
-			VM_OBJECT_UNLOCK(object);
-		}
-		vm_map_unlock_read(map);
-		vmspace_free(vm);
-		if (paging)
-			total.t_pw++;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
@@ -195,12 +166,11 @@
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		/*
-		 * Perform unsynchronized reads on the object to avoid
-		 * a lock-order reversal.  In this case, the lack of
-		 * synchronization should not impair the accuracy of
-		 * the reported statistics. 
+		 * Perform unsynchronized reads on the object.  In
+		 * this case, the lack of synchronization should not
+		 * impair the accuracy of the reported statistics.
 		 */
-		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
+		if ((object->flags & OBJ_FICTITIOUS) != 0) {
 			/*
 			 * Devices, like /dev/mem, will badly skew our totals.
 			 */
@@ -213,9 +183,18 @@
 			 */
 			continue;
 		}
+		if (object->ref_count == 1 &&
+		    (object->flags & OBJ_NOSPLIT) != 0) {
+			/*
+			 * Also skip otherwise unreferenced swap
+			 * objects backing tmpfs vnodes, and POSIX or
+			 * SysV shared memory.
+			 */
+			continue;
+		}
 		total.t_vm += object->size;
 		total.t_rm += object->resident_page_count;
-		if (object->flags & OBJ_ACTIVE) {
+		if (is_object_active(object)) {
 			total.t_avm += object->size;
 			total.t_arm += object->resident_page_count;
 		}
@@ -223,7 +202,7 @@
 			/* shared object */
 			total.t_vmshr += object->size;
 			total.t_rmshr += object->resident_page_count;
-			if (object->flags & OBJ_ACTIVE) {
+			if (is_object_active(object)) {
 				total.t_avmshr += object->size;
 				total.t_armshr += object->resident_page_count;
 			}
@@ -270,104 +249,63 @@
 	"VM meter vm stats");
 SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
 
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_swtch, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_swtch, 0, vcnt, "IU", "Context switches");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_trap, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_trap, 0, vcnt, "IU", "Traps");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_syscall, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_syscall, 0, vcnt, "IU", "Syscalls");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_intr, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_intr, 0, vcnt, "IU", "Hardware interrupts");
-SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_soft, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_soft, 0, vcnt, "IU", "Software interrupts");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vm_faults, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vm_faults, 0, vcnt, "IU", "VM faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_faults, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_cow_faults, 0, vcnt, "IU", "COW faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_optim, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_cow_optim, 0, vcnt, "IU", "Optimized COW faults");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_zfod, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_zfod, 0, vcnt, "IU", "Zero fill");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_ozfod, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_ozfod, 0, vcnt, "IU", "Optimized zero fill");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_swapin, 0, vcnt, "IU", "Swapin operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_swapout, 0, vcnt, "IU", "Swapout operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_swappgsin, 0, vcnt, "IU", "Swapin pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_swappgsout, 0, vcnt, "IU", "Swapout pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodein, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vnodein, 0, vcnt, "IU", "Vnodein operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodeout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vnodeout, 0, vcnt, "IU", "Vnodeout operations");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsin, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vnodepgsin, 0, vcnt, "IU", "Vnodein pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsout, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vnodepgsout, 0, vcnt, "IU", "Vnodeout pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_intrans, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_intrans, 0, vcnt, "IU", "In transit page blocking");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_reactivated, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_reactivated, 0, vcnt, "IU", "Reactivated pages");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tcached, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_tcached, 0, vcnt, "IU", "Total pages cached");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_dfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_pfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tfree, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_tfree, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_size, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_page_size, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_page_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_reserved, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_free_reserved, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_target, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_free_target, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_free_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_wire_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_wire_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_active_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_active_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_target, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_inactive_target, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_inactive_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_count, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_cache_count, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_cache_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_max, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_cache_max, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pageout_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_pageout_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_interrupt_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_forks, 0, vcnt, "IU", "Number of fork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vforks, 0, vcnt, "IU", "Number of vfork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforks, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_rforks, 0, vcnt, "IU", "Number of rfork() calls");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreads, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_kthreads, 0, vcnt, "IU", "Number of fork() calls by kernel");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_forkpages, 0, vcnt, "IU", "VM pages affected by fork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_vforkpages, 0, vcnt, "IU", "VM pages affected by vfork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforkpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_rforkpages, 0, vcnt, "IU", "VM pages affected by rfork()");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreadpages, CTLTYPE_UINT|CTLFLAG_RD|CTLFLAG_MPSAFE,
-	&cnt.v_kthreadpages, 0, vcnt, "IU", "VM pages affected by fork() by kernel");
+#define	VM_STATS(parent, var, descr) \
+	SYSCTL_PROC(parent, OID_AUTO, var, \
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \
+	    "IU", descr)
+#define	VM_STATS_VM(var, descr)		VM_STATS(_vm_stats_vm, var, descr)
+#define	VM_STATS_SYS(var, descr)	VM_STATS(_vm_stats_sys, var, descr)
 
-SYSCTL_INT(_vm_stats_misc, OID_AUTO,
-	zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
+VM_STATS_SYS(v_swtch, "Context switches");
+VM_STATS_SYS(v_trap, "Traps");
+VM_STATS_SYS(v_syscall, "System calls");
+VM_STATS_SYS(v_intr, "Device interrupts");
+VM_STATS_SYS(v_soft, "Software interrupts");
+VM_STATS_VM(v_vm_faults, "Address memory faults");
+VM_STATS_VM(v_io_faults, "Page faults requiring I/O");
+VM_STATS_VM(v_cow_faults, "Copy-on-write faults");
+VM_STATS_VM(v_cow_optim, "Optimized COW faults");
+VM_STATS_VM(v_zfod, "Pages zero-filled on demand");
+VM_STATS_VM(v_ozfod, "Optimized zero fill pages");
+VM_STATS_VM(v_swapin, "Swap pager pageins");
+VM_STATS_VM(v_swapout, "Swap pager pageouts");
+VM_STATS_VM(v_swappgsin, "Swap pages swapped in");
+VM_STATS_VM(v_swappgsout, "Swap pages swapped out");
+VM_STATS_VM(v_vnodein, "Vnode pager pageins");
+VM_STATS_VM(v_vnodeout, "Vnode pager pageouts");
+VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
+VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
+VM_STATS_VM(v_intrans, "In transit page faults");
+VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
+VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_tcached, "Total pages cached");
+VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
+VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
+VM_STATS_VM(v_tfree, "Total pages freed");
+VM_STATS_VM(v_page_size, "Page size in bytes");
+VM_STATS_VM(v_page_count, "Total number of pages in system");
+VM_STATS_VM(v_free_reserved, "Pages reserved for deadlock");
+VM_STATS_VM(v_free_target, "Pages desired free");
+VM_STATS_VM(v_free_min, "Minimum low-free-pages threshold");
+VM_STATS_VM(v_free_count, "Free pages");
+VM_STATS_VM(v_wire_count, "Wired pages");
+VM_STATS_VM(v_active_count, "Active pages");
+VM_STATS_VM(v_inactive_target, "Desired inactive pages");
+VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_cache_count, "Pages on cache queue");
+VM_STATS_VM(v_cache_min, "Min pages on cache queue");
+VM_STATS_VM(v_cache_max, "Max pages on cached queue");
+VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
+VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
+VM_STATS_VM(v_forks, "Number of fork() calls");
+VM_STATS_VM(v_vforks, "Number of vfork() calls");
+VM_STATS_VM(v_rforks, "Number of rfork() calls");
+VM_STATS_VM(v_kthreads, "Number of fork() calls by kernel");
+VM_STATS_VM(v_forkpages, "VM pages affected by fork()");
+VM_STATS_VM(v_vforkpages, "VM pages affected by vfork()");
+VM_STATS_VM(v_rforkpages, "VM pages affected by rfork()");
+VM_STATS_VM(v_kthreadpages, "VM pages affected by fork() by kernel");
+
+SYSCTL_INT(_vm_stats_misc, OID_AUTO, zero_page_count, CTLFLAG_RD,
+	&vm_page_zero_count, 0, "Number of zero-ed free pages");

Modified: trunk/sys/vm/vm_mmap.c
===================================================================
--- trunk/sys/vm/vm_mmap.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_mmap.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
@@ -41,7 +42,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
@@ -48,7 +49,7 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -56,9 +57,11 @@
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/procctl.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
@@ -67,6 +70,7 @@
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
@@ -88,15 +92,13 @@
 #include <sys/pmckern.h>
 #endif
 
-int old_mlock = 1;
+int old_mlock = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
 TUNABLE_INT("vm.old_mlock", &old_mlock);
 
-#ifndef _SYS_SYSPROTO_H_
-struct sbrk_args {
-	int incr;
-};
+#ifdef MAP_32BIT
+#define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
@@ -106,14 +108,14 @@
 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
 
-/*
- * MPSAFE
- */
-/* ARGSUSED */
+#ifndef _SYS_SYSPROTO_H_
+struct sbrk_args {
+	int incr;
+};
+#endif
+
 int
-sys_sbrk(td, uap)
-	struct thread *td;
-	struct sbrk_args *uap;
+sys_sbrk(struct thread *td, struct sbrk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
@@ -125,14 +127,8 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
-/* ARGSUSED */
 int
-sys_sstk(td, uap)
-	struct thread *td;
-	struct sstk_args *uap;
+sys_sstk(struct thread *td, struct sstk_args *uap)
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
@@ -145,13 +141,10 @@
 };
 #endif
 
-/* ARGSUSED */
 int
-ogetpagesize(td, uap)
-	struct thread *td;
-	struct getpagesize_args *uap;
+ogetpagesize(struct thread *td, struct getpagesize_args *uap)
 {
-	/* MP SAFE */
+
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
@@ -183,9 +176,6 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 sys_mmap(td, uap)
 	struct thread *td;
@@ -201,7 +191,7 @@
 	vm_prot_t cap_maxprot, prot, maxprot;
 	void *handle;
 	objtype_t handle_type;
-	int flags, error;
+	int align, error, flags;
 	off_t pos;
 	struct vmspace *vms = td->td_proc->p_vmspace;
 	cap_rights_t rights;
@@ -239,6 +229,12 @@
 		flags |= MAP_ANON;
 		pos = 0;
 	}
+	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
+		return (EINVAL);
+	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
+	    pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
+	    MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
+		return (EINVAL);
 
 	/*
 	 * Align the file position to a page boundary,
@@ -251,6 +247,13 @@
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
+	/* Ensure alignment is at least a page and fits in a pointer. */
+	align = flags & MAP_ALIGNMENT_MASK;
+	if (align != 0 && align != MAP_ALIGNED_SUPER &&
+	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
+	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
+		return (EINVAL);
+
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
@@ -271,6 +274,18 @@
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
+#ifdef MAP_32BIT
+		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
+			return (EINVAL);
+	} else if (flags & MAP_32BIT) {
+		/*
+		 * For MAP_32BIT, override the hint if it is too high and
+		 * do not bother moving the mapping past the heap (since
+		 * the heap is usually above 2GB).
+		 */
+		if (addr + size > MAP_32BIT_MAX_ADDR)
+			addr = 0;
+#endif
 	} else {
 		/*
 		 * XXX for non-fixed mappings where no hint is provided or
@@ -289,7 +304,12 @@
 			    lim_max(td->td_proc, RLIMIT_DATA));
 		PROC_UNLOCK(td->td_proc);
 	}
-	if (flags & MAP_ANON) {
+	if ((flags & MAP_GUARD) != 0) {
+		handle = NULL;
+		handle_type = OBJT_DEFAULT;
+		maxprot = VM_PROT_NONE;
+		cap_maxprot = VM_PROT_NONE;
+	} else if ((flags & MAP_ANON) != 0) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
@@ -304,17 +324,17 @@
 		 * rights, but also return the maximum rights to be combined
 		 * with maxprot later.
 		 */
-		rights = CAP_MMAP;
+		cap_rights_init(&rights, CAP_MMAP);
 		if (prot & PROT_READ)
-			rights |= CAP_READ;
+			cap_rights_set(&rights, CAP_MMAP_R);
 		if ((flags & MAP_SHARED) != 0) {
 			if (prot & PROT_WRITE)
-				rights |= CAP_WRITE;
+				cap_rights_set(&rights, CAP_MMAP_W);
 		}
 		if (prot & PROT_EXEC)
-			rights |= CAP_MAPEXEC;
-		if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot,
-		    &fp)) != 0)
+			cap_rights_set(&rights, CAP_MMAP_X);
+		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+		if (error != 0)
 			goto done;
 		if (fp->f_type == DTYPE_SHM) {
 			handle = fp->f_data;
@@ -492,9 +512,6 @@
 	int flags;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sys_msync(td, uap)
 	struct thread *td;
@@ -531,7 +548,7 @@
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
-		return (EINVAL);	/* Sun returns ENOMEM? */
+		return (ENOMEM);
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	case KERN_FAILURE:
@@ -547,9 +564,6 @@
 	size_t len;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sys_munmap(td, uap)
 	struct thread *td;
@@ -623,9 +637,6 @@
 	int prot;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sys_mprotect(td, uap)
 	struct thread *td;
@@ -665,13 +676,8 @@
 	int inherit;
 };
 #endif
-/*
- * MPSAFE
- */
 int
-sys_minherit(td, uap)
-	struct thread *td;
-	struct minherit_args *uap;
+sys_minherit(struct thread *td, struct minherit_args *uap)
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
@@ -706,19 +712,12 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
-/* ARGSUSED */
 int
-sys_madvise(td, uap)
-	struct thread *td;
-	struct madvise_args *uap;
+sys_madvise(struct thread *td, struct madvise_args *uap)
 {
 	vm_offset_t start, end;
 	vm_map_t map;
-	struct proc *p;
-	int error;
+	int flags;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
@@ -725,15 +724,11 @@
 	 * "immortal."
 	 */
 	if (uap->behav == MADV_PROTECT) {
-		error = priv_check(td, PRIV_VM_MADV_PROTECT);
-		if (error == 0) {
-			p = td->td_proc;
-			PROC_LOCK(p);
-			p->p_flag |= P_PROTECTED;
-			PROC_UNLOCK(p);
-		}
-		return (error);
+		flags = PPROT_SET;
+		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
+		    PROC_SPROTECT, &flags));
 	}
+
 	/*
 	 * Check for illegal behavior
 	 */
@@ -770,14 +765,8 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
-/* ARGSUSED */
 int
-sys_mincore(td, uap)
-	struct thread *td;
-	struct mincore_args *uap;
+sys_mincore(struct thread *td, struct mincore_args *uap)
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
@@ -883,12 +872,12 @@
 				m = PHYS_TO_VM_PAGE(locked_pa);
 				if (m->object != object) {
 					if (object != NULL)
-						VM_OBJECT_UNLOCK(object);
+						VM_OBJECT_WUNLOCK(object);
 					object = m->object;
-					locked = VM_OBJECT_TRYLOCK(object);
+					locked = VM_OBJECT_TRYWLOCK(object);
 					vm_page_unlock(m);
 					if (!locked) {
-						VM_OBJECT_LOCK(object);
+						VM_OBJECT_WLOCK(object);
 						vm_page_lock(m);
 						goto retry;
 					}
@@ -906,9 +895,9 @@
 				 */
 				if (current->object.vm_object != object) {
 					if (object != NULL)
-						VM_OBJECT_UNLOCK(object);
+						VM_OBJECT_WUNLOCK(object);
 					object = current->object.vm_object;
-					VM_OBJECT_LOCK(object);
+					VM_OBJECT_WLOCK(object);
 				}
 				if (object->type == OBJT_DEFAULT ||
 				    object->type == OBJT_SWAP ||
@@ -945,7 +934,7 @@
 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
 			}
 			if (object != NULL)
-				VM_OBJECT_UNLOCK(object);
+				VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
@@ -963,12 +952,12 @@
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
+				++lastvecindex;
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
-				++lastvecindex;
 			}
 
 			/*
@@ -1004,12 +993,12 @@
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
+		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
-		++lastvecindex;
 	}
 
 	/*
@@ -1030,15 +1019,16 @@
 	size_t len;
 };
 #endif
-/*
- * MPSAFE
- */
 int
-sys_mlock(td, uap)
-	struct thread *td;
-	struct mlock_args *uap;
+sys_mlock(struct thread *td, struct mlock_args *uap)
 {
-	struct proc *proc;
+
+	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+}
+
+int
+vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+{
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	vm_map_t map;
@@ -1045,11 +1035,11 @@
 	unsigned long nsize;
 	int error;
 
-	error = priv_check(td, PRIV_VM_MLOCK);
+	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)uap->addr;
-	size = uap->len;
+	addr = (vm_offset_t)addr0;
+	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
@@ -1058,7 +1048,6 @@
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
-	proc = td->td_proc;
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
@@ -1070,16 +1059,18 @@
 	if (npages + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
-	PROC_LOCK(proc);
-	error = racct_set(proc, RACCT_MEMLOCK, nsize);
-	PROC_UNLOCK(proc);
-	if (error != 0)
-		return (ENOMEM);
+	if (racct_enable) {
+		PROC_LOCK(proc);
+		error = racct_set(proc, RACCT_MEMLOCK, nsize);
+		PROC_UNLOCK(proc);
+		if (error != 0)
+			return (ENOMEM);
+	}
 #endif
 	error = vm_map_wire(map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
-	if (error != KERN_SUCCESS) {
+	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(proc);
 		racct_set(proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
@@ -1095,13 +1086,8 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
-sys_mlockall(td, uap)
-	struct thread *td;
-	struct mlockall_args *uap;
+sys_mlockall(struct thread *td, struct mlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
@@ -1127,11 +1113,13 @@
 		PROC_UNLOCK(td->td_proc);
 	}
 #ifdef RACCT
-	PROC_LOCK(td->td_proc);
-	error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
-	PROC_UNLOCK(td->td_proc);
-	if (error != 0)
-		return (ENOMEM);
+	if (racct_enable) {
+		PROC_LOCK(td->td_proc);
+		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
+		PROC_UNLOCK(td->td_proc);
+		if (error != 0)
+			return (ENOMEM);
+	}
 #endif
 
 	if (uap->how & MCL_FUTURE) {
@@ -1153,7 +1141,7 @@
 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 	}
 #ifdef RACCT
-	if (error != KERN_SUCCESS) {
+	if (racct_enable && error != KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK,
 		    ptoa(pmap_wired_count(map->pmap)));
@@ -1170,13 +1158,8 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
-sys_munlockall(td, uap)
-	struct thread *td;
-	struct munlockall_args *uap;
+sys_munlockall(struct thread *td, struct munlockall_args *uap)
 {
 	vm_map_t map;
 	int error;
@@ -1195,7 +1178,7 @@
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 #ifdef RACCT
-	if (error == KERN_SUCCESS) {
+	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 		PROC_UNLOCK(td->td_proc);
@@ -1211,9 +1194,6 @@
 	size_t len;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sys_munlock(td, uap)
 	struct thread *td;
@@ -1221,6 +1201,9 @@
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
+#ifdef RACCT
+	vm_map_t map;
+#endif
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
@@ -1236,9 +1219,11 @@
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 #ifdef RACCT
-	if (error == KERN_SUCCESS) {
+	if (racct_enable && error == KERN_SUCCESS) {
 		PROC_LOCK(td->td_proc);
-		racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start));
+		map = &td->td_proc->p_vmspace->vm_map;
+		racct_set(td->td_proc, RACCT_MEMLOCK,
+		    ptoa(pmap_wired_count(map->pmap)));
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
@@ -1263,21 +1248,16 @@
 	struct vattr va;
 	vm_object_t obj;
 	vm_offset_t foff;
-	struct mount *mp;
 	struct ucred *cred;
-	int error, flags, locktype, vfslocked;
+	int error, flags, locktype;
 
-	mp = vp->v_mount;
 	cred = td->td_ucred;
 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 		locktype = LK_EXCLUSIVE;
 	else
 		locktype = LK_SHARED;
-	vfslocked = VFS_LOCK_GIANT(mp);
-	if ((error = vget(vp, locktype, td)) != 0) {
-		VFS_UNLOCK_GIANT(vfslocked);
+	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
-	}
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
@@ -1289,18 +1269,16 @@
 			error = EINVAL;
 			goto done;
 		}
-		if (obj->handle != vp) {
+		if (obj->type == OBJT_VNODE && obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode *)obj->handle;
 			/*
 			 * Bypass filesystems obey the mpsafety of the
-			 * underlying fs.
+			 * underlying fs.  Tmpfs never bypasses.
 			 */
 			error = vget(vp, locktype, td);
-			if (error != 0) {
-				VFS_UNLOCK_GIANT(vfslocked);
+			if (error != 0)
 				return (error);
-			}
 		}
 		if (locktype == LK_EXCLUSIVE) {
 			*writecounted = TRUE;
@@ -1340,7 +1318,14 @@
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
-	obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred);
+	if (obj->type == OBJT_VNODE)
+		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
+		    cred);
+	else {
+		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
+		    ("wrong object type"));
+		vm_object_reference(obj);
+	}
 	if (obj == NULL) {
 		error = ENOMEM;
 		goto done;
@@ -1357,7 +1342,6 @@
 		vnode_pager_update_writecount(obj, objsize, 0);
 	}
 	vput(vp);
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -1364,8 +1348,6 @@
 /*
  * vm_mmap_cdev()
  *
- * MPSAFE
- *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
@@ -1478,10 +1460,11 @@
 	objtype_t handle_type, void *handle,
 	vm_ooffset_t foff)
 {
-	boolean_t fitit;
+	boolean_t curmap, fitit;
+	vm_offset_t max_addr;
 	vm_object_t object = NULL;
 	struct thread *td = curthread;
-	int docow, error, rv;
+	int docow, error, findspace, rv;
 	boolean_t writecounted;
 
 	if (size == 0)
@@ -1489,9 +1472,17 @@
 
 	size = round_page(size);
 
-	PROC_LOCK(td->td_proc);
-	if (td->td_proc->p_vmspace->vm_map.size + size >
-	    lim_cur(td->td_proc, RLIMIT_VMEM)) {
+	curmap = map == &td->td_proc->p_vmspace->vm_map;
+	if (curmap) {
+		PROC_LOCK(td->td_proc);
+		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
+			PROC_UNLOCK(td->td_proc);
+			return (ENOMEM);
+		}
+		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
+			PROC_UNLOCK(td->td_proc);
+			return (ENOMEM);
+		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
 			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
@@ -1510,14 +1501,7 @@
 			}
 		}
 		PROC_UNLOCK(td->td_proc);
-		return (ENOMEM);
 	}
-	if (racct_set(td->td_proc, RACCT_VMEM,
-	    td->td_proc->p_vmspace->vm_map.size + size)) {
-		PROC_UNLOCK(td->td_proc);
-		return (ENOMEM);
-	}
-	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
@@ -1592,17 +1576,48 @@
 		docow |= MAP_INHERIT_SHARE;
 	if (writecounted)
 		docow |= MAP_VN_WRITECOUNT;
+	if (flags & MAP_STACK) {
+		if (object != NULL)
+			return (EINVAL);
+		docow |= MAP_STACK_GROWS_DOWN;
+	}
+	if ((flags & MAP_EXCL) != 0)
+		docow |= MAP_CHECK_EXCL;
+	if ((flags & MAP_GUARD) != 0)
+		docow |= MAP_CREATE_GUARD;
 
-	if (flags & MAP_STACK)
-		rv = vm_map_stack(map, *addr, size, prot, maxprot,
-		    docow | MAP_STACK_GROWS_DOWN);
-	else if (fitit)
-		rv = vm_map_find(map, object, foff, addr, size,
-		    object != NULL && object->type == OBJT_DEVICE ?
-		    VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
-	else
+	if (fitit) {
+		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
+			findspace = VMFS_SUPER_SPACE;
+		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
+			findspace = VMFS_ALIGNED_SPACE(flags >>
+			    MAP_ALIGNMENT_SHIFT);
+		else
+			findspace = VMFS_OPTIMAL_SPACE;
+		max_addr = 0;
+#ifdef MAP_32BIT
+		if ((flags & MAP_32BIT) != 0)
+			max_addr = MAP_32BIT_MAX_ADDR;
+#endif
+		if (curmap) {
+			vm_offset_t min_addr;
+
+			PROC_LOCK(td->td_proc);
+			min_addr = round_page((vm_offset_t)td->td_proc->
+			    p_vmspace->vm_daddr + lim_max(td->td_proc,
+			    RLIMIT_DATA));
+			PROC_UNLOCK(td->td_proc);
+			rv = vm_map_find_min(map, object, foff, addr, size,
+			    min_addr, max_addr,
+			    findspace, prot, maxprot, docow);
+		} else {
+			rv = vm_map_find(map, object, foff, addr, size,
+			    max_addr, findspace, prot, maxprot, docow);
+		}
+	} else {
 		rv = vm_map_fixed(map, object, foff, *addr, size,
-				 prot, maxprot, docow);
+		    prot, maxprot, docow);
+	}
 
 	if (rv == KERN_SUCCESS) {
 		/*

Modified: trunk/sys/vm/vm_object.c
===================================================================
--- trunk/sys/vm/vm_object.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_object.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -63,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $");
 
 #include "opt_vm.h"
 
@@ -78,6 +79,8 @@
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
+#include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
@@ -93,6 +96,7 @@
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
@@ -164,15 +168,18 @@
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
+	KASSERT(object->ref_count == 0,
+	    ("object %p ref_count = %d", object, object->ref_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
-	    ("object %p has resident pages",
-	    object));
+	    ("object %p has resident pages in its memq", object));
+	KASSERT(vm_radix_is_empty(&object->rtree),
+	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
-	KASSERT(object->cache == NULL,
+	KASSERT(vm_object_cache_is_empty(object),
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
@@ -184,6 +191,9 @@
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
+	KASSERT(object->type == OBJT_DEAD,
+	    ("object %p has non-dead type %d",
+	    object, object->type));
 }
 #endif
 
@@ -193,17 +203,27 @@
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
-	bzero(&object->mtx, sizeof(object->mtx));
-	VM_OBJECT_LOCK_INIT(object, "standard object");
+	bzero(&object->lock, sizeof(object->lock));
+	rw_init_flags(&object->lock, "vm object", RW_DUPOK);
 
 	/* These are true for any object that has been freed */
+	object->type = OBJT_DEAD;
+	object->ref_count = 0;
+	object->rtree.rt_root = 0;
+	object->rtree.rt_flags = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
+	object->cache.rt_root = 0;
+	object->cache.rt_flags = 0;
+
+	mtx_lock(&vm_object_list_mtx);
+	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
+	mtx_unlock(&vm_object_list_mtx);
 	return (0);
 }
 
-void
+static void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 
@@ -210,18 +230,36 @@
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
-	object->root = NULL;
 	object->type = type;
+	switch (type) {
+	case OBJT_DEAD:
+		panic("_vm_object_allocate: can't create OBJT_DEAD");
+	case OBJT_DEFAULT:
+	case OBJT_SWAP:
+		object->flags = OBJ_ONEMAPPING;
+		break;
+	case OBJT_DEVICE:
+	case OBJT_SG:
+		object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
+		break;
+	case OBJT_MGTDEVICE:
+		object->flags = OBJ_FICTITIOUS;
+		break;
+	case OBJT_PHYS:
+		object->flags = OBJ_UNMANAGED;
+		break;
+	case OBJT_VNODE:
+		object->flags = 0;
+		break;
+	default:
+		panic("_vm_object_allocate: type %d is undefined", type);
+	}
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->memattr = VM_MEMATTR_DEFAULT;
-	object->flags = 0;
 	object->cred = NULL;
 	object->charge = 0;
-	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
-		object->flags = OBJ_ONEMAPPING;
-	object->pg_color = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
@@ -228,11 +266,6 @@
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
-	object->cache = NULL;
-
-	mtx_lock(&vm_object_list_mtx);
-	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
-	mtx_unlock(&vm_object_list_mtx);
 }
 
 /*
@@ -246,7 +279,7 @@
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
-	VM_OBJECT_LOCK_INIT(kernel_object, "kernel object");
+	rw_init(&kernel_object->lock, "kernel vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 #if VM_NRESERVLEVEL > 0
@@ -254,7 +287,7 @@
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
-	VM_OBJECT_LOCK_INIT(kmem_object, "kmem object");
+	rw_init(&kmem_object->lock, "kmem vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 #if VM_NRESERVLEVEL > 0
@@ -273,7 +306,9 @@
 #else
 	    NULL,
 #endif
-	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
+	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+
+	vm_radix_init();
 }
 
 void
@@ -280,7 +315,7 @@
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->flags &= ~bits;
 }
 
@@ -297,10 +332,11 @@
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
+	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
@@ -310,6 +346,9 @@
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
+	default:
+		panic("vm_object_set_memattr: object %p is of undefined type",
+		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
@@ -319,7 +358,7 @@
 vm_object_pip_add(vm_object_t object, short i)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress += i;
 }
 
@@ -327,7 +366,7 @@
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress -= i;
 }
 
@@ -335,7 +374,7 @@
 vm_object_pip_wakeup(vm_object_t object)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
@@ -347,7 +386,7 @@
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
@@ -360,10 +399,10 @@
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
-		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
+		VM_OBJECT_SLEEP(object, object, PVM, waitid, 0);
 	}
 }
 
@@ -394,9 +433,9 @@
 {
 	if (object == NULL)
 		return;
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	vm_object_reference_locked(object);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
@@ -411,7 +450,7 @@
 {
 	struct vnode *vp;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
@@ -427,8 +466,7 @@
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
-	VFS_ASSERT_GIANT(vp->v_mount);
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
@@ -439,25 +477,30 @@
 	}
 #endif
 
-	if (object->ref_count > 1) {
+	/*
+	 * The test for text of vp vnode does not need a bypass to
+	 * reach right VV_TEXT there, since it is obtained from
+	 * object->handle.
+	 */
+	if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) {
 		object->ref_count--;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		/* vrele may need the vnode lock. */
 		vrele(vp);
 	} else {
 		vhold(vp);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vdrop(vp);
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		object->ref_count--;
 		if (object->type == OBJT_DEAD) {
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			VOP_UNLOCK(vp, 0);
 		} else {
 			if (object->ref_count == 0)
 				VOP_UNSET_TEXT(vp);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			vput(vp);
 		}
 	}
@@ -478,40 +521,14 @@
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
+	struct vnode *vp;
 
 	while (object != NULL) {
-		int vfslocked;
-
-		vfslocked = 0;
-	restart:
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_VNODE) {
-			struct vnode *vp = (struct vnode *) object->handle;
-
-			/*
-			 * Conditionally acquire Giant for a vnode-backed
-			 * object.  We have to be careful since the type of
-			 * a vnode object can change while the object is
-			 * unlocked.
-			 */
-			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
-				vfslocked = 1;
-				if (!mtx_trylock(&Giant)) {
-					VM_OBJECT_UNLOCK(object);
-					mtx_lock(&Giant);
-					goto restart;
-				}
-			}
 			vm_object_vndeallocate(object);
-			VFS_UNLOCK_GIANT(vfslocked);
 			return;
-		} else
-			/*
-			 * This is to handle the case that the object
-			 * changed type while we dropped its lock to
-			 * obtain Giant.
-			 */
-			VFS_UNLOCK_GIANT(vfslocked);
+		}
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
@@ -524,13 +541,33 @@
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
+			if (object->type == OBJT_SWAP &&
+			    (object->flags & OBJ_TMPFS) != 0) {
+				vp = object->un_pager.swp.swp_tmpfs;
+				vhold(vp);
+				VM_OBJECT_WUNLOCK(object);
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+				VM_OBJECT_WLOCK(object);
+				if (object->type == OBJT_DEAD ||
+				    object->ref_count != 1) {
+					VM_OBJECT_WUNLOCK(object);
+					VOP_UNLOCK(vp, 0);
+					vdrop(vp);
+					return;
+				}
+				if ((object->flags & OBJ_TMPFS) != 0)
+					VOP_UNSET_TEXT(vp);
+				VOP_UNLOCK(vp, 0);
+				vdrop(vp);
+			}
 			if (object->shadow_count == 0 &&
 			    object->handle == NULL &&
 			    (object->type == OBJT_DEFAULT ||
-			     object->type == OBJT_SWAP)) {
+			    (object->type == OBJT_SWAP &&
+			    (object->flags & OBJ_TMPFS_NODE) == 0))) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
@@ -543,12 +580,14 @@
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
-				if (!VM_OBJECT_TRYLOCK(robject)) {
+				KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
+				    ("shadowed tmpfs v_object %p", object));
+				if (!VM_OBJECT_TRYWLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
-					VM_OBJECT_UNLOCK(object);
+					VM_OBJECT_WUNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
@@ -572,28 +611,27 @@
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
-						VM_OBJECT_UNLOCK(object);
+						VM_OBJECT_WUNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
-							VM_OBJECT_LOCK(object);
+							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
-						VM_OBJECT_UNLOCK(robject);
+						VM_OBJECT_WUNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
-						msleep(object,
-						    VM_OBJECT_MTX(object),
+						VM_OBJECT_SLEEP(object, object,
 						    PDROP | PVM, "objde2", 0);
-						VM_OBJECT_LOCK(robject);
+						VM_OBJECT_WLOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
-							VM_OBJECT_LOCK(object);
+							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else
-						VM_OBJECT_UNLOCK(object);
+						VM_OBJECT_WUNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
@@ -602,21 +640,23 @@
 					}
 					object = robject;
 					vm_object_collapse(object);
-					VM_OBJECT_UNLOCK(object);
+					VM_OBJECT_WUNLOCK(object);
 					continue;
 				}
-				VM_OBJECT_UNLOCK(robject);
+				VM_OBJECT_WUNLOCK(robject);
 			}
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 doterm:
 		temp = object->backing_object;
 		if (temp != NULL) {
-			VM_OBJECT_LOCK(temp);
+			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
+			    ("shadowed tmpfs v_object 2 %p", object));
+			VM_OBJECT_WLOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
-			VM_OBJECT_UNLOCK(temp);
+			VM_OBJECT_WUNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
@@ -627,7 +667,7 @@
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 		object = temp;
 	}
 }
@@ -641,20 +681,9 @@
 {
 
 	/*
-	 * Remove the object from the global object list.
-	 */
-	mtx_lock(&vm_object_list_mtx);
-	TAILQ_REMOVE(&vm_object_list, object, object_list);
-	mtx_unlock(&vm_object_list_mtx);
-
-	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
-		KASSERT(object->type == OBJT_DEFAULT ||
-		    object->type == OBJT_SWAP,
-		    ("vm_object_terminate: non-swap obj %p has cred",
-		     object));
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
@@ -679,7 +708,7 @@
 {
 	vm_page_t p, p_next;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Make sure no one uses us.
@@ -705,11 +734,15 @@
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, 0, 0);
 
-		VM_OBJECT_LOCK(object);
+		BO_LOCK(&vp->v_bufobj);
+		vp->v_bufobj.bo_flag |= BO_DEAD;
+		BO_UNLOCK(&vp->v_bufobj);
+
+		VM_OBJECT_WLOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
@@ -723,8 +756,7 @@
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
-		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
-		    ("vm_object_terminate: freeing busy page %p", p));
+		vm_page_assert_unbusied(p);
 		vm_page_lock(p);
 		/*
 		 * Optimize the page's removal from the object by resetting
@@ -746,7 +778,7 @@
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
-		object->root = NULL;
+		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
@@ -757,14 +789,18 @@
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
-	if (__predict_false(object->cache != NULL))
+	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, 0, 0);
 
+	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
+	    object->type == OBJT_SWAP,
+	    ("%s: non-swap obj %p has cred", __func__, object));
+
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
 	vm_object_destroy(object);
 }
@@ -820,9 +856,13 @@
 	int curgeneration, n, pagerflags;
 	boolean_t clearobjflags, eio, res;
 
-	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	KASSERT(object->type == OBJT_VNODE, ("Not a vnode object"));
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	/*
+	 * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE
+	 * objects.  The check below prevents the function from
+	 * operating on non-vnode objects.
+	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
 	    object->resident_page_count == 0)
 		return (TRUE);
@@ -846,7 +886,7 @@
 		np = TAILQ_NEXT(p, listq);
 		if (p->valid == 0)
 			continue;
-		if (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
+		if (vm_page_sleep_if_busy(p, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				if ((flags & OBJPC_SYNC) != 0)
 					goto rescan;
@@ -906,9 +946,8 @@
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
-	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
 	vm_page_lock_assert(p, MA_NOTOWNED);
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	count = 1;
 	mreq = 0;
@@ -915,7 +954,7 @@
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
-		if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
+		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
@@ -923,7 +962,7 @@
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
-		if (tp == NULL || tp->busy != 0 || (tp->oflags & VPO_BUSY) != 0)
+		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
@@ -966,11 +1005,11 @@
 		return (TRUE);
 	res = TRUE;
 	error = 0;
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
-		VM_OBJECT_LOCK(backing_object);
+		VM_OBJECT_WLOCK(backing_object);
 		offset += object->backing_object_offset;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
@@ -989,11 +1028,9 @@
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
-		int vfslocked;
 		vp = object->handle;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    OFF_TO_IDX(size) == object->size) {
@@ -1010,18 +1047,17 @@
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
-		VFS_UNLOCK_GIANT(vfslocked);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
@@ -1039,7 +1075,7 @@
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	return (res);
 }
 
@@ -1074,7 +1110,7 @@
 
 	if (object == NULL)
 		return;
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	/*
 	 * Locate and adjust resident pages
 	 */
@@ -1093,7 +1129,7 @@
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
-		} else if (tobject->type == OBJT_PHYS)
+		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
 			goto unlock_tobject;
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL && advise == MADV_WILLNEED) {
@@ -1115,10 +1151,10 @@
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
-			VM_OBJECT_LOCK(backing_object);
+			VM_OBJECT_WLOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
-				VM_OBJECT_UNLOCK(tobject);
+				VM_OBJECT_WUNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		} else if (m->valid != VM_PAGE_BITS_ALL)
@@ -1135,7 +1171,7 @@
 		    ("vm_object_madvise: page %p is fictitious", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", m));
-		if ((m->oflags & VPO_BUSY) || m->busy) {
+		if (vm_page_busied(m)) {
 			if (advise == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
@@ -1144,39 +1180,17 @@
 				 */
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			}
-			vm_page_unlock(m);
 			if (object != tobject)
-				VM_OBJECT_UNLOCK(object);
-			m->oflags |= VPO_WANTED;
-			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo",
-			    0);
-			VM_OBJECT_LOCK(object);
+				VM_OBJECT_WUNLOCK(object);
+			VM_OBJECT_WUNLOCK(tobject);
+			vm_page_busy_sleep(m, "madvpo", false);
+			VM_OBJECT_WLOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
-		} else if (advise == MADV_DONTNEED) {
-			vm_page_dontneed(m);
-		} else if (advise == MADV_FREE) {
-			/*
-			 * Mark the page clean.  This will allow the page
-			 * to be freed up by the system.  However, such pages
-			 * are often reused quickly by malloc()/free()
-			 * so we do not do anything that would cause
-			 * a page fault if we can help it.
-			 *
-			 * Specifically, we do not try to actually free
-			 * the page now nor do we try to put it in the
-			 * cache (which would cause a page fault on reuse).
-			 *
-			 * But we do make the page is freeable as we
-			 * can without actually taking the step of unmapping
-			 * it.
-			 */
-			pmap_clear_modify(m);
-			m->dirty = 0;
-			m->act_count = 0;
-			vm_page_dontneed(m);
+		} else {
+			vm_page_advise(m, advise);
 		}
 		vm_page_unlock(m);
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
@@ -1183,9 +1197,9 @@
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
-			VM_OBJECT_UNLOCK(tobject);
+			VM_OBJECT_WUNLOCK(tobject);
 	}	
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
@@ -1213,15 +1227,15 @@
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
-		VM_OBJECT_LOCK(source);
+		VM_OBJECT_WLOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
-			VM_OBJECT_UNLOCK(source);
+			VM_OBJECT_WUNLOCK(source);
 			return;
 		}
-		VM_OBJECT_UNLOCK(source);
+		VM_OBJECT_WUNLOCK(source);
 	}
 
 	/*
@@ -1246,7 +1260,7 @@
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
-		VM_OBJECT_LOCK(source);
+		VM_OBJECT_WLOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 #if VM_NRESERVLEVEL > 0
@@ -1254,7 +1268,7 @@
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #endif
-		VM_OBJECT_UNLOCK(source);
+		VM_OBJECT_WUNLOCK(source);
 	}
 
 
@@ -1285,7 +1299,7 @@
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
-	VM_OBJECT_UNLOCK(orig_object);
+	VM_OBJECT_WUNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
@@ -1300,17 +1314,17 @@
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
-	VM_OBJECT_LOCK(new_object);
-	VM_OBJECT_LOCK(orig_object);
+	VM_OBJECT_WLOCK(new_object);
+	VM_OBJECT_WLOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
-		VM_OBJECT_LOCK(source);
+		VM_OBJECT_WLOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
-			VM_OBJECT_UNLOCK(source);
-			VM_OBJECT_UNLOCK(orig_object);
-			VM_OBJECT_UNLOCK(new_object);
+			VM_OBJECT_WUNLOCK(source);
+			VM_OBJECT_WUNLOCK(orig_object);
+			VM_OBJECT_WUNLOCK(new_object);
 			vm_object_deallocate(new_object);
-			VM_OBJECT_LOCK(orig_object);
+			VM_OBJECT_WLOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
@@ -1318,7 +1332,7 @@
 		source->shadow_count++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
-		VM_OBJECT_UNLOCK(source);
+		VM_OBJECT_WUNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
@@ -1344,18 +1358,42 @@
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
-		if ((m->oflags & VPO_BUSY) || m->busy) {
-			VM_OBJECT_UNLOCK(new_object);
-			m->oflags |= VPO_WANTED;
-			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
-			VM_OBJECT_LOCK(new_object);
+		if (vm_page_busied(m)) {
+			VM_OBJECT_WUNLOCK(new_object);
+			vm_page_lock(m);
+			VM_OBJECT_WUNLOCK(orig_object);
+			vm_page_busy_sleep(m, "spltwt", false);
+			VM_OBJECT_WLOCK(orig_object);
+			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
-		vm_page_lock(m);
-		vm_page_rename(m, new_object, idx);
-		vm_page_unlock(m);
-		/* page automatically made dirty by rename and cache handled */
-		vm_page_busy(m);
+
+		/* vm_page_rename() will handle dirty and cache. */
+		if (vm_page_rename(m, new_object, idx)) {
+			VM_OBJECT_WUNLOCK(new_object);
+			VM_OBJECT_WUNLOCK(orig_object);
+			VM_WAIT;
+			VM_OBJECT_WLOCK(orig_object);
+			VM_OBJECT_WLOCK(new_object);
+			goto retry;
+		}
+#if VM_NRESERVLEVEL > 0
+		/*
+		 * If some of the reservation's allocated pages remain with
+		 * the original object, then transferring the reservation to
+		 * the new object is neither particularly beneficial nor
+		 * particularly harmful as compared to leaving the reservation
+		 * with the original object.  If, however, all of the
+		 * reservation's allocated pages are transferred to the new
+		 * object, then transferring the reservation is typically
+		 * beneficial.  Determining which of these two cases applies
+		 * would be more costly than unconditionally renaming the
+		 * reservation.
+		 */
+		vm_reserv_rename(m, new_object, orig_object, offidxstart);
+#endif
+		if (orig_object->type == OBJT_SWAP)
+			vm_page_xbusy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
@@ -1363,22 +1401,28 @@
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
+		TAILQ_FOREACH(m, &new_object->memq, listq)
+			vm_page_xunbusy(m);
 
 		/*
 		 * Transfer any cached pages from orig_object to new_object.
+		 * If swap_pager_copy() found swapped out pages within the
+		 * specified range of orig_object, then it changed
+		 * new_object's type to OBJT_SWAP when it transferred those
+		 * pages to new_object.  Otherwise, new_object's type
+		 * should still be OBJT_DEFAULT and orig_object should not
+		 * contain any cached pages within the specified range.
 		 */
-		if (__predict_false(orig_object->cache != NULL))
+		if (__predict_false(!vm_object_cache_is_empty(orig_object)))
 			vm_page_cache_transfer(orig_object, offidxstart,
 			    new_object);
 	}
-	VM_OBJECT_UNLOCK(orig_object);
-	TAILQ_FOREACH(m, &new_object->memq, listq)
-		vm_page_wakeup(m);
-	VM_OBJECT_UNLOCK(new_object);
+	VM_OBJECT_WUNLOCK(orig_object);
+	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
-	VM_OBJECT_LOCK(new_object);
+	VM_OBJECT_WLOCK(new_object);
 }
 
 #define	OBSC_TEST_ALL_SHADOWED	0x0001
@@ -1385,16 +1429,43 @@
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
-static int
+static vm_page_t
+vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
+    int op)
+{
+	vm_object_t backing_object;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	backing_object = object->backing_object;
+	VM_OBJECT_ASSERT_WLOCKED(backing_object);
+
+	KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p));
+	KASSERT(p == NULL || p->object == object || p->object == backing_object,
+	    ("invalid ownership %p %p %p", p, object, backing_object));
+	if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
+		return (next);
+	if (p != NULL)
+		vm_page_lock(p);
+	VM_OBJECT_WUNLOCK(object);
+	VM_OBJECT_WUNLOCK(backing_object);
+	if (p == NULL)
+		VM_WAIT;
+	else
+		vm_page_busy_sleep(p, "vmocol", false);
+	VM_OBJECT_WLOCK(object);
+	VM_OBJECT_WLOCK(backing_object);
+	return (TAILQ_FIRST(&backing_object->memq));
+}
+
+static bool
 vm_object_backing_scan(vm_object_t object, int op)
 {
-	int r = 1;
-	vm_page_t p;
 	vm_object_t backing_object;
-	vm_pindex_t backing_offset_index;
+	vm_page_t next, p, pp;
+	vm_pindex_t backing_offset_index, new_pindex;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
@@ -1413,7 +1484,7 @@
 		 * shadow test may succeed! XXX
 		 */
 		if (backing_object->type != OBJT_DEFAULT) {
-			return (0);
+			return (false);
 		}
 	}
 	if (op & OBSC_COLLAPSE_WAIT) {
@@ -1425,24 +1496,19 @@
 	 */
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
-		vm_page_t next = TAILQ_NEXT(p, listq);
-		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
-
+		next = TAILQ_NEXT(p, listq);
+		new_pindex = p->pindex - backing_offset_index;
 		if (op & OBSC_TEST_ALL_SHADOWED) {
-			vm_page_t pp;
-
 			/*
 			 * Ignore pages outside the parent object's range
 			 * and outside the parent object's mapping of the 
 			 * backing object.
 			 *
-			 * note that we do not busy the backing object's
+			 * Note that we do not busy the backing object's
 			 * page.
 			 */
-			if (
-			    p->pindex < backing_offset_index ||
-			    new_pindex >= object->size
-			) {
+			if (p->pindex < backing_offset_index ||
+			    new_pindex >= object->size) {
 				p = next;
 				continue;
 			}
@@ -1458,13 +1524,9 @@
 			 */
 
 			pp = vm_page_lookup(object, new_pindex);
-			if (
-			    (pp == NULL || pp->valid == 0) &&
-			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
-			) {
-				r = 0;
-				break;
-			}
+			if ((pp == NULL || pp->valid == 0) &&
+			    !vm_pager_has_page(object, new_pindex, NULL, NULL))
+				return (false);
 		}
 
 		/*
@@ -1471,55 +1533,21 @@
 		 * Check for busy page
 		 */
 		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
-			vm_page_t pp;
-
-			if (op & OBSC_COLLAPSE_NOWAIT) {
-				if ((p->oflags & VPO_BUSY) ||
-				    !p->valid || 
-				    p->busy) {
-					p = next;
-					continue;
-				}
-			} else if (op & OBSC_COLLAPSE_WAIT) {
-				if ((p->oflags & VPO_BUSY) || p->busy) {
-					VM_OBJECT_UNLOCK(object);
-					p->oflags |= VPO_WANTED;
-					msleep(p, VM_OBJECT_MTX(backing_object),
-					    PDROP | PVM, "vmocol", 0);
-					VM_OBJECT_LOCK(object);
-					VM_OBJECT_LOCK(backing_object);
-					/*
-					 * If we slept, anything could have
-					 * happened.  Since the object is
-					 * marked dead, the backing offset
-					 * should not have changed so we
-					 * just restart our scan.
-					 */
-					p = TAILQ_FIRST(&backing_object->memq);
-					continue;
-				}
+			if (vm_page_busied(p)) {
+				p = vm_object_backing_scan_wait(object, p,
+				    next, op);
+				continue;
 			}
 
-			KASSERT(
-			    p->object == backing_object,
-			    ("vm_object_backing_scan: object mismatch")
-			);
+			KASSERT(p->object == backing_object,
+			    ("vm_object_backing_scan: object mismatch"));
 
-			/*
-			 * Destroy any associated swap
-			 */
-			if (backing_object->type == OBJT_SWAP) {
-				swap_pager_freespace(
-				    backing_object, 
-				    p->pindex,
-				    1
-				);
-			}
+			if (p->pindex < backing_offset_index ||
+			    new_pindex >= object->size) {
+				if (backing_object->type == OBJT_SWAP)
+					swap_pager_freespace(backing_object, 
+					    p->pindex, 1);
 
-			if (
-			    p->pindex < backing_offset_index ||
-			    new_pindex >= object->size
-			) {
 				/*
 				 * Page is out of the parent object's range, we 
 				 * can simply destroy it. 
@@ -1537,35 +1565,45 @@
 			}
 
 			pp = vm_page_lookup(object, new_pindex);
-			if (
-			    (op & OBSC_COLLAPSE_NOWAIT) != 0 &&
-			    (pp != NULL && pp->valid == 0)
-			) {
+			if (pp != NULL && vm_page_busied(pp)) {
 				/*
-				 * The page in the parent is not (yet) valid.
-				 * We don't know anything about the state of
-				 * the original page.  It might be mapped,
-				 * so we must avoid the next if here.
+				 * The page in the parent is busy and
+				 * possibly not (yet) valid.  Until
+				 * its state is finalized by the busy
+				 * bit owner, we can't tell whether it
+				 * shadows the original page.
+				 * Therefore, we must either skip it
+				 * and the original (backing_object)
+				 * page or wait for its state to be
+				 * finalized.
 				 *
-				 * This is due to a race in vm_fault() where
-				 * we must unbusy the original (backing_obj)
-				 * page before we can (re)lock the parent.
-				 * Hence we can get here.
+				 * This is due to a race with vm_fault()
+				 * where we must unbusy the original
+				 * (backing_obj) page before we can
+				 * (re)lock the parent.  Hence we can
+				 * get here.
 				 */
-				p = next;
+				p = vm_object_backing_scan_wait(object, pp,
+				    next, op);
 				continue;
 			}
-			if (
-			    pp != NULL ||
-			    vm_pager_has_page(object, new_pindex, NULL, NULL)
-			) {
+
+			KASSERT(pp == NULL || pp->valid != 0,
+			    ("unbusy invalid page %p", pp));
+
+			if (pp != NULL || vm_pager_has_page(object,
+			    new_pindex, NULL, NULL)) {
 				/*
-				 * page already exists in parent OR swap exists
-				 * for this location in the parent.  Destroy 
-				 * the original page from the backing object.
-				 *
-				 * Leave the parent's page alone
+				 * The page already exists in the
+				 * parent OR swap exists for this
+				 * location in the parent.  Leave the
+				 * parent's page alone.  Destroy the
+				 * original page from the backing
+				 * object.
 				 */
+				if (backing_object->type == OBJT_SWAP)
+					swap_pager_freespace(backing_object,
+					    p->pindex, 1);
 				vm_page_lock(p);
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
@@ -1578,6 +1616,25 @@
 				continue;
 			}
 
+			/*
+			 * Page does not exist in parent, rename the
+			 * page from the backing object to the main object. 
+			 *
+			 * If the page was mapped to a process, it can remain 
+			 * mapped through the rename.
+			 * vm_page_rename() will handle dirty and cache.
+			 */
+			if (vm_page_rename(p, object, new_pindex)) {
+				p = vm_object_backing_scan_wait(object, NULL,
+				    next, op);
+				continue;
+			}
+
+			/* Use the old pindex to free the right page. */
+			if (backing_object->type == OBJT_SWAP)
+				swap_pager_freespace(backing_object,
+				    new_pindex + backing_offset_index, 1);
+
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Rename the reservation.
@@ -1585,22 +1642,10 @@
 			vm_reserv_rename(p, object, backing_object,
 			    backing_offset_index);
 #endif
-
-			/*
-			 * Page does not exist in parent, rename the
-			 * page from the backing object to the main object. 
-			 *
-			 * If the page was mapped to a process, it can remain 
-			 * mapped through the rename.
-			 */
-			vm_page_lock(p);
-			vm_page_rename(p, object, new_pindex);
-			vm_page_unlock(p);
-			/* page automatically made dirty by rename */
 		}
 		p = next;
 	}
-	return (r);
+	return (true);
 }
 
 
@@ -1614,8 +1659,8 @@
 {
 	vm_object_t backing_object = object->backing_object;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	if (backing_object->ref_count != 1)
 		return;
@@ -1633,11 +1678,11 @@
 void
 vm_object_collapse(vm_object_t object)
 {
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	
+	vm_object_t backing_object, new_backing_object;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
 	while (TRUE) {
-		vm_object_t backing_object;
-
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
@@ -1650,7 +1695,7 @@
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
-		VM_OBJECT_LOCK(backing_object);
+		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
@@ -1659,18 +1704,17 @@
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
-			VM_OBJECT_UNLOCK(backing_object);
+			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 
-		if (
-		    object->paging_in_progress != 0 ||
-		    backing_object->paging_in_progress != 0
-		) {
+		if (object->paging_in_progress != 0 ||
+		    backing_object->paging_in_progress != 0) {
 			vm_object_qcollapse(object);
-			VM_OBJECT_UNLOCK(backing_object);
+			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
+
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
@@ -1682,6 +1726,9 @@
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
+			vm_object_pip_add(object, 1);
+			vm_object_pip_add(backing_object, 1);
+
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.  
@@ -1704,6 +1751,9 @@
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
+				 * Since swap_pager_copy() is being asked to
+				 * destroy the source, it will change the
+				 * backing_object's type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
@@ -1713,7 +1763,8 @@
 				/*
 				 * Free any cached pages from backing_object.
 				 */
-				if (__predict_false(backing_object->cache != NULL))
+				if (__predict_false(
+				    !vm_object_cache_is_empty(backing_object)))
 					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
@@ -1725,7 +1776,7 @@
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			if (backing_object->backing_object) {
-				VM_OBJECT_LOCK(backing_object->backing_object);
+				VM_OBJECT_WLOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
@@ -1733,7 +1784,7 @@
 				/*
 				 * The shadow_count has not changed.
 				 */
-				VM_OBJECT_UNLOCK(backing_object->backing_object);
+				VM_OBJECT_WUNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
@@ -1749,21 +1800,23 @@
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
-			VM_OBJECT_UNLOCK(backing_object);
+			vm_object_pip_wakeup(backing_object);
+			backing_object->type = OBJT_DEAD;
+			backing_object->ref_count = 0;
+			VM_OBJECT_WUNLOCK(backing_object);
 			vm_object_destroy(backing_object);
 
+			vm_object_pip_wakeup(object);
 			object_collapses++;
 		} else {
-			vm_object_t new_backing_object;
-
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
-			    vm_object_backing_scan(object,
-			    OBSC_TEST_ALL_SHADOWED) == 0) {
-				VM_OBJECT_UNLOCK(backing_object);
+			    !vm_object_backing_scan(object,
+			    OBSC_TEST_ALL_SHADOWED)) {
+				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
 
@@ -1777,7 +1830,7 @@
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
-				VM_OBJECT_LOCK(new_backing_object);
+				VM_OBJECT_WLOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
@@ -1785,7 +1838,7 @@
 				);
 				new_backing_object->shadow_count++;
 				vm_object_reference_locked(new_backing_object);
-				VM_OBJECT_UNLOCK(new_backing_object);
+				VM_OBJECT_WUNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
@@ -1795,7 +1848,7 @@
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
-			VM_OBJECT_UNLOCK(backing_object);
+			VM_OBJECT_WUNLOCK(backing_object);
 			object_bypasses++;
 		}
 
@@ -1836,10 +1889,9 @@
     int options)
 {
 	vm_page_t p, next;
-	int wirings;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_PHYS) ||
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
@@ -1864,50 +1916,44 @@
 		 * not specified.
 		 */
 		vm_page_lock(p);
-		if ((wirings = p->wire_count) != 0 &&
-		    (wirings = pmap_page_wired_mappings(p)) != p->wire_count) {
-			if ((options & OBJPR_NOTMAPPED) == 0) {
+		if (vm_page_xbusied(p)) {
+			VM_OBJECT_WUNLOCK(object);
+			vm_page_busy_sleep(p, "vmopax", true);
+			VM_OBJECT_WLOCK(object);
+			goto again;
+		}
+		if (p->wire_count != 0) {
+			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_all(p);
-				/* Account for removal of wired mappings. */
-				if (wirings != 0)
-					p->wire_count -= wirings;
-			}
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
-			vm_page_unlock(p);
-			continue;
+			goto next;
 		}
-		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
+		if (vm_page_busied(p)) {
+			VM_OBJECT_WUNLOCK(object);
+			vm_page_busy_sleep(p, "vmopar", false);
+			VM_OBJECT_WLOCK(object);
 			goto again;
+		}
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_write(p);
-			if (p->dirty) {
-				vm_page_unlock(p);
-				continue;
-			}
+			if (p->dirty)
+				goto next;
 		}
-		if ((options & OBJPR_NOTMAPPED) == 0) {
+		if ((options & OBJPR_NOTMAPPED) == 0)
 			pmap_remove_all(p);
-			/* Account for removal of wired mappings. */
-			if (wirings != 0) {
-				KASSERT(p->wire_count == wirings,
-				    ("inconsistent wire count %d %d %p",
-				    p->wire_count, wirings, p));
-				p->wire_count = 0;
-				atomic_subtract_int(&cnt.v_wire_count, 1);
-			}
-		}
 		vm_page_free(p);
+next:
 		vm_page_unlock(p);
 	}
 	vm_object_pip_wakeup(object);
 skipmemq:
-	if (__predict_false(object->cache != NULL))
+	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, start, end);
 }
 
@@ -1923,7 +1969,7 @@
  *	pages are moved to the cache queue.
  *
  *	This operation should only be performed on objects that
- *	contain managed pages.
+ *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
@@ -1933,9 +1979,8 @@
 	struct mtx *mtx, *new_mtx;
 	vm_page_t p, next;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	KASSERT((object->type != OBJT_DEVICE && object->type != OBJT_SG &&
-	    object->type != OBJT_PHYS),
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_cache: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
@@ -1982,10 +2027,9 @@
 	vm_pindex_t pindex;
 	int rv;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (pindex = start; pindex < end; pindex++) {
-		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL |
-		    VM_ALLOC_RETRY);
+		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			ma[0] = m;
 			rv = vm_pager_get_pages(object, ma, 1, 0);
@@ -2007,7 +2051,7 @@
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
-			vm_page_wakeup(m);
+			vm_page_xunbusy(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
@@ -2043,10 +2087,11 @@
 
 	if (prev_object == NULL)
 		return (TRUE);
-	VM_OBJECT_LOCK(prev_object);
-	if (prev_object->type != OBJT_DEFAULT &&
-	    prev_object->type != OBJT_SWAP) {
-		VM_OBJECT_UNLOCK(prev_object);
+	VM_OBJECT_WLOCK(prev_object);
+	if ((prev_object->type != OBJT_DEFAULT &&
+	    prev_object->type != OBJT_SWAP) ||
+	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
+		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
@@ -2061,7 +2106,7 @@
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
-		VM_OBJECT_UNLOCK(prev_object);
+		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
@@ -2071,7 +2116,7 @@
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
-		VM_OBJECT_UNLOCK(prev_object);
+		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
@@ -2092,6 +2137,7 @@
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
+			VM_OBJECT_WUNLOCK(prev_object);
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
@@ -2125,7 +2171,7 @@
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
-	VM_OBJECT_UNLOCK(prev_object);
+	VM_OBJECT_WUNLOCK(prev_object);
 	return (TRUE);
 }
 
@@ -2133,9 +2179,14 @@
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if (object->type != OBJT_VNODE)
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->type != OBJT_VNODE) {
+		if ((object->flags & OBJ_TMPFS_NODE) != 0) {
+			KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs"));
+			vm_object_set_flag(object, OBJ_TMPFS_DIRTY);
+		}
 		return;
+	}
 	object->generation++;
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
@@ -2142,6 +2193,228 @@
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 }
 
+/*
+ *	vm_object_unwire:
+ *
+ *	For each page offset within the specified range of the given object,
+ *	find the highest-level page in the shadow chain and unwire it.  A page
+ *	must exist at every page offset, and the highest-level page must be
+ *	wired.
+ */
+void
+vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
+    uint8_t queue)
+{
+	vm_object_t tobject;
+	vm_page_t m, tm;
+	vm_pindex_t end_pindex, pindex, tpindex;
+	int depth, locked_depth;
+
+	KASSERT((offset & PAGE_MASK) == 0,
+	    ("vm_object_unwire: offset is not page aligned"));
+	KASSERT((length & PAGE_MASK) == 0,
+	    ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
+	/* The wired count of a fictitious page never changes. */
+	if ((object->flags & OBJ_FICTITIOUS) != 0)
+		return;
+	pindex = OFF_TO_IDX(offset);
+	end_pindex = pindex + atop(length);
+	locked_depth = 1;
+	VM_OBJECT_RLOCK(object);
+	m = vm_page_find_least(object, pindex);
+	while (pindex < end_pindex) {
+		if (m == NULL || pindex < m->pindex) {
+			/*
+			 * The first object in the shadow chain doesn't
+			 * contain a page at the current index.  Therefore,
+			 * the page must exist in a backing object.
+			 */
+			tobject = object;
+			tpindex = pindex;
+			depth = 0;
+			do {
+				tpindex +=
+				    OFF_TO_IDX(tobject->backing_object_offset);
+				tobject = tobject->backing_object;
+				KASSERT(tobject != NULL,
+				    ("vm_object_unwire: missing page"));
+				if ((tobject->flags & OBJ_FICTITIOUS) != 0)
+					goto next_page;
+				depth++;
+				if (depth == locked_depth) {
+					locked_depth++;
+					VM_OBJECT_RLOCK(tobject);
+				}
+			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
+			    NULL);
+		} else {
+			tm = m;
+			m = TAILQ_NEXT(m, listq);
+		}
+		vm_page_lock(tm);
+		vm_page_unwire(tm, queue);
+		vm_page_unlock(tm);
+next_page:
+		pindex++;
+	}
+	/* Release the accumulated object locks. */
+	for (depth = 0; depth < locked_depth; depth++) {
+		tobject = object->backing_object;
+		VM_OBJECT_RUNLOCK(object);
+		object = tobject;
+	}
+}
+
+struct vnode *
+vm_object_vnode(vm_object_t object)
+{
+
+	VM_OBJECT_ASSERT_LOCKED(object);
+	if (object->type == OBJT_VNODE)
+		return (object->handle);
+	if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0)
+		return (object->un_pager.swp.swp_tmpfs);
+	return (NULL);
+}
+
+static int
+sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
+{
+	struct kinfo_vmobject *kvo;
+	char *fullpath, *freepath;
+	struct vnode *vp;
+	struct vattr va;
+	vm_object_t obj;
+	vm_page_t m;
+	int count, error;
+
+	if (req->oldptr == NULL) {
+		/*
+		 * If an old buffer has not been provided, generate an
+		 * estimate of the space needed for a subsequent call.
+		 */
+		mtx_lock(&vm_object_list_mtx);
+		count = 0;
+		TAILQ_FOREACH(obj, &vm_object_list, object_list) {
+			if (obj->type == OBJT_DEAD)
+				continue;
+			count++;
+		}
+		mtx_unlock(&vm_object_list_mtx);
+		return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
+		    count * 11 / 10));
+	}
+
+	kvo = malloc(sizeof(*kvo), M_TEMP, M_WAITOK);
+	error = 0;
+
+	/*
+	 * VM objects are type stable and are never removed from the
+	 * list once added.  This allows us to safely read obj->object_list
+	 * after reacquiring the VM object lock.
+	 */
+	mtx_lock(&vm_object_list_mtx);
+	TAILQ_FOREACH(obj, &vm_object_list, object_list) {
+		if (obj->type == OBJT_DEAD)
+			continue;
+		VM_OBJECT_RLOCK(obj);
+		if (obj->type == OBJT_DEAD) {
+			VM_OBJECT_RUNLOCK(obj);
+			continue;
+		}
+		mtx_unlock(&vm_object_list_mtx);
+		kvo->kvo_size = ptoa(obj->size);
+		kvo->kvo_resident = obj->resident_page_count;
+		kvo->kvo_ref_count = obj->ref_count;
+		kvo->kvo_shadow_count = obj->shadow_count;
+		kvo->kvo_memattr = obj->memattr;
+		kvo->kvo_active = 0;
+		kvo->kvo_inactive = 0;
+		TAILQ_FOREACH(m, &obj->memq, listq) {
+			/*
+			 * A page may belong to the object but be
+			 * dequeued and set to PQ_NONE while the
+			 * object lock is not held.  This makes the
+			 * reads of m->queue below racy, and we do not
+			 * count pages set to PQ_NONE.  However, this
+			 * sysctl is only meant to give an
+			 * approximation of the system anyway.
+			 */
+			if (m->queue == PQ_ACTIVE)
+				kvo->kvo_active++;
+			else if (m->queue == PQ_INACTIVE)
+				kvo->kvo_inactive++;
+		}
+
+		kvo->kvo_vn_fileid = 0;
+		kvo->kvo_vn_fsid = 0;
+		freepath = NULL;
+		fullpath = "";
+		vp = NULL;
+		switch (obj->type) {
+		case OBJT_DEFAULT:
+			kvo->kvo_type = KVME_TYPE_DEFAULT;
+			break;
+		case OBJT_VNODE:
+			kvo->kvo_type = KVME_TYPE_VNODE;
+			vp = obj->handle;
+			vref(vp);
+			break;
+		case OBJT_SWAP:
+			kvo->kvo_type = KVME_TYPE_SWAP;
+			break;
+		case OBJT_DEVICE:
+			kvo->kvo_type = KVME_TYPE_DEVICE;
+			break;
+		case OBJT_PHYS:
+			kvo->kvo_type = KVME_TYPE_PHYS;
+			break;
+		case OBJT_DEAD:
+			kvo->kvo_type = KVME_TYPE_DEAD;
+			break;
+		case OBJT_SG:
+			kvo->kvo_type = KVME_TYPE_SG;
+			break;
+		case OBJT_MGTDEVICE:
+			kvo->kvo_type = KVME_TYPE_MGTDEVICE;
+			break;
+		default:
+			kvo->kvo_type = KVME_TYPE_UNKNOWN;
+			break;
+		}
+		VM_OBJECT_RUNLOCK(obj);
+		if (vp != NULL) {
+			vn_fullpath(curthread, vp, &fullpath, &freepath);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
+				kvo->kvo_vn_fileid = va.va_fileid;
+				kvo->kvo_vn_fsid = va.va_fsid;
+			}
+			vput(vp);
+		}
+
+		strlcpy(kvo->kvo_path, fullpath, sizeof(kvo->kvo_path));
+		if (freepath != NULL)
+			free(freepath, M_TEMP);
+
+		/* Pack record size down */
+		kvo->kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path)
+		    + strlen(kvo->kvo_path) + 1;
+		kvo->kvo_structsize = roundup(kvo->kvo_structsize,
+		    sizeof(uint64_t));
+		error = SYSCTL_OUT(req, kvo, kvo->kvo_structsize);
+		mtx_lock(&vm_object_list_mtx);
+		if (error)
+			break;
+	}
+	mtx_unlock(&vm_object_list_mtx);
+	free(kvo, M_TEMP);
+	return (error);
+}
+SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
+    "List of VM objects");
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
@@ -2206,12 +2479,6 @@
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
-	if (_vm_object_in_map(kmem_map, object, 0))
-		return 1;
-	if (_vm_object_in_map(pager_map, object, 0))
-		return 1;
-	if (_vm_object_in_map(buffer_map, object, 0))
-		return 1;
 	return 0;
 }
 

Modified: trunk/sys/vm/vm_object.h
===================================================================
--- trunk/sys/vm/vm_object.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_object.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $
  */
 
 /*
@@ -70,24 +71,40 @@
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_rwlock.h>
 
+#include <vm/_vm_radix.h>
+
 /*
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
  *
+ *	The root of cached pages pool is protected by both the per-object lock
+ *	and the free pages queue mutex.
+ *	On insert in the cache radix trie, the per-object lock is expected
+ *	to be already held and the free pages queue mutex will be
+ *	acquired during the operation too.
+ *	On remove and lookup from the cache radix trie, only the free
+ *	pages queue mutex is expected to be locked.
+ *	These rules allow for reliably checking for the presence of cached
+ *	pages with only the per-object lock held, thereby reducing contention
+ *	for the free pages queue mutex.
+ *
  * List of locks
  *	(c)	const until freed
+ *	(o)	per-object lock 
+ *	(f)	free pages queue mutex
  *
  */
 
 struct vm_object {
-	struct mtx mtx;
+	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
-	TAILQ_HEAD(, vm_page) memq;	/* list of resident pages */
-	vm_page_t root;			/* root of the resident page splay tree */
+	TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
@@ -96,13 +113,13 @@
 	objtype_t type;			/* type of pager */
 	u_short flags;			/* see below */
 	u_short pg_color;		/* (c) color of first page in obj */
-	u_short pad1;			/* Old pip counter */
+	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
-	vm_page_t cache;		/* root of the cache page splay tree */
+	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
@@ -123,6 +140,7 @@
 		struct {
 			TAILQ_HEAD(, vm_page) devp_pglist;
 			struct cdev_pager_ops *ops;
+			struct cdev *dev;
 		} devp;
 
 		/*
@@ -137,33 +155,46 @@
 		/*
 		 * Swap pager
 		 *
+		 *	swp_tmpfs - back-pointer to the tmpfs vnode,
+		 *		     if any, which uses the vm object
+		 *		     as backing store.  The handle
+		 *		     cannot be reused for linking,
+		 *		     because the vnode can be
+		 *		     reclaimed and recreated, making
+		 *		     the handle changed and hash-chain
+		 *		     invalid.
+		 *
 		 *	swp_bcount - number of swap 'swblock' metablocks, each
 		 *		     contains up to 16 swapblk assignments.
 		 *		     see vm/swap_pager.h
 		 */
 		struct {
+			void *swp_tmpfs;
 			int swp_bcount;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
-	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 };
 
 /*
  * Flags
  */
-#define OBJ_ACTIVE	0x0004		/* active objects */
+#define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
+#define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
 #define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
+#define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
+#define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
+#define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
-#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
-#define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
+#define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#define	OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 
 #ifdef	_KERNEL
 
@@ -188,16 +219,32 @@
 #define	kernel_object	(&kernel_object_store)
 #define	kmem_object	(&kmem_object_store)
 
-#define	VM_OBJECT_LOCK(object)		mtx_lock(&(object)->mtx)
-#define	VM_OBJECT_LOCK_ASSERT(object, type) \
-					mtx_assert(&(object)->mtx, (type))
-#define	VM_OBJECT_LOCK_INIT(object, type) \
-					mtx_init(&(object)->mtx, "vm object", \
-					    (type), MTX_DEF | MTX_DUPOK)
-#define	VM_OBJECT_LOCKED(object)	mtx_owned(&(object)->mtx)
-#define	VM_OBJECT_MTX(object)		(&(object)->mtx)
-#define	VM_OBJECT_TRYLOCK(object)	mtx_trylock(&(object)->mtx)
-#define	VM_OBJECT_UNLOCK(object)	mtx_unlock(&(object)->mtx)
+#define	VM_OBJECT_ASSERT_LOCKED(object)					\
+	rw_assert(&(object)->lock, RA_LOCKED)
+#define	VM_OBJECT_ASSERT_RLOCKED(object)				\
+	rw_assert(&(object)->lock, RA_RLOCKED)
+#define	VM_OBJECT_ASSERT_WLOCKED(object)				\
+	rw_assert(&(object)->lock, RA_WLOCKED)
+#define	VM_OBJECT_ASSERT_UNLOCKED(object)				\
+	rw_assert(&(object)->lock, RA_UNLOCKED)
+#define	VM_OBJECT_LOCK_DOWNGRADE(object)				\
+	rw_downgrade(&(object)->lock)
+#define	VM_OBJECT_RLOCK(object)						\
+	rw_rlock(&(object)->lock)
+#define	VM_OBJECT_RUNLOCK(object)					\
+	rw_runlock(&(object)->lock)
+#define	VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo)		\
+	rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo))
+#define	VM_OBJECT_TRYRLOCK(object)					\
+	rw_try_rlock(&(object)->lock)
+#define	VM_OBJECT_TRYWLOCK(object)					\
+	rw_try_wlock(&(object)->lock)
+#define	VM_OBJECT_TRYUPGRADE(object)					\
+	rw_try_upgrade(&(object)->lock)
+#define	VM_OBJECT_WLOCK(object)						\
+	rw_wlock(&(object)->lock)
+#define	VM_OBJECT_WUNLOCK(object)					\
+	rw_wunlock(&(object)->lock)
 
 /*
  *	The object must be locked or thread private.
@@ -216,8 +263,14 @@
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
+static __inline boolean_t
+vm_object_cache_is_empty(vm_object_t object)
+{
+
+	return (vm_radix_is_empty(&object->cache));
+}
+
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
-void _vm_object_allocate (objtype_t, vm_pindex_t, vm_object_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
 void vm_object_collapse (vm_object_t);
@@ -242,6 +295,9 @@
 void vm_object_split(vm_map_entry_t);
 boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
     boolean_t);
+void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
+    vm_size_t length, uint8_t queue);
+struct vnode *vm_object_vnode(vm_object_t object);
 #endif				/* _KERNEL */
 
 #endif				/* _VM_OBJECT_ */

Modified: trunk/sys/vm/vm_page.c
===================================================================
--- trunk/sys/vm/vm_page.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_page.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
@@ -63,11 +64,16 @@
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
- *	- a pageq mutex is required when adding or removing a page from a
- *	  page queue (vm_page_queue[]), regardless of other mutexes or the
- *	  busy state of a page.
+ *	- A page queue lock is required when adding or removing a page from a
+ *	  page queue regardless of other locks or the busy state of a page.
  *
- *	- The object mutex is held when inserting or removing
+ *		* In general, no thread besides the page daemon can acquire or
+ *		  hold more than one page queue lock at a time.
+ *
+ *		* The page daemon can acquire and hold any pair of page queue
+ *		  locks in any order.
+ *
+ *	- The object lock is required when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).
  *
  */
@@ -77,7 +83,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $");
 
 #include "opt_vm.h"
 
@@ -87,9 +93,11 @@
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
+#include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
@@ -103,6 +111,7 @@
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
@@ -115,11 +124,10 @@
  *	page structure.
  */
 
-struct vpgqueues vm_page_queues[PQ_COUNT];
-struct vpglocks vm_page_queue_lock;
-struct vpglocks vm_page_queue_free_lock;
+struct vm_domain vm_dom[MAXMEMDOM];
+struct mtx_padalign vm_page_queue_free_mtx;
 
-struct vpglocks	pa_lock[PA_LOCK_COUNT];
+struct mtx_padalign pa_lock[PA_LOCK_COUNT];
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
@@ -131,16 +139,21 @@
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
 	"number of pages allocated for bootstrapping the VM system");
 
-int pa_tryrelock_restart;
+static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
 static uma_zone_t fakepg_zone;
 
+static struct vnode *vm_page_alloc_init(vm_page_t m);
+static void vm_page_cache_turn_free(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_queue_remove(int queue, vm_page_t m);
 static void vm_page_enqueue(int queue, vm_page_t m);
 static void vm_page_init_fakepg(void *dummy);
+static int vm_page_insert_after(vm_page_t m, vm_object_t object,
+    vm_pindex_t pindex, vm_page_t mpred);
+static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
+    vm_page_t mpred);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
@@ -233,20 +246,46 @@
 	return (0);
 }
 
+static void
+vm_page_domain_init(struct vm_domain *vmd)
+{
+	struct vm_pagequeue *pq;
+	int i;
+
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
+	    "vm inactive pagequeue";
+	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
+	    &cnt.v_inactive_count;
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
+	    "vm active pagequeue";
+	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
+	    &cnt.v_active_count;
+	vmd->vmd_page_count = 0;
+	vmd->vmd_free_count = 0;
+	vmd->vmd_segs = 0;
+	vmd->vmd_oom = FALSE;
+	vmd->vmd_pass = 0;
+	for (i = 0; i < PQ_COUNT; i++) {
+		pq = &vmd->vmd_pagequeues[i];
+		TAILQ_INIT(&pq->pq_pl);
+		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
+		    MTX_DEF | MTX_DUPOK);
+	}
+}
+
 /*
  *	vm_page_startup:
  *
- *	Initializes the resident memory module.
- *
- *	Allocates memory for the page cells, and
- *	for the object/offset-to-page hash table headers.
- *	Each page cell is initialized and placed on the free list.
+ *	Initializes the resident memory module.  Allocates physical memory for
+ *	bootstrapping UMA and some data structures that are used to manage
+ *	physical pages.  Initializes these structures, and populates the free
+ *	page queues.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
-	vm_paddr_t page_range;
+	vm_paddr_t high_avail, low_avail, page_range, size;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
@@ -256,7 +295,6 @@
 	/* the biggest memory array is the second group of pages */
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
-	vm_paddr_t low_water, high_water;
 	int biggestone;
 
 	biggestsize = 0;
@@ -268,48 +306,34 @@
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
-	low_water = phys_avail[0];
-	high_water = phys_avail[1];
+#ifdef XEN
+	/*
+	 * There is no obvious reason why i386 PV Xen needs vm_page structs
+	 * created for these pseudo-physical addresses.  XXX
+	 */
+	vm_phys_add_seg(0, phys_avail[0]);
+#endif
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
-		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
-
+		size = phys_avail[i + 1] - phys_avail[i];
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
-		if (phys_avail[i] < low_water)
-			low_water = phys_avail[i];
-		if (phys_avail[i + 1] > high_water)
-			high_water = phys_avail[i + 1];
 	}
 
-#ifdef XEN
-	low_water = 0;
-#endif	
-
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the page and queue locks.
 	 */
-	mtx_init(&vm_page_queue_mtx, "vm page queue", NULL, MTX_DEF |
-	    MTX_RECURSE);
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
-		mtx_init(&pa_lock[i].data, "vm page", NULL, MTX_DEF);
+		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
+	for (i = 0; i < vm_ndomains; i++)
+		vm_page_domain_init(&vm_dom[i]);
 
 	/*
-	 * Initialize the queue headers for the hold queue, the active queue,
-	 * and the inactive queue.
-	 */
-	for (i = 0; i < PQ_COUNT; i++)
-		TAILQ_INIT(&vm_page_queues[i].pl);
-	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
-	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
-	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
-
-	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 */
@@ -344,6 +368,16 @@
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
+#if defined(__amd64__) || defined(__mips__)
+	/*
+	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
+	 * When pmap_map() uses the direct map, they are not automatically 
+	 * included.
+	 */
+	for (pa = new_end; pa < end; pa += PAGE_SIZE)
+		dump_add_page(pa);
+#endif
+	phys_avail[biggestone + 1] = new_end;
 #ifdef __amd64__
 	/*
 	 * Request that the physical pages underlying the message buffer be
@@ -359,29 +393,80 @@
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
-	 * use (taking into account the overhead of a page structure per
-	 * page).
+	 * use, taking into account the overhead of a page structure per page.
+	 * In other words, solve
+	 *	"available physical memory" - round_page(page_range *
+	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE 
+	 * for page_range.  
 	 */
-	first_page = low_water / PAGE_SIZE;
+	low_avail = phys_avail[0];
+	high_avail = phys_avail[1];
+	for (i = 0; i < vm_phys_nsegs; i++) {
+		if (vm_phys_segs[i].start < low_avail)
+			low_avail = vm_phys_segs[i].start;
+		if (vm_phys_segs[i].end > high_avail)
+			high_avail = vm_phys_segs[i].end;
+	}
+	/* Skip the first chunk.  It is already accounted for. */
+	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
+		if (phys_avail[i] < low_avail)
+			low_avail = phys_avail[i];
+		if (phys_avail[i + 1] > high_avail)
+			high_avail = phys_avail[i + 1];
+	}
+	first_page = low_avail / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
-	page_range = 0;
+	size = 0;
+	for (i = 0; i < vm_phys_nsegs; i++)
+		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
-		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
+		size += phys_avail[i + 1] - phys_avail[i];
 #elif defined(VM_PHYSSEG_DENSE)
-	page_range = high_water / PAGE_SIZE - first_page;
+	size = high_avail - low_avail;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
+
+#ifdef VM_PHYSSEG_DENSE
+	/*
+	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
+	 * the overhead of a page structure per page only if vm_page_array is
+	 * allocated from the last physical memory chunk.  Otherwise, we must
+	 * allocate page structures representing the physical memory
+	 * underlying vm_page_array, even though they will not be used.
+	 */
+	if (new_end != high_avail)
+		page_range = size / PAGE_SIZE;
+	else
+#endif
+	{
+		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
+
+		/*
+		 * If the partial bytes remaining are large enough for
+		 * a page (PAGE_SIZE) without a corresponding
+		 * 'struct vm_page', then new_end will contain an
+		 * extra page after subtracting the length of the VM
+		 * page array.  Compensate by subtracting an extra
+		 * page from new_end.
+		 */
+		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
+			if (new_end == high_avail)
+				high_avail -= PAGE_SIZE;
+			new_end -= PAGE_SIZE;
+		}
+	}
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
+	 * However, because this page is allocated from KVM, out-of-bounds
+	 * accesses using the direct map will not be trapped.
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
-	 * Initialize the mem entry structures now, and put them in the free
-	 * queue.
+	 * Allocate physical memory for the page structures, and map it.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
@@ -389,24 +474,30 @@
 	vm_page_array = (vm_page_t) mapped;
 #if VM_NRESERVLEVEL > 0
 	/*
-	 * Allocate memory for the reservation management system's data
-	 * structures.
+	 * Allocate physical memory for the reservation management system's
+	 * data structures, and map it.
 	 */
-	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
+	if (high_avail == end)
+		high_avail = new_end;
+	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
 #endif
 #if defined(__amd64__) || defined(__mips__)
 	/*
-	 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
-	 * like i386, so the pages must be tracked for a crashdump to include
-	 * this data.  This includes the vm_page_array and the early UMA
-	 * bootstrap pages.
+	 * Include vm_page_array and vm_reserv_array in a crash dump.
 	 */
-	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
+	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif	
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
+	 * Add physical memory segments corresponding to the available
+	 * physical pages.
+	 */
+	for (i = 0; phys_avail[i + 1] != 0; i += 2)
+		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
+
+	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
@@ -449,130 +540,191 @@
 	return (vaddr);
 }
 
-
-CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
-
 void
-vm_page_aflag_set(vm_page_t m, uint8_t bits)
+vm_page_reference(vm_page_t m)
 {
-	uint32_t *addr, val;
 
-	/*
-	 * The PGA_WRITEABLE flag can only be set if the page is managed and
-	 * VPO_BUSY.  Currently, this flag is only set by pmap_enter().
-	 */
-	KASSERT((bits & PGA_WRITEABLE) == 0 ||
-	    (m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == VPO_BUSY,
-	    ("PGA_WRITEABLE and !VPO_BUSY"));
+	vm_page_aflag_set(m, PGA_REFERENCED);
+}
 
-	/*
-	 * We want to use atomic updates for m->aflags, which is a
-	 * byte wide.  Not all architectures provide atomic operations
-	 * on the single-byte destination.  Punt and access the whole
-	 * 4-byte word with an atomic update.  Parallel non-atomic
-	 * updates to the fields included in the update by proximity
-	 * are handled properly by atomics.
-	 */
-	addr = (void *)&m->aflags;
-	MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
-	atomic_set_32(addr, val);
-} 
-
+/*
+ *	vm_page_busy_downgrade:
+ *
+ *	Downgrade an exclusive busy page into a single shared busy page.
+ */
 void
-vm_page_aflag_clear(vm_page_t m, uint8_t bits)
+vm_page_busy_downgrade(vm_page_t m)
 {
-	uint32_t *addr, val;
+	u_int x;
+	bool locked;
 
-	/*
-	 * The PGA_REFERENCED flag can only be cleared if the object
-	 * containing the page is locked.
-	 */
-	KASSERT((bits & PGA_REFERENCED) == 0 || VM_OBJECT_LOCKED(m->object),
-	    ("PGA_REFERENCED and !VM_OBJECT_LOCKED"));
+	vm_page_assert_xbusied(m);
+	locked = mtx_owned(vm_page_lockptr(m));
 
-	/*
-	 * See the comment in vm_page_aflag_set().
-	 */
-	addr = (void *)&m->aflags;
-	MPASS(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0);
-	val = bits;
-#if BYTE_ORDER == BIG_ENDIAN
-	val <<= 24;
-#endif
-	atomic_clear_32(addr, val);
+	for (;;) {
+		x = m->busy_lock;
+		x &= VPB_BIT_WAITERS;
+		if (x != 0 && !locked)
+			vm_page_lock(m);
+		if (atomic_cmpset_rel_int(&m->busy_lock,
+		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
+			break;
+		if (x != 0 && !locked)
+			vm_page_unlock(m);
+	}
+	if (x != 0) {
+		wakeup(m);
+		if (!locked)
+			vm_page_unlock(m);
+	}
 }
 
-void
-vm_page_reference(vm_page_t m)
+/*
+ *	vm_page_sbusied:
+ *
+ *	Return a positive value if the page is shared busied, 0 otherwise.
+ */
+int
+vm_page_sbusied(vm_page_t m)
 {
+	u_int x;
 
-	vm_page_aflag_set(m, PGA_REFERENCED);
+	x = m->busy_lock;
+	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
 }
 
+/*
+ *	vm_page_sunbusy:
+ *
+ *	Shared unbusy a page.
+ */
 void
-vm_page_busy(vm_page_t m)
+vm_page_sunbusy(vm_page_t m)
 {
+	u_int x;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT((m->oflags & VPO_BUSY) == 0,
-	    ("vm_page_busy: page already busy!!!"));
-	m->oflags |= VPO_BUSY;
+	vm_page_assert_sbusied(m);
+
+	for (;;) {
+		x = m->busy_lock;
+		if (VPB_SHARERS(x) > 1) {
+			if (atomic_cmpset_int(&m->busy_lock, x,
+			    x - VPB_ONE_SHARER))
+				break;
+			continue;
+		}
+		if ((x & VPB_BIT_WAITERS) == 0) {
+			KASSERT(x == VPB_SHARERS_WORD(1),
+			    ("vm_page_sunbusy: invalid lock state"));
+			if (atomic_cmpset_int(&m->busy_lock,
+			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
+				break;
+			continue;
+		}
+		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
+		    ("vm_page_sunbusy: invalid lock state for waiters"));
+
+		vm_page_lock(m);
+		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
+			vm_page_unlock(m);
+			continue;
+		}
+		wakeup(m);
+		vm_page_unlock(m);
+		break;
+	}
 }
 
 /*
- *      vm_page_flash:
+ *	vm_page_busy_sleep:
  *
- *      wakeup anyone waiting for the page.
+ *	Sleep and release the page lock, using the page pointer as wchan.
+ *	This is used to implement the hard-path of busying mechanism.
+ *
+ *	The given page must be locked.
+ *
+ *	If nonshared is true, sleep only if the page is xbusy.
  */
 void
-vm_page_flash(vm_page_t m)
+vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
 {
+	u_int x;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if (m->oflags & VPO_WANTED) {
-		m->oflags &= ~VPO_WANTED;
-		wakeup(m);
+	vm_page_assert_locked(m);
+
+	x = m->busy_lock;
+	if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
+	    ((x & VPB_BIT_WAITERS) == 0 &&
+	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
+		vm_page_unlock(m);
+		return;
 	}
+	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
 }
 
 /*
- *      vm_page_wakeup:
+ *	vm_page_trysbusy:
  *
- *      clear the VPO_BUSY flag and wakeup anyone waiting for the
- *      page.
- *
+ *	Try to shared busy a page.
+ *	If the operation succeeds 1 is returned otherwise 0.
+ *	The operation never sleeps.
  */
-void
-vm_page_wakeup(vm_page_t m)
+int
+vm_page_trysbusy(vm_page_t m)
 {
+	u_int x;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
-	m->oflags &= ~VPO_BUSY;
-	vm_page_flash(m);
+	for (;;) {
+		x = m->busy_lock;
+		if ((x & VPB_BIT_SHARED) == 0)
+			return (0);
+		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
+			return (1);
+	}
 }
 
+/*
+ *	vm_page_xunbusy_hard:
+ *
+ *	Called after the first try the exclusive unbusy of a page failed.
+ *	It is assumed that the waiters bit is on.
+ */
 void
-vm_page_io_start(vm_page_t m)
+vm_page_xunbusy_hard(vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	m->busy++;
+	vm_page_assert_xbusied(m);
+
+	vm_page_lock(m);
+	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+	wakeup(m);
+	vm_page_unlock(m);
 }
 
+/*
+ *	vm_page_flash:
+ *
+ *	Wakeup anyone waiting for the page.
+ *	The ownership bits do not change.
+ *
+ *	The given page must be locked.
+ */
 void
-vm_page_io_finish(vm_page_t m)
+vm_page_flash(vm_page_t m)
 {
+	u_int x;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT(m->busy > 0, ("vm_page_io_finish: page %p is not busy", m));
-	m->busy--;
-	if (m->busy == 0)
-		vm_page_flash(m);
+	vm_page_lock_assert(m, MA_OWNED);
+
+	for (;;) {
+		x = m->busy_lock;
+		if ((x & VPB_BIT_WAITERS) == 0)
+			return;
+		if (atomic_cmpset_int(&m->busy_lock, x,
+		    x & (~VPB_BIT_WAITERS)))
+			break;
+	}
+	wakeup(m);
 }
 
 /*
@@ -594,9 +746,9 @@
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
+	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
 	--mem->hold_count;
-	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
-	if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
+	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 		vm_page_free_toq(mem);
 }
 
@@ -687,8 +839,10 @@
 	/* Fictitious pages don't use "segind". */
 	m->flags = PG_FICTITIOUS;
 	/* Fictitious pages don't use "order" or "pool". */
-	m->oflags = VPO_BUSY | VPO_UNMANAGED;
+	m->oflags = VPO_UNMANAGED;
+	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	m->wire_count = 1;
+	pmap_page_init(m);
 memattr:
 	pmap_page_set_memattr(m, memattr);
 }
@@ -766,16 +920,13 @@
 		 * deactivating the page is usually the best choice,
 		 * unless the page is wanted by another thread.
 		 */
-		if (m->oflags & VPO_WANTED) {
-			vm_page_lock(m);
+		vm_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 			vm_page_activate(m);
-			vm_page_unlock(m);
-		} else {
-			vm_page_lock(m);
+		else
 			vm_page_deactivate(m);
-			vm_page_unlock(m);
-		}
-		vm_page_wakeup(m);
+		vm_page_unlock(m);
+		vm_page_xunbusy(m);
 	} else {
 		/*
 		 * Free the completely invalid page.  Such page state
@@ -790,35 +941,42 @@
 }
 
 /*
- *	vm_page_sleep:
+ *	vm_page_sleep_if_busy:
  *
- *	Sleep and release the page and page queues locks.
+ *	Sleep and release the page queues lock if the page is busied.
+ *	Returns TRUE if the thread slept.
  *
- *	The object containing the given page must be locked.
+ *	The given page must be unlocked and object containing it must
+ *	be locked.
  */
-void
-vm_page_sleep(vm_page_t m, const char *msg)
+int
+vm_page_sleep_if_busy(vm_page_t m, const char *msg)
 {
+	vm_object_t obj;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if (mtx_owned(&vm_page_queue_mtx))
-		vm_page_unlock_queues();
-	if (mtx_owned(vm_page_lockptr(m)))
-		vm_page_unlock(m);
+	vm_page_lock_assert(m, MA_NOTOWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
-	/*
-	 * It's possible that while we sleep, the page will get
-	 * unbusied and freed.  If we are holding the object
-	 * lock, we will assume we hold a reference to the object
-	 * such that even if m->object changes, we can re-lock
-	 * it.
-	 */
-	m->oflags |= VPO_WANTED;
-	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
+	if (vm_page_busied(m)) {
+		/*
+		 * The page-specific object must be cached because page
+		 * identity can change during the sleep, causing the
+		 * re-lock of a different object.
+		 * It is assumed that a reference to the object is already
+		 * held by the callers.
+		 */
+		obj = m->object;
+		vm_page_lock(m);
+		VM_OBJECT_WUNLOCK(obj);
+		vm_page_busy_sleep(m, msg, false);
+		VM_OBJECT_WLOCK(obj);
+		return (TRUE);
+	}
+	return (FALSE);
 }
 
 /*
- *	vm_page_dirty:
+ *	vm_page_dirty_KBI:		[ internal use only ]
  *
  *	Set all bits in the page's dirty field.
  *
@@ -826,11 +984,14 @@
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
+ *
+ *	This function should only be called by vm_page_dirty().
  */
 void
-vm_page_dirty(vm_page_t m)
+vm_page_dirty_KBI(vm_page_t m)
 {
 
+	/* These assertions refer to this operation by its public name. */
 	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_dirty: page in cache!"));
 	KASSERT(!VM_PAGE_IS_FREE(m),
@@ -841,77 +1002,52 @@
 }
 
 /*
- *	vm_page_splay:
+ *	vm_page_insert:		[ internal use only ]
  *
- *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
- *	the vm_page containing the given pindex.  If, however, that
- *	pindex is not found in the vm_object, returns a vm_page that is
- *	adjacent to the pindex, coming before or after it.
+ *	Inserts the given mem entry into the object and object list.
+ *
+ *	The object must be locked.
  */
-vm_page_t
-vm_page_splay(vm_pindex_t pindex, vm_page_t root)
+int
+vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
-	struct vm_page dummy;
-	vm_page_t lefttreemax, righttreemin, y;
+	vm_page_t mpred;
 
-	if (root == NULL)
-		return (root);
-	lefttreemax = righttreemin = &dummy;
-	for (;; root = y) {
-		if (pindex < root->pindex) {
-			if ((y = root->left) == NULL)
-				break;
-			if (pindex < y->pindex) {
-				/* Rotate right. */
-				root->left = y->right;
-				y->right = root;
-				root = y;
-				if ((y = root->left) == NULL)
-					break;
-			}
-			/* Link into the new root's right tree. */
-			righttreemin->left = root;
-			righttreemin = root;
-		} else if (pindex > root->pindex) {
-			if ((y = root->right) == NULL)
-				break;
-			if (pindex > y->pindex) {
-				/* Rotate left. */
-				root->right = y->left;
-				y->left = root;
-				root = y;
-				if ((y = root->right) == NULL)
-					break;
-			}
-			/* Link into the new root's left tree. */
-			lefttreemax->right = root;
-			lefttreemax = root;
-		} else
-			break;
-	}
-	/* Assemble the new root. */
-	lefttreemax->right = root->left;
-	righttreemin->left = root->right;
-	root->left = dummy.right;
-	root->right = dummy.left;
-	return (root);
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	mpred = vm_radix_lookup_le(&object->rtree, pindex);
+	return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
- *	vm_page_insert:		[ internal use only ]
+ *	vm_page_insert_after:
  *
- *	Inserts the given mem entry into the object and object list.
+ *	Inserts the page "m" into the specified object at offset "pindex".
  *
+ *	The page "mpred" must immediately precede the offset "pindex" within
+ *	the specified object.
+ *
  *	The object must be locked.
  */
-void
-vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
+static int
+vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
+    vm_page_t mpred)
 {
-	vm_page_t root;
+	vm_page_t msucc;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if (m->object != NULL)
-		panic("vm_page_insert: page already inserted");
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(m->object == NULL,
+	    ("vm_page_insert_after: page already inserted"));
+	if (mpred != NULL) {
+		KASSERT(mpred->object == object,
+		    ("vm_page_insert_after: object doesn't contain mpred"));
+		KASSERT(mpred->pindex < pindex,
+		    ("vm_page_insert_after: mpred doesn't precede pindex"));
+		msucc = TAILQ_NEXT(mpred, listq);
+	} else
+		msucc = TAILQ_FIRST(&object->memq);
+	if (msucc != NULL)
+		KASSERT(msucc->pindex > pindex,
+		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
 
 	/*
 	 * Record the object/offset pair in this page
@@ -922,29 +1058,45 @@
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
-	root = object->root;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
-		TAILQ_INSERT_TAIL(&object->memq, m, listq);
-	} else {
-		root = vm_page_splay(pindex, root);
-		if (pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-			TAILQ_INSERT_BEFORE(root, m, listq);
-		} else if (pindex == root->pindex)
-			panic("vm_page_insert: offset already allocated");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
-		}
+	if (vm_radix_insert(&object->rtree, m)) {
+		m->object = NULL;
+		m->pindex = 0;
+		return (1);
 	}
-	object->root = m;
+	vm_page_insert_radixdone(m, object, mpred);
+	return (0);
+}
 
+/*
+ *	vm_page_insert_radixdone:
+ *
+ *	Complete page "m" insertion into the specified object after the
+ *	radix trie hooking.
+ *
+ *	The page "mpred" must precede the offset "m->pindex" within the
+ *	specified object.
+ *
+ *	The object must be locked.
+ */
+static void
+vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(object != NULL && m->object == object,
+	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
+	if (mpred != NULL) {
+		KASSERT(mpred->object == object,
+		    ("vm_page_insert_after: object doesn't contain mpred"));
+		KASSERT(mpred->pindex < m->pindex,
+		    ("vm_page_insert_after: mpred doesn't precede pindex"));
+	}
+
+	if (mpred != NULL)
+		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
+	else
+		TAILQ_INSERT_HEAD(&object->memq, m, listq);
+
 	/*
 	 * Show that the object has one more resident page.
 	 */
@@ -977,57 +1129,30 @@
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t next, prev, root;
+	boolean_t lockacq;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_lock_assert(m, MA_OWNED);
 	if ((object = m->object) == NULL)
 		return;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if (m->oflags & VPO_BUSY) {
-		m->oflags &= ~VPO_BUSY;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (vm_page_xbusied(m)) {
+		lockacq = FALSE;
+		if ((m->oflags & VPO_UNMANAGED) != 0 &&
+		    !mtx_owned(vm_page_lockptr(m))) {
+			lockacq = TRUE;
+			vm_page_lock(m);
+		}
 		vm_page_flash(m);
+		atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+		if (lockacq)
+			vm_page_unlock(m);
 	}
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_remove: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_remove: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
+	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
@@ -1055,15 +1180,9 @@
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if ((m = object->root) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->root = m)->pindex != pindex)
-			m = NULL;
-	}
-	return (m);
+	VM_OBJECT_ASSERT_LOCKED(object);
+	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
@@ -1079,14 +1198,9 @@
 {
 	vm_page_t m;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
-		if (m->pindex < pindex) {
-			m = vm_page_splay(pindex, object->root);
-			if ((object->root = m)->pindex < pindex)
-				m = TAILQ_NEXT(m, listq);
-		}
-	}
+	VM_OBJECT_ASSERT_LOCKED(object);
+	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
+		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
@@ -1101,10 +1215,12 @@
 {
 	vm_page_t next;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((next = TAILQ_NEXT(m, listq)) != NULL &&
-	    next->pindex != m->pindex + 1)
-		next = NULL;
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
+		MPASS(next->object == m->object);
+		if (next->pindex != m->pindex + 1)
+			next = NULL;
+	}
 	return (next);
 }
 
@@ -1119,14 +1235,64 @@
 {
 	vm_page_t prev;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->pindex != m->pindex - 1)
-		prev = NULL;
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
+		MPASS(prev->object == m->object);
+		if (prev->pindex != m->pindex - 1)
+			prev = NULL;
+	}
 	return (prev);
 }
 
 /*
+ * Uses the page mnew as a replacement for an existing page at index
+ * pindex which must be already present in the object.
+ *
+ * The existing page must not be on a paging queue.
+ */
+vm_page_t
+vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
+{
+	vm_page_t mold, mpred;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	/*
+	 * This function mostly follows vm_page_insert() and
+	 * vm_page_remove() without the radix, object count and vnode
+	 * dance.  Double check such functions for more comments.
+	 */
+	mpred = vm_radix_lookup(&object->rtree, pindex);
+	KASSERT(mpred != NULL,
+	    ("vm_page_replace: replacing page not present with pindex"));
+	mpred = TAILQ_PREV(mpred, respgs, listq);
+	if (mpred != NULL)
+		KASSERT(mpred->pindex < pindex,
+		    ("vm_page_insert_after: mpred doesn't precede pindex"));
+
+	mnew->object = object;
+	mnew->pindex = pindex;
+	mold = vm_radix_replace(&object->rtree, mnew);
+	KASSERT(mold->queue == PQ_NONE,
+	    ("vm_page_replace: mold is on a paging queue"));
+
+	/* Detach the old page from the resident tailq. */
+	TAILQ_REMOVE(&object->memq, mold, listq);
+
+	mold->object = NULL;
+	vm_page_xunbusy(mold);
+
+	/* Insert the new page in the resident tailq. */
+	if (mpred != NULL)
+		TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
+	else
+		TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+	if (pmap_page_is_write_mapped(mnew))
+		vm_object_set_writeable_dirty(object);
+	return (mold);
+}
+
+/*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
@@ -1144,15 +1310,47 @@
  *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
  *	      on the cache.
  *
- *	The objects must be locked.  The page must be locked if it is managed.
+ *	The objects must be locked.
  */
-void
+int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
+	vm_page_t mpred;
+	vm_pindex_t opidx;
 
+	VM_OBJECT_ASSERT_WLOCKED(new_object);
+
+	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
+	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
+	    ("vm_page_rename: pindex already renamed"));
+
+	/*
+	 * Create a custom version of vm_page_insert() which does not depend
+	 * by m_prev and can cheat on the implementation aspects of the
+	 * function.
+	 */
+	opidx = m->pindex;
+	m->pindex = new_pindex;
+	if (vm_radix_insert(&new_object->rtree, m)) {
+		m->pindex = opidx;
+		return (1);
+	}
+
+	/*
+	 * The operation cannot fail anymore.  The removal must happen before
+	 * the listq iterator is tainted.
+	 */
+	m->pindex = opidx;
+	vm_page_lock(m);
 	vm_page_remove(m);
-	vm_page_insert(m, new_object, new_pindex);
+
+	/* Return back to the new pindex to complete vm_page_insert(). */
+	m->pindex = new_pindex;
+	m->object = new_object;
+	vm_page_unlock(m);
+	vm_page_insert_radixdone(m, new_object, mpred);
 	vm_page_dirty(m);
+	return (0);
 }
 
 /*
@@ -1166,55 +1364,21 @@
 void
 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	vm_page_t m, m_next;
+	vm_page_t m;
 	boolean_t empty;
 
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (__predict_false(object->cache == NULL)) {
+	if (__predict_false(vm_radix_is_empty(&object->cache))) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return;
 	}
-	m = object->cache = vm_page_splay(start, object->cache);
-	if (m->pindex < start) {
-		if (m->right == NULL)
-			m = NULL;
-		else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m;
-			m->right = NULL;
-			m = object->cache = m_next;
-		}
+	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
+		if (end != 0 && m->pindex >= end)
+			break;
+		vm_radix_remove(&object->cache, m->pindex);
+		vm_page_cache_turn_free(m);
 	}
-
-	/*
-	 * At this point, "m" is either (1) a reference to the page
-	 * with the least pindex that is greater than or equal to
-	 * "start" or (2) NULL.
-	 */
-	for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
-		/*
-		 * Find "m"'s successor and remove "m" from the
-		 * object's cache.
-		 */
-		if (m->right == NULL) {
-			object->cache = m->left;
-			m_next = NULL;
-		} else {
-			m_next = vm_page_splay(start, m->right);
-			m_next->left = m->left;
-			object->cache = m_next;
-		}
-		/* Convert "m" to a free page. */
-		m->object = NULL;
-		m->valid = 0;
-		/* Clear PG_CACHED and set PG_FREE. */
-		m->flags ^= PG_CACHED | PG_FREE;
-		KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
-		    ("vm_page_cache_free: page %p has inconsistent flags", m));
-		cnt.v_cache_count--;
-		cnt.v_free_count++;
-	}
-	empty = object->cache == NULL;
+	empty = vm_radix_is_empty(&object->cache);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (object->type == OBJT_VNODE && empty)
 		vdrop(object->handle);
@@ -1229,15 +1393,9 @@
 static inline vm_page_t
 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	if ((m = object->cache) != NULL && m->pindex != pindex) {
-		m = vm_page_splay(pindex, m);
-		if ((object->cache = m)->pindex != pindex)
-			m = NULL;
-	}
-	return (m);
+	return (vm_radix_lookup(&object->cache, pindex));
 }
 
 /*
@@ -1246,31 +1404,14 @@
  *
  *	The free page queue must be locked.
  */
-void
+static void
 vm_page_cache_remove(vm_page_t m)
 {
-	vm_object_t object;
-	vm_page_t root;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_page_cache_remove: page %p is not cached", m));
-	object = m->object;
-	if (m != object->cache) {
-		root = vm_page_splay(m->pindex, object->cache);
-		KASSERT(root == m,
-		    ("vm_page_cache_remove: page %p is not cached in object %p",
-		    m, object));
-	}
-	if (m->left == NULL)
-		root = m->right;
-	else if (m->right == NULL)
-		root = m->left;
-	else {
-		root = vm_page_splay(m->pindex, m->left);
-		root->right = m->right;
-	}
-	object->cache = root;
+	vm_radix_remove(&m->object->cache, m->pindex);
 	m->object = NULL;
 	cnt.v_cache_count--;
 }
@@ -1290,7 +1431,7 @@
 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
     vm_object_t new_object)
 {
-	vm_page_t m, m_next;
+	vm_page_t m;
 
 	/*
 	 * Insertion into an object's collection of cached pages
@@ -1297,54 +1438,26 @@
 	 * requires the object to be locked.  In contrast, removal does
 	 * not.
 	 */
-	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
-	KASSERT(new_object->cache == NULL,
+	VM_OBJECT_ASSERT_WLOCKED(new_object);
+	KASSERT(vm_radix_is_empty(&new_object->cache),
 	    ("vm_page_cache_transfer: object %p has cached pages",
 	    new_object));
 	mtx_lock(&vm_page_queue_free_mtx);
-	if ((m = orig_object->cache) != NULL) {
+	while ((m = vm_radix_lookup_ge(&orig_object->cache,
+	    offidxstart)) != NULL) {
 		/*
 		 * Transfer all of the pages with offset greater than or
 		 * equal to 'offidxstart' from the original object's
 		 * cache to the new object's cache.
 		 */
-		m = vm_page_splay(offidxstart, m);
-		if (m->pindex < offidxstart) {
-			orig_object->cache = m;
-			new_object->cache = m->right;
-			m->right = NULL;
-		} else {
-			orig_object->cache = m->left;
-			new_object->cache = m;
-			m->left = NULL;
-		}
-		while ((m = new_object->cache) != NULL) {
-			if ((m->pindex - offidxstart) >= new_object->size) {
-				/*
-				 * Return all of the cached pages with
-				 * offset greater than or equal to the
-				 * new object's size to the original
-				 * object's cache. 
-				 */
-				new_object->cache = m->left;
-				m->left = orig_object->cache;
-				orig_object->cache = m;
-				break;
-			}
-			m_next = vm_page_splay(m->pindex, m->right);
-			/* Update the page's object and offset. */
-			m->object = new_object;
-			m->pindex -= offidxstart;
-			if (m_next == NULL)
-				break;
-			m->right = NULL;
-			m_next->left = m;
-			new_object->cache = m_next;
-		}
-		KASSERT(new_object->cache == NULL ||
-		    new_object->type == OBJT_SWAP,
-		    ("vm_page_cache_transfer: object %p's type is incompatible"
-		    " with cached pages", new_object));
+		if ((m->pindex - offidxstart) >= new_object->size)
+			break;
+		vm_radix_remove(&orig_object->cache, m->pindex);
+		/* Update the page's object and offset. */
+		m->object = new_object;
+		m->pindex -= offidxstart;
+		if (vm_radix_insert(&new_object->cache, m))
+			vm_page_cache_turn_free(m);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -1367,8 +1480,8 @@
 	 * page queues lock in order to prove that the specified page doesn't
 	 * exist.
 	 */
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if (object->cache == NULL)
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (__predict_true(vm_object_cache_is_empty(object)))
 		return (FALSE);
 	mtx_lock(&vm_page_queue_free_mtx);
 	m = vm_page_cache_lookup(object, pindex);
@@ -1379,8 +1492,8 @@
 /*
  *	vm_page_alloc:
  *
- *	Allocate and return a memory cell associated
- *	with this VM object/offset pair.
+ *	Allocate and return a page that is associated with the specified
+ *	object and offset pair.  By default, this page is exclusive busied.
  *
  *	The caller must always specify an allocation class.
  *
@@ -1390,13 +1503,18 @@
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
- *	VM_ALLOC_ZERO		prefer a zeroed page
- *	VM_ALLOC_WIRED		wire the allocated page
- *	VM_ALLOC_NOOBJ		page is not associated with a vm object
- *	VM_ALLOC_NOBUSY		do not set the page busy
+ *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
+ *				intends to allocate
  *	VM_ALLOC_IFCACHED	return page only if it is cached
  *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
  *				is cached
+ *	VM_ALLOC_NOBUSY		do not exclusive busy the page
+ *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
+ *	VM_ALLOC_NOOBJ		page is not associated with an object and
+ *				should not be exclusive busy 
+ *	VM_ALLOC_SBUSY		shared busy the allocated page
+ *	VM_ALLOC_WIRED		wire the allocated page
+ *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
@@ -1405,28 +1523,43 @@
 {
 	struct vnode *vp = NULL;
 	vm_object_t m_object;
-	vm_page_t m;
-	int flags, page_req;
+	vm_page_t m, mpred;
+	int flags, req_class;
 
-	if ((req & VM_ALLOC_NOOBJ) == 0) {
-		KASSERT(object != NULL,
-		    ("vm_page_alloc: NULL object."));
-		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	}
+	mpred = 0;	/* XXX: pacify gcc */
+	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
+	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
+	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
+	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
+	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+	    req));
+	if (object != NULL)
+		VM_OBJECT_ASSERT_WLOCKED(object);
 
-	page_req = req & VM_ALLOC_CLASS_MASK;
+	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
-	 * The pager is allowed to eat deeper into the free page list.
+	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
-	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT))
-		page_req = VM_ALLOC_SYSTEM;
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
 
-	mtx_lock(&vm_page_queue_free_mtx);
+	if (object != NULL) {
+		mpred = vm_radix_lookup_le(&object->rtree, pindex);
+		KASSERT(mpred == NULL || mpred->pindex != pindex,
+		   ("vm_page_alloc: pindex already allocated"));
+	}
+
+	/*
+	 * The page allocation request can came from consumers which already
+	 * hold the free page queue mutex, like vm_page_insert() in
+	 * vm_page_cache().
+	 */
+	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
 	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
-	    (page_req == VM_ALLOC_SYSTEM && 
+	    (req_class == VM_ALLOC_SYSTEM &&
 	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT &&
+	    (req_class == VM_ALLOC_INTERRUPT &&
 	    cnt.v_free_count + cnt.v_cache_count > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
@@ -1451,10 +1584,9 @@
 			mtx_unlock(&vm_page_queue_free_mtx);
 			return (NULL);
 #if VM_NRESERVLEVEL > 0
-		} else if (object == NULL || object->type == OBJT_DEVICE ||
-		    object->type == OBJT_SG ||
-		    (object->flags & OBJ_COLORED) == 0 ||
-		    (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
+		} else if (object == NULL || (object->flags & (OBJ_COLORED |
+		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
+		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
 #else
 		} else {
 #endif
@@ -1474,7 +1606,7 @@
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
-		    MAX((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
@@ -1482,18 +1614,19 @@
 	/*
 	 *  At this point we had better have found a good page.
 	 */
-
 	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
 	KASSERT(m->queue == PQ_NONE,
 	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
 	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
 	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
-	KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
+	KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m));
 	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
 	    pmap_page_get_memattr(m)));
 	if ((m->flags & PG_CACHED) != 0) {
+		KASSERT((m->flags & PG_ZERO) == 0,
+		    ("vm_page_alloc: cached page %p is PG_ZERO", m));
 		KASSERT(m->valid != 0,
 		    ("vm_page_alloc: cached page %p is invalid", m));
 		if (m->object == object && m->pindex == pindex)
@@ -1502,7 +1635,8 @@
 			m->valid = 0;
 		m_object = m->object;
 		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
+		if (m_object->type == OBJT_VNODE &&
+		    vm_object_cache_is_empty(m_object))
 			vp = m_object->handle;
 	} else {
 		KASSERT(VM_PAGE_IS_FREE(m),
@@ -1509,7 +1643,7 @@
 		    ("vm_page_alloc: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
 	}
 
 	/*
@@ -1517,22 +1651,23 @@
 	 * must be cleared before the free page queues lock is released.
 	 */
 	flags = 0;
-	if (req & VM_ALLOC_NODUMP)
-		flags |= PG_NODUMP;
 	if (m->flags & PG_ZERO) {
 		vm_page_zero_count--;
 		if (req & VM_ALLOC_ZERO)
 			flags = PG_ZERO;
 	}
+	if (req & VM_ALLOC_NODUMP)
+		flags |= PG_NODUMP;
 	m->flags = flags;
 	mtx_unlock(&vm_page_queue_free_mtx);
 	m->aflags = 0;
-	if (object == NULL || object->type == OBJT_PHYS)
-		m->oflags = VPO_UNMANAGED;
-	else
-		m->oflags = 0;
-	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ)) == 0)
-		m->oflags |= VPO_BUSY;
+	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
+	    VPO_UNMANAGED : 0;
+	m->busy_lock = VPB_UNBUSIED;
+	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
+		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
+	if ((req & VM_ALLOC_SBUSY) != 0)
+		m->busy_lock = VPB_SHARERS_WORD(1);
 	if (req & VM_ALLOC_WIRED) {
 		/*
 		 * The page lock is not required for wiring a page until that
@@ -1544,11 +1679,26 @@
 	m->act_count = 0;
 
 	if (object != NULL) {
+		if (vm_page_insert_after(m, object, pindex, mpred)) {
+			/* See the comment below about hold count. */
+			if (vp != NULL)
+				vdrop(vp);
+			pagedaemon_wakeup();
+			if (req & VM_ALLOC_WIRED) {
+				atomic_subtract_int(&cnt.v_wire_count, 1);
+				m->wire_count = 0;
+			}
+			m->object = NULL;
+			m->oflags = VPO_UNMANAGED;
+			m->busy_lock = VPB_UNBUSIED;
+			vm_page_free(m);
+			return (NULL);
+		}
+
 		/* Ignore device objects; the pager sets "memattr" for them. */
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
-		    object->type != OBJT_DEVICE && object->type != OBJT_SG)
+		    (object->flags & OBJ_FICTITIOUS) == 0)
 			pmap_page_set_memattr(m, object->memattr);
-		vm_page_insert(m, object, pindex);
 	} else
 		m->pindex = pindex;
 
@@ -1571,13 +1721,204 @@
 	return (m);
 }
 
+static void
+vm_page_alloc_contig_vdrop(struct spglist *lst)
+{
+
+	while (!SLIST_EMPTY(lst)) {
+		vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
+		SLIST_REMOVE_HEAD(lst, plinks.s.ss);
+	}
+}
+
 /*
+ *	vm_page_alloc_contig:
+ *
+ *	Allocate a contiguous set of physical pages of the given size "npages"
+ *	from the free lists.  All of the physical pages must be at or above
+ *	the given physical address "low" and below the given physical address
+ *	"high".  The given value "alignment" determines the alignment of the
+ *	first physical page in the set.  If the given value "boundary" is
+ *	non-zero, then the set of physical pages cannot cross any physical
+ *	address boundary that is a multiple of that value.  Both "alignment"
+ *	and "boundary" must be a power of two.
+ *
+ *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
+ *	then the memory attribute setting for the physical pages is configured
+ *	to the object's memory attribute setting.  Otherwise, the memory
+ *	attribute setting for the physical pages is configured to "memattr",
+ *	overriding the object's memory attribute setting.  However, if the
+ *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
+ *	memory attribute setting for the physical pages cannot be configured
+ *	to VM_MEMATTR_DEFAULT.
+ *
+ *	The caller must always specify an allocation class.
+ *
+ *	allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs a page
+ *	VM_ALLOC_INTERRUPT	interrupt time request
+ *
+ *	optional allocation flags:
+ *	VM_ALLOC_NOBUSY		do not exclusive busy the page
+ *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
+ *	VM_ALLOC_NOOBJ		page is not associated with an object and
+ *				should not be exclusive busy 
+ *	VM_ALLOC_SBUSY		shared busy the allocated page
+ *	VM_ALLOC_WIRED		wire the allocated page
+ *	VM_ALLOC_ZERO		prefer a zeroed page
+ *
+ *	This routine may not sleep.
+ */
+vm_page_t
+vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
+    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+    vm_paddr_t boundary, vm_memattr_t memattr)
+{
+	struct vnode *drop;
+	struct spglist deferred_vdrop_list;
+	vm_page_t m, m_tmp, m_ret;
+	u_int flags, oflags;
+	int req_class;
+
+	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
+	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
+	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
+	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
+	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+	    req));
+	if (object != NULL) {
+		VM_OBJECT_ASSERT_WLOCKED(object);
+		KASSERT(object->type == OBJT_PHYS,
+		    ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
+		    object));
+	}
+	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
+	req_class = req & VM_ALLOC_CLASS_MASK;
+
+	/*
+	 * The page daemon is allowed to dig deeper into the free page list.
+	 */
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
+
+	SLIST_INIT(&deferred_vdrop_list);
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (cnt.v_free_count + cnt.v_cache_count >= npages +
+	    cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
+	    cnt.v_free_count + cnt.v_cache_count >= npages +
+	    cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
+	    cnt.v_free_count + cnt.v_cache_count >= npages)) {
+#if VM_NRESERVLEVEL > 0
+retry:
+		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
+		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
+		    low, high, alignment, boundary)) == NULL)
+#endif
+			m_ret = vm_phys_alloc_contig(npages, low, high,
+			    alignment, boundary);
+	} else {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		atomic_add_int(&vm_pageout_deficit, npages);
+		pagedaemon_wakeup();
+		return (NULL);
+	}
+	if (m_ret != NULL)
+		for (m = m_ret; m < &m_ret[npages]; m++) {
+			drop = vm_page_alloc_init(m);
+			if (drop != NULL) {
+				/*
+				 * Enqueue the vnode for deferred vdrop().
+				 */
+				m->plinks.s.pv = drop;
+				SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
+				    plinks.s.ss);
+			}
+		}
+	else {
+#if VM_NRESERVLEVEL > 0
+		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
+		    boundary))
+			goto retry;
+#endif
+	}
+	mtx_unlock(&vm_page_queue_free_mtx);
+	if (m_ret == NULL)
+		return (NULL);
+
+	/*
+	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
+	 */
+	flags = 0;
+	if ((req & VM_ALLOC_ZERO) != 0)
+		flags = PG_ZERO;
+	if ((req & VM_ALLOC_NODUMP) != 0)
+		flags |= PG_NODUMP;
+	if ((req & VM_ALLOC_WIRED) != 0)
+		atomic_add_int(&cnt.v_wire_count, npages);
+	oflags = VPO_UNMANAGED;
+	if (object != NULL) {
+		if (object->memattr != VM_MEMATTR_DEFAULT &&
+		    memattr == VM_MEMATTR_DEFAULT)
+			memattr = object->memattr;
+	}
+	for (m = m_ret; m < &m_ret[npages]; m++) {
+		m->aflags = 0;
+		m->flags = (m->flags | PG_NODUMP) & flags;
+		m->busy_lock = VPB_UNBUSIED;
+		if (object != NULL) {
+			if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
+				m->busy_lock = VPB_SINGLE_EXCLUSIVER;
+			if ((req & VM_ALLOC_SBUSY) != 0)
+				m->busy_lock = VPB_SHARERS_WORD(1);
+		}
+		if ((req & VM_ALLOC_WIRED) != 0)
+			m->wire_count = 1;
+		/* Unmanaged pages don't use "act_count". */
+		m->oflags = oflags;
+		if (object != NULL) {
+			if (vm_page_insert(m, object, pindex)) {
+				vm_page_alloc_contig_vdrop(
+				    &deferred_vdrop_list);
+				if (vm_paging_needed())
+					pagedaemon_wakeup();
+				if ((req & VM_ALLOC_WIRED) != 0)
+					atomic_subtract_int(&cnt.v_wire_count,
+					    npages);
+				for (m_tmp = m, m = m_ret;
+				    m < &m_ret[npages]; m++) {
+					if ((req & VM_ALLOC_WIRED) != 0)
+						m->wire_count = 0;
+					if (m >= m_tmp) {
+						m->object = NULL;
+						m->oflags |= VPO_UNMANAGED;
+					}
+					m->busy_lock = VPB_UNBUSIED;
+					vm_page_free(m);
+				}
+				return (NULL);
+			}
+		} else
+			m->pindex = pindex;
+		if (memattr != VM_MEMATTR_DEFAULT)
+			pmap_page_set_memattr(m, memattr);
+		pindex++;
+	}
+	vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
+	if (vm_paging_needed())
+		pagedaemon_wakeup();
+	return (m_ret);
+}
+
+/*
  * Initialize a page that has been freshly dequeued from a freelist.
  * The caller has to drop the vnode returned, if it is not NULL.
  *
+ * This function may only be used to initialize unmanaged pages.
+ *
  * To be called with vm_page_queue_free_mtx held.
  */
-struct vnode *
+static struct vnode *
 vm_page_alloc_init(vm_page_t m)
 {
 	struct vnode *drop;
@@ -1590,7 +1931,7 @@
 	    ("vm_page_alloc_init: page %p is wired", m));
 	KASSERT(m->hold_count == 0,
 	    ("vm_page_alloc_init: page %p is held", m));
-	KASSERT(m->busy == 0,
+	KASSERT(!vm_page_busied(m),
 	    ("vm_page_alloc_init: page %p is busy", m));
 	KASSERT(m->dirty == 0,
 	    ("vm_page_alloc_init: page %p is dirty", m));
@@ -1600,11 +1941,13 @@
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	drop = NULL;
 	if ((m->flags & PG_CACHED) != 0) {
+		KASSERT((m->flags & PG_ZERO) == 0,
+		    ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
 		m->valid = 0;
 		m_object = m->object;
 		vm_page_cache_remove(m);
 		if (m_object->type == OBJT_VNODE &&
-		    m_object->cache == NULL)
+		    vm_object_cache_is_empty(m_object))
 			drop = m_object->handle;
 	} else {
 		KASSERT(VM_PAGE_IS_FREE(m),
@@ -1611,24 +1954,34 @@
 		    ("vm_page_alloc_init: page %p is not free", m));
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc_init: free page %p is valid", m));
-		cnt.v_free_count--;
+		vm_phys_freecnt_adj(m, -1);
+		if ((m->flags & PG_ZERO) != 0)
+			vm_page_zero_count--;
 	}
-	if (m->flags & PG_ZERO)
-		vm_page_zero_count--;
 	/* Don't clear the PG_ZERO flag; we'll need it later. */
 	m->flags &= PG_ZERO;
-	m->aflags = 0;
-	m->oflags = VPO_UNMANAGED;
-	/* Unmanaged pages don't use "act_count". */
 	return (drop);
 }
 
 /*
  * 	vm_page_alloc_freelist:
- * 
- *	Allocate a page from the specified freelist.
- *	Only the ALLOC_CLASS values in req are honored, other request flags
- *	are ignored.
+ *
+ *	Allocate a physical page from the specified free page list.
+ *
+ *	The caller must always specify an allocation class.
+ *
+ *	allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs a page
+ *	VM_ALLOC_INTERRUPT	interrupt time request
+ *
+ *	optional allocation flags:
+ *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
+ *				intends to allocate
+ *	VM_ALLOC_WIRED		wire the allocated page
+ *	VM_ALLOC_ZERO		prefer a zeroed page
+ *
+ *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
@@ -1635,20 +1988,33 @@
 {
 	struct vnode *drop;
 	vm_page_t m;
-	int page_req;
+	u_int flags;
+	int req_class;
 
-	m = NULL;
-	page_req = req & VM_ALLOC_CLASS_MASK;
-	mtx_lock(&vm_page_queue_free_mtx);
+	req_class = req & VM_ALLOC_CLASS_MASK;
+
 	/*
+	 * The page daemon is allowed to dig deeper into the free page list.
+	 */
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
+
+	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
+	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
 	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
-	    (page_req == VM_ALLOC_SYSTEM && 
+	    (req_class == VM_ALLOC_SYSTEM &&
 	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count > 0)) {
+	    (req_class == VM_ALLOC_INTERRUPT &&
+	    cnt.v_free_count + cnt.v_cache_count > 0))
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
+	else {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		atomic_add_int(&vm_pageout_deficit,
+		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+		pagedaemon_wakeup();
+		return (NULL);
 	}
 	if (m == NULL) {
 		mtx_unlock(&vm_page_queue_free_mtx);
@@ -1656,8 +2022,29 @@
 	}
 	drop = vm_page_alloc_init(m);
 	mtx_unlock(&vm_page_queue_free_mtx);
-	if (drop)
+
+	/*
+	 * Initialize the page.  Only the PG_ZERO flag is inherited.
+	 */
+	m->aflags = 0;
+	flags = 0;
+	if ((req & VM_ALLOC_ZERO) != 0)
+		flags = PG_ZERO;
+	m->flags &= flags;
+	if ((req & VM_ALLOC_WIRED) != 0) {
+		/*
+		 * The page lock is not required for wiring a page that does
+		 * not belong to an object.
+		 */
+		atomic_add_int(&cnt.v_wire_count, 1);
+		m->wire_count = 1;
+	}
+	/* Unmanaged pages don't use "act_count". */
+	m->oflags = VPO_UNMANAGED;
+	if (drop != NULL)
 		vdrop(drop);
+	if (vm_paging_needed())
+		pagedaemon_wakeup();
 	return (m);
 }
 
@@ -1709,84 +2096,117 @@
 	    "pfault", 0);
 }
 
+struct vm_pagequeue *
+vm_page_pagequeue(vm_page_t m)
+{
+
+	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+}
+
 /*
- *	vm_page_requeue:
+ *	vm_page_dequeue:
  *
- *	Move the given page to the tail of its present page queue.
+ *	Remove the given page from its current page queue.
  *
- *	The page queues must be locked.
+ *	The page must be locked.
  */
 void
-vm_page_requeue(vm_page_t m)
+vm_page_dequeue(vm_page_t m)
 {
-	struct vpgqueues *vpq;
-	int queue;
+	struct vm_pagequeue *pq;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-	queue = m->queue;
-	KASSERT(queue != PQ_NONE,
-	    ("vm_page_requeue: page %p is not queued", m));
-	vpq = &vm_page_queues[queue];
-	TAILQ_REMOVE(&vpq->pl, m, pageq);
-	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
+	vm_page_lock_assert(m, MA_OWNED);
+	KASSERT(m->queue != PQ_NONE,
+	    ("vm_page_dequeue: page %p is not queued", m));
+	pq = vm_page_pagequeue(m);
+	vm_pagequeue_lock(pq);
+	m->queue = PQ_NONE;
+	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+	vm_pagequeue_cnt_dec(pq);
+	vm_pagequeue_unlock(pq);
 }
 
 /*
- *	vm_page_queue_remove:
+ *	vm_page_dequeue_locked:
  *
- *	Remove the given page from the specified queue.
+ *	Remove the given page from its current page queue.
  *
- *	The page and page queues must be locked.
+ *	The page and page queue must be locked.
  */
-static __inline void
-vm_page_queue_remove(int queue, vm_page_t m)
+void
+vm_page_dequeue_locked(vm_page_t m)
 {
-	struct vpgqueues *pq;
+	struct vm_pagequeue *pq;
 
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_page_queues[queue];
-	TAILQ_REMOVE(&pq->pl, m, pageq);
-	(*pq->cnt)--;
+	pq = vm_page_pagequeue(m);
+	vm_pagequeue_assert_locked(pq);
+	m->queue = PQ_NONE;
+	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+	vm_pagequeue_cnt_dec(pq);
 }
 
 /*
- *	vm_pageq_remove:
+ *	vm_page_enqueue:
  *
- *	Remove a page from its queue.
+ *	Add the given page to the specified page queue.
  *
- *	The given page must be locked.
+ *	The page must be locked.
  */
+static void
+vm_page_enqueue(int queue, vm_page_t m)
+{
+	struct vm_pagequeue *pq;
+
+	vm_page_lock_assert(m, MA_OWNED);
+	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+	vm_pagequeue_lock(pq);
+	m->queue = queue;
+	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+	vm_pagequeue_cnt_inc(pq);
+	vm_pagequeue_unlock(pq);
+}
+
+/*
+ *	vm_page_requeue:
+ *
+ *	Move the given page to the tail of its current page queue.
+ *
+ *	The page must be locked.
+ */
 void
-vm_pageq_remove(vm_page_t m)
+vm_page_requeue(vm_page_t m)
 {
-	int queue;
+	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	if ((queue = m->queue) != PQ_NONE) {
-		vm_page_lock_queues();
-		m->queue = PQ_NONE;
-		vm_page_queue_remove(queue, m);
-		vm_page_unlock_queues();
-	}
+	KASSERT(m->queue != PQ_NONE,
+	    ("vm_page_requeue: page %p is not queued", m));
+	pq = vm_page_pagequeue(m);
+	vm_pagequeue_lock(pq);
+	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+	vm_pagequeue_unlock(pq);
 }
 
 /*
- *	vm_page_enqueue:
+ *	vm_page_requeue_locked:
  *
- *	Add the given page to the specified queue.
+ *	Move the given page to the tail of its current page queue.
  *
- *	The page queues must be locked.
+ *	The page queue must be locked.
  */
-static void
-vm_page_enqueue(int queue, vm_page_t m)
+void
+vm_page_requeue_locked(vm_page_t m)
 {
-	struct vpgqueues *vpq;
+	struct vm_pagequeue *pq;
 
-	vpq = &vm_page_queues[queue];
-	m->queue = queue;
-	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
-	++*vpq->cnt;
+	KASSERT(m->queue != PQ_NONE,
+	    ("vm_page_requeue_locked: page %p is not queued", m));
+	pq = vm_page_pagequeue(m);
+	vm_pagequeue_assert_locked(pq);
+	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
+	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 }
 
 /*
@@ -1804,16 +2224,13 @@
 	int queue;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((queue = m->queue) != PQ_ACTIVE) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
-			vm_page_lock_queues();
 			if (queue != PQ_NONE)
-				vm_page_queue_remove(queue, m);
+				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_ACTIVE, m);
-			vm_page_unlock_queues();
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("vm_page_activate: wired page %p is queued", m));
@@ -1858,6 +2275,28 @@
 }
 
 /*
+ *	Turn a cached page into a free page, by changing its attributes.
+ *	Keep the statistics up-to-date.
+ *
+ *	The free page queue must be locked.
+ */
+static void
+vm_page_cache_turn_free(vm_page_t m)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	m->object = NULL;
+	m->valid = 0;
+	/* Clear PG_CACHED and set PG_FREE. */
+	m->flags ^= PG_CACHED | PG_FREE;
+	KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+	    ("vm_page_cache_free: page %p has inconsistent flags", m));
+	cnt.v_cache_count--;
+	vm_phys_freecnt_adj(m, 1);
+}
+
+/*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the free list,
@@ -1873,12 +2312,14 @@
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("vm_page_free_toq: freeing mapped page %p", m));
-	}
+	} else
+		KASSERT(m->queue == PQ_NONE,
+		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 	PCPU_INC(cnt.v_tfree);
 
 	if (VM_PAGE_IS_FREE(m))
 		panic("vm_page_free: freeing free page %p", m);
-	else if (m->busy != 0)
+	else if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
@@ -1887,8 +2328,7 @@
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
-	if ((m->oflags & VPO_UNMANAGED) == 0)
-		vm_pageq_remove(m);
+	vm_page_remque(m);
 	vm_page_remove(m);
 
 	/*
@@ -1906,9 +2346,9 @@
 		panic("vm_page_free: freeing wired page %p", m);
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
-		vm_page_lock_queues();
-		vm_page_enqueue(PQ_HOLD, m);
-		vm_page_unlock_queues();
+		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
+		m->flags |= PG_UNHOLDFREE;
 	} else {
 		/*
 		 * Restore the default memory attribute to the page.
@@ -1922,7 +2362,7 @@
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		m->flags |= PG_FREE;
-		cnt.v_free_count++;
+		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
@@ -1966,8 +2406,10 @@
 		return;
 	}
 	if (m->wire_count == 0) {
-		if ((m->oflags & VPO_UNMANAGED) == 0)
-			vm_pageq_remove(m);
+		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
+		    m->queue == PQ_NONE,
+		    ("vm_page_wire: unmanaged page %p is queued", m));
+		vm_page_remque(m);
 		atomic_add_int(&cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
@@ -1986,7 +2428,7 @@
  * However, unless the page belongs to an object, it is not enqueued because
  * it cannot be paged out.
  *
- * If a page is fictitious, then its wire count must alway be one.
+ * If a page is fictitious, then its wire count must always be one.
  *
  * A managed page must be locked.
  */
@@ -2010,9 +2452,7 @@
 				return;
 			if (!activate)
 				m->flags &= ~PG_WINATCFLS;
-			vm_page_lock_queues();
 			vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
-			vm_page_unlock_queues();
 		}
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
@@ -2041,29 +2481,36 @@
 static inline void
 _vm_page_deactivate(vm_page_t m, int athead)
 {
+	struct vm_pagequeue *pq;
 	int queue;
 
-	vm_page_lock_assert(m, MA_OWNED);
+	vm_page_assert_locked(m);
 
 	/*
-	 * Ignore if already inactive.
+	 * Ignore if the page is already inactive, unless it is unlikely to be
+	 * reactivated.
 	 */
-	if ((queue = m->queue) == PQ_INACTIVE)
+	if ((queue = m->queue) == PQ_INACTIVE && !athead)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
-		m->flags &= ~PG_WINATCFLS;
-		vm_page_lock_queues();
-		if (queue != PQ_NONE)
-			vm_page_queue_remove(queue, m);
+		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
+		/* Avoid multiple acquisitions of the inactive queue lock. */
+		if (queue == PQ_INACTIVE) {
+			vm_pagequeue_lock(pq);
+			vm_page_dequeue_locked(m);
+		} else {
+			if (queue != PQ_NONE)
+				vm_page_dequeue(m);
+			m->flags &= ~PG_WINATCFLS;
+			vm_pagequeue_lock(pq);
+		}
+		m->queue = PQ_INACTIVE;
 		if (athead)
-			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m,
-			    pageq);
+			TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
 		else
-			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m,
-			    pageq);
-		m->queue = PQ_INACTIVE;
-		cnt.v_inactive_count++;
-		vm_page_unlock_queues();
+			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+		vm_pagequeue_cnt_inc(pq);
+		vm_pagequeue_unlock(pq);
 	}
 }
 
@@ -2089,9 +2536,9 @@
 {
 
 	vm_page_lock_assert(m, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
-	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (m->dirty || m->hold_count || m->wire_count ||
+	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
@@ -2112,9 +2559,9 @@
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if (m->object != NULL)
-		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
-	    (m->oflags & (VPO_BUSY | VPO_UNMANAGED)) != 0)
+		VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (m->dirty || m->hold_count || m->wire_count ||
+	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
@@ -2134,17 +2581,17 @@
 vm_page_cache(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t next, prev, root;
+	boolean_t cache_was_empty;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) || m->busy ||
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
 	    m->hold_count || m->wire_count)
 		panic("vm_page_cache: attempting to cache busy page");
-	pmap_remove_all(m);
-	if (m->dirty != 0)
-		panic("vm_page_cache: page %p is dirty", m);
+	KASSERT(!pmap_page_is_mapped(m),
+	    ("vm_page_cache: page %p is mapped", m));
+	KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
 	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
 	    (object->type == OBJT_SWAP &&
 	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
@@ -2158,53 +2605,17 @@
 	}
 	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_cache: page %p is already cached", m));
-	PCPU_INC(cnt.v_tcached);
 
 	/*
 	 * Remove the page from the paging queues.
 	 */
-	vm_pageq_remove(m);
+	vm_page_remque(m);
 
 	/*
 	 * Remove the page from the object's collection of resident
 	 * pages. 
 	 */
-	if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
-		/*
-		 * Since the page's successor in the list is also its parent
-		 * in the tree, its right subtree must be empty.
-		 */
-		next->left = m->left;
-		KASSERT(m->right == NULL,
-		    ("vm_page_cache: page %p has right child", m));
-	} else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
-	    prev->right == m) {
-		/*
-		 * Since the page's predecessor in the list is also its parent
-		 * in the tree, its left subtree must be empty.
-		 */
-		KASSERT(m->left == NULL,
-		    ("vm_page_cache: page %p has left child", m));
-		prev->right = m->right;
-	} else {
-		if (m != object->root)
-			vm_page_splay(m->pindex, object->root);
-		if (m->left == NULL)
-			root = m->right;
-		else if (m->right == NULL)
-			root = m->left;
-		else {
-			/*
-			 * Move the page's successor to the root, because
-			 * pages are usually removed in ascending order.
-			 */
-			if (m->right != next)
-				vm_page_splay(m->pindex, m->right);
-			next->left = m->left;
-			root = next;
-		}
-		object->root = root;
-	}
+	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 	object->resident_page_count--;
 
@@ -2220,27 +2631,28 @@
 	 */
 	m->flags &= ~PG_ZERO;
 	mtx_lock(&vm_page_queue_free_mtx);
+	cache_was_empty = vm_radix_is_empty(&object->cache);
+	if (vm_radix_insert(&object->cache, m)) {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		if (object->type == OBJT_VNODE &&
+		    object->resident_page_count == 0)
+			vdrop(object->handle);
+		m->object = NULL;
+		vm_page_free(m);
+		return;
+	}
+
+	/*
+	 * The above call to vm_radix_insert() could reclaim the one pre-
+	 * existing cached page from this object, resulting in a call to
+	 * vdrop().
+	 */
+	if (!cache_was_empty)
+		cache_was_empty = vm_radix_is_singleton(&object->cache);
+
 	m->flags |= PG_CACHED;
 	cnt.v_cache_count++;
-	root = object->cache;
-	if (root == NULL) {
-		m->left = NULL;
-		m->right = NULL;
-	} else {
-		root = vm_page_splay(m->pindex, root);
-		if (m->pindex < root->pindex) {
-			m->left = root->left;
-			m->right = root;
-			root->left = NULL;
-		} else if (__predict_false(m->pindex == root->pindex))
-			panic("vm_page_cache: offset already cached");
-		else {
-			m->right = root->right;
-			m->left = root;
-			root->right = NULL;
-		}
-	}
-	object->cache = m;
+	PCPU_INC(cnt.v_tcached);
 #if VM_NRESERVLEVEL > 0
 	if (!vm_reserv_free_page(m)) {
 #else
@@ -2258,87 +2670,60 @@
 	 * the object's only resident page.
 	 */
 	if (object->type == OBJT_VNODE) {
-		if (root == NULL && object->resident_page_count != 0)
+		if (cache_was_empty && object->resident_page_count != 0)
 			vhold(object->handle);
-		else if (root != NULL && object->resident_page_count == 0)
+		else if (!cache_was_empty && object->resident_page_count == 0)
 			vdrop(object->handle);
 	}
 }
 
 /*
- * vm_page_dontneed
+ * vm_page_advise
  *
- *	Cache, deactivate, or do nothing as appropriate.  This routine
- *	is typically used by madvise() MADV_DONTNEED.
+ * 	Deactivate or do nothing, as appropriate.  This routine is used
+ * 	by madvise() and vop_stdadvise().
  *
- *	Generally speaking we want to move the page into the cache so
- *	it gets reused quickly.  However, this can result in a silly syndrome
- *	due to the page recycling too quickly.  Small objects will not be
- *	fully cached.  On the otherhand, if we move the page to the inactive
- *	queue we wind up with a problem whereby very large objects 
- *	unnecessarily blow away our inactive and cache queues.
- *
- *	The solution is to move the pages based on a fixed weighting.  We
- *	either leave them alone, deactivate them, or move them to the cache,
- *	where moving them to the cache has the highest weighting.
- *	By forcing some pages into other queues we eventually force the
- *	system to balance the queues, potentially recovering other unrelated
- *	space from active.  The idea is to not force this to happen too
- *	often.
- *
  *	The object and page must be locked.
  */
 void
-vm_page_dontneed(vm_page_t m)
+vm_page_advise(vm_page_t m, int advice)
 {
-	int dnw;
-	int head;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	dnw = PCPU_GET(dnweight);
-	PCPU_INC(dnweight);
-
-	/*
-	 * Occasionally leave the page alone.
-	 */
-	if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
-		if (m->act_count >= ACT_INIT)
-			--m->act_count;
+	vm_page_assert_locked(m);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (advice == MADV_FREE)
+		/*
+		 * Mark the page clean.  This will allow the page to be freed
+		 * up by the system.  However, such pages are often reused
+		 * quickly by malloc() so we do not do anything that would
+		 * cause a page fault if we can help it.
+		 *
+		 * Specifically, we do not try to actually free the page now
+		 * nor do we try to put it in the cache (which would cause a
+		 * page fault on reuse).
+		 *
+		 * But we do make the page as freeable as we can without
+		 * actually taking the step of unmapping it.
+		 */
+		vm_page_undirty(m);
+	else if (advice != MADV_DONTNEED)
 		return;
-	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
 	 * immediately reactivate the page.
-	 *
-	 * Perform the pmap_clear_reference() first.  Otherwise, a concurrent
-	 * pmap operation, such as pmap_remove(), could clear a reference in
-	 * the pmap and set PGA_REFERENCED on the page before the
-	 * pmap_clear_reference() had completed.  Consequently, the page would
-	 * appear referenced based upon an old reference that occurred before
-	 * this function ran.
 	 */
-	pmap_clear_reference(m);
 	vm_page_aflag_clear(m, PGA_REFERENCED);
 
-	if (m->dirty == 0 && pmap_is_modified(m))
+	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
-	if (m->dirty || (dnw & 0x0070) == 0) {
-		/*
-		 * Deactivate the page 3 times out of 32.
-		 */
-		head = 0;
-	} else {
-		/*
-		 * Cache the page 28 times out of every 32.  Note that
-		 * the page is deactivated instead of cached, but placed
-		 * at the head of the queue instead of the tail.
-		 */
-		head = 1;
-	}
-	_vm_page_deactivate(m, head);
+	/*
+	 * Place clean pages at the head of the inactive queue rather than the
+	 * tail, thus defeating the queue's LRU operation and ensuring that the
+	 * page will be reused quickly.
+	 */
+	_vm_page_deactivate(m, m->dirty == 0);
 }
 
 /*
@@ -2347,9 +2732,6 @@
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
- * The caller must always specify the VM_ALLOC_RETRY flag.  This is intended
- * to facilitate its eventual removal.
- *
  * This routine may sleep.
  *
  * The object must be locked on entry.  The lock will, however, be released
@@ -2359,14 +2741,17 @@
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
+	int sleep;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	KASSERT((allocflags & VM_ALLOC_RETRY) != 0,
-	    ("vm_page_grab: VM_ALLOC_RETRY is required"));
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
-		if ((m->oflags & VPO_BUSY) != 0 ||
-		    ((allocflags & VM_ALLOC_IGN_SBUSY) == 0 && m->busy != 0)) {
+		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+		    vm_page_xbusied(m) : vm_page_busied(m);
+		if (sleep) {
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
@@ -2373,7 +2758,11 @@
 			 * likely to reclaim it.
 			 */
 			vm_page_aflag_set(m, PGA_REFERENCED);
-			vm_page_sleep(m, "pgrbwt");
+			vm_page_lock(m);
+			VM_OBJECT_WUNLOCK(object);
+			vm_page_busy_sleep(m, "pgrbwt", (allocflags &
+			    VM_ALLOC_IGN_SBUSY) != 0);
+			VM_OBJECT_WLOCK(object);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
@@ -2381,17 +2770,19 @@
 				vm_page_wire(m);
 				vm_page_unlock(m);
 			}
-			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
-				vm_page_busy(m);
+			if ((allocflags &
+			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
+				vm_page_xbusy(m);
+			if ((allocflags & VM_ALLOC_SBUSY) != 0)
+				vm_page_sbusy(m);
 			return (m);
 		}
 	}
-	m = vm_page_alloc(object, pindex, allocflags & ~(VM_ALLOC_RETRY |
-	    VM_ALLOC_IGN_SBUSY));
+	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
 	if (m == NULL) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		VM_WAIT;
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		goto retrylookup;
 	} else if (m->valid != 0)
 		return (m);
@@ -2427,7 +2818,7 @@
 }
 
 /*
- *	vm_page_set_valid:
+ *	vm_page_set_valid_range:
  *
  *	Sets portions of a page valid.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
@@ -2437,11 +2828,11 @@
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
-vm_page_set_valid(vm_page_t m, int base, int size)
+vm_page_set_valid_range(vm_page_t m, int base, int size)
 {
 	int endoff, frag;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
@@ -2470,7 +2861,7 @@
 	 * is already dirty. 
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
-	    ("vm_page_set_valid: page %p is dirty", m)); 
+	    ("vm_page_set_valid_range: page %p is dirty", m));
 
 	/*
 	 * Set valid bits inclusive of any overlap.
@@ -2490,12 +2881,12 @@
 #endif
 
 	/*
-	 * If the object is locked and the page is neither VPO_BUSY nor
+	 * If the object is locked and the page is neither exclusive busy nor
 	 * write mapped, then the page's dirty field cannot possibly be
 	 * set by a concurrent pmap operation.
 	 */
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	if ((m->oflags & VPO_BUSY) == 0 && !pmap_page_is_write_mapped(m))
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 		m->dirty &= ~pagebits;
 	else {
 		/*
@@ -2548,7 +2939,7 @@
 	vm_page_bits_t oldvalid, pagebits;
 	int endoff, frag;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
@@ -2637,14 +3028,20 @@
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
+	vm_object_t object;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
-	KASSERT((m->oflags & VPO_BUSY) == 0,
-	    ("vm_page_set_invalid: page %p is busy", m));
-	bits = vm_page_bits(base, size);
-	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
+	object = m->object;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
+	    size >= object->un_pager.vnp.vnp_size)
+		bits = VM_PAGE_BITS_ALL;
+	else
+		bits = vm_page_bits(base, size);
+	if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
+	    bits != 0)
 		pmap_remove_all(m);
-	KASSERT(!pmap_page_is_mapped(m),
+	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
+	    !pmap_page_is_mapped(m),
 	    ("vm_page_set_invalid: page %p is mapped", m));
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
@@ -2667,11 +3064,11 @@
 	int b;
 	int i;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	/*
 	 * Scan the valid bits looking for invalid sections that
-	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
-	 * valid bit may be set ) have already been zerod by
+	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
+	 * valid bit may be set ) have already been zeroed by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
@@ -2706,15 +3103,37 @@
 {
 	vm_page_bits_t bits;
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	bits = vm_page_bits(base, size);
-	if (m->valid && ((m->valid & bits) == bits))
-		return 1;
-	else
-		return 0;
+	return (m->valid != 0 && (m->valid & bits) == bits);
 }
 
 /*
+ *	vm_page_ps_is_valid:
+ *
+ *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ */
+boolean_t
+vm_page_ps_is_valid(vm_page_t m)
+{
+	int i, npages;
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+	npages = atop(pagesizes[m->psind]);
+
+	/*
+	 * The physically contiguous pages that make up a superpage, i.e., a
+	 * page with a page size index ("psind") greater than zero, will
+	 * occupy adjacent entries in vm_page_array[].
+	 */
+	for (i = 0; i < npages; i++) {
+		if (m[i].valid != VM_PAGE_BITS_ALL)
+			return (FALSE);
+	}
+	return (TRUE);
+}
+
+/*
  * Set the page's dirty bits if the page is modified.
  */
 void
@@ -2721,7 +3140,7 @@
 vm_page_test_dirty(vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 		vm_page_dirty(m);
 }
@@ -2749,127 +3168,51 @@
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void
-vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
+vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 {
 
-	mtx_assert_(vm_page_lockptr(m), a, file, line);
+	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 }
-#endif
 
-int so_zerocp_fullpage = 0;
-
-/*
- *	Replace the given page with a copy.  The copied page assumes
- *	the portion of the given page's "wire_count" that is not the
- *	responsibility of this copy-on-write mechanism.
- *
- *	The object containing the given page must have a non-zero
- *	paging-in-progress count and be locked.
- */
 void
-vm_page_cowfault(vm_page_t m)
+vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 {
-	vm_page_t mnew;
-	vm_object_t object;
-	vm_pindex_t pindex;
 
-	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
-	vm_page_lock_assert(m, MA_OWNED);
-	object = m->object;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	KASSERT(object->paging_in_progress != 0,
-	    ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
-	    object)); 
-	pindex = m->pindex;
-
- retry_alloc:
-	pmap_remove_all(m);
-	vm_page_remove(m);
-	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
-	if (mnew == NULL) {
-		vm_page_insert(m, object, pindex);
-		vm_page_unlock(m);
-		VM_OBJECT_UNLOCK(object);
-		VM_WAIT;
-		VM_OBJECT_LOCK(object);
-		if (m == vm_page_lookup(object, pindex)) {
-			vm_page_lock(m);
-			goto retry_alloc;
-		} else {
-			/*
-			 * Page disappeared during the wait.
-			 */
-			return;
-		}
-	}
-
-	if (m->cow == 0) {
-		/* 
-		 * check to see if we raced with an xmit complete when 
-		 * waiting to allocate a page.  If so, put things back 
-		 * the way they were 
-		 */
-		vm_page_unlock(m);
-		vm_page_lock(mnew);
-		vm_page_free(mnew);
-		vm_page_unlock(mnew);
-		vm_page_insert(m, object, pindex);
-	} else { /* clear COW & copy page */
-		if (!so_zerocp_fullpage)
-			pmap_copy_page(m, mnew);
-		mnew->valid = VM_PAGE_BITS_ALL;
-		vm_page_dirty(mnew);
-		mnew->wire_count = m->wire_count - m->cow;
-		m->wire_count = m->cow;
-		vm_page_unlock(m);
-	}
+	mtx_assert_(vm_page_lockptr(m), a, file, line);
 }
+#endif
 
-void 
-vm_page_cowclear(vm_page_t m)
+#ifdef INVARIANTS
+void
+vm_page_object_lock_assert(vm_page_t m)
 {
 
-	vm_page_lock_assert(m, MA_OWNED);
-	if (m->cow) {
-		m->cow--;
-		/* 
-		 * let vm_fault add back write permission  lazily
-		 */
-	} 
 	/*
-	 *  sf_buf_free() will free the page, so we needn't do it here
-	 */ 
+	 * Certain of the page's fields may only be modified by the
+	 * holder of the containing object's lock or the exclusive busy.
+	 * holder.  Unfortunately, the holder of the write busy is
+	 * not recorded, and thus cannot be checked here.
+	 */
+	if (m->object != NULL && !vm_page_xbusied(m))
+		VM_OBJECT_ASSERT_WLOCKED(m->object);
 }
 
-int
-vm_page_cowsetup(vm_page_t m)
+void
+vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
 {
 
-	vm_page_lock_assert(m, MA_OWNED);
-	if ((m->flags & PG_FICTITIOUS) != 0 ||
-	    (m->oflags & VPO_UNMANAGED) != 0 ||
-	    m->cow == USHRT_MAX - 1 || !VM_OBJECT_TRYLOCK(m->object))
-		return (EBUSY);
-	m->cow++;
-	pmap_remove_write(m);
-	VM_OBJECT_UNLOCK(m->object);
-	return (0);
-}
+	if ((bits & PGA_WRITEABLE) == 0)
+		return;
 
-#ifdef INVARIANTS
-void
-vm_page_object_lock_assert(vm_page_t m)
-{
-
 	/*
-	 * Certain of the page's fields may only be modified by the
-	 * holder of the containing object's lock or the setter of the
-	 * page's VPO_BUSY flag.  Unfortunately, the setter of the
-	 * VPO_BUSY flag is not recorded, and thus cannot be checked
-	 * here.
+	 * The PGA_WRITEABLE flag can only be set if the page is
+	 * managed, is exclusively busied or the object is locked.
+	 * Currently, this flag is only set by pmap_enter().
 	 */
-	if (m->object != NULL && (m->oflags & VPO_BUSY) == 0)
-		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+	    ("PGA_WRITEABLE on unmanaged page"));
+	if (!vm_page_xbusied(m))
+		VM_OBJECT_ASSERT_LOCKED(m->object);
 }
 #endif
 
@@ -2895,18 +3238,20 @@
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
-		
-	db_printf("PQ_FREE:");
-	db_printf(" %d", cnt.v_free_count);
-	db_printf("\n");
-		
-	db_printf("PQ_CACHE:");
-	db_printf(" %d", cnt.v_cache_count);
-	db_printf("\n");
+	int dom;
 
-	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
-		*vm_page_queues[PQ_ACTIVE].cnt,
-		*vm_page_queues[PQ_INACTIVE].cnt);
+	db_printf("pq_free %d pq_cache %d\n",
+	    cnt.v_free_count, cnt.v_cache_count);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		db_printf(
+	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+		    dom,
+		    vm_dom[dom].vmd_page_count,
+		    vm_dom[dom].vmd_free_count,
+		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
+		    vm_dom[dom].vmd_pass);
+	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
@@ -2926,9 +3271,9 @@
 		m = (vm_page_t)addr;
 	db_printf(
     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
-    "  af 0x%x of 0x%x f 0x%x act %d busy %d valid 0x%x dirty 0x%x\n",
+    "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
-	    m->flags, m->act_count, m->busy, m->valid, m->dirty);
+	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
 }
 #endif /* DDB */

Modified: trunk/sys/vm/vm_page.h
===================================================================
--- trunk/sys/vm/vm_page.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_page.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $
  */
 
 /*
@@ -74,9 +75,9 @@
  *
  *	A small structure is kept for each resident
  *	page, indexed by page number.  Each structure
- *	is an element of several lists:
+ *	is an element of several collections:
  *
- *		A hash table bucket used to quickly
+ *		A radix tree used to quickly
  *		perform object/offset lookups
  *
  *		A list of all pages for a given object,
@@ -92,7 +93,7 @@
  *	In general, operations on this structure's mutable fields are
  *	synchronized using either one of or a combination of the lock on the
  *	object that the page belongs to (O), the pool lock for the page (P),
- *	or the lock for either the free or paging queues (Q).  If a field is
+ *	or the lock for either the free or paging queue (Q).  If a field is
  *	annotated below with two of these locks, then holding either lock is
  *	sufficient for read access, but both locks are required for write
  *	access.
@@ -111,8 +112,6 @@
  *	field is encapsulated in vm_page_clear_dirty_mask().
  */
 
-TAILQ_HEAD(pglist, vm_page);
-
 #if PAGE_SIZE == 4096
 #define VM_PAGE_BITS_ALL 0xffu
 typedef uint8_t vm_page_bits_t;
@@ -128,31 +127,38 @@
 #endif
 
 struct vm_page {
-	TAILQ_ENTRY(vm_page) pageq;	/* queue info for FIFO queue or free list (Q) */
-	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) 	*/
-	struct vm_page *left;		/* splay tree link (O)		*/
-	struct vm_page *right;		/* splay tree link (O)		*/
-
-	vm_object_t object;		/* which object am I in (O,P)*/
+	union {
+		TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */
+		struct {
+			SLIST_ENTRY(vm_page) ss; /* private slists */
+			void *pv;
+		} s;
+		struct {
+			u_long p;
+			u_long v;
+		} memguard;
+	} plinks;
+	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
+	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_paddr_t phys_addr;		/* physical address of page */
 	struct md_page md;		/* machine dependant stuff */
+	u_int wire_count;		/* wired down maps refs (P) */
+	volatile u_int busy_lock;	/* busy owners lock */
+	uint16_t hold_count;		/* page hold count (P) */
+	uint16_t flags;			/* page PG_* flags (P) */
+	uint8_t aflags;			/* access is atomic */
+	uint8_t oflags;			/* page VPO_* flags (O) */
 	uint8_t	queue;			/* page queue index (P,Q) */
 	int8_t segind;
-	short hold_count;		/* page hold count (P) */
 	uint8_t	order;			/* index of the buddy queue */
 	uint8_t pool;
-	u_short cow;			/* page cow mapping count (P) */
-	u_int wire_count;		/* wired down maps refs (P) */
-	uint8_t aflags;			/* access is atomic */
-	uint8_t flags;			/* see below, often immutable after alloc */
-	u_short oflags;			/* page flags (O) */
-	u_char	act_count;		/* page usage count (O) */
-	u_char	busy;			/* page busy count (O) */
-	/* NOTE that these must support one bit per DEV_BSIZE in a page!!! */
+	u_char	act_count;		/* page usage count (P) */
+	/* NOTE that these must support one bit per DEV_BSIZE in a page */
 	/* so, on normal X86 kernels, they must be at least 8 bits wide */
 	vm_page_bits_t valid;		/* map of valid DEV_BSIZE chunks (O) */
 	vm_page_bits_t dirty;		/* map of dirty DEV_BSIZE chunks (M) */
+	int8_t psind;			/* pagesizes[] index (O) */
 };
 
 /*
@@ -169,33 +175,88 @@
  * 	 mappings, and such pages are also not on any PQ queue.
  *
  */
-#define	VPO_BUSY	0x0001	/* page is in transit */
-#define	VPO_WANTED	0x0002	/* someone is waiting for page */
-#define	VPO_UNMANAGED	0x0004		/* No PV management for page */
-#define	VPO_SWAPINPROG	0x0200	/* swap I/O in progress on page */
-#define	VPO_NOSYNC	0x0400	/* do not collect for syncer */
+#define	VPO_UNUSED01	0x01		/* --available-- */
+#define	VPO_SWAPSLEEP	0x02		/* waiting for swap to finish */
+#define	VPO_UNMANAGED	0x04		/* no PV management for page */
+#define	VPO_SWAPINPROG	0x08		/* swap I/O in progress on page */
+#define	VPO_NOSYNC	0x10		/* do not collect for syncer */
 
+/*
+ * Busy page implementation details.
+ * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation,
+ * even if the support for owner identity is removed because of size
+ * constraints.  Checks on lock recursion are then not possible, while the
+ * lock assertions effectiveness is someway reduced.
+ */
+#define	VPB_BIT_SHARED		0x01
+#define	VPB_BIT_EXCLUSIVE	0x02
+#define	VPB_BIT_WAITERS		0x04
+#define	VPB_BIT_FLAGMASK						\
+	(VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS)
+
+#define	VPB_SHARERS_SHIFT	3
+#define	VPB_SHARERS(x)							\
+	(((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT)
+#define	VPB_SHARERS_WORD(x)	((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED)
+#define	VPB_ONE_SHARER		(1 << VPB_SHARERS_SHIFT)
+
+#define	VPB_SINGLE_EXCLUSIVER	VPB_BIT_EXCLUSIVE
+
+#define	VPB_UNBUSIED		VPB_SHARERS_WORD(0)
+
 #define	PQ_NONE		255
 #define	PQ_INACTIVE	0
 #define	PQ_ACTIVE	1
-#define	PQ_HOLD		2
-#define	PQ_COUNT	3
+#define	PQ_COUNT	2
 
-struct vpgqueues {
-	struct pglist pl;
-	int	*cnt;
+TAILQ_HEAD(pglist, vm_page);
+SLIST_HEAD(spglist, vm_page);
+
+struct vm_pagequeue {
+	struct mtx	pq_mutex;
+	struct pglist	pq_pl;
+	int		pq_cnt;
+	u_int		* const pq_vcnt;
+	const char	* const pq_name;
+} __aligned(CACHE_LINE_SIZE);
+
+
+struct vm_domain {
+	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
+	u_int vmd_page_count;
+	u_int vmd_free_count;
+	long vmd_segs;	/* bitmask of the segments */
+	boolean_t vmd_oom;
+	int vmd_pass;	/* local pagedaemon pass */
+	int vmd_oom_seq;
+	int vmd_last_active_scan;
+	struct vm_page vmd_marker; /* marker for pagedaemon private use */
 };
 
-extern struct vpgqueues vm_page_queues[PQ_COUNT];
+extern struct vm_domain vm_dom[MAXMEMDOM];
 
-struct vpglocks {
-	struct mtx	data;
-	char		pad[CACHE_LINE_SIZE - sizeof(struct mtx)];
-} __aligned(CACHE_LINE_SIZE);
+#define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
+#define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
+#define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
-extern struct vpglocks vm_page_queue_free_lock;
-extern struct vpglocks pa_lock[];
+#ifdef _KERNEL
+static __inline void
+vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
+{
 
+#ifdef notyet
+	vm_pagequeue_assert_locked(pq);
+#endif
+	pq->pq_cnt += addend;
+	atomic_add_int(pq->pq_vcnt, addend);
+}
+#define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
+#define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)
+#endif	/* _KERNEL */
+
+extern struct mtx_padalign vm_page_queue_free_mtx;
+extern struct mtx_padalign pa_lock[];
+
 #if defined(__arm__)
 #define	PDRSHIFT	PDR_SHIFT
 #elif !defined(PDRSHIFT)
@@ -203,7 +264,7 @@
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
-#define	PA_LOCKPTR(pa)	&pa_lock[pa_index((pa)) % PA_LOCK_COUNT].data
+#define	PA_LOCKPTR(pa)	((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT]))
 #define	PA_LOCKOBJPTR(pa)	((struct lock_object *)PA_LOCKPTR((pa)))
 #define	PA_LOCK(pa)	mtx_lock(PA_LOCKPTR(pa))
 #define	PA_TRYLOCK(pa)	mtx_trylock(PA_LOCKPTR(pa))
@@ -222,35 +283,36 @@
 #define	vm_page_lock(m)		vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE)
 #define	vm_page_unlock(m)	vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE)
 #define	vm_page_trylock(m)	vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE)
+#else	/* !KLD_MODULE */
+#define	vm_page_lockptr(m)	(PA_LOCKPTR(VM_PAGE_TO_PHYS((m))))
+#define	vm_page_lock(m)		mtx_lock(vm_page_lockptr((m)))
+#define	vm_page_unlock(m)	mtx_unlock(vm_page_lockptr((m)))
+#define	vm_page_trylock(m)	mtx_trylock(vm_page_lockptr((m)))
+#endif
 #if defined(INVARIANTS)
+#define	vm_page_assert_locked(m)		\
+    vm_page_assert_locked_KBI((m), __FILE__, __LINE__)
 #define	vm_page_lock_assert(m, a)		\
     vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__)
 #else
+#define	vm_page_assert_locked(m)
 #define	vm_page_lock_assert(m, a)
 #endif
-#else	/* !KLD_MODULE */
-#define	vm_page_lockptr(m)	(PA_LOCKPTR(VM_PAGE_TO_PHYS((m))))
-#define	vm_page_lock(m)		mtx_lock(vm_page_lockptr((m)))
-#define	vm_page_unlock(m)	mtx_unlock(vm_page_lockptr((m)))
-#define	vm_page_trylock(m)	mtx_trylock(vm_page_lockptr((m)))
-#define	vm_page_lock_assert(m, a)	mtx_assert(vm_page_lockptr((m)), (a))
-#endif
 
-#define	vm_page_queue_free_mtx	vm_page_queue_free_lock.data
-
 /*
- * These are the flags defined for vm_page.
+ * The vm_page's aflags are updated using atomic operations.  To set or clear
+ * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear()
+ * must be used.  Neither these flags nor these functions are part of the KBI.
  *
- * aflags are updated by atomic accesses.  Use the vm_page_aflag_set()
- * and vm_page_aflag_clear() functions to set and clear the flags.
+ * PGA_REFERENCED may be cleared only if the page is locked.  It is set by
+ * both the MI and MD VM layers.  However, kernel loadable modules should not
+ * directly set this flag.  They should call vm_page_reference() instead.
  *
- * PGA_REFERENCED may be cleared only if the object containing the page is
- * locked.  It is set by both the MI and MD VM layers.
+ * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter().
+ * When it does so, the object must be locked, or the page must be
+ * exclusive busied.  The MI VM layer must never access this flag
+ * directly.  Instead, it should call pmap_page_is_write_mapped().
  *
- * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter().  When it
- * does so, the page must be VPO_BUSY.  The MI VM layer must never access this
- * flag directly.  Instead, it should call pmap_page_is_write_mapped().
- *
  * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
  * at least one executable mapping.  It is not consumed by the MI VM layer.
  */
@@ -262,14 +324,14 @@
  * Page flags.  If changed at any other time than page allocation or
  * freeing, the modification must be protected by the vm_page lock.
  */
-#define	PG_CACHED	0x01		/* page is cached */
-#define	PG_FREE		0x02		/* page is free */
-#define	PG_FICTITIOUS	0x04		/* physical page doesn't exist */
-#define	PG_ZERO		0x08		/* page is zeroed */
-#define	PG_MARKER	0x10		/* special queue marker page */
-#define	PG_SLAB		0x20		/* object pointer is actually a slab */
-#define	PG_WINATCFLS	0x40		/* flush dirty page on inactive q */
-#define	PG_NODUMP	0x80		/* don't include this page in a dump */
+#define	PG_CACHED	0x0001		/* page is cached */
+#define	PG_FREE		0x0002		/* page is free */
+#define	PG_FICTITIOUS	0x0004		/* physical page doesn't exist */
+#define	PG_ZERO		0x0008		/* page is zeroed */
+#define	PG_MARKER	0x0010		/* special queue marker page */
+#define	PG_WINATCFLS	0x0040		/* flush dirty page on inactive q */
+#define	PG_NODUMP	0x0080		/* don't include this page in a dump */
+#define	PG_UNHOLDFREE	0x0100		/* delayed free of a held page */
 
 /*
  * Misc constants.
@@ -281,8 +343,12 @@
 
 #ifdef _KERNEL
 
+#include <sys/systm.h>
+
+#include <machine/atomic.h>
+
 /*
- * Each pageable resident page falls into one of five lists:
+ * Each pageable resident page falls into one of four lists:
  *
  *	free
  *		Available for allocation now.
@@ -291,10 +357,6 @@
  *		Almost available for allocation. Still associated with
  *		an object, but clean and immediately freeable.
  *
- *	hold
- *		Will become free after a pending I/O operation
- *		completes.
- *
  * The following lists are LRU sorted:
  *
  *	inactive
@@ -308,7 +370,6 @@
  *
  */
 
-struct vnode;
 extern int vm_page_zero_count;
 
 extern vm_page_t vm_page_array;		/* First resident page in table */
@@ -319,16 +380,8 @@
 
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
-vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
-
 vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
 
-extern struct vpglocks vm_page_queue_lock;
-
-#define	vm_page_queue_mtx	vm_page_queue_lock.data
-#define vm_page_lock_queues()   mtx_lock(&vm_page_queue_mtx)
-#define vm_page_unlock_queues() mtx_unlock(&vm_page_queue_mtx)
-
 /* page allocation classes: */
 #define VM_ALLOC_NORMAL		0
 #define VM_ALLOC_INTERRUPT	1
@@ -337,7 +390,6 @@
 /* page allocation flags: */
 #define	VM_ALLOC_WIRED		0x0020	/* non pageable */
 #define	VM_ALLOC_ZERO		0x0040	/* Try to obtain a zeroed page */
-#define	VM_ALLOC_RETRY		0x0080	/* Mandatory with vm_page_grab() */
 #define	VM_ALLOC_NOOBJ		0x0100	/* No associated object */
 #define	VM_ALLOC_NOBUSY		0x0200	/* Do not busy the page */
 #define	VM_ALLOC_IFCACHED	0x0400	/* Fail if the page is not cached */
@@ -344,61 +396,85 @@
 #define	VM_ALLOC_IFNOTCACHED	0x0800	/* Fail if the page is cached */
 #define	VM_ALLOC_IGN_SBUSY	0x1000	/* vm_page_grab() only */
 #define	VM_ALLOC_NODUMP		0x2000	/* don't include in dump */
+#define	VM_ALLOC_SBUSY		0x4000	/* Shared busy the page */
 
 #define	VM_ALLOC_COUNT_SHIFT	16
 #define	VM_ALLOC_COUNT(count)	((count) << VM_ALLOC_COUNT_SHIFT)
 
-void vm_page_aflag_set(vm_page_t m, uint8_t bits);
-void vm_page_aflag_clear(vm_page_t m, uint8_t bits);
-void vm_page_busy(vm_page_t m);
+#ifdef M_NOWAIT
+static inline int
+malloc2vm_flags(int malloc_flags)
+{
+	int pflags;
+
+	KASSERT((malloc_flags & M_USE_RESERVE) == 0 ||
+	    (malloc_flags & M_NOWAIT) != 0,
+	    ("M_USE_RESERVE requires M_NOWAIT"));
+	pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT :
+	    VM_ALLOC_SYSTEM;
+	if ((malloc_flags & M_ZERO) != 0)
+		pflags |= VM_ALLOC_ZERO;
+	if ((malloc_flags & M_NODUMP) != 0)
+		pflags |= VM_ALLOC_NODUMP;
+	return (pflags);
+}
+#endif
+
+void vm_page_busy_downgrade(vm_page_t m);
+void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
 void vm_page_flash(vm_page_t m);
-void vm_page_io_start(vm_page_t m);
-void vm_page_io_finish(vm_page_t m);
 void vm_page_hold(vm_page_t mem);
 void vm_page_unhold(vm_page_t mem);
 void vm_page_free(vm_page_t m);
 void vm_page_free_zero(vm_page_t m);
-void vm_page_dirty(vm_page_t m);
-void vm_page_wakeup(vm_page_t m);
 
-void vm_pageq_remove(vm_page_t m);
-
 void vm_page_activate (vm_page_t);
+void vm_page_advise(vm_page_t m, int advice);
 vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
+    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+    vm_paddr_t boundary, vm_memattr_t memattr);
 vm_page_t vm_page_alloc_freelist(int, int);
-struct vnode *vm_page_alloc_init(vm_page_t);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
 void vm_page_cache(vm_page_t);
 void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_remove(vm_page_t);
 void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
 int vm_page_try_to_cache (vm_page_t);
 int vm_page_try_to_free (vm_page_t);
-void vm_page_dontneed(vm_page_t);
 void vm_page_deactivate (vm_page_t);
+void vm_page_dequeue(vm_page_t m);
+void vm_page_dequeue_locked(vm_page_t m);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
 boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
+struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
+boolean_t vm_page_ps_is_valid(vm_page_t m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
-void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
+    vm_pindex_t pindex);
 void vm_page_requeue(vm_page_t m);
-void vm_page_set_valid(vm_page_t m, int base, int size);
-void vm_page_sleep(vm_page_t m, const char *msg);
-vm_page_t vm_page_splay(vm_pindex_t, vm_page_t);
+void vm_page_requeue_locked(vm_page_t m);
+int vm_page_sbusied(vm_page_t m);
+void vm_page_set_valid_range(vm_page_t m, int base, int size);
+int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
+void vm_page_sunbusy(vm_page_t m);
+int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
 void vm_page_unwire (vm_page_t, int);
 void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_wire (vm_page_t);
+void vm_page_xunbusy_hard(vm_page_t m);
 void vm_page_set_validclean (vm_page_t, int, int);
 void vm_page_clear_dirty (vm_page_t, int, int);
 void vm_page_set_invalid (vm_page_t, int, int);
@@ -408,46 +484,170 @@
 void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
 void vm_page_free_toq(vm_page_t m);
 void vm_page_zero_idle_wakeup(void);
-void vm_page_cowfault (vm_page_t);
-int vm_page_cowsetup(vm_page_t);
-void vm_page_cowclear (vm_page_t);
 
+void vm_page_dirty_KBI(vm_page_t m);
 void vm_page_lock_KBI(vm_page_t m, const char *file, int line);
 void vm_page_unlock_KBI(vm_page_t m, const char *file, int line);
 int vm_page_trylock_KBI(vm_page_t m, const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
+void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line);
 void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line);
 #endif
 
+#define	vm_page_assert_sbusied(m)					\
+	KASSERT(vm_page_sbusied(m),					\
+	    ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
+	    (void *)m, __FILE__, __LINE__));
+
+#define	vm_page_assert_unbusied(m)					\
+	KASSERT(!vm_page_busied(m),					\
+	    ("vm_page_assert_unbusied: page %p busy @ %s:%d",		\
+	    (void *)m, __FILE__, __LINE__));
+
+#define	vm_page_assert_xbusied(m)					\
+	KASSERT(vm_page_xbusied(m),					\
+	    ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
+	    (void *)m, __FILE__, __LINE__));
+
+#define	vm_page_busied(m)						\
+	((m)->busy_lock != VPB_UNBUSIED)
+
+#define	vm_page_sbusy(m) do {						\
+	if (!vm_page_trysbusy(m))					\
+		panic("%s: page %p failed shared busing", __func__, m);	\
+} while (0)
+
+#define	vm_page_tryxbusy(m)						\
+	(atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,		\
+	    VPB_SINGLE_EXCLUSIVER))
+
+#define	vm_page_xbusied(m)						\
+	((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
+
+#define	vm_page_xbusy(m) do {						\
+	if (!vm_page_tryxbusy(m))					\
+		panic("%s: page %p failed exclusive busing", __func__,	\
+		    m);							\
+} while (0)
+
+#define	vm_page_xunbusy(m) do {						\
+	if (!atomic_cmpset_rel_int(&(m)->busy_lock,			\
+	    VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED))			\
+		vm_page_xunbusy_hard(m);				\
+} while (0)
+
 #ifdef INVARIANTS
 void vm_page_object_lock_assert(vm_page_t m);
 #define	VM_PAGE_OBJECT_LOCK_ASSERT(m)	vm_page_object_lock_assert(m)
+void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits);
+#define	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits)				\
+	vm_page_assert_pga_writeable(m, bits)
 #else
 #define	VM_PAGE_OBJECT_LOCK_ASSERT(m)	(void)0
+#define	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits)	(void)0
 #endif
 
 /*
- *	vm_page_sleep_if_busy:
+ * We want to use atomic updates for the aflags field, which is 8 bits wide.
+ * However, not all architectures support atomic operations on 8-bit
+ * destinations.  In order that we can easily use a 32-bit operation, we
+ * require that the aflags field be 32-bit aligned.
+ */
+CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
+
+/*
+ *	Clear the given bits in the specified page.
+ */
+static inline void
+vm_page_aflag_clear(vm_page_t m, uint8_t bits)
+{
+	uint32_t *addr, val;
+
+	/*
+	 * The PGA_REFERENCED flag can only be cleared if the page is locked.
+	 */
+	if ((bits & PGA_REFERENCED) != 0)
+		vm_page_assert_locked(m);
+
+	/*
+	 * Access the whole 32-bit word containing the aflags field with an
+	 * atomic update.  Parallel non-atomic updates to the other fields
+	 * within this word are handled properly by the atomic update.
+	 */
+	addr = (void *)&m->aflags;
+	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
+	    ("vm_page_aflag_clear: aflags is misaligned"));
+	val = bits;
+#if BYTE_ORDER == BIG_ENDIAN
+	val <<= 24;
+#endif
+	atomic_clear_32(addr, val);
+}
+
+/*
+ *	Set the given bits in the specified page.
+ */
+static inline void
+vm_page_aflag_set(vm_page_t m, uint8_t bits)
+{
+	uint32_t *addr, val;
+
+	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits);
+
+	/*
+	 * Access the whole 32-bit word containing the aflags field with an
+	 * atomic update.  Parallel non-atomic updates to the other fields
+	 * within this word are handled properly by the atomic update.
+	 */
+	addr = (void *)&m->aflags;
+	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
+	    ("vm_page_aflag_set: aflags is misaligned"));
+	val = bits;
+#if BYTE_ORDER == BIG_ENDIAN
+	val <<= 24;
+#endif
+	atomic_set_32(addr, val);
+} 
+
+/*
+ *	vm_page_dirty:
  *
- *	Sleep and release the page queues lock if VPO_BUSY is set or,
- *	if also_m_busy is TRUE, busy is non-zero.  Returns TRUE if the
- *	thread slept and the page queues lock was released.
- *	Otherwise, retains the page queues lock and returns FALSE.
+ *	Set all bits in the page's dirty field.
  *
- *	The object containing the given page must be locked.
+ *	The object containing the specified page must be locked if the
+ *	call is made from the machine-independent layer.
+ *
+ *	See vm_page_clear_dirty_mask().
  */
-static __inline int
-vm_page_sleep_if_busy(vm_page_t m, int also_m_busy, const char *msg)
+static __inline void
+vm_page_dirty(vm_page_t m)
 {
 
-	if ((m->oflags & VPO_BUSY) || (also_m_busy && m->busy)) {
-		vm_page_sleep(m, msg);
-		return (TRUE);
-	}
-	return (FALSE);
+	/* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */
+#if defined(KLD_MODULE) || defined(INVARIANTS)
+	vm_page_dirty_KBI(m);
+#else
+	m->dirty = VM_PAGE_BITS_ALL;
+#endif
 }
 
 /*
+ *	vm_page_remque:
+ *
+ *	If the given page is in a page queue, then remove it from that page
+ *	queue.
+ *
+ *	The page must be locked.
+ */
+static inline void
+vm_page_remque(vm_page_t m)
+{
+
+	if (m->queue != PQ_NONE)
+		vm_page_dequeue(m);
+}
+
+/*
  *	vm_page_undirty:
  *
  *	Set page to not be dirty.  Note: does not clear pmap modify bits

Modified: trunk/sys/vm/vm_pageout.c
===================================================================
--- trunk/sys/vm/vm_pageout.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pageout.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
@@ -73,9 +74,10 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $");
 
 #include "opt_vm.h"
+#include "opt_kdtrace.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -89,9 +91,13 @@
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
+#include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
@@ -102,6 +108,7 @@
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
@@ -112,9 +119,15 @@
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
+static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(int pass);
+static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage);
 
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
+    NULL);
+
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
@@ -122,9 +135,13 @@
 	vm_pageout,
 	&pageproc
 };
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
+SDT_PROVIDER_DEFINE(vm);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
+
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
@@ -142,6 +159,8 @@
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
+int vm_pageout_wakeup_thresh;
+static int vm_pageout_oom_seq = 12;
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
@@ -151,35 +170,34 @@
 MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
 #endif
 static int vm_max_launder = 32;
-static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
-static int vm_pageout_full_stats_interval = 0;
-static int vm_pageout_algorithm=0;
-static int defer_swap_pageouts=0;
-static int disable_swap_pageouts=0;
+static int vm_pageout_update_period;
+static int defer_swap_pageouts;
+static int disable_swap_pageouts;
+static int lowmem_period = 10;
+static time_t lowmem_uptime;
 
 #if defined(NO_SWAPPING)
-static int vm_swap_enabled=0;
-static int vm_swap_idle_enabled=0;
+static int vm_swap_enabled = 0;
+static int vm_swap_idle_enabled = 0;
 #else
-static int vm_swap_enabled=1;
-static int vm_swap_idle_enabled=0;
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
 #endif
 
-SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
-	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
+SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
+	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+	"free page threshold for waking up the pageout daemon");
 
 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
-SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
-	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
+SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
+	CTLFLAG_RW, &vm_pageout_update_period, 0,
+	"Maximum active LRU update period");
+  
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+	"Low memory callback period");
 
-SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
-	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
-
-SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
-	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
-
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
@@ -202,6 +220,10 @@
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
+SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
+	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+	"back-to-back calls to oom detector to start OOM");
+
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
@@ -209,18 +231,21 @@
 SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
+static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
+static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
+    vm_paddr_t);
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(int req);
 #endif
-static void vm_pageout_page_stats(void);
+static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
 
 /*
  * Initialize a dummy page for marking the caller's place in the specified
  * paging queue.  In principle, this function only needs to set the flag
- * PG_MARKER.  Nonetheless, it sets the flag VPO_BUSY and initializes the hold
- * count to one as safety precautions.
+ * PG_MARKER.  Nonetheless, it write busies and initializes the hold count
+ * to one as safety precautions.
  */ 
 static void
 vm_pageout_init_marker(vm_page_t marker, u_short queue)
@@ -228,7 +253,7 @@
 
 	bzero(marker, sizeof(*marker));
 	marker->flags = PG_MARKER;
-	marker->oflags = VPO_BUSY;
+	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	marker->queue = queue;
 	marker->hold_count = 1;
 }
@@ -236,9 +261,9 @@
 /*
  * vm_pageout_fallback_object_lock:
  * 
- * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
+ * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
  * known to have failed and page queue must be either PQ_ACTIVE or
- * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
+ * PQ_INACTIVE.  To avoid lock order violation, unlock the page queue
  * while locking the vm object.  Use marker page to detect page queue
  * changes and maintain notion of next page on page queue.  Return
  * TRUE if no changes were detected, FALSE otherwise.  vm object is
@@ -247,10 +272,11 @@
  * This function depends on both the lock portion of struct vm_object
  * and normal struct vm_page being type stable.
  */
-boolean_t
+static boolean_t
 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
+	struct vm_pagequeue *pq;
 	boolean_t unchanged;
 	u_short queue;
 	vm_object_t object;
@@ -257,23 +283,32 @@
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
+	pq = vm_page_pagequeue(m);
 	object = m->object;
 	
-	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
-			   m, &marker, pageq);
-	vm_page_unlock_queues();
+	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
+	vm_pagequeue_unlock(pq);
 	vm_page_unlock(m);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	vm_page_lock(m);
-	vm_page_lock_queues();
+	vm_pagequeue_lock(pq);
 
-	/* Page queue might have changed. */
-	*next = TAILQ_NEXT(&marker, pageq);
-	unchanged = (m->queue == queue &&
-		     m->object == object &&
-		     &marker == TAILQ_NEXT(m, pageq));
-	TAILQ_REMOVE(&vm_page_queues[queue].pl,
-		     &marker, pageq);
+	/*
+	 * The page's object might have changed, and/or the page might
+	 * have moved from its original position in the queue.  If the
+	 * page's object has changed, then the caller should abandon
+	 * processing the page because the wrong object lock was
+	 * acquired.  Use the marker's plinks.q, not the page's, to
+	 * determine if the page has been moved.  The state of the
+	 * page's plinks.q can be indeterminate; whereas, the marker's
+	 * plinks.q must be valid.
+	 */
+	*next = TAILQ_NEXT(&marker, plinks.q);
+	unchanged = m->object == object &&
+	    m == TAILQ_PREV(&marker, pglist, plinks.q);
+	KASSERT(!unchanged || m->queue == queue,
+	    ("page %p queue %d %d", m, queue, m->queue));
+	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
 	return (unchanged);
 }
 
@@ -286,31 +321,33 @@
  *
  * This function depends on normal struct vm_page being type stable.
  */
-boolean_t
+static boolean_t
 vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
+	struct vm_pagequeue *pq;
 	boolean_t unchanged;
 	u_short queue;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
-
 	if (vm_page_trylock(m))
 		return (TRUE);
 
 	queue = m->queue;
 	vm_pageout_init_marker(&marker, queue);
+	pq = vm_page_pagequeue(m);
 
-	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl, m, &marker, pageq);
-	vm_page_unlock_queues();
+	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
+	vm_pagequeue_unlock(pq);
 	vm_page_lock(m);
-	vm_page_lock_queues();
+	vm_pagequeue_lock(pq);
 
 	/* Page queue might have changed. */
-	*next = TAILQ_NEXT(&marker, pageq);
-	unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, pageq));
-	TAILQ_REMOVE(&vm_page_queues[queue].pl, &marker, pageq);
+	*next = TAILQ_NEXT(&marker, plinks.q);
+	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
+	KASSERT(!unchanged || m->queue == queue,
+	    ("page %p queue %d %d", m, queue, m->queue));
+	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
 	return (unchanged);
 }
 
@@ -334,7 +371,7 @@
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
@@ -348,8 +385,7 @@
 	/*
 	 * Can't clean the page if it's busy or held.
 	 */
-	KASSERT(m->busy == 0 && (m->oflags & VPO_BUSY) == 0,
-	    ("vm_pageout_clean: page %p is busy", m));
+	vm_page_assert_unbusied(m);
 	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
 	vm_page_unlock(m);
 
@@ -387,15 +423,17 @@
 			break;
 		}
 
-		if ((p = vm_page_prev(pb)) == NULL ||
-		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0) {
+		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
 			ib = 0;
 			break;
 		}
+		vm_page_test_dirty(p);
+		if (p->dirty == 0) {
+			ib = 0;
+			break;
+		}
 		vm_page_lock(p);
-		vm_page_test_dirty(p);
-		if (p->dirty == 0 ||
-		    p->queue != PQ_INACTIVE ||
+		if (p->queue != PQ_INACTIVE ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			ib = 0;
@@ -417,13 +455,13 @@
 	    pindex + is < object->size) {
 		vm_page_t p;
 
-		if ((p = vm_page_next(ps)) == NULL ||
-		    (p->oflags & VPO_BUSY) != 0 || p->busy != 0)
+		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
+		vm_page_test_dirty(p);
+		if (p->dirty == 0)
+			break;
 		vm_page_lock(p);
-		vm_page_test_dirty(p);
-		if (p->dirty == 0 ||
-		    p->queue != PQ_INACTIVE ||
+		if (p->queue != PQ_INACTIVE ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			break;
@@ -472,8 +510,7 @@
 	int numpagedout = 0;
 	int i, runlen;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-	mtx_assert(&vm_page_queue_mtx, MA_NOTOWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
@@ -489,7 +526,7 @@
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
-		vm_page_io_start(mc[i]);
+		vm_page_sbusy(mc[i]);
 		pmap_remove_write(mc[i]);
 	}
 	vm_object_pip_add(object, count);
@@ -545,7 +582,7 @@
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
-			vm_page_io_finish(mt);
+			vm_page_sunbusy(mt);
 			if (vm_page_count_severe()) {
 				vm_page_lock(mt);
 				vm_page_try_to_cache(mt);
@@ -558,6 +595,170 @@
 	return (numpagedout);
 }
 
+static boolean_t
+vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
+    vm_paddr_t high)
+{
+	struct mount *mp;
+	struct vnode *vp;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_tmp, next;
+	int lockmode;
+
+	vm_pagequeue_lock(pq);
+	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
+		if ((m->flags & PG_MARKER) != 0)
+			continue;
+		pa = VM_PAGE_TO_PHYS(m);
+		if (pa < low || pa + PAGE_SIZE > high)
+			continue;
+		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
+			vm_page_unlock(m);
+			continue;
+		}
+		object = m->object;
+		if ((!VM_OBJECT_TRYWLOCK(object) &&
+		    (!vm_pageout_fallback_object_lock(m, &next) ||
+		    m->hold_count != 0)) || vm_page_busied(m)) {
+			vm_page_unlock(m);
+			VM_OBJECT_WUNLOCK(object);
+			continue;
+		}
+		vm_page_test_dirty(m);
+		if (m->dirty == 0 && object->ref_count != 0)
+			pmap_remove_all(m);
+		if (m->dirty != 0) {
+			vm_page_unlock(m);
+			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
+				VM_OBJECT_WUNLOCK(object);
+				continue;
+			}
+			if (object->type == OBJT_VNODE) {
+				vm_pagequeue_unlock(pq);
+				vp = object->handle;
+				vm_object_reference_locked(object);
+				VM_OBJECT_WUNLOCK(object);
+				(void)vn_start_write(vp, &mp, V_WAIT);
+				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+				    LK_SHARED : LK_EXCLUSIVE;
+				vn_lock(vp, lockmode | LK_RETRY);
+				VM_OBJECT_WLOCK(object);
+				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
+				VM_OBJECT_WUNLOCK(object);
+				VOP_UNLOCK(vp, 0);
+				vm_object_deallocate(object);
+				vn_finished_write(mp);
+				return (TRUE);
+			} else if (object->type == OBJT_SWAP ||
+			    object->type == OBJT_DEFAULT) {
+				vm_pagequeue_unlock(pq);
+				m_tmp = m;
+				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
+				    0, NULL, NULL);
+				VM_OBJECT_WUNLOCK(object);
+				return (TRUE);
+			}
+		} else {
+			/*
+			 * Dequeue here to prevent lock recursion in
+			 * vm_page_cache().
+			 */
+			vm_page_dequeue_locked(m);
+			vm_page_cache(m);
+			vm_page_unlock(m);
+		}
+		VM_OBJECT_WUNLOCK(object);
+	}
+	vm_pagequeue_unlock(pq);
+	return (FALSE);
+}
+
+/*
+ * Increase the number of cached pages.  The specified value, "tries",
+ * determines which categories of pages are cached:
+ *
+ *  0: All clean, inactive pages within the specified physical address range
+ *     are cached.  Will not sleep.
+ *  1: The vm_lowmem handlers are called.  All inactive pages within
+ *     the specified physical address range are cached.  May sleep.
+ *  2: The vm_lowmem handlers are called.  All inactive and active pages
+ *     within the specified physical address range are cached.  May sleep.
+ */
+void
+vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
+{
+	int actl, actmax, inactl, inactmax, dom, initial_dom;
+	static int start_dom = 0;
+
+	if (tries > 0) {
+		/*
+		 * Decrease registered cache sizes.  The vm_lowmem handlers
+		 * may acquire locks and/or sleep, so they can only be invoked
+		 * when "tries" is greater than zero.
+		 */
+		SDT_PROBE0(vm, , , vm__lowmem_cache);
+		EVENTHANDLER_INVOKE(vm_lowmem, 0);
+
+		/*
+		 * We do this explicitly after the caches have been drained
+		 * above.
+		 */
+		uma_reclaim();
+	}
+
+	/*
+	 * Make the next scan start on the next domain.
+	 */
+	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+
+	inactl = 0;
+	inactmax = cnt.v_inactive_count;
+	actl = 0;
+	actmax = tries < 2 ? 0 : cnt.v_active_count;
+	dom = initial_dom;
+
+	/*
+	 * Scan domains in round-robin order, first inactive queues,
+	 * then active.  Since domain usually owns large physically
+	 * contiguous chunk of memory, it makes sense to completely
+	 * exhaust one domain before switching to next, while growing
+	 * the pool of contiguous physical pages.
+	 *
+	 * Do not even start launder a domain which cannot contain
+	 * the specified address range, as indicated by segments
+	 * constituting the domain.
+	 */
+again_inact:
+	if (inactl < inactmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
+		    tries, low, high)) {
+			inactl++;
+			goto again_inact;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again_inact;
+	}
+again_act:
+	if (actl < actmax) {
+		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
+		    low, high) &&
+		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
+		      tries, low, high)) {
+			actl++;
+			goto again_act;
+		}
+		if (++dom == vm_ndomains)
+			dom = 0;
+		if (dom != initial_dom)
+			goto again_act;
+	}
+}
+
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
@@ -573,17 +774,17 @@
 {
 	vm_object_t backing_object, object;
 	vm_page_t p;
-	int actcount, remove_mode;
+	int act_delta, remove_mode;
 
-	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
-	if (first_object->type == OBJT_DEVICE ||
-	    first_object->type == OBJT_SG)
+	VM_OBJECT_ASSERT_LOCKED(first_object);
+	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
-		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
-		if (object->type == OBJT_PHYS || object->paging_in_progress)
+		VM_OBJECT_ASSERT_LOCKED(object);
+		if ((object->flags & OBJ_UNMANAGED) != 0 ||
+		    object->paging_in_progress != 0)
 			goto unlock_return;
 
 		remove_mode = 0;
@@ -595,7 +796,7 @@
 		TAILQ_FOREACH(p, &object->memq, listq) {
 			if (pmap_resident_count(pmap) <= desired)
 				goto unlock_return;
-			if ((p->oflags & VPO_BUSY) != 0 || p->busy != 0)
+			if (vm_page_busied(p))
 				continue;
 			PCPU_INC(cnt.v_pdpages);
 			vm_page_lock(p);
@@ -604,37 +805,30 @@
 				vm_page_unlock(p);
 				continue;
 			}
-			actcount = pmap_ts_referenced(p);
+			act_delta = pmap_ts_referenced(p);
 			if ((p->aflags & PGA_REFERENCED) != 0) {
-				if (actcount == 0)
-					actcount = 1;
+				if (act_delta == 0)
+					act_delta = 1;
 				vm_page_aflag_clear(p, PGA_REFERENCED);
 			}
-			if (p->queue != PQ_ACTIVE && actcount != 0) {
+			if (p->queue != PQ_ACTIVE && act_delta != 0) {
 				vm_page_activate(p);
-				p->act_count += actcount;
+				p->act_count += act_delta;
 			} else if (p->queue == PQ_ACTIVE) {
-				if (actcount == 0) {
+				if (act_delta == 0) {
 					p->act_count -= min(p->act_count,
 					    ACT_DECLINE);
-					if (!remove_mode &&
-					    (vm_pageout_algorithm ||
-					    p->act_count == 0)) {
+					if (!remove_mode && p->act_count == 0) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
-					} else {
-						vm_page_lock_queues();
+					} else
 						vm_page_requeue(p);
-						vm_page_unlock_queues();
-					}
 				} else {
 					vm_page_activate(p);
 					if (p->act_count < ACT_MAX -
 					    ACT_ADVANCE)
 						p->act_count += ACT_ADVANCE;
-					vm_page_lock_queues();
 					vm_page_requeue(p);
-					vm_page_unlock_queues();
 				}
 			} else if (p->queue == PQ_INACTIVE)
 				pmap_remove_all(p);
@@ -642,13 +836,13 @@
 		}
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
-		VM_OBJECT_LOCK(backing_object);
+		VM_OBJECT_RLOCK(backing_object);
 		if (object != first_object)
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_RUNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_RUNLOCK(object);
 }
 
 /*
@@ -678,15 +872,15 @@
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
-			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
+			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count < obj->resident_page_count)) {
 					if (bigobj != NULL)
-						VM_OBJECT_UNLOCK(bigobj);
+						VM_OBJECT_RUNLOCK(bigobj);
 					bigobj = obj;
 				} else
-					VM_OBJECT_UNLOCK(obj);
+					VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
@@ -696,7 +890,7 @@
 
 	if (bigobj != NULL) {
 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
-		VM_OBJECT_UNLOCK(bigobj);
+		VM_OBJECT_RUNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
@@ -709,15 +903,23 @@
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
-				VM_OBJECT_LOCK(obj);
+				VM_OBJECT_RLOCK(obj);
 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
-				VM_OBJECT_UNLOCK(obj);
+				VM_OBJECT_RUNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
+#ifdef __ia64__
 	/*
+	 * Remove all non-wired, managed mappings if a process is swapped out.
+	 * This will free page table pages.
+	 */
+	if (desired == 0)
+		pmap_remove_pages(map->pmap);
+#else
+	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
@@ -725,6 +927,8 @@
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 	}
+#endif
+
 	vm_map_unlock(map);
 }
 #endif		/* !defined(NO_SWAPPING) */
@@ -731,52 +935,63 @@
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
+ *
+ *	pass 0 - Update active LRU/deactivate pages
+ *	pass 1 - Move inactive to cache or free
+ *	pass 2 - Launder dirty pages
  */
 static void
-vm_pageout_scan(int pass)
+vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
-	struct vm_page marker;
-	int page_shortage, maxscan, pcount;
-	int addl_page_shortage;
+	struct vm_pagequeue *pq;
 	vm_object_t object;
-	int actcount;
+	long min_scan;
+	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
 	int vnodes_skipped = 0;
-	int maxlaunder;
-	boolean_t queues_locked;
+	int maxlaunder, scan_tick, scanned, starting_page_shortage;
+	int lockmode;
+	boolean_t queue_locked;
 
 	/*
-	 * Decrease registered cache sizes.
+	 * If we need to reclaim memory ask kernel caches to return
+	 * some.  We rate limit to avoid thrashing.
 	 */
-	EVENTHANDLER_INVOKE(vm_lowmem, 0);
-	/*
-	 * We do this explicitly after the caches have been drained above.
-	 */
-	uma_reclaim();
+	if (vmd == &vm_dom[0] && pass > 0 &&
+	    (time_uptime - lowmem_uptime) >= lowmem_period) {
+		/*
+		 * Decrease registered cache sizes.
+		 */
+		SDT_PROBE0(vm, , , vm__lowmem_scan);
+		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
+		/*
+		 * We do this explicitly after the caches have been
+		 * drained above.
+		 */
+		uma_reclaim();
+		lowmem_uptime = time_uptime;
+	}
 
 	/*
 	 * The addl_page_shortage is the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
-	 * number of pages from cnt.v_inactive_count that should be
+	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
-	addl_page_shortage = atomic_readandclear_int(&vm_pageout_deficit);
+	addl_page_shortage = 0;
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
-	page_shortage = vm_paging_target() + addl_page_shortage;
+	if (pass > 0) {
+		deficit = atomic_readandclear_int(&vm_pageout_deficit);
+		page_shortage = vm_paging_target() + deficit;
+	} else
+		page_shortage = deficit = 0;
+	starting_page_shortage = page_shortage;
 
-	vm_pageout_init_marker(&marker, PQ_INACTIVE);
-
 	/*
-	 * Start scanning the inactive queue for pages we can move to the
-	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.  Note that m->act_count
-	 * is not used to form decisions for the inactive queue, only for the
-	 * active queue.
-	 *
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
@@ -788,21 +1003,29 @@
 	 */
 	if ((maxlaunder = vm_max_launder) <= 1)
 		maxlaunder = 1;
-	if (pass)
+	if (pass > 1)
 		maxlaunder = 10000;
-	vm_page_lock_queues();
-	queues_locked = TRUE;
-	maxscan = cnt.v_inactive_count;
 
-	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
+	/*
+	 * Start scanning the inactive queue for pages we can move to the
+	 * cache or free.  The scan will stop when the target is reached or
+	 * we have scanned the entire inactive queue.  Note that m->act_count
+	 * is not used to form decisions for the inactive queue, only for the
+	 * active queue.
+	 */
+	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
+	maxscan = pq->pq_cnt;
+	vm_pagequeue_lock(pq);
+	queue_locked = TRUE;
+	for (m = TAILQ_FIRST(&pq->pq_pl);
 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
 	     m = next) {
-		KASSERT(queues_locked, ("unlocked queues"));
-		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+		vm_pagequeue_assert_locked(pq);
+		KASSERT(queue_locked, ("unlocked inactive queue"));
 		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
 
-		cnt.v_pdpages++;
-		next = TAILQ_NEXT(m, pageq);
+		PCPU_INC(cnt.v_pdpages);
+		next = TAILQ_NEXT(m, plinks.q);
 
 		/*
 		 * skip marker pages
@@ -826,10 +1049,10 @@
 			continue;
 		}
 		object = m->object;
-		if (!VM_OBJECT_TRYLOCK(object) &&
+		if (!VM_OBJECT_TRYWLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			vm_page_unlock(m);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			continue;
 		}
 
@@ -840,105 +1063,87 @@
 		 * pages, because they may leave the inactive queue
 		 * shortly after page scan is finished.
 		 */
-		if (m->busy != 0 || (m->oflags & VPO_BUSY) != 0) {
+		if (vm_page_busied(m)) {
 			vm_page_unlock(m);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
-		 * We unlock vm_page_queue_mtx, invalidating the
+		 * We unlock the inactive page queue, invalidating the
 		 * 'next' pointer.  Use our marker to remember our
 		 * place.
 		 */
-		TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
-		    m, &marker, pageq);
-		vm_page_unlock_queues();
-		queues_locked = FALSE;
+		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+		vm_pagequeue_unlock(pq);
+		queue_locked = FALSE;
 
 		/*
-		 * If the object is not being used, we ignore previous 
+		 * We bump the activation count if the page has been
+		 * referenced while in the inactive queue.  This makes
+		 * it less likely that the page will be added back to the
+		 * inactive queue prematurely again.  Here we check the 
+		 * page tables (or emulated bits, if any), given the upper 
+		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
-		if (object->ref_count == 0) {
+		act_delta = 0;
+		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
+			act_delta = 1;
+		}
+		if (object->ref_count != 0) {
+			act_delta += pmap_ts_referenced(m);
+		} else {
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("vm_pageout_scan: page %p is mapped", m));
-
-		/*
-		 * Otherwise, if the page has been referenced while in the 
-		 * inactive queue, we bump the "activation count" upwards, 
-		 * making it less likely that the page will be added back to 
-		 * the inactive queue prematurely again.  Here we check the 
-		 * page tables (or emulated bits, if any), given the upper 
-		 * level VM system not knowing anything about existing 
-		 * references.
-		 */
-		} else if ((m->aflags & PGA_REFERENCED) == 0 &&
-		    (actcount = pmap_ts_referenced(m)) != 0) {
-			vm_page_activate(m);
-			vm_page_unlock(m);
-			m->act_count += actcount + ACT_ADVANCE;
-			VM_OBJECT_UNLOCK(object);
-			goto relock_queues;
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
-		 * references, we activate the page.  We also set the 
-		 * "activation count" higher than normal so that we will less 
-		 * likely place pages back onto the inactive queue again.
+		 * references, we reactivate the page or requeue it.
 		 */
-		if ((m->aflags & PGA_REFERENCED) != 0) {
-			vm_page_aflag_clear(m, PGA_REFERENCED);
-			actcount = pmap_ts_referenced(m);
-			vm_page_activate(m);
+		if (act_delta != 0) {
+			if (object->ref_count) {
+				vm_page_activate(m);
+				m->act_count += act_delta + ACT_ADVANCE;
+			} else {
+				vm_pagequeue_lock(pq);
+				queue_locked = TRUE;
+				vm_page_requeue_locked(m);
+			}
+			VM_OBJECT_WUNLOCK(object);
 			vm_page_unlock(m);
-			m->act_count += actcount + ACT_ADVANCE + 1;
-			VM_OBJECT_UNLOCK(object);
-			goto relock_queues;
+			goto relock_queue;
 		}
 
 		if (m->hold_count != 0) {
 			vm_page_unlock(m);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 
 			/*
 			 * Held pages are essentially stuck in the
 			 * queue.  So, they ought to be discounted
-			 * from cnt.v_inactive_count.  See the
+			 * from the inactive count.  See the
 			 * calculation of the page_shortage for the
 			 * loop over the active queue below.
 			 */
 			addl_page_shortage++;
-			goto relock_queues;
+			goto relock_queue;
 		}
 
 		/*
-		 * If the upper level VM system does not believe that the page
-		 * is fully dirty, but it is mapped for write access, then we
-		 * consult the pmap to see if the page's dirty status should
-		 * be updated.
+		 * If the page appears to be clean at the machine-independent
+		 * layer, then remove all of its mappings from the pmap in
+		 * anticipation of placing it onto the cache queue.  If,
+		 * however, any of the page's mappings allow write access,
+		 * then the page may still be modified until the last of those
+		 * mappings are removed.
 		 */
-		if (m->dirty != VM_PAGE_BITS_ALL &&
-		    pmap_page_is_write_mapped(m)) {
-			/*
-			 * Avoid a race condition: Unless write access is
-			 * removed from the page, another processor could
-			 * modify it before all access is removed by the call
-			 * to vm_page_cache() below.  If vm_page_cache() finds
-			 * that the page has been modified when it removes all
-			 * access, it panics because it cannot cache dirty
-			 * pages.  In principle, we could eliminate just write
-			 * access here rather than all access.  In the expected
-			 * case, when there are no last instant modifications
-			 * to the page, removing all access will be cheaper
-			 * overall.
-			 */
-			if (pmap_is_modified(m))
-				vm_page_dirty(m);
-			else if (m->dirty == 0)
+		if (object->ref_count != 0) {
+			vm_page_test_dirty(m);
+			if (m->dirty == 0)
 				pmap_remove_all(m);
 		}
 
@@ -956,7 +1161,7 @@
 			 */
 			vm_page_cache(m);
 			--page_shortage;
-		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
+		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
 			 * a page is extremely expensive verses freeing
@@ -970,9 +1175,9 @@
 			 * the thrash point for a heavily loaded machine.
 			 */
 			m->flags |= PG_WINATCFLS;
-			vm_page_lock_queues();
-			queues_locked = TRUE;
-			vm_page_requeue(m);
+			vm_pagequeue_lock(pq);
+			queue_locked = TRUE;
+			vm_page_requeue_locked(m);
 		} else if (maxlaunder > 0) {
 			/*
 			 * We always want to try to flush some dirty pages if
@@ -981,7 +1186,7 @@
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
-			int swap_pageouts_ok, vfslocked = 0;
+			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp = NULL;
 
@@ -999,12 +1204,12 @@
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
-				vm_page_lock_queues();
+				vm_pagequeue_lock(pq);
 				vm_page_unlock(m);
-				VM_OBJECT_UNLOCK(object);
-				queues_locked = TRUE;
-				vm_page_requeue(m);
-				goto relock_queues;
+				VM_OBJECT_WUNLOCK(object);
+				queue_locked = TRUE;
+				vm_page_requeue_locked(m);
+				goto relock_queue;
 			}
 
 			/*
@@ -1044,11 +1249,12 @@
 				KASSERT(mp != NULL,
 				    ("vp %p with NULL v_mount", vp));
 				vm_object_reference_locked(object);
-				VM_OBJECT_UNLOCK(object);
-				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-				if (vget(vp, LK_EXCLUSIVE | LK_TIMELOCK,
+				VM_OBJECT_WUNLOCK(object);
+				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+				    LK_SHARED : LK_EXCLUSIVE;
+				if (vget(vp, lockmode | LK_TIMELOCK,
 				    curthread)) {
-					VM_OBJECT_LOCK(object);
+					VM_OBJECT_WLOCK(object);
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
@@ -1055,10 +1261,10 @@
 					vp = NULL;
 					goto unlock_and_continue;
 				}
-				VM_OBJECT_LOCK(object);
+				VM_OBJECT_WLOCK(object);
 				vm_page_lock(m);
-				vm_page_lock_queues();
-				queues_locked = TRUE;
+				vm_pagequeue_lock(pq);
+				queue_locked = TRUE;
 				/*
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
@@ -1067,7 +1273,7 @@
 				 */
 				if (m->queue != PQ_INACTIVE ||
 				    m->object != object ||
-				    TAILQ_NEXT(m, pageq) != &marker) {
+				    TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
 					vm_page_unlock(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
@@ -1080,8 +1286,9 @@
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
-				if (m->busy || (m->oflags & VPO_BUSY)) {
+				if (vm_page_busied(m)) {
 					vm_page_unlock(m);
+					addl_page_shortage++;
 					goto unlock_and_continue;
 				}
 
@@ -1089,15 +1296,15 @@
 				 * If the page has become held it might
 				 * be undergoing I/O, so skip it
 				 */
-				if (m->hold_count) {
+				if (m->hold_count != 0) {
 					vm_page_unlock(m);
-					vm_page_requeue(m);
+					addl_page_shortage++;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
-				vm_page_unlock_queues();
-				queues_locked = FALSE;
+				vm_pagequeue_unlock(pq);
+				queue_locked = FALSE;
 			}
 
 			/*
@@ -1116,60 +1323,93 @@
 			}
 unlock_and_continue:
 			vm_page_lock_assert(m, MA_NOTOWNED);
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			if (mp != NULL) {
-				if (queues_locked) {
-					vm_page_unlock_queues();
-					queues_locked = FALSE;
+				if (queue_locked) {
+					vm_pagequeue_unlock(pq);
+					queue_locked = FALSE;
 				}
 				if (vp != NULL)
 					vput(vp);
-				VFS_UNLOCK_GIANT(vfslocked);
 				vm_object_deallocate(object);
 				vn_finished_write(mp);
 			}
 			vm_page_lock_assert(m, MA_NOTOWNED);
-			goto relock_queues;
+			goto relock_queue;
 		}
 		vm_page_unlock(m);
-		VM_OBJECT_UNLOCK(object);
-relock_queues:
-		if (!queues_locked) {
-			vm_page_lock_queues();
-			queues_locked = TRUE;
+		VM_OBJECT_WUNLOCK(object);
+relock_queue:
+		if (!queue_locked) {
+			vm_pagequeue_lock(pq);
+			queue_locked = TRUE;
 		}
-		next = TAILQ_NEXT(&marker, pageq);
-		TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
-		    &marker, pageq);
+		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
+		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
 	}
+	vm_pagequeue_unlock(pq);
 
+#if !defined(NO_SWAPPING)
 	/*
+	 * Wakeup the swapout daemon if we didn't cache or free the targeted
+	 * number of pages. 
+	 */
+	if (vm_swap_enabled && page_shortage > 0)
+		vm_req_vmdaemon(VM_SWAP_NORMAL);
+#endif
+
+	/*
+	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
+	 * and we didn't cache or free enough pages.
+	 */
+	if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
+	    cnt.v_free_min)
+		(void)speedup_syncer();
+
+	/*
+	 * If the inactive queue scan fails repeatedly to meet its
+	 * target, kill the largest process.
+	 */
+	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
+
+	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
-	page_shortage = vm_paging_target() +
-		cnt.v_inactive_target - cnt.v_inactive_count;
-	page_shortage += addl_page_shortage;
+	page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
+	    vm_paging_target() + deficit + addl_page_shortage;
 
+	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
+	vm_pagequeue_lock(pq);
+	maxscan = pq->pq_cnt;
+
 	/*
-	 * Scan the active queue for things we can deactivate. We nominally
-	 * track the per-page activity counter and use it to locate
-	 * deactivation candidates.
+	 * If we're just idle polling attempt to visit every
+	 * active page within 'update_period' seconds.
 	 */
-	pcount = cnt.v_active_count;
-	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
-	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
+	scan_tick = ticks;
+	if (vm_pageout_update_period != 0) {
+		min_scan = pq->pq_cnt;
+		min_scan *= scan_tick - vmd->vmd_last_active_scan;
+		min_scan /= hz * vm_pageout_update_period;
+	} else
+		min_scan = 0;
+	if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
+		vmd->vmd_last_active_scan = scan_tick;
 
-	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
-
+	/*
+	 * Scan the active queue for pages that can be deactivated.  Update
+	 * the per-page activity counter and use it to identify deactivation
+	 * candidates.  Held pages may be deactivated.
+	 */
+	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
+	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
+	    scanned++) {
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
-
-		next = TAILQ_NEXT(m, pageq);
-		if ((m->flags & PG_MARKER) != 0) {
-			m = next;
+		next = TAILQ_NEXT(m, plinks.q);
+		if ((m->flags & PG_MARKER) != 0)
 			continue;
-		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("Fictitious page %p cannot be in active queue", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
@@ -1176,89 +1416,65 @@
 		    ("Unmanaged page %p cannot be in active queue", m));
 		if (!vm_pageout_page_lock(m, &next)) {
 			vm_page_unlock(m);
-			m = next;
 			continue;
 		}
-		object = m->object;
-		if (!VM_OBJECT_TRYLOCK(object) &&
-		    !vm_pageout_fallback_object_lock(m, &next)) {
-			VM_OBJECT_UNLOCK(object);
-			vm_page_unlock(m);
-			m = next;
-			continue;
-		}
 
 		/*
-		 * Don't deactivate pages that are busy.
+		 * The count for page daemon pages is updated after checking
+		 * the page for eligibility.
 		 */
-		if ((m->busy != 0) ||
-		    (m->oflags & VPO_BUSY) ||
-		    (m->hold_count != 0)) {
-			vm_page_unlock(m);
-			VM_OBJECT_UNLOCK(object);
-			vm_page_requeue(m);
-			m = next;
-			continue;
-		}
+		PCPU_INC(cnt.v_pdpages);
 
 		/*
-		 * The count for pagedaemon pages is done after checking the
-		 * page for eligibility...
-		 */
-		cnt.v_pdpages++;
-
-		/*
 		 * Check to see "how much" the page has been used.
 		 */
-		actcount = 0;
-		if (object->ref_count != 0) {
-			if (m->aflags & PGA_REFERENCED) {
-				actcount += 1;
-			}
-			actcount += pmap_ts_referenced(m);
-			if (actcount) {
-				m->act_count += ACT_ADVANCE + actcount;
-				if (m->act_count > ACT_MAX)
-					m->act_count = ACT_MAX;
-			}
+		act_delta = 0;
+		if (m->aflags & PGA_REFERENCED) {
+			vm_page_aflag_clear(m, PGA_REFERENCED);
+			act_delta += 1;
 		}
-
 		/*
-		 * Since we have "tested" this bit, we need to clear it now.
+		 * Perform an unsynchronized object ref count check.  While
+		 * the page lock ensures that the page is not reallocated to
+		 * another object, in particular, one with unmanaged mappings
+		 * that cannot support pmap_ts_referenced(), two races are,
+		 * nonetheless, possible:
+		 * 1) The count was transitioning to zero, but we saw a non-
+		 *    zero value.  pmap_ts_referenced() will return zero
+		 *    because the page is not mapped.
+		 * 2) The count was transitioning to one, but we saw zero. 
+		 *    This race delays the detection of a new reference.  At
+		 *    worst, we will deactivate and reactivate the page.
 		 */
-		vm_page_aflag_clear(m, PGA_REFERENCED);
+		if (m->object->ref_count != 0)
+			act_delta += pmap_ts_referenced(m);
 
 		/*
-		 * Only if an object is currently being used, do we use the
-		 * page activation count stats.
+		 * Advance or decay the act_count based on recent usage.
 		 */
-		if (actcount && (object->ref_count != 0)) {
-			vm_page_requeue(m);
+		if (act_delta) {
+			m->act_count += ACT_ADVANCE + act_delta;
+			if (m->act_count > ACT_MAX)
+				m->act_count = ACT_MAX;
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
-			if (vm_pageout_algorithm ||
-			    object->ref_count == 0 ||
-			    m->act_count == 0) {
-				page_shortage--;
-				if (object->ref_count == 0) {
-					KASSERT(!pmap_page_is_mapped(m),
-				    ("vm_pageout_scan: page %p is mapped", m));
-					if (m->dirty == 0)
-						vm_page_cache(m);
-					else
-						vm_page_deactivate(m);
-				} else {
-					vm_page_deactivate(m);
-				}
-			} else {
-				vm_page_requeue(m);
-			}
+			act_delta = m->act_count;
 		}
+
+		/*
+		 * Move this page to the tail of the active or inactive
+		 * queue depending on usage.
+		 */
+		if (act_delta == 0) {
+			/* Dequeue to avoid later lock recursion. */
+			vm_page_dequeue_locked(m);
+			vm_page_deactivate(m);
+			page_shortage--;
+		} else
+			vm_page_requeue_locked(m);
 		vm_page_unlock(m);
-		VM_OBJECT_UNLOCK(object);
-		m = next;
 	}
-	vm_page_unlock_queues();
+	vm_pagequeue_unlock(pq);
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
@@ -1271,35 +1487,124 @@
 		}
 	}
 #endif
-		
+}
+
+static int vm_pageout_oom_vote;
+
+/*
+ * The pagedaemon threads randlomly select one to perform the
+ * OOM.  Trying to kill processes before all pagedaemons
+ * failed to reach free target is premature.
+ */
+static void
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage)
+{
+	int old_vote;
+
+	if (starting_page_shortage <= 0 || starting_page_shortage !=
+	    page_shortage)
+		vmd->vmd_oom_seq = 0;
+	else
+		vmd->vmd_oom_seq++;
+	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
+		if (vmd->vmd_oom) {
+			vmd->vmd_oom = FALSE;
+			atomic_subtract_int(&vm_pageout_oom_vote, 1);
+		}
+		return;
+	}
+
 	/*
-	 * If we didn't get enough free pages, and we have skipped a vnode
-	 * in a writeable object, wakeup the sync daemon.  And kick swapout
-	 * if we did not get enough free pages.
+	 * Do not follow the call sequence until OOM condition is
+	 * cleared.
 	 */
-	if (vm_paging_target() > 0) {
-		if (vnodes_skipped && vm_page_count_min())
-			(void) speedup_syncer();
-#if !defined(NO_SWAPPING)
-		if (vm_swap_enabled && vm_page_count_target())
-			vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
-	}
+	vmd->vmd_oom_seq = 0;
 
+	if (vmd->vmd_oom)
+		return;
+
+	vmd->vmd_oom = TRUE;
+	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
+	if (old_vote != vm_ndomains - 1)
+		return;
+
 	/*
-	 * If we are critically low on one of RAM or swap and low on
-	 * the other, kill the largest process.  However, we avoid
-	 * doing this on the first pass in order to give ourselves a
-	 * chance to flush out dirty vnode-backed pages and to allow
-	 * active pages to be moved to the inactive queue and reclaimed.
+	 * The current pagedaemon thread is the last in the quorum to
+	 * start OOM.  Initiate the selection and signaling of the
+	 * victim.
 	 */
-	if (pass != 0 &&
-	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
-	     (swap_pager_full && vm_paging_target() > 0)))
-		vm_pageout_oom(VM_OOM_MEM);
+	vm_pageout_oom(VM_OOM_MEM);
+
+	/*
+	 * After one round of OOM terror, recall our vote.  On the
+	 * next pass, current pagedaemon would vote again if the low
+	 * memory condition is still there, due to vmd_oom being
+	 * false.
+	 */
+	vmd->vmd_oom = FALSE;
+	atomic_subtract_int(&vm_pageout_oom_vote, 1);
 }
 
+/*
+ * The OOM killer is the page daemon's action of last resort when
+ * memory allocation requests have been stalled for a prolonged period
+ * of time because it cannot reclaim memory.  This function computes
+ * the approximate number of physical pages that could be reclaimed if
+ * the specified address space is destroyed.
+ *
+ * Private, anonymous memory owned by the address space is the
+ * principal resource that we expect to recover after an OOM kill.
+ * Since the physical pages mapped by the address space's COW entries
+ * are typically shared pages, they are unlikely to be released and so
+ * they are not counted.
+ *
+ * To get to the point where the page daemon runs the OOM killer, its
+ * efforts to write-back vnode-backed pages may have stalled.  This
+ * could be caused by a memory allocation deadlock in the write path
+ * that might be resolved by an OOM kill.  Therefore, physical pages
+ * belonging to vnode-backed objects are counted, because they might
+ * be freed without being written out first if the address space holds
+ * the last reference to an unlinked vnode.
+ *
+ * Similarly, physical pages belonging to OBJT_PHYS objects are
+ * counted because the address space might hold the last reference to
+ * the object.
+ */
+static long
+vm_pageout_oom_pagecount(struct vmspace *vmspace)
+{
+	vm_map_t map;
+	vm_map_entry_t entry;
+	vm_object_t obj;
+	long res;
 
+	map = &vmspace->vm_map;
+	KASSERT(!map->system_map, ("system map"));
+	sx_assert(&map->lock, SA_LOCKED);
+	res = 0;
+	for (entry = map->header.next; entry != &map->header;
+	    entry = entry->next) {
+		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+			continue;
+		obj = entry->object.vm_object;
+		if (obj == NULL)
+			continue;
+		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
+		    obj->ref_count != 1)
+			continue;
+		switch (obj->type) {
+		case OBJT_DEFAULT:
+		case OBJT_SWAP:
+		case OBJT_PHYS:
+		case OBJT_VNODE:
+			res += obj->resident_page_count;
+			break;
+		}
+	}
+	return (res);
+}
+
 void
 vm_pageout_oom(int shortage)
 {
@@ -1307,6 +1612,7 @@
 	vm_offset_t size, bigsize;
 	struct thread *td;
 	struct vmspace *vm;
+	bool breakout;
 
 	/*
 	 * We keep the process bigproc locked once we find it to keep anyone
@@ -1320,17 +1626,15 @@
 	bigsize = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		int breakout;
+		PROC_LOCK(p);
 
-		if (PROC_TRYLOCK(p) == 0)
-			continue;
 		/*
 		 * If this is a system, protected or killed process, skip it.
 		 */
-		if (p->p_state != PRS_NORMAL ||
-		    (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM)) ||
-		    (p->p_pid == 1) || P_KILLED(p) ||
-		    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
+		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
+		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
+		    p->p_pid == 1 || P_KILLED(p) ||
+		    (p->p_pid < 48 && swap_pager_avail != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
@@ -1338,15 +1642,16 @@
 		 * If the process is in a non-running type state,
 		 * don't touch it.  Check all the threads individually.
 		 */
-		breakout = 0;
+		breakout = false;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (!TD_ON_RUNQ(td) &&
 			    !TD_IS_RUNNING(td) &&
 			    !TD_IS_SLEEPING(td) &&
-			    !TD_IS_SUSPENDED(td)) {
+			    !TD_IS_SUSPENDED(td) &&
+			    !TD_IS_SWAPPED(td)) {
 				thread_unlock(td);
-				breakout = 1;
+				breakout = true;
 				break;
 			}
 			thread_unlock(td);
@@ -1363,156 +1668,119 @@
 			PROC_UNLOCK(p);
 			continue;
 		}
+		_PHOLD(p);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
+			_PRELE(p);
+			PROC_UNLOCK(p);
 			vmspace_free(vm);
-			PROC_UNLOCK(p);
 			continue;
 		}
+		PROC_UNLOCK(p);
 		size = vmspace_swap_count(vm);
+		if (shortage == VM_OOM_MEM)
+			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
-		if (shortage == VM_OOM_MEM)
-			size += vmspace_resident_count(vm);
 		vmspace_free(vm);
+
 		/*
-		 * if the this process is bigger than the biggest one
+		 * If this process is bigger than the biggest one,
 		 * remember it.
 		 */
 		if (size > bigsize) {
 			if (bigproc != NULL)
-				PROC_UNLOCK(bigproc);
+				PRELE(bigproc);
 			bigproc = p;
 			bigsize = size;
-		} else
-			PROC_UNLOCK(p);
+		} else {
+			PRELE(p);
+		}
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
+		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
+		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
 		wakeup(&cnt.v_free_count);
 	}
 }
 
-/*
- * This routine tries to maintain the pseudo LRU active queue,
- * so that during long periods of time where there is no paging,
- * that some statistic accumulation still occurs.  This code
- * helps the situation where paging just starts to occur.
- */
 static void
-vm_pageout_page_stats()
+vm_pageout_worker(void *arg)
 {
-	vm_object_t object;
-	vm_page_t m,next;
-	int pcount,tpcount;		/* Number of pages to check */
-	static int fullintervalcount = 0;
-	int page_shortage;
+	struct vm_domain *domain;
+	int domidx;
 
-	page_shortage = 
-	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
-	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+	domidx = (uintptr_t)arg;
+	domain = &vm_dom[domidx];
 
-	if (page_shortage <= 0)
-		return;
+	/*
+	 * XXXKIB It could be useful to bind pageout daemon threads to
+	 * the cores belonging to the domain, from which vm_page_array
+	 * is allocated.
+	 */
 
-	vm_page_lock_queues();
-	pcount = cnt.v_active_count;
-	fullintervalcount += vm_pageout_stats_interval;
-	if (fullintervalcount < vm_pageout_full_stats_interval) {
-		tpcount = (int64_t)vm_pageout_stats_max * cnt.v_active_count /
-		    cnt.v_page_count;
-		if (pcount > tpcount)
-			pcount = tpcount;
-	} else {
-		fullintervalcount = 0;
-	}
+	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+	domain->vmd_last_active_scan = ticks;
+	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
 
-	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
-	while ((m != NULL) && (pcount-- > 0)) {
-		int actcount;
-
-		KASSERT(m->queue == PQ_ACTIVE,
-		    ("vm_pageout_page_stats: page %p isn't active", m));
-
-		next = TAILQ_NEXT(m, pageq);
-		if ((m->flags & PG_MARKER) != 0) {
-			m = next;
-			continue;
-		}
-		vm_page_lock_assert(m, MA_NOTOWNED);
-		if (!vm_pageout_page_lock(m, &next)) {
-			vm_page_unlock(m);
-			m = next;
-			continue;
-		}
-		object = m->object;
-		if (!VM_OBJECT_TRYLOCK(object) &&
-		    !vm_pageout_fallback_object_lock(m, &next)) {
-			VM_OBJECT_UNLOCK(object);
-			vm_page_unlock(m);
-			m = next;
-			continue;
-		}
-
+	/*
+	 * The pageout daemon worker is never done, so loop forever.
+	 */
+	while (TRUE) {
 		/*
-		 * Don't deactivate pages that are busy.
+		 * If we have enough free memory, wakeup waiters.  Do
+		 * not clear vm_pages_needed until we reach our target,
+		 * otherwise we may be woken up over and over again and
+		 * waste a lot of cpu.
 		 */
-		if ((m->busy != 0) ||
-		    (m->oflags & VPO_BUSY) ||
-		    (m->hold_count != 0)) {
-			vm_page_unlock(m);
-			VM_OBJECT_UNLOCK(object);
-			vm_page_requeue(m);
-			m = next;
-			continue;
+		mtx_lock(&vm_page_queue_free_mtx);
+		if (vm_pages_needed && !vm_page_count_min()) {
+			if (!vm_paging_needed())
+				vm_pages_needed = 0;
+			wakeup(&cnt.v_free_count);
 		}
-
-		actcount = 0;
-		if (m->aflags & PGA_REFERENCED) {
-			vm_page_aflag_clear(m, PGA_REFERENCED);
-			actcount += 1;
-		}
-
-		actcount += pmap_ts_referenced(m);
-		if (actcount) {
-			m->act_count += ACT_ADVANCE + actcount;
-			if (m->act_count > ACT_MAX)
-				m->act_count = ACT_MAX;
-			vm_page_requeue(m);
+		if (vm_pages_needed) {
+			/*
+			 * We're still not done.  Either vm_pages_needed was
+			 * set by another thread during the previous scan
+			 * (typically, this happens during a level 0 scan) or
+			 * vm_pages_needed was already set and the scan failed
+			 * to free enough pages.  If we haven't yet performed
+			 * a level >= 2 scan (unlimited dirty cleaning), then
+			 * upgrade the level and scan again now.  Otherwise,
+			 * sleep a bit and try again later.  While sleeping,
+			 * vm_pages_needed can be cleared.
+			 */
+			if (domain->vmd_pass > 1)
+				msleep(&vm_pages_needed,
+				    &vm_page_queue_free_mtx, PVM, "psleep",
+				    hz / 2);
 		} else {
-			if (m->act_count == 0) {
-				/*
-				 * We turn off page access, so that we have
-				 * more accurate RSS stats.  We don't do this
-				 * in the normal page deactivation when the
-				 * system is loaded VM wise, because the
-				 * cost of the large number of page protect
-				 * operations would be higher than the value
-				 * of doing the operation.
-				 */
-				pmap_remove_all(m);
-				vm_page_deactivate(m);
-			} else {
-				m->act_count -= min(m->act_count, ACT_DECLINE);
-				vm_page_requeue(m);
-			}
+			/*
+			 * Good enough, sleep until required to refresh
+			 * stats.
+			 */
+			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
+			    PVM, "psleep", hz);
 		}
-		vm_page_unlock(m);
-		VM_OBJECT_UNLOCK(object);
-		m = next;
+		if (vm_pages_needed) {
+			cnt.v_pdwakeups++;
+			domain->vmd_pass++;
+		} else
+			domain->vmd_pass = 0;
+		mtx_unlock(&vm_page_queue_free_mtx);
+		vm_pageout_scan(domain, domain->vmd_pass);
 	}
-	vm_page_unlock_queues();
 }
 
 /*
- *	vm_pageout is the high level pageout daemon.
+ *	vm_pageout_init initialises basic pageout daemon settings.
  */
 static void
-vm_pageout()
+vm_pageout_init(void)
 {
-	int error, pass;
-
 	/*
 	 * Initialize some paging parameters.
 	 */
@@ -1534,105 +1802,59 @@
 	cnt.v_free_reserved = vm_pageout_page_count +
 	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
 	cnt.v_free_severe = cnt.v_free_min / 2;
+	cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
 	cnt.v_free_min += cnt.v_free_reserved;
 	cnt.v_free_severe += cnt.v_free_reserved;
+	cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
+	if (cnt.v_inactive_target > cnt.v_free_count / 3)
+		cnt.v_inactive_target = cnt.v_free_count / 3;
 
 	/*
-	 * v_free_target and v_cache_min control pageout hysteresis.  Note
-	 * that these are more a measure of the VM cache queue hysteresis
-	 * then the VM free queue.  Specifically, v_free_target is the
-	 * high water mark (free+cache pages).
-	 *
-	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
-	 * low water mark, while v_free_min is the stop.  v_cache_min must
-	 * be big enough to handle memory needs while the pageout daemon
-	 * is signalled and run to free more pages.
+	 * Set the default wakeup threshold to be 10% above the minimum
+	 * page limit.  This keeps the steady state out of shortfall.
 	 */
-	if (cnt.v_free_count > 6144)
-		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
-	else
-		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
+	vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
 
-	if (cnt.v_free_count > 2048) {
-		cnt.v_cache_min = cnt.v_free_target;
-		cnt.v_cache_max = 2 * cnt.v_cache_min;
-		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
-	} else {
-		cnt.v_cache_min = 0;
-		cnt.v_cache_max = 0;
-		cnt.v_inactive_target = cnt.v_free_count / 4;
-	}
-	if (cnt.v_inactive_target > cnt.v_free_count / 3)
-		cnt.v_inactive_target = cnt.v_free_count / 3;
+	/*
+	 * Set interval in seconds for active scan.  We want to visit each
+	 * page at least once every ten minutes.  This is to prevent worst
+	 * case paging behaviors with stale active LRU.
+	 */
+	if (vm_pageout_update_period == 0)
+		vm_pageout_update_period = 600;
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
+}
 
-	if (vm_pageout_stats_max == 0)
-		vm_pageout_stats_max = cnt.v_free_target;
+/*
+ *     vm_pageout is the high level pageout daemon.
+ */
+static void
+vm_pageout(void)
+{
+	int error;
+#if MAXMEMDOM > 1
+	int i;
+#endif
 
-	/*
-	 * Set interval in seconds for stats scan.
-	 */
-	if (vm_pageout_stats_interval == 0)
-		vm_pageout_stats_interval = 5;
-	if (vm_pageout_full_stats_interval == 0)
-		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
-
 	swap_pager_swap_init();
-	pass = 0;
-	/*
-	 * The pageout daemon is never done, so loop forever.
-	 */
-	while (TRUE) {
-		/*
-		 * If we have enough free memory, wakeup waiters.  Do
-		 * not clear vm_pages_needed until we reach our target,
-		 * otherwise we may be woken up over and over again and
-		 * waste a lot of cpu.
-		 */
-		mtx_lock(&vm_page_queue_free_mtx);
-		if (vm_pages_needed && !vm_page_count_min()) {
-			if (!vm_paging_needed())
-				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+#if MAXMEMDOM > 1
+	for (i = 1; i < vm_ndomains; i++) {
+		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
+		    curproc, NULL, 0, 0, "dom%d", i);
+		if (error != 0) {
+			panic("starting pageout for domain %d, error %d\n",
+			    i, error);
 		}
-		if (vm_pages_needed) {
-			/*
-			 * Still not done, take a second pass without waiting
-			 * (unlimited dirty cleaning), otherwise sleep a bit
-			 * and try again.
-			 */
-			++pass;
-			if (pass > 1)
-				msleep(&vm_pages_needed,
-				    &vm_page_queue_free_mtx, PVM, "psleep",
-				    hz / 2);
-		} else {
-			/*
-			 * Good enough, sleep & handle stats.  Prime the pass
-			 * for the next run.
-			 */
-			if (pass > 1)
-				pass = 1;
-			else
-				pass = 0;
-			error = msleep(&vm_pages_needed,
-			    &vm_page_queue_free_mtx, PVM, "psleep",
-			    vm_pageout_stats_interval * hz);
-			if (error && !vm_pages_needed) {
-				mtx_unlock(&vm_page_queue_free_mtx);
-				pass = 0;
-				vm_pageout_page_stats();
-				continue;
-			}
-		}
-		if (vm_pages_needed)
-			cnt.v_pdwakeups++;
-		mtx_unlock(&vm_page_queue_free_mtx);
-		vm_pageout_scan(pass);
 	}
+#endif
+	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
+	    0, 0, "uma");
+	if (error != 0)
+		panic("starting uma_reclaim helper, error %d\n", error);
+	vm_pageout_worker((void *)(uintptr_t)0);
 }
 
 /*
@@ -1642,7 +1864,7 @@
  * the free page queue lock is held until the msleep() is performed.
  */
 void
-pagedaemon_wakeup()
+pagedaemon_wakeup(void)
 {
 
 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
@@ -1667,7 +1889,7 @@
 }
 
 static void
-vm_daemon()
+vm_daemon(void)
 {
 	struct rlimit rsslim;
 	struct proc *p;
@@ -1680,11 +1902,13 @@
 
 	while (TRUE) {
 		mtx_lock(&vm_daemon_mtx);
+		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
 #ifdef RACCT
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", hz);
+		    racct_enable ? hz : 0
 #else
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 0);
+		    0
 #endif
+		);
 		swapout_flags = vm_pageout_req_swapout;
 		vm_pageout_req_swapout = 0;
 		mtx_unlock(&vm_daemon_mtx);
@@ -1754,38 +1978,48 @@
 				continue;
 
 			size = vmspace_resident_count(vm);
-			if (limit >= 0 && size >= limit) {
+			if (size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &vm->vm_map, limit);
+				size = vmspace_resident_count(vm);
 			}
 #ifdef RACCT
-			rsize = IDX_TO_OFF(size);
-			PROC_LOCK(p);
-			racct_set(p, RACCT_RSS, rsize);
-			ravailable = racct_get_available(p, RACCT_RSS);
-			PROC_UNLOCK(p);
-			if (rsize > ravailable) {
-				/*
-				 * Don't be overly aggressive; this might be
-				 * an innocent process, and the limit could've
-				 * been exceeded by some memory hog.  Don't
-				 * try to deactivate more than 1/4th of process'
-				 * resident set size.
-				 */
-				if (attempts <= 8) {
-					if (ravailable < rsize - (rsize / 4))
-						ravailable = rsize - (rsize / 4);
-				}
-				vm_pageout_map_deactivate_pages(
-				    &vm->vm_map, OFF_TO_IDX(ravailable));
-				/* Update RSS usage after paging out. */
-				size = vmspace_resident_count(vm);
+			if (racct_enable) {
 				rsize = IDX_TO_OFF(size);
 				PROC_LOCK(p);
-				racct_set(p, RACCT_RSS, rsize);
+				if (p->p_state == PRS_NORMAL)
+					racct_set(p, RACCT_RSS, rsize);
+				ravailable = racct_get_available(p, RACCT_RSS);
 				PROC_UNLOCK(p);
-				if (rsize > ravailable)
-					tryagain = 1;
+				if (rsize > ravailable) {
+					/*
+					 * Don't be overly aggressive; this
+					 * might be an innocent process,
+					 * and the limit could've been exceeded
+					 * by some memory hog.  Don't try
+					 * to deactivate more than 1/4th
+					 * of process' resident set size.
+					 */
+					if (attempts <= 8) {
+						if (ravailable < rsize -
+						    (rsize / 4)) {
+							ravailable = rsize -
+							    (rsize / 4);
+						}
+					}
+					vm_pageout_map_deactivate_pages(
+					    &vm->vm_map,
+					    OFF_TO_IDX(ravailable));
+					/* Update RSS usage after paging out. */
+					size = vmspace_resident_count(vm);
+					rsize = IDX_TO_OFF(size);
+					PROC_LOCK(p);
+					if (p->p_state == PRS_NORMAL)
+						racct_set(p, RACCT_RSS, rsize);
+					PROC_UNLOCK(p);
+					if (rsize > ravailable)
+						tryagain = 1;
+				}
 			}
 #endif
 			vmspace_free(vm);

Modified: trunk/sys/vm/vm_pageout.h
===================================================================
--- trunk/sys/vm/vm_pageout.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pageout.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $
  */
 
 #ifndef _VM_VM_PAGEOUT_H_
@@ -87,6 +88,12 @@
 #define	VM_OOM_SWAPZ	2
 
 /*
+ * vm_lowmem flags.
+ */
+#define	VM_LOW_KMEM	0x01
+#define	VM_LOW_PAGES	0x02
+
+/*
  *	Exported routines.
  */
 
@@ -101,10 +108,8 @@
 extern void vm_waitpfault(void);
 
 #ifdef _KERNEL
-boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
 int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
+void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
 void vm_pageout_oom(int shortage);
-boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
-void vm_contig_grow_cache(int, vm_paddr_t, vm_paddr_t);
 #endif
 #endif	/* _VM_VM_PAGEOUT_H_ */

Modified: trunk/sys/vm/vm_pager.c
===================================================================
--- trunk/sys/vm/vm_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -64,7 +65,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -74,9 +75,11 @@
 #include <sys/buf.h>
 #include <sys/ucred.h>
 #include <sys/malloc.h>
+#include <sys/rwlock.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
+#include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
@@ -105,43 +108,35 @@
 dead_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t off, struct ucred *cred)
 {
-	return NULL;
+
+	return (NULL);
 }
 
 static void
-dead_pager_putpages(object, m, count, flags, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int flags;
-	int *rtvals;
+dead_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+    int flags, int *rtvals)
 {
 	int i;
 
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
-	}
 }
 
 static int
-dead_pager_haspage(object, pindex, prev, next)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *prev;
-	int *next;
+dead_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *prev, int *next)
 {
-	if (prev)
+
+	if (prev != NULL)
 		*prev = 0;
-	if (next)
+	if (next != NULL)
 		*next = 0;
-	return FALSE;
+	return (FALSE);
 }
 
 static void
-dead_pager_dealloc(object)
-	vm_object_t object;
+dead_pager_dealloc(vm_object_t object)
 {
-	return;
+
 }
 
 static struct pagerops deadpagerops = {
@@ -173,14 +168,13 @@
  * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
  * (MAXPHYS == 64k) if you want to get the most efficiency.
  */
-vm_map_t pager_map;
+struct mtx_padalign pbuf_mtx;
+static TAILQ_HEAD(swqueue, buf) bswlist;
 static int bswneeded;
-static vm_offset_t swapbkva;		/* swap buffers kva */
-struct mtx pbuf_mtx;
-static TAILQ_HEAD(swqueue, buf) bswlist;
+vm_offset_t swapbkva;		/* swap buffers kva */
 
 void
-vm_pager_init()
+vm_pager_init(void)
 {
 	struct pagerops **pgops;
 
@@ -190,11 +184,11 @@
 	 */
 	for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
 		if ((*pgops)->pgo_init != NULL)
-			(*(*pgops)->pgo_init) ();
+			(*(*pgops)->pgo_init)();
 }
 
 void
-vm_pager_bufferinit()
+vm_pager_bufferinit(void)
 {
 	struct buf *bp;
 	int i;
@@ -214,10 +208,6 @@
 
 	cluster_pbuf_freecnt = nswbuf / 2;
 	vnode_pbuf_freecnt = nswbuf / 2 + 1;
-
-	swapbkva = kmem_alloc_nofault(pager_map, nswbuf * MAXPHYS);
-	if (!swapbkva)
-		panic("Not enough pager_map VM space for physical buffers");
 }
 
 /*
@@ -234,7 +224,7 @@
 
 	ops = pagertab[type];
 	if (ops)
-		ret = (*ops->pgo_alloc) (handle, size, prot, off, cred);
+		ret = (*ops->pgo_alloc)(handle, size, prot, off, cred);
 	else
 		ret = NULL;
 	return (ret);
@@ -244,11 +234,10 @@
  *	The object must be locked.
  */
 void
-vm_pager_deallocate(object)
-	vm_object_t object;
+vm_pager_deallocate(vm_object_t object)
 {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
@@ -272,13 +261,13 @@
 
 	TAILQ_FOREACH(object, pg_list, pager_object_list) {
 		if (object->handle == handle) {
-			VM_OBJECT_LOCK(object);
+			VM_OBJECT_WLOCK(object);
 			if ((object->flags & OBJ_DEAD) == 0) {
 				vm_object_reference_locked(object);
-				VM_OBJECT_UNLOCK(object);
+				VM_OBJECT_WUNLOCK(object);
 				break;
 			}
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 		}
 	}
 	return (object);
@@ -294,12 +283,13 @@
 static void
 initpbuf(struct buf *bp)
 {
+
 	KASSERT(bp->b_bufobj == NULL, ("initpbuf with bufobj"));
 	KASSERT(bp->b_vp == NULL, ("initpbuf with vp"));
 	bp->b_rcred = NOCRED;
 	bp->b_wcred = NOCRED;
 	bp->b_qindex = 0;	/* On no queue (QUEUE_NONE) */
-	bp->b_saveaddr = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva;
+	bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
 	bp->b_data = bp->b_saveaddr;
 	bp->b_kvabase = bp->b_saveaddr;
 	bp->b_kvasize = MAXPHYS;
@@ -332,9 +322,8 @@
 	struct buf *bp;
 
 	mtx_lock(&pbuf_mtx);
-
 	for (;;) {
-		if (pfreecnt) {
+		if (pfreecnt != NULL) {
 			while (*pfreecnt == 0) {
 				msleep(pfreecnt, &pbuf_mtx, PVM, "wswbuf0", 0);
 			}
@@ -352,9 +341,8 @@
 	if (pfreecnt)
 		--*pfreecnt;
 	mtx_unlock(&pbuf_mtx);
-
 	initpbuf(bp);
-	return bp;
+	return (bp);
 }
 
 /*
@@ -374,14 +362,10 @@
 		return NULL;
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
-
 	--*pfreecnt;
-
 	mtx_unlock(&pbuf_mtx);
-
 	initpbuf(bp);
-
-	return bp;
+	return (bp);
 }
 
 /*
@@ -468,17 +452,9 @@
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 	KASSERT(bp->b_bufobj != NULL, ("pbrelvp: NULL bufobj"));
+	KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
+	    ("pbrelvp: pager buf on vnode list."));
 
-	/* XXX REMOVE ME */
-	BO_LOCK(bp->b_bufobj);
-	if (TAILQ_NEXT(bp, b_bobufs) != NULL) {
-		panic(
-		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
-		    bp,
-		    (int)bp->b_flags
-		);
-	}
-	BO_UNLOCK(bp->b_bufobj);
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	bp->b_flags &= ~B_PAGING;
@@ -493,17 +469,9 @@
 
 	KASSERT(bp->b_vp == NULL, ("pbrelbo: vnode"));
 	KASSERT(bp->b_bufobj != NULL, ("pbrelbo: NULL bufobj"));
+	KASSERT((bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) == 0,
+	    ("pbrelbo: pager buf on vnode list."));
 
-	/* XXX REMOVE ME */
-	BO_LOCK(bp->b_bufobj);
-	if (TAILQ_NEXT(bp, b_bobufs) != NULL) {
-		panic(
-		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
-		    bp,
-		    (int)bp->b_flags
-		);
-	}
-	BO_UNLOCK(bp->b_bufobj);
 	bp->b_bufobj = NULL;
 	bp->b_flags &= ~B_PAGING;
 }

Modified: trunk/sys/vm/vm_pager.h
===================================================================
--- trunk/sys/vm/vm_pager.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_pager.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
@@ -32,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $
  */
 
 /*
@@ -95,9 +96,8 @@
 
 #ifdef _KERNEL
 
-extern vm_map_t pager_map;
 extern struct pagerops *pagertab[];
-extern struct mtx pbuf_mtx;
+extern struct mtx_padalign pbuf_mtx;
 
 vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
@@ -104,7 +104,6 @@
 void vm_pager_bufferinit(void);
 void vm_pager_deallocate(vm_object_t);
 static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
-static __inline boolean_t vm_pager_has_page(vm_object_t, vm_pindex_t, int *, int *);
 void vm_pager_init(void);
 vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
 
@@ -124,7 +123,7 @@
 ) {
 	int r;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
 	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
 		vm_page_zero_invalid(m[reqpage], TRUE);
@@ -141,7 +140,7 @@
 	int *rtvals
 ) {
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	(*pagertab[object->type]->pgo_putpages)
 	    (object, m, count, flags, rtvals);
 }
@@ -165,7 +164,7 @@
 ) {
 	boolean_t ret;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	ret = (*pagertab[object->type]->pgo_haspage)
 	    (object, offset, before, after);
 	return (ret);
@@ -188,7 +187,7 @@
 vm_pager_page_unswapped(vm_page_t m)
 {
 
-	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (pagertab[m->object->type]->pgo_pageunswapped)
 		(*pagertab[m->object->type]->pgo_pageunswapped)(m);
 }

Modified: trunk/sys/vm/vm_param.h
===================================================================
--- trunk/sys/vm/vm_param.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_param.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -57,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $
  */
 
 /*
@@ -82,25 +83,10 @@
 #define	VM_V_CACHE_MIN		7	/* cnt.v_cache_min */
 #define	VM_V_CACHE_MAX		8	/* cnt.v_cache_max */
 #define VM_V_PAGEOUT_FREE_MIN	9	/* cnt.v_pageout_free_min */
-#define	VM_PAGEOUT_ALGORITHM	10	/* pageout algorithm */
+#define	VM_OBSOLETE_10		10	/* pageout algorithm */
 #define VM_SWAPPING_ENABLED	11	/* swapping enabled */
 #define	VM_MAXID		12	/* number of valid vm ids */
 
-#define CTL_VM_NAMES { \
-	{ 0, 0 }, \
-	{ "vmtotal", CTLTYPE_STRUCT }, \
-	{ "loadavg", CTLTYPE_STRUCT }, \
-	{ "v_free_min", CTLTYPE_UINT }, \
-	{ "v_free_target", CTLTYPE_UINT }, \
-	{ "v_free_reserved", CTLTYPE_UINT }, \
-	{ "v_inactive_target", CTLTYPE_UINT }, \
-	{ "v_cache_min", CTLTYPE_UINT }, \
-	{ "v_cache_max", CTLTYPE_UINT }, \
-	{ "v_pageout_free_min", CTLTYPE_UINT}, \
-	{ "pageout_algorithm", CTLTYPE_INT}, \
-	{ "swap_enabled", CTLTYPE_INT},\
-}
-
 /*
  * Structure for swap device statistics
  */

Modified: trunk/sys/vm/vm_phys.c
===================================================================
--- trunk/sys/vm/vm_phys.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_phys.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
@@ -29,8 +30,15 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ *	Physical memory system implementation
+ *
+ * Any external functions defined by this module are only to be used by the
+ * virtual memory system.
+ */
+
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
@@ -41,11 +49,13 @@
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#if MAXMEMDOM > 1
+#include <sys/proc.h>
+#endif
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
-#include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
@@ -55,33 +65,16 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
-#include <vm/vm_reserv.h>
 
-/*
- * VM_FREELIST_DEFAULT is split into VM_NDOMAIN lists, one for each
- * domain.  These extra lists are stored at the end of the regular
- * free lists starting with VM_NFREELIST.
- */
-#define VM_RAW_NFREELIST	(VM_NFREELIST + VM_NDOMAIN - 1)
+_Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
+    "Too many physsegs.");
 
-struct vm_freelist {
-	struct pglist pl;
-	int lcnt;
-};
-
-struct vm_phys_seg {
-	vm_paddr_t	start;
-	vm_paddr_t	end;
-	vm_page_t	first_page;
-	int		domain;
-	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
-};
-
 struct mem_affinity *mem_affinity;
 
-static struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_ndomains = 1;
 
-static int vm_phys_nsegs;
+struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
+int vm_phys_nsegs;
 
 #define VM_PHYS_FICTITIOUS_NSEGS	8
 static struct vm_phys_fictitious_seg {
@@ -90,15 +83,38 @@
 	vm_page_t	first_page;
 } vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
 static struct mtx vm_phys_fictitious_reg_mtx;
-MALLOC_DEFINE(M_FICT_PAGES, "", "");
+MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 
 static struct vm_freelist
-    vm_phys_free_queues[VM_RAW_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
-static struct vm_freelist
-(*vm_phys_lookup_lists[VM_NDOMAIN][VM_RAW_NFREELIST])[VM_NFREEPOOL][VM_NFREEORDER];
+    vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL][VM_NFREEORDER];
 
-static int vm_nfreelists = VM_FREELIST_DEFAULT + 1;
+static int vm_nfreelists;
 
+/*
+ * Provides the mapping from VM_FREELIST_* to free list indices (flind).
+ */
+static int vm_freelist_to_flind[VM_NFREELIST];
+
+CTASSERT(VM_FREELIST_DEFAULT == 0);
+
+#ifdef VM_FREELIST_ISADMA
+#define	VM_ISADMA_BOUNDARY	16777216
+#endif
+#ifdef VM_FREELIST_DMA32
+#define	VM_DMA32_BOUNDARY	((vm_paddr_t)1 << 32)
+#endif
+
+/*
+ * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
+ * the ordering of the free list boundaries.
+ */
+#if defined(VM_ISADMA_BOUNDARY) && defined(VM_LOWMEM_BOUNDARY)
+CTASSERT(VM_ISADMA_BOUNDARY < VM_LOWMEM_BOUNDARY);
+#endif
+#if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
+CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
+#endif
+
 static int cnt_prezero;
 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
     &cnt_prezero, 0, "The number of physical pages prezeroed at idle time");
@@ -111,21 +127,49 @@
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
-#if VM_NDOMAIN > 1
-static int sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS);
-SYSCTL_OID(_vm, OID_AUTO, phys_lookup_lists, CTLTYPE_STRING | CTLFLAG_RD,
-    NULL, 0, sysctl_vm_phys_lookup_lists, "A", "Phys Lookup Lists");
-#endif
+SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
+    &vm_ndomains, 0, "Number of physical memory domains available.");
 
 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
     int order);
-static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind,
-    int domain);
-static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind);
+static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
+static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order);
 
+static __inline int
+vm_rr_selectdomain(void)
+{
+#if MAXMEMDOM > 1
+	struct thread *td;
+
+	td = curthread;
+
+	td->td_dom_rr_idx++;
+	td->td_dom_rr_idx %= vm_ndomains;
+	return (td->td_dom_rr_idx);
+#else
+	return (0);
+#endif
+}
+
+boolean_t
+vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
+{
+	struct vm_phys_seg *s;
+	int idx;
+
+	while ((idx = ffsl(mask)) != 0) {
+		idx--;	/* ffsl counts from 1 */
+		mask &= ~(1UL << idx);
+		s = &vm_phys_segs[idx];
+		if (low < s->end && high > s->start)
+			return (TRUE);
+	}
+	return (FALSE);
+}
+
 /*
  * Outputs the state of the physical memory allocator, specifically,
  * the amount of physical memory in each free list.
@@ -135,30 +179,34 @@
 {
 	struct sbuf sbuf;
 	struct vm_freelist *fl;
-	int error, flind, oind, pind;
+	int dom, error, flind, oind, pind;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
-	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
-		    "\n  ORDER (SIZE)  |  NUMBER"
-		    "\n              ", flind);
-		for (pind = 0; pind < VM_NFREEPOOL; pind++)
-			sbuf_printf(&sbuf, "  |  POOL %d", pind);
-		sbuf_printf(&sbuf, "\n--            ");
-		for (pind = 0; pind < VM_NFREEPOOL; pind++)
-			sbuf_printf(&sbuf, "-- --      ");
-		sbuf_printf(&sbuf, "--\n");
-		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
-			sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
-			    1 << (PAGE_SHIFT - 10 + oind));
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = vm_phys_free_queues[flind][pind];
-				sbuf_printf(&sbuf, "  |  %6d", fl[oind].lcnt);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
+		for (flind = 0; flind < vm_nfreelists; flind++) {
+			sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
+			    "\n  ORDER (SIZE)  |  NUMBER"
+			    "\n              ", flind);
+			for (pind = 0; pind < VM_NFREEPOOL; pind++)
+				sbuf_printf(&sbuf, "  |  POOL %d", pind);
+			sbuf_printf(&sbuf, "\n--            ");
+			for (pind = 0; pind < VM_NFREEPOOL; pind++)
+				sbuf_printf(&sbuf, "-- --      ");
+			sbuf_printf(&sbuf, "--\n");
+			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+				sbuf_printf(&sbuf, "  %2d (%6dK)", oind,
+				    1 << (PAGE_SHIFT - 10 + oind));
+				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[dom][flind][pind];
+					sbuf_printf(&sbuf, "  |  %6d",
+					    fl[oind].lcnt);
+				}
+				sbuf_printf(&sbuf, "\n");
 			}
-			sbuf_printf(&sbuf, "\n");
 		}
 	}
 	error = sbuf_finish(&sbuf);
@@ -195,78 +243,56 @@
 	return (error);
 }
 
-#if VM_NDOMAIN > 1
-/*
- * Outputs the set of free list lookup lists.
- */
-static int
-sysctl_vm_phys_lookup_lists(SYSCTL_HANDLER_ARGS)
+static void
+vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 {
-	struct sbuf sbuf;
-	int domain, error, flind, ndomains;
 
-	error = sysctl_wire_old_buffer(req, 0);
-	if (error != 0)
-		return (error);
-	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
-	ndomains = vm_nfreelists - VM_NFREELIST + 1;
-	for (domain = 0; domain < ndomains; domain++) {
-		sbuf_printf(&sbuf, "\nDOMAIN %d:\n\n", domain);
-		for (flind = 0; flind < vm_nfreelists; flind++)
-			sbuf_printf(&sbuf, "  [%d]:\t%p\n", flind,
-			    vm_phys_lookup_lists[domain][flind]);
-	}
-	error = sbuf_finish(&sbuf);
-	sbuf_delete(&sbuf);
-	return (error);
+	m->order = order;
+	if (tail)
+		TAILQ_INSERT_TAIL(&fl[order].pl, m, plinks.q);
+	else
+		TAILQ_INSERT_HEAD(&fl[order].pl, m, plinks.q);
+	fl[order].lcnt++;
 }
-#endif
-	
+
+static void
+vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
+{
+
+	TAILQ_REMOVE(&fl[order].pl, m, plinks.q);
+	fl[order].lcnt--;
+	m->order = VM_NFREEORDER;
+}
+
 /*
  * Create a physical memory segment.
  */
 static void
-_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind, int domain)
+_vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
 {
 	struct vm_phys_seg *seg;
-#ifdef VM_PHYSSEG_SPARSE
-	long pages;
-	int segind;
 
-	pages = 0;
-	for (segind = 0; segind < vm_phys_nsegs; segind++) {
-		seg = &vm_phys_segs[segind];
-		pages += atop(seg->end - seg->start);
-	}
-#endif
 	KASSERT(vm_phys_nsegs < VM_PHYSSEG_MAX,
 	    ("vm_phys_create_seg: increase VM_PHYSSEG_MAX"));
+	KASSERT(domain < vm_ndomains,
+	    ("vm_phys_create_seg: invalid domain provided"));
 	seg = &vm_phys_segs[vm_phys_nsegs++];
+	while (seg > vm_phys_segs && (seg - 1)->start >= end) {
+		*seg = *(seg - 1);
+		seg--;
+	}
 	seg->start = start;
 	seg->end = end;
 	seg->domain = domain;
-#ifdef VM_PHYSSEG_SPARSE
-	seg->first_page = &vm_page_array[pages];
-#else
-	seg->first_page = PHYS_TO_VM_PAGE(start);
-#endif
-#if VM_NDOMAIN > 1
-	if (flind == VM_FREELIST_DEFAULT && domain != 0) {
-		flind = VM_NFREELIST + (domain - 1);
-		if (flind >= vm_nfreelists)
-			vm_nfreelists = flind + 1;
-	}
-#endif
-	seg->free_queues = &vm_phys_free_queues[flind];
 }
 
 static void
-vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int flind)
+vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 {
 	int i;
 
 	if (mem_affinity == NULL) {
-		_vm_phys_create_seg(start, end, flind, 0);
+		_vm_phys_create_seg(start, end, 0);
 		return;
 	}
 
@@ -279,11 +305,11 @@
 			panic("No affinity info for start %jx",
 			    (uintmax_t)start);
 		if (mem_affinity[i].end >= end) {
-			_vm_phys_create_seg(start, end, flind,
+			_vm_phys_create_seg(start, end,
 			    mem_affinity[i].domain);
 			break;
 		}
-		_vm_phys_create_seg(start, mem_affinity[i].end, flind,
+		_vm_phys_create_seg(start, mem_affinity[i].end,
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
@@ -290,90 +316,163 @@
 }
 
 /*
+ * Add a physical memory segment.
+ */
+void
+vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
+{
+	vm_paddr_t paddr;
+
+	KASSERT((start & PAGE_MASK) == 0,
+	    ("vm_phys_define_seg: start is not page aligned"));
+	KASSERT((end & PAGE_MASK) == 0,
+	    ("vm_phys_define_seg: end is not page aligned"));
+
+	/*
+	 * Split the physical memory segment if it spans two or more free
+	 * list boundaries.
+	 */
+	paddr = start;
+#ifdef	VM_FREELIST_ISADMA
+	if (paddr < VM_ISADMA_BOUNDARY && end > VM_ISADMA_BOUNDARY) {
+		vm_phys_create_seg(paddr, VM_ISADMA_BOUNDARY);
+		paddr = VM_ISADMA_BOUNDARY;
+	}
+#endif
+#ifdef	VM_FREELIST_LOWMEM
+	if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
+		vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
+		paddr = VM_LOWMEM_BOUNDARY;
+	}
+#endif
+#ifdef	VM_FREELIST_DMA32
+	if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
+		vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
+		paddr = VM_DMA32_BOUNDARY;
+	}
+#endif
+	vm_phys_create_seg(paddr, end);
+}
+
+/*
  * Initialize the physical memory allocator.
+ *
+ * Requires that vm_page_array is initialized!
  */
 void
 vm_phys_init(void)
 {
 	struct vm_freelist *fl;
-	int flind, i, oind, pind;
-#if VM_NDOMAIN > 1
-	int ndomains, j;
+	struct vm_phys_seg *seg;
+	u_long npages;
+	int dom, flind, freelist, oind, pind, segind;
+
+	/*
+	 * Compute the number of free lists, and generate the mapping from the
+	 * manifest constants VM_FREELIST_* to the free list indices.
+	 *
+	 * Initially, the entries of vm_freelist_to_flind[] are set to either
+	 * 0 or 1 to indicate which free lists should be created.
+	 */
+	npages = 0;
+	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
+		seg = &vm_phys_segs[segind];
+#ifdef	VM_FREELIST_ISADMA
+		if (seg->end <= VM_ISADMA_BOUNDARY)
+			vm_freelist_to_flind[VM_FREELIST_ISADMA] = 1;
+		else
 #endif
+#ifdef	VM_FREELIST_LOWMEM
+		if (seg->end <= VM_LOWMEM_BOUNDARY)
+			vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
+		else
+#endif
+#ifdef	VM_FREELIST_DMA32
+		if (
+#ifdef	VM_DMA32_NPAGES_THRESHOLD
+		    /*
+		     * Create the DMA32 free list only if the amount of
+		     * physical memory above physical address 4G exceeds the
+		     * given threshold.
+		     */
+		    npages > VM_DMA32_NPAGES_THRESHOLD &&
+#endif
+		    seg->end <= VM_DMA32_BOUNDARY)
+			vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
+		else
+#endif
+		{
+			npages += atop(seg->end - seg->start);
+			vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
+		}
+	}
+	/* Change each entry into a running total of the free lists. */
+	for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
+		vm_freelist_to_flind[freelist] +=
+		    vm_freelist_to_flind[freelist - 1];
+	}
+	vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
+	KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
+	/* Change each entry into a free list index. */
+	for (freelist = 0; freelist < VM_NFREELIST; freelist++)
+		vm_freelist_to_flind[freelist]--;
 
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+	/*
+	 * Initialize the first_page and free_queues fields of each physical
+	 * memory segment.
+	 */
+#ifdef VM_PHYSSEG_SPARSE
+	npages = 0;
+#endif
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+#ifdef VM_PHYSSEG_SPARSE
+		seg->first_page = &vm_page_array[npages];
+		npages += atop(seg->end - seg->start);
+#else
+		seg->first_page = PHYS_TO_VM_PAGE(seg->start);
+#endif
 #ifdef	VM_FREELIST_ISADMA
-		if (phys_avail[i] < 16777216) {
-			if (phys_avail[i + 1] > 16777216) {
-				vm_phys_create_seg(phys_avail[i], 16777216,
-				    VM_FREELIST_ISADMA);
-				vm_phys_create_seg(16777216, phys_avail[i + 1],
-				    VM_FREELIST_DEFAULT);
-			} else {
-				vm_phys_create_seg(phys_avail[i],
-				    phys_avail[i + 1], VM_FREELIST_ISADMA);
-			}
-			if (VM_FREELIST_ISADMA >= vm_nfreelists)
-				vm_nfreelists = VM_FREELIST_ISADMA + 1;
+		if (seg->end <= VM_ISADMA_BOUNDARY) {
+			flind = vm_freelist_to_flind[VM_FREELIST_ISADMA];
+			KASSERT(flind >= 0,
+			    ("vm_phys_init: ISADMA flind < 0"));
 		} else
 #endif
-#ifdef	VM_FREELIST_HIGHMEM
-		if (phys_avail[i + 1] > VM_HIGHMEM_ADDRESS) {
-			if (phys_avail[i] < VM_HIGHMEM_ADDRESS) {
-				vm_phys_create_seg(phys_avail[i],
-				    VM_HIGHMEM_ADDRESS, VM_FREELIST_DEFAULT);
-				vm_phys_create_seg(VM_HIGHMEM_ADDRESS,
-				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
-			} else {
-				vm_phys_create_seg(phys_avail[i],
-				    phys_avail[i + 1], VM_FREELIST_HIGHMEM);
-			}
-			if (VM_FREELIST_HIGHMEM >= vm_nfreelists)
-				vm_nfreelists = VM_FREELIST_HIGHMEM + 1;
+#ifdef	VM_FREELIST_LOWMEM
+		if (seg->end <= VM_LOWMEM_BOUNDARY) {
+			flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
+			KASSERT(flind >= 0,
+			    ("vm_phys_init: LOWMEM flind < 0"));
 		} else
 #endif
-		vm_phys_create_seg(phys_avail[i], phys_avail[i + 1],
-		    VM_FREELIST_DEFAULT);
-	}
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-			fl = vm_phys_free_queues[flind][pind];
-			for (oind = 0; oind < VM_NFREEORDER; oind++)
-				TAILQ_INIT(&fl[oind].pl);
+#ifdef	VM_FREELIST_DMA32
+		if (seg->end <= VM_DMA32_BOUNDARY) {
+			flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
+			KASSERT(flind >= 0,
+			    ("vm_phys_init: DMA32 flind < 0"));
+		} else
+#endif
+		{
+			flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
+			KASSERT(flind >= 0,
+			    ("vm_phys_init: DEFAULT flind < 0"));
 		}
+		seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
 	}
-#if VM_NDOMAIN > 1
+
 	/*
-	 * Build a free list lookup list for each domain.  All of the
-	 * memory domain lists are inserted at the VM_FREELIST_DEFAULT
-	 * index in a round-robin order starting with the current
-	 * domain.
+	 * Initialize the free queues.
 	 */
-	ndomains = vm_nfreelists - VM_NFREELIST + 1;
-	for (flind = 0; flind < VM_FREELIST_DEFAULT; flind++)
-		for (i = 0; i < ndomains; i++)
-			vm_phys_lookup_lists[i][flind] =
-			    &vm_phys_free_queues[flind];
-	for (i = 0; i < ndomains; i++)
-		for (j = 0; j < ndomains; j++) {
-			flind = (i + j) % ndomains;
-			if (flind == 0)
-				flind = VM_FREELIST_DEFAULT;
-			else
-				flind += VM_NFREELIST - 1;
-			vm_phys_lookup_lists[i][VM_FREELIST_DEFAULT + j] =
-			    &vm_phys_free_queues[flind];
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		for (flind = 0; flind < vm_nfreelists; flind++) {
+			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[dom][flind][pind];
+				for (oind = 0; oind < VM_NFREEORDER; oind++)
+					TAILQ_INIT(&fl[oind].pl);
+			}
 		}
-	for (flind = VM_FREELIST_DEFAULT + 1; flind < VM_NFREELIST;
-	     flind++)
-		for (i = 0; i < ndomains; i++)
-			vm_phys_lookup_lists[i][flind + ndomains - 1] =
-			    &vm_phys_free_queues[flind];
-#else
-	for (flind = 0; flind < vm_nfreelists; flind++)
-		vm_phys_lookup_lists[0][flind] = &vm_phys_free_queues[flind];
-#endif
-
+	}
 	mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
 }
 
@@ -391,9 +490,7 @@
 		KASSERT(m_buddy->order == VM_NFREEORDER,
 		    ("vm_phys_split_pages: page %p has unexpected order %d",
 		    m_buddy, m_buddy->order));
-		m_buddy->order = oind;
-		TAILQ_INSERT_HEAD(&fl[oind].pl, m_buddy, pageq);
-		fl[oind].lcnt++;
+		vm_freelist_add(fl, m_buddy, oind, 0);
         }
 }
 
@@ -404,12 +501,17 @@
 vm_phys_add_page(vm_paddr_t pa)
 {
 	vm_page_t m;
+	struct vm_domain *vmd;
 
 	cnt.v_page_count++;
 	m = vm_phys_paddr_to_vm_page(pa);
+	m->busy_lock = VPB_UNBUSIED;
 	m->phys_addr = pa;
 	m->queue = PQ_NONE;
 	m->segind = vm_phys_paddr_to_segind(pa);
+	vmd = vm_phys_domain(m);
+	vmd->vmd_page_count++;
+	vmd->vmd_segs |= 1UL << m->segind;
 	m->flags = PG_FREE;
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_add_page: page %p has unexpected order %d",
@@ -417,7 +519,7 @@
 	m->pool = VM_FREEPOOL_DEFAULT;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
-	cnt.v_free_count++;
+	vm_phys_freecnt_adj(m, 1);
 	vm_phys_free_pages(m, 0);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -432,7 +534,7 @@
 vm_phys_alloc_pages(int pool, int order)
 {
 	vm_page_t m;
-	int domain, flind;
+	int dom, domain, flind;
 
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_pages: pool %d is out of range", pool));
@@ -439,63 +541,46 @@
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_pages: order %d is out of range", order));
 
-#if VM_NDOMAIN > 1
-	domain = PCPU_GET(domain);
-#else
-	domain = 0;
-#endif
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		m = vm_phys_alloc_domain_pages(domain, flind, pool, order);
-		if (m != NULL)
-			return (m);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		domain = vm_rr_selectdomain();
+		for (flind = 0; flind < vm_nfreelists; flind++) {
+			m = vm_phys_alloc_domain_pages(domain, flind, pool,
+			    order);
+			if (m != NULL)
+				return (m);
+		}
 	}
 	return (NULL);
 }
 
 /*
- * Find and dequeue a free page on the given free list, with the 
- * specified pool and order
+ * Allocate a contiguous, power of two-sized set of physical pages from the
+ * specified free list.  The free list must be specified using one of the
+ * manifest constants VM_FREELIST_*.
+ *
+ * The free page queues must be locked.
  */
 vm_page_t
-vm_phys_alloc_freelist_pages(int flind, int pool, int order)
+vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 {
-#if VM_NDOMAIN > 1
 	vm_page_t m;
-	int i, ndomains;
-#endif
-	int domain;
+	int dom, domain;
 
-	KASSERT(flind < VM_NFREELIST,
-	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range", flind));
+	KASSERT(freelist < VM_NFREELIST,
+	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
+	    freelist));
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
-
-#if VM_NDOMAIN > 1
-	/*
-	 * This routine expects to be called with a VM_FREELIST_* constant.
-	 * On a system with multiple domains we need to adjust the flind
-	 * appropriately.  If it is for VM_FREELIST_DEFAULT we need to
-	 * iterate over the per-domain lists.
-	 */
-	domain = PCPU_GET(domain);
-	ndomains = vm_nfreelists - VM_NFREELIST + 1;
-	if (flind == VM_FREELIST_DEFAULT) {
-		m = NULL;
-		for (i = 0; i < ndomains; i++, flind++) {
-			m = vm_phys_alloc_domain_pages(domain, flind, pool,
-			    order);
-			if (m != NULL)
-				break;
-		}
-		return (m);
-	} else if (flind > VM_FREELIST_DEFAULT)
-		flind += ndomains - 1;
-#else
-	domain = 0;
-#endif
-	return (vm_phys_alloc_domain_pages(domain, flind, pool, order));
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		domain = vm_rr_selectdomain();
+		m = vm_phys_alloc_domain_pages(domain,
+		    vm_freelist_to_flind[freelist], pool, order);
+		if (m != NULL)
+			return (m);
+	}
+	return (NULL);
 }
 
 static vm_page_t
@@ -507,13 +592,11 @@
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	fl = (*vm_phys_lookup_lists[domain][flind])[pool];
+	fl = &vm_phys_free_queues[domain][flind][pool][0];
 	for (oind = order; oind < VM_NFREEORDER; oind++) {
 		m = TAILQ_FIRST(&fl[oind].pl);
 		if (m != NULL) {
-			TAILQ_REMOVE(&fl[oind].pl, m, pageq);
-			fl[oind].lcnt--;
-			m->order = VM_NFREEORDER;
+			vm_freelist_rem(fl, m, oind);
 			vm_phys_split_pages(m, oind, fl, order);
 			return (m);
 		}
@@ -527,12 +610,10 @@
 	 */
 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
 		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-			alt = (*vm_phys_lookup_lists[domain][flind])[pind];
+			alt = &vm_phys_free_queues[domain][flind][pind][0];
 			m = TAILQ_FIRST(&alt[oind].pl);
 			if (m != NULL) {
-				TAILQ_REMOVE(&alt[oind].pl, m, pageq);
-				alt[oind].lcnt--;
-				m->order = VM_NFREEORDER;
+				vm_freelist_rem(alt, m, oind);
 				vm_phys_set_pool(pool, m, oind);
 				vm_phys_split_pages(m, oind, fl, order);
 				return (m);
@@ -543,26 +624,6 @@
 }
 
 /*
- * Allocate physical memory from phys_avail[].
- */
-vm_paddr_t
-vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment)
-{
-	vm_paddr_t pa;
-	int i;
-
-	size = round_page(size);
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
-		if (phys_avail[i + 1] - phys_avail[i] < size)
-			continue;
-		pa = phys_avail[i];
-		phys_avail[i] += size;
-		return (pa);
-	}
-	panic("vm_phys_bootstrap_alloc");
-}
-
-/*
  * Find the vm_page corresponding to the given physical address.
  */
 vm_page_t
@@ -616,7 +677,9 @@
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
-	if (pi >= first_page && atop(end) < vm_page_array_size) {
+	if (pi >= first_page && pi < vm_page_array_size + first_page) {
+		if (atop(end) >= vm_page_array_size + first_page)
+			return (EINVAL);
 		fp = &vm_page_array[pi - first_page];
 		malloced = FALSE;
 	} else
@@ -630,8 +693,8 @@
 	}
 	for (i = 0; i < page_count; i++) {
 		vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
-		pmap_page_init(&fp[i]);
-		fp[i].oflags &= ~(VPO_BUSY | VPO_UNMANAGED);
+		fp[i].oflags &= ~VPO_UNMANAGED;
+		fp[i].busy_lock = VPB_UNBUSIED;
 	}
 	mtx_lock(&vm_phys_fictitious_reg_mtx);
 	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
@@ -713,7 +776,7 @@
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
-	vm_paddr_t pa, pa_buddy;
+	vm_paddr_t pa;
 	vm_page_t m_buddy;
 
 	KASSERT(m->order == VM_NFREEORDER,
@@ -725,33 +788,71 @@
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_free_pages: order %d is out of range", order));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	pa = VM_PAGE_TO_PHYS(m);
 	seg = &vm_phys_segs[m->segind];
-	while (order < VM_NFREEORDER - 1) {
-		pa_buddy = pa ^ (1 << (PAGE_SHIFT + order));
-		if (pa_buddy < seg->start ||
-		    pa_buddy >= seg->end)
-			break;
-		m_buddy = &seg->first_page[atop(pa_buddy - seg->start)];
-		if (m_buddy->order != order)
-			break;
-		fl = (*seg->free_queues)[m_buddy->pool];
-		TAILQ_REMOVE(&fl[m_buddy->order].pl, m_buddy, pageq);
-		fl[m_buddy->order].lcnt--;
-		m_buddy->order = VM_NFREEORDER;
-		if (m_buddy->pool != m->pool)
-			vm_phys_set_pool(m->pool, m_buddy, order);
-		order++;
-		pa &= ~((1 << (PAGE_SHIFT + order)) - 1);
-		m = &seg->first_page[atop(pa - seg->start)];
+	if (order < VM_NFREEORDER - 1) {
+		pa = VM_PAGE_TO_PHYS(m);
+		do {
+			pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
+			if (pa < seg->start || pa >= seg->end)
+				break;
+			m_buddy = &seg->first_page[atop(pa - seg->start)];
+			if (m_buddy->order != order)
+				break;
+			fl = (*seg->free_queues)[m_buddy->pool];
+			vm_freelist_rem(fl, m_buddy, order);
+			if (m_buddy->pool != m->pool)
+				vm_phys_set_pool(m->pool, m_buddy, order);
+			order++;
+			pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
+			m = &seg->first_page[atop(pa - seg->start)];
+		} while (order < VM_NFREEORDER - 1);
 	}
-	m->order = order;
 	fl = (*seg->free_queues)[m->pool];
-	TAILQ_INSERT_TAIL(&fl[order].pl, m, pageq);
-	fl[order].lcnt++;
+	vm_freelist_add(fl, m, order, 1);
 }
 
 /*
+ * Free a contiguous, arbitrarily sized set of physical pages.
+ *
+ * The free page queues must be locked.
+ */
+void
+vm_phys_free_contig(vm_page_t m, u_long npages)
+{
+	u_int n;
+	int order;
+
+	/*
+	 * Avoid unnecessary coalescing by freeing the pages in the largest
+	 * possible power-of-two-sized subsets.
+	 */
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	for (;; npages -= n) {
+		/*
+		 * Unsigned "min" is used here so that "order" is assigned
+		 * "VM_NFREEORDER - 1" when "m"'s physical address is zero
+		 * or the low-order bits of its physical address are zero
+		 * because the size of a physical address exceeds the size of
+		 * a long.
+		 */
+		order = min(ffsl(VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) - 1,
+		    VM_NFREEORDER - 1);
+		n = 1 << order;
+		if (npages < n)
+			break;
+		vm_phys_free_pages(m, order);
+		m += n;
+	}
+	/* The residual "npages" is less than "1 << (VM_NFREEORDER - 1)". */
+	for (; npages > 0; npages -= n) {
+		order = flsl(npages) - 1;
+		n = 1 << order;
+		vm_phys_free_pages(m, order);
+		m += n;
+	}
+}
+
+/*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
@@ -812,9 +913,7 @@
 	 */
 	fl = (*seg->free_queues)[m_set->pool];
 	order = m_set->order;
-	TAILQ_REMOVE(&fl[order].pl, m_set, pageq);
-	fl[order].lcnt--;
-	m_set->order = VM_NFREEORDER;
+	vm_freelist_rem(fl, m_set, order);
 	while (order > 0) {
 		order--;
 		pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
@@ -824,9 +923,7 @@
 			m_tmp = m_set;
 			m_set = &seg->first_page[atop(pa_half - seg->start)];
 		}
-		m_tmp->order = order;
-		TAILQ_INSERT_HEAD(&fl[order].pl, m_tmp, pageq);
-		fl[order].lcnt++;
+		vm_freelist_add(fl, m_tmp, order, 0);
 	}
 	KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
 	return (TRUE);
@@ -838,22 +935,25 @@
 boolean_t
 vm_phys_zero_pages_idle(void)
 {
-	static struct vm_freelist *fl = vm_phys_free_queues[0][0];
+	static struct vm_freelist *fl;
 	static int flind, oind, pind;
 	vm_page_t m, m_tmp;
+	int domain;
 
+	domain = vm_rr_selectdomain();
+	fl = vm_phys_free_queues[domain][0][0];
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	for (;;) {
-		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, pageq) {
+		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
 			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
 				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
 					vm_phys_unfree_page(m_tmp);
-					cnt.v_free_count--;
+					vm_phys_freecnt_adj(m, -1);
 					mtx_unlock(&vm_page_queue_free_mtx);
 					pmap_zero_page_idle(m_tmp);
 					m_tmp->flags |= PG_ZERO;
 					mtx_lock(&vm_page_queue_free_mtx);
-					cnt.v_free_count++;
+					vm_phys_freecnt_adj(m, 1);
 					vm_phys_free_pages(m_tmp, 0);
 					vm_page_zero_count++;
 					cnt_prezero++;
@@ -871,7 +971,7 @@
 				if (flind == vm_nfreelists)
 					flind = 0;
 			}
-			fl = vm_phys_free_queues[flind][pind];
+			fl = vm_phys_free_queues[domain][flind][pind];
 		}
 	}
 }
@@ -887,21 +987,17 @@
  * "alignment" and "boundary" must be a power of two.
  */
 vm_page_t
-vm_phys_alloc_contig(unsigned long npages, vm_paddr_t low, vm_paddr_t high,
-    unsigned long alignment, unsigned long boundary)
+vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
-	struct vnode *vp;
 	vm_paddr_t pa, pa_last, size;
-	vm_page_t deferred_vdrop_list, m, m_ret;
-	int domain, flind, i, oind, order, pind;
+	vm_page_t m, m_ret;
+	u_long npages_end;
+	int dom, domain, flind, oind, order, pind;
 
-#if VM_NDOMAIN > 1
-	domain = PCPU_GET(domain);
-#else
-	domain = 0;
-#endif
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	size = npages << PAGE_SHIFT;
 	KASSERT(size != 0,
 	    ("vm_phys_alloc_contig: size must not be 0"));
@@ -909,19 +1005,16 @@
 	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
 	KASSERT((boundary & (boundary - 1)) == 0,
 	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
-	deferred_vdrop_list = NULL;
 	/* Compute the queue that is the best fit for npages. */
 	for (order = 0; (1 << order) < npages; order++);
-	mtx_lock(&vm_page_queue_free_mtx);
-#if VM_NRESERVLEVEL > 0
-retry:
-#endif
+	dom = 0;
+restartdom:
+	domain = vm_rr_selectdomain();
 	for (flind = 0; flind < vm_nfreelists; flind++) {
 		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
 			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = (*vm_phys_lookup_lists[domain][flind])
-				    [pind];
-				TAILQ_FOREACH(m_ret, &fl[oind].pl, pageq) {
+				fl = &vm_phys_free_queues[domain][flind][pind][0];
+				TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
 					/*
 					 * A free list may contain physical pages
 					 * from one or more segments.
@@ -974,51 +1067,22 @@
 			}
 		}
 	}
-#if VM_NRESERVLEVEL > 0
-	if (vm_reserv_reclaim_contig(size, low, high, alignment, boundary))
-		goto retry;
-#endif
-	mtx_unlock(&vm_page_queue_free_mtx);
+	if (++dom < vm_ndomains)
+		goto restartdom;
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {
 		fl = (*seg->free_queues)[m->pool];
-		TAILQ_REMOVE(&fl[m->order].pl, m, pageq);
-		fl[m->order].lcnt--;
-		m->order = VM_NFREEORDER;
+		vm_freelist_rem(fl, m, m->order);
 	}
 	if (m_ret->pool != VM_FREEPOOL_DEFAULT)
 		vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m_ret, oind);
 	fl = (*seg->free_queues)[m_ret->pool];
 	vm_phys_split_pages(m_ret, oind, fl, order);
-	for (i = 0; i < npages; i++) {
-		m = &m_ret[i];
-		vp = vm_page_alloc_init(m);
-		if (vp != NULL) {
-			/*
-			 * Enqueue the vnode for deferred vdrop().
-			 *
-			 * Unmanaged pages don't use "pageq", so it
-			 * can be safely abused to construct a short-
-			 * lived queue of vnodes.
-			 */
-			m->pageq.tqe_prev = (void *)vp;
-			m->pageq.tqe_next = deferred_vdrop_list;
-			deferred_vdrop_list = m;
-		}
-	}
-	for (; i < roundup2(npages, 1 << imin(oind, order)); i++) {
-		m = &m_ret[i];
-		KASSERT(m->order == VM_NFREEORDER,
-		    ("vm_phys_alloc_contig: page %p has unexpected order %d",
-		    m, m->order));
-		vm_phys_free_pages(m, 0);
-	}
-	mtx_unlock(&vm_page_queue_free_mtx);
-	while (deferred_vdrop_list != NULL) {
-		vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
-		deferred_vdrop_list = deferred_vdrop_list->pageq.tqe_next;
-	}
+	/* Return excess pages to the free lists. */
+	npages_end = roundup2(npages, 1 << imin(oind, order));
+	if (npages < npages_end)
+		vm_phys_free_contig(&m_ret[npages], npages_end - npages);
 	return (m_ret);
 }
 
@@ -1029,24 +1093,28 @@
 DB_SHOW_COMMAND(freepages, db_show_freepages)
 {
 	struct vm_freelist *fl;
-	int flind, oind, pind;
+	int flind, oind, pind, dom;
 
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		db_printf("FREE LIST %d:\n"
-		    "\n  ORDER (SIZE)  |  NUMBER"
-		    "\n              ", flind);
-		for (pind = 0; pind < VM_NFREEPOOL; pind++)
-			db_printf("  |  POOL %d", pind);
-		db_printf("\n--            ");
-		for (pind = 0; pind < VM_NFREEPOOL; pind++)
-			db_printf("-- --      ");
-		db_printf("--\n");
-		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
-			db_printf("  %2.2d (%6.6dK)", oind,
-			    1 << (PAGE_SHIFT - 10 + oind));
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = vm_phys_free_queues[flind][pind];
-				db_printf("  |  %6.6d", fl[oind].lcnt);
+	for (dom = 0; dom < vm_ndomains; dom++) {
+		db_printf("DOMAIN: %d\n", dom);
+		for (flind = 0; flind < vm_nfreelists; flind++) {
+			db_printf("FREE LIST %d:\n"
+			    "\n  ORDER (SIZE)  |  NUMBER"
+			    "\n              ", flind);
+			for (pind = 0; pind < VM_NFREEPOOL; pind++)
+				db_printf("  |  POOL %d", pind);
+			db_printf("\n--            ");
+			for (pind = 0; pind < VM_NFREEPOOL; pind++)
+				db_printf("-- --      ");
+			db_printf("--\n");
+			for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
+				db_printf("  %2.2d (%6.6dK)", oind,
+				    1 << (PAGE_SHIFT - 10 + oind));
+				for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+				fl = vm_phys_free_queues[dom][flind][pind];
+					db_printf("  |  %6.6d", fl[oind].lcnt);
+				}
+				db_printf("\n");
 			}
 			db_printf("\n");
 		}

Modified: trunk/sys/vm/vm_phys.h
===================================================================
--- trunk/sys/vm/vm_phys.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_phys.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc at cs.rice.edu>
@@ -28,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $
  */
 
 /*
@@ -47,24 +48,76 @@
 	int domain;
 };
 
+struct vm_freelist {
+	struct pglist pl;
+	int lcnt;
+};
+
+struct vm_phys_seg {
+	vm_paddr_t	start;
+	vm_paddr_t	end;
+	vm_page_t	first_page;
+	int		domain;
+	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
+};
+
 extern struct mem_affinity *mem_affinity;
+extern int vm_ndomains;
+extern struct vm_phys_seg vm_phys_segs[];
+extern int vm_phys_nsegs;
 
+/*
+ * The following functions are only to be used by the virtual memory system.
+ */
 void vm_phys_add_page(vm_paddr_t pa);
-vm_page_t vm_phys_alloc_contig(unsigned long npages,
-    vm_paddr_t low, vm_paddr_t high,
-    unsigned long alignment, unsigned long boundary);
-vm_page_t vm_phys_alloc_freelist_pages(int flind, int pool, int order);
+void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
+vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary);
+vm_page_t vm_phys_alloc_freelist_pages(int freelist, int pool, int order);
 vm_page_t vm_phys_alloc_pages(int pool, int order);
-vm_paddr_t vm_phys_bootstrap_alloc(vm_size_t size, unsigned long alignment);
+boolean_t vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high);
 int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr);
 void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa);
+void vm_phys_free_contig(vm_page_t m, u_long npages);
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
+vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);
 
+/*
+ *	vm_phys_domain:
+ *
+ * 	Return the memory domain the page belongs to.
+ */
+static inline struct vm_domain *
+vm_phys_domain(vm_page_t m)
+{
+#if MAXMEMDOM > 1
+	int domn, segind;
+
+	/* XXXKIB try to assert that the page is managed */
+	segind = m->segind;
+	KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
+	domn = vm_phys_segs[segind].domain;
+	KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
+	return (&vm_dom[domn]);
+#else
+	return (&vm_dom[0]);
+#endif
+}
+
+static inline void
+vm_phys_freecnt_adj(vm_page_t m, int adj)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	cnt.v_free_count += adj;
+	vm_phys_domain(m)->vmd_free_count += adj;
+}
+
 #endif	/* _KERNEL */
 #endif	/* !_VM_PHYS_H_ */

Added: trunk/sys/vm/vm_radix.c
===================================================================
--- trunk/sys/vm/vm_radix.c	                        (rev 0)
+++ trunk/sys/vm/vm_radix.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,857 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * Path-compressed radix trie implementation.
+ * The following code is not generalized into a general purpose library
+ * because there are way too many parameters embedded that should really
+ * be decided by the library consumers.  At the same time, consumers
+ * of this code must achieve highest possible performance.
+ *
+ * The implementation takes into account the following rationale:
+ * - Size of the nodes should be as small as possible but still big enough
+ *   to avoid a large maximum depth for the trie.  This is a balance
+ *   between the necessity to not wire too much physical memory for the nodes
+ *   and the necessity to avoid too much cache pollution during the trie
+ *   operations.
+ * - There is not a huge bias toward the number of lookup operations over
+ *   the number of insert and remove operations.  This basically implies
+ *   that optimizations supposedly helping one operation but hurting the
+ *   other might be carefully evaluated.
+ * - On average not many nodes are expected to be fully populated, hence
+ *   level compression may just complicate things.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vmmeter.h>
+
+#include <vm/uma.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_page.h>
+#include <vm/vm_radix.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * These widths should allow the pointers to a node's children to fit within
+ * a single cache line.  The extra levels from a narrow width should not be
+ * a problem thanks to path compression.
+ */
+#ifdef __LP64__
+#define	VM_RADIX_WIDTH	4
+#else
+#define	VM_RADIX_WIDTH	3
+#endif
+
+#define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
+#define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
+#define	VM_RADIX_LIMIT							\
+	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
+
+/* Flag bits stored in node pointers. */
+#define	VM_RADIX_ISLEAF	0x1
+#define	VM_RADIX_FLAGS	0x1
+#define	VM_RADIX_PAD	VM_RADIX_FLAGS
+
+/* Returns one unit associated with specified level. */
+#define	VM_RADIX_UNITLEVEL(lev)						\
+	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
+
+struct vm_radix_node {
+	vm_pindex_t	 rn_owner;			/* Owner of record. */
+	uint16_t	 rn_count;			/* Valid children. */
+	uint16_t	 rn_clev;			/* Current level. */
+	void		*rn_child[VM_RADIX_COUNT];	/* Child nodes. */
+};
+
+static uma_zone_t vm_radix_node_zone;
+
+/*
+ * Allocate a radix node.
+ */
+static __inline struct vm_radix_node *
+vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
+{
+	struct vm_radix_node *rnode;
+
+	rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
+	if (rnode == NULL)
+		return (NULL);
+	rnode->rn_owner = owner;
+	rnode->rn_count = count;
+	rnode->rn_clev = clevel;
+	return (rnode);
+}
+
+/*
+ * Free radix node.
+ */
+static __inline void
+vm_radix_node_put(struct vm_radix_node *rnode)
+{
+
+	uma_zfree(vm_radix_node_zone, rnode);
+}
+
+/*
+ * Return the position in the array for a given level.
+ */
+static __inline int
+vm_radix_slot(vm_pindex_t index, uint16_t level)
+{
+
+	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
+}
+
+/* Trims the key after the specified level. */
+static __inline vm_pindex_t
+vm_radix_trimkey(vm_pindex_t index, uint16_t level)
+{
+	vm_pindex_t ret;
+
+	ret = index;
+	if (level > 0) {
+		ret >>= level * VM_RADIX_WIDTH;
+		ret <<= level * VM_RADIX_WIDTH;
+	}
+	return (ret);
+}
+
+/*
+ * Get the root node for a radix tree.
+ */
+static __inline struct vm_radix_node *
+vm_radix_getroot(struct vm_radix *rtree)
+{
+
+	return ((struct vm_radix_node *)rtree->rt_root);
+}
+
+/*
+ * Set the root node for a radix tree.
+ */
+static __inline void
+vm_radix_setroot(struct vm_radix *rtree, struct vm_radix_node *rnode)
+{
+
+	rtree->rt_root = (uintptr_t)rnode;
+}
+
+/*
+ * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
+ */
+static __inline boolean_t
+vm_radix_isleaf(struct vm_radix_node *rnode)
+{
+
+	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
+}
+
+/*
+ * Returns the associated page extracted from rnode.
+ */
+static __inline vm_page_t
+vm_radix_topage(struct vm_radix_node *rnode)
+{
+
+	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
+}
+
+/*
+ * Adds the page as a child of the provided node.
+ */
+static __inline void
+vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
+    vm_page_t page)
+{
+	int slot;
+
+	slot = vm_radix_slot(index, clev);
+	rnode->rn_child[slot] = (void *)((uintptr_t)page | VM_RADIX_ISLEAF);
+}
+
+/*
+ * Returns the slot where two keys differ.
+ * It cannot accept 2 equal keys.
+ */
+static __inline uint16_t
+vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
+{
+	uint16_t clev;
+
+	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
+	    __func__, (uintmax_t)index1));
+
+	index1 ^= index2;
+	for (clev = VM_RADIX_LIMIT;; clev--)
+		if (vm_radix_slot(index1, clev) != 0)
+			return (clev);
+}
+
+/*
+ * Returns TRUE if it can be determined that key does not belong to the
+ * specified rnode.  Otherwise, returns FALSE.
+ */
+static __inline boolean_t
+vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
+{
+
+	if (rnode->rn_clev < VM_RADIX_LIMIT) {
+		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
+		return (idx != rnode->rn_owner);
+	}
+	return (FALSE);
+}
+
+/*
+ * Internal helper for vm_radix_reclaim_allnodes().
+ * This function is recursive.
+ */
+static void
+vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
+{
+	int slot;
+
+	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
+	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
+	for (slot = 0; rnode->rn_count != 0; slot++) {
+		if (rnode->rn_child[slot] == NULL)
+			continue;
+		if (!vm_radix_isleaf(rnode->rn_child[slot]))
+			vm_radix_reclaim_allnodes_int(rnode->rn_child[slot]);
+		rnode->rn_child[slot] = NULL;
+		rnode->rn_count--;
+	}
+	vm_radix_node_put(rnode);
+}
+
+#ifdef INVARIANTS
+/*
+ * Radix node zone destructor.
+ */
+static void
+vm_radix_node_zone_dtor(void *mem, int size __unused, void *arg __unused)
+{
+	struct vm_radix_node *rnode;
+	int slot;
+
+	rnode = mem;
+	KASSERT(rnode->rn_count == 0,
+	    ("vm_radix_node_put: rnode %p has %d children", rnode,
+	    rnode->rn_count));
+	for (slot = 0; slot < VM_RADIX_COUNT; slot++)
+		KASSERT(rnode->rn_child[slot] == NULL,
+		    ("vm_radix_node_put: rnode %p has a child", rnode));
+}
+#endif
+
+#ifndef UMA_MD_SMALL_ALLOC
+/*
+ * Reserve the KVA necessary to satisfy the node allocation.
+ * This is mandatory in architectures not supporting direct
+ * mapping as they will need otherwise to carve into the kernel maps for
+ * every node allocation, resulting into deadlocks for consumers already
+ * working with kernel maps.
+ */
+static void
+vm_radix_reserve_kva(void *arg __unused)
+{
+
+	/*
+	 * Calculate the number of reserved nodes, discounting the pages that
+	 * are needed to store them.
+	 */
+	if (!uma_zone_reserve_kva(vm_radix_node_zone,
+	    ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+	    sizeof(struct vm_radix_node))))
+		panic("%s: unable to reserve KVA", __func__);
+}
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+    vm_radix_reserve_kva, NULL);
+#endif
+
+/*
+ * Initialize the UMA slab zone.
+ * Until vm_radix_prealloc() is called, the zone will be served by the
+ * UMA boot-time pre-allocated pool of pages.
+ */
+void
+vm_radix_init(void)
+{
+
+	vm_radix_node_zone = uma_zcreate("RADIX NODE",
+	    sizeof(struct vm_radix_node), NULL,
+#ifdef INVARIANTS
+	    vm_radix_node_zone_dtor,
+#else
+	    NULL,
+#endif
+	    NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
+}
+
+/*
+ * Inserts the key-value pair into the trie.
+ * Panics if the key already exists.
+ */
+int
+vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
+{
+	vm_pindex_t index, newind;
+	void **parentp;
+	struct vm_radix_node *rnode, *tmp;
+	vm_page_t m;
+	int slot;
+	uint16_t clev;
+
+	index = page->pindex;
+
+restart:
+
+	/*
+	 * The owner of record for root is not really important because it
+	 * will never be used.
+	 */
+	rnode = vm_radix_getroot(rtree);
+	if (rnode == NULL) {
+		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
+		return (0);
+	}
+	parentp = (void **)&rtree->rt_root;
+	for (;;) {
+		if (vm_radix_isleaf(rnode)) {
+			m = vm_radix_topage(rnode);
+			if (m->pindex == index)
+				panic("%s: key %jx is already present",
+				    __func__, (uintmax_t)index);
+			clev = vm_radix_keydiff(m->pindex, index);
+
+			/*
+			 * During node allocation the trie that is being
+			 * walked can be modified because of recursing radix
+			 * trie operations.
+			 * If this is the case, the recursing functions signal
+			 * such situation and the insert operation must
+			 * start from scratch again.
+			 * The freed radix node will then be in the UMA
+			 * caches very likely to avoid the same situation
+			 * to happen.
+			 */
+			rtree->rt_flags |= RT_INSERT_INPROG;
+			tmp = vm_radix_node_get(vm_radix_trimkey(index,
+			    clev + 1), 2, clev);
+			rtree->rt_flags &= ~RT_INSERT_INPROG;
+			if (tmp == NULL) {
+				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+				return (ENOMEM);
+			}
+			if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+				tmp->rn_count = 0;
+				vm_radix_node_put(tmp);
+				goto restart;
+			}
+			*parentp = tmp;
+			vm_radix_addpage(tmp, index, clev, page);
+			vm_radix_addpage(tmp, m->pindex, clev, m);
+			return (0);
+		} else if (vm_radix_keybarr(rnode, index))
+			break;
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		if (rnode->rn_child[slot] == NULL) {
+			rnode->rn_count++;
+			vm_radix_addpage(rnode, index, rnode->rn_clev, page);
+			return (0);
+		}
+		parentp = &rnode->rn_child[slot];
+		rnode = rnode->rn_child[slot];
+	}
+
+	/*
+	 * A new node is needed because the right insertion level is reached.
+	 * Setup the new intermediate node and add the 2 children: the
+	 * new object and the older edge.
+	 */
+	newind = rnode->rn_owner;
+	clev = vm_radix_keydiff(newind, index);
+
+	/* See the comments above. */
+	rtree->rt_flags |= RT_INSERT_INPROG;
+	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
+	rtree->rt_flags &= ~RT_INSERT_INPROG;
+	if (tmp == NULL) {
+		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+		return (ENOMEM);
+	}
+	if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+		tmp->rn_count = 0;
+		vm_radix_node_put(tmp);
+		goto restart;
+	}
+	*parentp = tmp;
+	vm_radix_addpage(tmp, index, clev, page);
+	slot = vm_radix_slot(newind, clev);
+	tmp->rn_child[slot] = rnode;
+	return (0);
+}
+
+/*
+ * Returns TRUE if the specified radix tree contains a single leaf and FALSE
+ * otherwise.
+ */
+boolean_t
+vm_radix_is_singleton(struct vm_radix *rtree)
+{
+	struct vm_radix_node *rnode;
+
+	rnode = vm_radix_getroot(rtree);
+	if (rnode == NULL)
+		return (FALSE);
+	return (vm_radix_isleaf(rnode));
+}
+
+/*
+ * Returns the value stored at the index.  If the index is not present,
+ * NULL is returned.
+ */
+vm_page_t
+vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
+{
+	struct vm_radix_node *rnode;
+	vm_page_t m;
+	int slot;
+
+	rnode = vm_radix_getroot(rtree);
+	while (rnode != NULL) {
+		if (vm_radix_isleaf(rnode)) {
+			m = vm_radix_topage(rnode);
+			if (m->pindex == index)
+				return (m);
+			else
+				break;
+		} else if (vm_radix_keybarr(rnode, index))
+			break;
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		rnode = rnode->rn_child[slot];
+	}
+	return (NULL);
+}
+
+/*
+ * Look up the nearest entry at a position bigger than or equal to index.
+ */
+vm_page_t
+vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
+{
+	struct vm_radix_node *stack[VM_RADIX_LIMIT];
+	vm_pindex_t inc;
+	vm_page_t m;
+	struct vm_radix_node *child, *rnode;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	rnode = vm_radix_getroot(rtree);
+	if (rnode == NULL)
+		return (NULL);
+	else if (vm_radix_isleaf(rnode)) {
+		m = vm_radix_topage(rnode);
+		if (m->pindex >= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the smallest key
+		 * in the current node (if the owner is bigger than the
+		 * search key).
+		 */
+		if (vm_radix_keybarr(rnode, index)) {
+			if (index > rnode->rn_owner) {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("vm_radix_lookup_ge: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					rnode = stack[--tos];
+				} while (vm_radix_slot(index,
+				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is less than VM_RADIX_COUNT - 1.
+				 */
+				index = vm_radix_trimkey(index,
+				    rnode->rn_clev);
+				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
+			} else
+				index = rnode->rn_owner;
+			KASSERT(!vm_radix_keybarr(rnode, index),
+			    ("vm_radix_lookup_ge: keybarr failed"));
+		}
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		child = rnode->rn_child[slot];
+		if (vm_radix_isleaf(child)) {
+			m = vm_radix_topage(child);
+			if (m->pindex >= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or page within the current
+		 * bisection node.
+		 */
+                if (slot < (VM_RADIX_COUNT - 1)) {
+			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
+			index = vm_radix_trimkey(index, rnode->rn_clev);
+			do {
+				index += inc;
+				slot++;
+				child = rnode->rn_child[slot];
+				if (vm_radix_isleaf(child)) {
+					m = vm_radix_topage(child);
+					if (m->pindex >= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot < (VM_RADIX_COUNT - 1));
+		}
+		KASSERT(child == NULL || vm_radix_isleaf(child),
+		    ("vm_radix_lookup_ge: child is radix node"));
+
+		/*
+		 * If a page or edge bigger than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(rnode->rn_clev > 0,
+		    ("vm_radix_lookup_ge: pushing leaf's parent"));
+		KASSERT(tos < VM_RADIX_LIMIT,
+		    ("vm_radix_lookup_ge: stack overflow"));
+		stack[tos++] = rnode;
+		rnode = child;
+	}
+}
+
+/*
+ * Look up the nearest entry at a position less than or equal to index.
+ */
+vm_page_t
+vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
+{
+	struct vm_radix_node *stack[VM_RADIX_LIMIT];
+	vm_pindex_t inc;
+	vm_page_t m;
+	struct vm_radix_node *child, *rnode;
+#ifdef INVARIANTS
+	int loops = 0;
+#endif
+	int slot, tos;
+
+	rnode = vm_radix_getroot(rtree);
+	if (rnode == NULL)
+		return (NULL);
+	else if (vm_radix_isleaf(rnode)) {
+		m = vm_radix_topage(rnode);
+		if (m->pindex <= index)
+			return (m);
+		else
+			return (NULL);
+	}
+	tos = 0;
+	for (;;) {
+		/*
+		 * If the keys differ before the current bisection node,
+		 * then the search key might rollback to the earliest
+		 * available bisection node or to the largest key
+		 * in the current node (if the owner is smaller than the
+		 * search key).
+		 */
+		if (vm_radix_keybarr(rnode, index)) {
+			if (index > rnode->rn_owner) {
+				index = rnode->rn_owner + VM_RADIX_COUNT *
+				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
+			} else {
+ascend:
+				KASSERT(++loops < 1000,
+				    ("vm_radix_lookup_le: too many loops"));
+
+				/*
+				 * Pop nodes from the stack until either the
+				 * stack is empty or a node that could have a
+				 * matching descendant is found.
+				 */
+				do {
+					if (tos == 0)
+						return (NULL);
+					rnode = stack[--tos];
+				} while (vm_radix_slot(index,
+				    rnode->rn_clev) == 0);
+
+				/*
+				 * The following computation cannot overflow
+				 * because index's slot at the current level
+				 * is greater than 0.
+				 */
+				index = vm_radix_trimkey(index,
+				    rnode->rn_clev);
+			}
+			index--;
+			KASSERT(!vm_radix_keybarr(rnode, index),
+			    ("vm_radix_lookup_le: keybarr failed"));
+		}
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		child = rnode->rn_child[slot];
+		if (vm_radix_isleaf(child)) {
+			m = vm_radix_topage(child);
+			if (m->pindex <= index)
+				return (m);
+		} else if (child != NULL)
+			goto descend;
+
+		/*
+		 * Look for an available edge or page within the current
+		 * bisection node.
+		 */
+		if (slot > 0) {
+			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
+			index |= inc - 1;
+			do {
+				index -= inc;
+				slot--;
+				child = rnode->rn_child[slot];
+				if (vm_radix_isleaf(child)) {
+					m = vm_radix_topage(child);
+					if (m->pindex <= index)
+						return (m);
+				} else if (child != NULL)
+					goto descend;
+			} while (slot > 0);
+		}
+		KASSERT(child == NULL || vm_radix_isleaf(child),
+		    ("vm_radix_lookup_le: child is radix node"));
+
+		/*
+		 * If a page or edge smaller than the search slot is not found
+		 * in the current node, ascend to the next higher-level node.
+		 */
+		goto ascend;
+descend:
+		KASSERT(rnode->rn_clev > 0,
+		    ("vm_radix_lookup_le: pushing leaf's parent"));
+		KASSERT(tos < VM_RADIX_LIMIT,
+		    ("vm_radix_lookup_le: stack overflow"));
+		stack[tos++] = rnode;
+		rnode = child;
+	}
+}
+
+/*
+ * Remove the specified index from the tree.
+ * Panics if the key is not present.
+ */
+void
+vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
+{
+	struct vm_radix_node *rnode, *parent;
+	vm_page_t m;
+	int i, slot;
+
+	/*
+	 * Detect if a page is going to be removed from a trie which is
+	 * already undergoing another trie operation.
+	 * Right now this is only possible for vm_radix_remove() recursing
+	 * into vm_radix_insert().
+	 * If this is the case, the caller must be notified about this
+	 * situation.  It will also takecare to update the RT_TRIE_MODIFIED
+	 * accordingly.
+	 * The RT_TRIE_MODIFIED bit is set here because the remove operation
+	 * will always succeed.
+	 */
+	if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
+		rtree->rt_flags |= RT_TRIE_MODIFIED;
+
+	rnode = vm_radix_getroot(rtree);
+	if (vm_radix_isleaf(rnode)) {
+		m = vm_radix_topage(rnode);
+		if (m->pindex != index)
+			panic("%s: invalid key found", __func__);
+		vm_radix_setroot(rtree, NULL);
+		return;
+	}
+	parent = NULL;
+	for (;;) {
+		if (rnode == NULL)
+			panic("vm_radix_remove: impossible to locate the key");
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		if (vm_radix_isleaf(rnode->rn_child[slot])) {
+			m = vm_radix_topage(rnode->rn_child[slot]);
+			if (m->pindex != index)
+				panic("%s: invalid key found", __func__);
+			rnode->rn_child[slot] = NULL;
+			rnode->rn_count--;
+			if (rnode->rn_count > 1)
+				break;
+			for (i = 0; i < VM_RADIX_COUNT; i++)
+				if (rnode->rn_child[i] != NULL)
+					break;
+			KASSERT(i != VM_RADIX_COUNT,
+			    ("%s: invalid node configuration", __func__));
+			if (parent == NULL)
+				vm_radix_setroot(rtree, rnode->rn_child[i]);
+			else {
+				slot = vm_radix_slot(index, parent->rn_clev);
+				KASSERT(parent->rn_child[slot] == rnode,
+				    ("%s: invalid child value", __func__));
+				parent->rn_child[slot] = rnode->rn_child[i];
+			}
+			rnode->rn_count--;
+			rnode->rn_child[i] = NULL;
+			vm_radix_node_put(rnode);
+			break;
+		}
+		parent = rnode;
+		rnode = rnode->rn_child[slot];
+	}
+}
+
+/*
+ * Remove and free all the nodes from the radix tree.
+ * This function is recursive but there is a tight control on it as the
+ * maximum depth of the tree is fixed.
+ */
+void
+vm_radix_reclaim_allnodes(struct vm_radix *rtree)
+{
+	struct vm_radix_node *root;
+
+	KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
+	    ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
+
+	root = vm_radix_getroot(rtree);
+	if (root == NULL)
+		return;
+	vm_radix_setroot(rtree, NULL);
+	if (!vm_radix_isleaf(root))
+		vm_radix_reclaim_allnodes_int(root);
+}
+
+/*
+ * Replace an existing page in the trie with another one.
+ * Panics if there is not an old page in the trie at the new page's index.
+ */
+vm_page_t
+vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
+{
+	struct vm_radix_node *rnode;
+	vm_page_t m;
+	vm_pindex_t index;
+	int slot;
+
+	index = newpage->pindex;
+	rnode = vm_radix_getroot(rtree);
+	if (rnode == NULL)
+		panic("%s: replacing page on an empty trie", __func__);
+	if (vm_radix_isleaf(rnode)) {
+		m = vm_radix_topage(rnode);
+		if (m->pindex != index)
+			panic("%s: original replacing root key not found",
+			    __func__);
+		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
+		return (m);
+	}
+	for (;;) {
+		slot = vm_radix_slot(index, rnode->rn_clev);
+		if (vm_radix_isleaf(rnode->rn_child[slot])) {
+			m = vm_radix_topage(rnode->rn_child[slot]);
+			if (m->pindex == index) {
+				rnode->rn_child[slot] =
+				    (void *)((uintptr_t)newpage |
+				    VM_RADIX_ISLEAF);
+				return (m);
+			} else
+				break;
+		} else if (rnode->rn_child[slot] == NULL ||
+		    vm_radix_keybarr(rnode->rn_child[slot], index))
+			break;
+		rnode = rnode->rn_child[slot];
+	}
+	panic("%s: original replacing page not found", __func__);
+}
+
+#ifdef DDB
+/*
+ * Show details about the given radix node.
+ */
+DB_SHOW_COMMAND(radixnode, db_show_radixnode)
+{
+	struct vm_radix_node *rnode;
+	int i;
+
+        if (!have_addr)
+                return;
+	rnode = (struct vm_radix_node *)addr;
+	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
+	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
+	    rnode->rn_clev);
+	for (i = 0; i < VM_RADIX_COUNT; i++)
+		if (rnode->rn_child[i] != NULL)
+			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
+			    i, (void *)rnode->rn_child[i],
+			    vm_radix_isleaf(rnode->rn_child[i]) ?
+			    vm_radix_topage(rnode->rn_child[i]) : NULL,
+			    rnode->rn_clev);
+}
+#endif /* DDB */


Property changes on: trunk/sys/vm/vm_radix.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_radix.h
===================================================================
--- trunk/sys/vm/vm_radix.h	                        (rev 0)
+++ trunk/sys/vm/vm_radix.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,50 @@
+/* $MidnightBSD$ */
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * Copyright (c) 2011 Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2008 Mayur Shardul <mayur.shardul at gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $
+ */
+
+#ifndef _VM_RADIX_H_
+#define _VM_RADIX_H_
+
+#include <vm/_vm_radix.h>
+
+#ifdef _KERNEL
+
+void		vm_radix_init(void);
+int		vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+boolean_t	vm_radix_is_singleton(struct vm_radix *rtree);
+vm_page_t	vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t	vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t	vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
+void		vm_radix_reclaim_allnodes(struct vm_radix *rtree);
+void		vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t	vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage);
+
+#endif /* _KERNEL */
+#endif /* !_VM_RADIX_H_ */


Property changes on: trunk/sys/vm/vm_radix.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_reserv.c
===================================================================
--- trunk/sys/vm/vm_reserv.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_reserv.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
@@ -31,10 +32,13 @@
 
 /*
  *	Superpage reservation management module
+ *
+ * Any external functions defined by this module are only to be used by the
+ * virtual memory system.
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $");
 
 #include "opt_vm.h"
 
@@ -44,6 +48,7 @@
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
+#include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -53,6 +58,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 
 /*
@@ -224,6 +230,11 @@
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
+	} else {
+		KASSERT(rv->pages->psind == 1,
+		    ("vm_reserv_depopulate: reserv %p is already demoted",
+		    rv));
+		rv->pages->psind = 0;
 	}
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
@@ -273,6 +284,8 @@
 	    ("vm_reserv_populate: reserv %p is free", rv));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
+	KASSERT(rv->pages->psind == 0,
+	    ("vm_reserv_populate: reserv %p is already promoted", rv));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
@@ -281,106 +294,281 @@
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop, rv, partpopq);
-	}
+	} else
+		rv->pages->psind = 1;
 }
 
 /*
- * Allocates a page from an existing or newly-created reservation.
+ * Allocates a contiguous set of physical pages of the given size "npages"
+ * from existing or newly created reservations.  All of the physical pages
+ * must be at or above the given physical address "low" and below the given
+ * physical address "high".  The given value "alignment" determines the
+ * alignment of the first physical page in the set.  If the given value
+ * "boundary" is non-zero, then the set of physical pages cannot cross any
+ * physical address boundary that is a multiple of that value.  Both
+ * "alignment" and "boundary" must be a power of two.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
-vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex)
+vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
-	vm_page_t m, mpred, msucc;
+	vm_paddr_t pa, size;
+	vm_page_t m, m_ret, mpred, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
+	u_long allocpages, maxpages, minpages;
+	int i, index, n;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 
 	/*
-	 * Is a reservation fundamentally not possible?
+	 * Is a reservation fundamentally impossible?
 	 */
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
-	    pindex >= object->size)
+	    pindex + npages > object->size)
 		return (NULL);
 
 	/*
+	 * All reservations of a particular size have the same alignment.
+	 * Assuming that the first page is allocated from a reservation, the
+	 * least significant bits of its physical address can be determined
+	 * from its offset from the beginning of the reservation and the size
+	 * of the reservation.
+	 *
+	 * Could the specified index within a reservation of the smallest
+	 * possible size satisfy the alignment and boundary requirements?
+	 */
+	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
+	if ((pa & (alignment - 1)) != 0)
+		return (NULL);
+	size = npages << PAGE_SHIFT;
+	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
+		return (NULL);
+
+	/*
 	 * Look for an existing reservation.
 	 */
-	msucc = NULL;
-	mpred = object->root;
-	while (mpred != NULL) {
-		KASSERT(mpred->pindex != pindex,
-		    ("vm_reserv_alloc_page: pindex already allocated"));
+	mpred = vm_radix_lookup_le(&object->rtree, pindex);
+	if (mpred != NULL) {
+		KASSERT(mpred->pindex < pindex,
+		    ("vm_reserv_alloc_contig: pindex already allocated"));
 		rv = vm_reserv_from_page(mpred);
-		if (rv->object == object && vm_reserv_has_pindex(rv, pindex)) {
-			m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
-			/* Handle vm_page_rename(m, new_object, ...). */
-			if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+			goto found;
+		msucc = TAILQ_NEXT(mpred, listq);
+	} else
+		msucc = TAILQ_FIRST(&object->memq);
+	if (msucc != NULL) {
+		KASSERT(msucc->pindex > pindex,
+		    ("vm_reserv_alloc_contig: pindex already allocated"));
+		rv = vm_reserv_from_page(msucc);
+		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+			goto found;
+	}
+
+	/*
+	 * Could at least one reservation fit between the first index to the
+	 * left that can be used ("leftcap") and the first index to the right
+	 * that cannot be used ("rightcap")?
+	 */
+	first = pindex - VM_RESERV_INDEX(object, pindex);
+	if (mpred != NULL) {
+		if ((rv = vm_reserv_from_page(mpred))->object != object)
+			leftcap = mpred->pindex + 1;
+		else
+			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+		if (leftcap > first)
+			return (NULL);
+	}
+	minpages = VM_RESERV_INDEX(object, pindex) + npages;
+	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
+	allocpages = maxpages;
+	if (msucc != NULL) {
+		if ((rv = vm_reserv_from_page(msucc))->object != object)
+			rightcap = msucc->pindex;
+		else
+			rightcap = rv->pindex;
+		if (first + maxpages > rightcap) {
+			if (maxpages == VM_LEVEL_0_NPAGES)
 				return (NULL);
-			vm_reserv_populate(rv);
-			return (m);
-		} else if (mpred->pindex < pindex) {
-			if (msucc != NULL ||
-			    (msucc = TAILQ_NEXT(mpred, listq)) == NULL)
-				break;
-			KASSERT(msucc->pindex != pindex,
-			    ("vm_reserv_alloc_page: pindex already allocated"));
-			rv = vm_reserv_from_page(msucc);
-			if (rv->object == object &&
-			    vm_reserv_has_pindex(rv, pindex)) {
-				m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
-				/* Handle vm_page_rename(m, new_object, ...). */
-				if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
-					return (NULL);
-				vm_reserv_populate(rv);
-				return (m);
-			} else if (pindex < msucc->pindex)
-				break;
-		} else if (msucc == NULL) {
-			msucc = mpred;
-			mpred = TAILQ_PREV(msucc, pglist, listq);
-			continue;
+
+			/*
+			 * At least one reservation will fit between "leftcap"
+			 * and "rightcap".  However, a reservation for the
+			 * last of the requested pages will not fit.  Reduce
+			 * the size of the upcoming allocation accordingly.
+			 */
+			allocpages = minpages;
 		}
-		msucc = NULL;
-		mpred = object->root = vm_page_splay(pindex, object->root);
 	}
 
 	/*
-	 * Determine the first index to the left that can be used.
+	 * Would the last new reservation extend past the end of the object?
 	 */
-	if (mpred == NULL)
-		leftcap = 0;
-	else if ((rv = vm_reserv_from_page(mpred))->object != object)
-		leftcap = mpred->pindex + 1;
-	else
-		leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+	if (first + maxpages > object->size) {
+		/*
+		 * Don't allocate the last new reservation if the object is a
+		 * vnode or backed by another object that is a vnode. 
+		 */
+		if (object->type == OBJT_VNODE ||
+		    (object->backing_object != NULL &&
+		    object->backing_object->type == OBJT_VNODE)) {
+			if (maxpages == VM_LEVEL_0_NPAGES)
+				return (NULL);
+			allocpages = minpages;
+		}
+		/* Speculate that the object may grow. */
+	}
 
 	/*
-	 * Determine the first index to the right that cannot be used.
+	 * Allocate the physical pages.  The alignment and boundary specified
+	 * for this allocation may be different from the alignment and
+	 * boundary specified for the requested pages.  For instance, the
+	 * specified index may not be the first page within the first new
+	 * reservation.
 	 */
-	if (msucc == NULL)
-		rightcap = pindex + VM_LEVEL_0_NPAGES;
-	else if ((rv = vm_reserv_from_page(msucc))->object != object)
-		rightcap = msucc->pindex;
-	else
-		rightcap = rv->pindex;
+	m = vm_phys_alloc_contig(allocpages, low, high, ulmax(alignment,
+	    VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
+	if (m == NULL)
+		return (NULL);
 
 	/*
-	 * Determine if a reservation fits between the first index to
-	 * the left that can be used and the first index to the right
-	 * that cannot be used. 
+	 * The allocated physical pages always begin at a reservation
+	 * boundary, but they do not always end at a reservation boundary.
+	 * Initialize every reservation that is completely covered by the
+	 * allocated physical pages.
 	 */
-	first = pindex - VM_RESERV_INDEX(object, pindex);
-	if (first < leftcap || first + VM_LEVEL_0_NPAGES > rightcap)
+	m_ret = NULL;
+	index = VM_RESERV_INDEX(object, pindex);
+	do {
+		rv = vm_reserv_from_page(m);
+		KASSERT(rv->pages == m,
+		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
+		    rv));
+		KASSERT(rv->object == NULL,
+		    ("vm_reserv_alloc_contig: reserv %p isn't free", rv));
+		LIST_INSERT_HEAD(&object->rvq, rv, objq);
+		rv->object = object;
+		rv->pindex = first;
+		KASSERT(rv->popcnt == 0,
+		    ("vm_reserv_alloc_contig: reserv %p's popcnt is corrupted",
+		    rv));
+		KASSERT(!rv->inpartpopq,
+		    ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
+		    rv));
+		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
+		for (i = 0; i < n; i++)
+			vm_reserv_populate(rv);
+		npages -= n;
+		if (m_ret == NULL) {
+			m_ret = &rv->pages[index];
+			index = 0;
+		}
+		m += VM_LEVEL_0_NPAGES;
+		first += VM_LEVEL_0_NPAGES;
+		allocpages -= VM_LEVEL_0_NPAGES;
+	} while (allocpages >= VM_LEVEL_0_NPAGES);
+	return (m_ret);
+
+	/*
+	 * Found a matching reservation.
+	 */
+found:
+	index = VM_RESERV_INDEX(object, pindex);
+	/* Does the allocation fit within the reservation? */
+	if (index + npages > VM_LEVEL_0_NPAGES)
 		return (NULL);
+	m = &rv->pages[index];
+	pa = VM_PAGE_TO_PHYS(m);
+	if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
+	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
+		return (NULL);
+	/* Handle vm_page_rename(m, new_object, ...). */
+	for (i = 0; i < npages; i++)
+		if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
+			return (NULL);
+	for (i = 0; i < npages; i++)
+		vm_reserv_populate(rv);
+	return (m);
+}
 
+/*
+ * Allocates a page from an existing or newly-created reservation.
+ *
+ * The page "mpred" must immediately precede the offset "pindex" within the
+ * specified object.
+ *
+ * The object and free page queue must be locked.
+ */
+vm_page_t
+vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, vm_page_t mpred)
+{
+	vm_page_t m, msucc;
+	vm_pindex_t first, leftcap, rightcap;
+	vm_reserv_t rv;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
 	/*
-	 * Would a new reservation extend past the end of the given object? 
+	 * Is a reservation fundamentally impossible?
 	 */
-	if (object->size < first + VM_LEVEL_0_NPAGES) {
+	if (pindex < VM_RESERV_INDEX(object, pindex) ||
+	    pindex >= object->size)
+		return (NULL);
+
+	/*
+	 * Look for an existing reservation.
+	 */
+	if (mpred != NULL) {
+		KASSERT(mpred->object == object,
+		    ("vm_reserv_alloc_page: object doesn't contain mpred"));
+		KASSERT(mpred->pindex < pindex,
+		    ("vm_reserv_alloc_page: mpred doesn't precede pindex"));
+		rv = vm_reserv_from_page(mpred);
+		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+			goto found;
+		msucc = TAILQ_NEXT(mpred, listq);
+	} else
+		msucc = TAILQ_FIRST(&object->memq);
+	if (msucc != NULL) {
+		KASSERT(msucc->pindex > pindex,
+		    ("vm_reserv_alloc_page: msucc doesn't succeed pindex"));
+		rv = vm_reserv_from_page(msucc);
+		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
+			goto found;
+	}
+
+	/*
+	 * Could a reservation fit between the first index to the left that
+	 * can be used and the first index to the right that cannot be used?
+	 */
+	first = pindex - VM_RESERV_INDEX(object, pindex);
+	if (mpred != NULL) {
+		if ((rv = vm_reserv_from_page(mpred))->object != object)
+			leftcap = mpred->pindex + 1;
+		else
+			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
+		if (leftcap > first)
+			return (NULL);
+	}
+	if (msucc != NULL) {
+		if ((rv = vm_reserv_from_page(msucc))->object != object)
+			rightcap = msucc->pindex;
+		else
+			rightcap = rv->pindex;
+		if (first + VM_LEVEL_0_NPAGES > rightcap)
+			return (NULL);
+	}
+
+	/*
+	 * Would a new reservation extend past the end of the object? 
+	 */
+	if (first + VM_LEVEL_0_NPAGES > object->size) {
 		/*
 		 * Don't allocate a new reservation if the object is a vnode or
 		 * backed by another object that is a vnode. 
@@ -393,28 +581,35 @@
 	}
 
 	/*
-	 * Allocate a new reservation.
+	 * Allocate and populate the new reservation.
 	 */
 	m = vm_phys_alloc_pages(VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
-	if (m != NULL) {
-		rv = vm_reserv_from_page(m);
-		KASSERT(rv->pages == m,
-		    ("vm_reserv_alloc_page: reserv %p's pages is corrupted",
-		    rv));
-		KASSERT(rv->object == NULL,
-		    ("vm_reserv_alloc_page: reserv %p isn't free", rv));
-		LIST_INSERT_HEAD(&object->rvq, rv, objq);
-		rv->object = object;
-		rv->pindex = first;
-		KASSERT(rv->popcnt == 0,
-		    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted",
-		    rv));
-		KASSERT(!rv->inpartpopq,
-		    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE",
-		    rv));
-		vm_reserv_populate(rv);
-		m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
-	}
+	if (m == NULL)
+		return (NULL);
+	rv = vm_reserv_from_page(m);
+	KASSERT(rv->pages == m,
+	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
+	KASSERT(rv->object == NULL,
+	    ("vm_reserv_alloc_page: reserv %p isn't free", rv));
+	LIST_INSERT_HEAD(&object->rvq, rv, objq);
+	rv->object = object;
+	rv->pindex = first;
+	KASSERT(rv->popcnt == 0,
+	    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
+	KASSERT(!rv->inpartpopq,
+	    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
+	vm_reserv_populate(rv);
+	return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
+
+	/*
+	 * Found a matching reservation.
+	 */
+found:
+	m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
+	/* Handle vm_page_rename(m, new_object, ...). */
+	if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+		return (NULL);
+	vm_reserv_populate(rv);
 	return (m);
 }
 
@@ -629,16 +824,17 @@
  * The free page queue lock must be held.
  */
 boolean_t
-vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low, vm_paddr_t high,
-    unsigned long alignment, unsigned long boundary)
+vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary)
 {
-	vm_paddr_t pa, pa_length;
+	vm_paddr_t pa, pa_length, size;
 	vm_reserv_t rv;
 	int i;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	if (size > VM_LEVEL_0_SIZE - PAGE_SIZE)
+	if (npages > VM_LEVEL_0_NPAGES - 1)
 		return (FALSE);
+	size = npages << PAGE_SHIFT;
 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
@@ -681,7 +877,7 @@
 {
 	vm_reserv_t rv;
 
-	VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(new_object);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == old_object) {
 		mtx_lock(&vm_page_queue_free_mtx);

Modified: trunk/sys/vm/vm_reserv.h
===================================================================
--- trunk/sys/vm/vm_reserv.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_reserv.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
@@ -28,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $
  */
 
 /*
@@ -42,15 +43,21 @@
 
 #if VM_NRESERVLEVEL > 0
 
-vm_page_t	vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex);
+/*
+ * The following functions are only to be used by the virtual memory system.
+ */
+vm_page_t	vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
+		    u_long npages, vm_paddr_t low, vm_paddr_t high,
+		    u_long alignment, vm_paddr_t boundary);
+vm_page_t	vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
+		    vm_page_t mpred);
 void		vm_reserv_break_all(vm_object_t object);
 boolean_t	vm_reserv_free_page(vm_page_t m);
 void		vm_reserv_init(void);
 int		vm_reserv_level_iffullpop(vm_page_t m);
 boolean_t	vm_reserv_reactivate_page(vm_page_t m);
-boolean_t	vm_reserv_reclaim_contig(vm_paddr_t size, vm_paddr_t low,
-		    vm_paddr_t high, unsigned long alignment,
-		    unsigned long boundary);
+boolean_t	vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
+		    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 boolean_t	vm_reserv_reclaim_inactive(void);
 void		vm_reserv_rename(vm_page_t m, vm_object_t new_object,
 		    vm_object_t old_object, vm_pindex_t old_object_offset);

Modified: trunk/sys/vm/vm_unix.c
===================================================================
--- trunk/sys/vm/vm_unix.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_unix.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
@@ -43,7 +44,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -130,35 +131,39 @@
 			goto done;
 		}
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		error = racct_set(td->td_proc, RACCT_DATA, new - base);
-		if (error != 0) {
-			PROC_UNLOCK(td->td_proc);
-			error = ENOMEM;
-			goto done;
-		}
-		error = racct_set(td->td_proc, RACCT_VMEM,
-		    map->size + (new - old));
-		if (error != 0) {
-			racct_set_force(td->td_proc, RACCT_DATA, old - base);
-			PROC_UNLOCK(td->td_proc);
-			error = ENOMEM;
-			goto done;
-		}
-		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-			error = racct_set(td->td_proc, RACCT_MEMLOCK,
-			    ptoa(pmap_wired_count(map->pmap)) + (new - old));
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			error = racct_set(td->td_proc, RACCT_DATA, new - base);
 			if (error != 0) {
+				PROC_UNLOCK(td->td_proc);
+				error = ENOMEM;
+				goto done;
+			}
+			error = racct_set(td->td_proc, RACCT_VMEM,
+			    map->size + (new - old));
+			if (error != 0) {
 				racct_set_force(td->td_proc, RACCT_DATA,
 				    old - base);
-				racct_set_force(td->td_proc, RACCT_VMEM,
-				    map->size);
 				PROC_UNLOCK(td->td_proc);
 				error = ENOMEM;
 				goto done;
 			}
+			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+				error = racct_set(td->td_proc, RACCT_MEMLOCK,
+				    ptoa(pmap_wired_count(map->pmap)) +
+				    (new - old));
+				if (error != 0) {
+					racct_set_force(td->td_proc, RACCT_DATA,
+					    old - base);
+					racct_set_force(td->td_proc, RACCT_VMEM,
+					    map->size);
+					PROC_UNLOCK(td->td_proc);
+					error = ENOMEM;
+					goto done;
+				}
+			}
+			PROC_UNLOCK(td->td_proc);
 		}
-		PROC_UNLOCK(td->td_proc);
 #endif
 		prot = VM_PROT_RW;
 #ifdef COMPAT_FREEBSD32
@@ -170,14 +175,19 @@
 		rv = vm_map_insert(map, NULL, 0, old, new, prot, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 #ifdef RACCT
-			PROC_LOCK(td->td_proc);
-			racct_set_force(td->td_proc, RACCT_DATA, old - base);
-			racct_set_force(td->td_proc, RACCT_VMEM, map->size);
-			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-				racct_set_force(td->td_proc, RACCT_MEMLOCK,
-				    ptoa(pmap_wired_count(map->pmap)));
+			if (racct_enable) {
+				PROC_LOCK(td->td_proc);
+				racct_set_force(td->td_proc,
+				    RACCT_DATA, old - base);
+				racct_set_force(td->td_proc,
+				    RACCT_VMEM, map->size);
+				if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+					racct_set_force(td->td_proc,
+					    RACCT_MEMLOCK,
+					    ptoa(pmap_wired_count(map->pmap)));
+				}
+				PROC_UNLOCK(td->td_proc);
 			}
-			PROC_UNLOCK(td->td_proc);
 #endif
 			error = ENOMEM;
 			goto done;
@@ -205,14 +215,16 @@
 		}
 		vm->vm_dsize -= btoc(old - new);
 #ifdef RACCT
-		PROC_LOCK(td->td_proc);
-		racct_set_force(td->td_proc, RACCT_DATA, new - base);
-		racct_set_force(td->td_proc, RACCT_VMEM, map->size);
-		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
-			racct_set_force(td->td_proc, RACCT_MEMLOCK,
-			    ptoa(pmap_wired_count(map->pmap)));
+		if (racct_enable) {
+			PROC_LOCK(td->td_proc);
+			racct_set_force(td->td_proc, RACCT_DATA, new - base);
+			racct_set_force(td->td_proc, RACCT_VMEM, map->size);
+			if (!old_mlock && map->flags & MAP_WIREFUTURE) {
+				racct_set_force(td->td_proc, RACCT_MEMLOCK,
+				    ptoa(pmap_wired_count(map->pmap)));
+			}
+			PROC_UNLOCK(td->td_proc);
 		}
-		PROC_UNLOCK(td->td_proc);
 #endif
 	}
 done:

Modified: trunk/sys/vm/vm_zeroidle.c
===================================================================
--- trunk/sys/vm/vm_zeroidle.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vm_zeroidle.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1994 John Dyson
  * Copyright (c) 2001 Matt Dillon
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $");
 
 #include <opt_sched.h>
 
@@ -50,6 +51,7 @@
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 

Modified: trunk/sys/vm/vnode_pager.c
===================================================================
--- trunk/sys/vm/vnode_pager.c	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vnode_pager.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
@@ -51,7 +52,7 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -63,6 +64,7 @@
 #include <sys/vmmeter.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
+#include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 
 #include <machine/atomic.h>
@@ -82,7 +84,7 @@
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
+static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
@@ -109,14 +111,14 @@
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
-			VM_OBJECT_UNLOCK(object);
+			VM_OBJECT_WUNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
-		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
+		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
@@ -135,9 +137,9 @@
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	object->ref_count--;
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
@@ -154,26 +156,21 @@
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	if (obj->ref_count == 0) {
 		/*
-		 * vclean() may be called twice. The first time
-		 * removes the primary reference to the object,
-		 * the second time goes one further and is a
-		 * special-case to terminate the object.
-		 *
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(obj);
 		else
-			VM_OBJECT_UNLOCK(obj);
+			VM_OBJECT_WUNLOCK(obj);
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
-		VM_OBJECT_UNLOCK(obj);
+		VM_OBJECT_WUNLOCK(obj);
 	}
 	vp->v_object = NULL;
 }
@@ -206,11 +203,11 @@
 	 */
 retry:
 	while ((object = vp->v_object) != NULL) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
-		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
+		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0);
 	}
 
 	KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference"));
@@ -231,6 +228,12 @@
 			 * Object has been created while we were sleeping
 			 */
 			VI_UNLOCK(vp);
+			VM_OBJECT_WLOCK(object);
+			KASSERT(object->ref_count == 1,
+			    ("leaked ref %p %d", object, object->ref_count));
+			object->type = OBJT_DEAD;
+			object->ref_count = 0;
+			VM_OBJECT_WUNLOCK(object);
 			vm_object_destroy(object);
 			goto retry;
 		}
@@ -238,7 +241,7 @@
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	}
 	vref(vp);
 	return (object);
@@ -258,7 +261,7 @@
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "vnpdea");
 	refs = object->ref_count;
 
@@ -272,13 +275,15 @@
 	if (object->un_pager.vnp.writemappings > 0) {
 		object->un_pager.vnp.writemappings = 0;
 		VOP_ADD_WRITECOUNT(vp, -1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+		    __func__, vp, vp->v_writecount);
 	}
 	vp->v_object = NULL;
 	VOP_UNSET_TEXT(vp);
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	while (refs-- > 0)
 		vunref(vp);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 }
 
 static boolean_t
@@ -295,9 +300,8 @@
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
-	int vfslocked;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
@@ -320,11 +324,9 @@
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
-	VM_OBJECT_UNLOCK(object);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	VM_OBJECT_WUNLOCK(object);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
-	VFS_UNLOCK_GIANT(vfslocked);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
@@ -379,9 +381,9 @@
 	if ((object = vp->v_object) == NULL)
 		return;
 /* 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (object->type == OBJT_DEAD) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	KASSERT(object->type == OBJT_VNODE,
@@ -390,7 +392,7 @@
 		/*
 		 * Hasn't changed size
 		 */
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
@@ -426,7 +428,7 @@
 			 * have been zeroed.  Some of these valid bits may
 			 * have already been set.
 			 */
-			vm_page_set_valid(m, base, size);
+			vm_page_set_valid_range(m, base, size);
 
 			/*
 			 * Round "base" to the next block boundary so that the
@@ -444,7 +446,7 @@
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
 		} else if ((nsize & PAGE_MASK) &&
-		    __predict_false(object->cache != NULL)) {
+		    vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
 			vm_page_cache_free(object, OFF_TO_IDX(nsize),
 			    nobjsize);
 		}
@@ -451,7 +453,7 @@
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
@@ -574,9 +576,9 @@
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		KASSERT((m->dirty & bits) == 0,
 		    ("vnode_pager_input_smlfs: page %p is dirty", m));
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		m->valid |= bits;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 	}
 	sf_buf_free(sf);
 	if (error) {
@@ -600,7 +602,7 @@
 	struct sf_buf *sf;
 	struct vnode *vp;
 
-	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 
 	/*
@@ -613,7 +615,7 @@
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
@@ -643,7 +645,7 @@
 		}
 		sf_buf_free(sf);
 
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 	}
 	KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
 	if (!error)
@@ -673,16 +675,13 @@
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
-	int vfslocked;
 
 	vp = object->handle;
-	VM_OBJECT_UNLOCK(object);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
-	VFS_UNLOCK_GIANT(vfslocked);
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	return rtval;
 }
 
@@ -698,25 +697,22 @@
 	int reqpage;
 {
 	vm_object_t object;
+	struct bufobj *bo;
+	struct buf *bp;
+	struct mount *mp;
 	vm_offset_t kva;
-	off_t foff, tfoff, nextoff;
-	int i, j, size, bsize, first;
 	daddr_t firstaddr, reqblock;
-	struct bufobj *bo;
-	int runpg;
-	int runend;
-	struct buf *bp;
-	int count;
-	int error;
+	off_t foff, nextoff, tfoff, pib;
+	int pbefore, pafter, i, size, bsize, first, last;
+	int count, error, before, after, secmask;
 
-	object = vp->v_object;
-	count = bytecount / PAGE_SIZE;
-
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("vnode_pager_generic_getpages does not support devices"));
 	if (vp->v_iflag & VI_DOOMED)
-		return VM_PAGER_BAD;
+		return (VM_PAGER_BAD);
 
+	object = vp->v_object;
+	count = bytecount / PAGE_SIZE;
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	/* get the UNDERLYING device for the file with VOP_BMAP() */
@@ -730,9 +726,10 @@
 	/*
 	 * if we can't bmap, use old VOP code
 	 */
-	error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
+	error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
+	    &reqblock, &after, &before);
 	if (error == EOPNOTSUPP) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		
 		for (i = 0; i < count; i++)
 			if (i != reqpage) {
@@ -743,10 +740,10 @@
 		PCPU_INC(cnt.v_vnodein);
 		PCPU_INC(cnt.v_vnodepgsin);
 		error = vnode_pager_input_old(object, m[reqpage]);
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < count; i++)
 			if (i != reqpage) {
 				vm_page_lock(m[i]);
@@ -753,7 +750,7 @@
 				vm_page_free(m[i]);
 				vm_page_unlock(m[i]);
 			}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_ERROR);
 
 		/*
@@ -763,7 +760,7 @@
 		 */
 	} else if ((PAGE_SIZE / bsize) > 1 &&
 	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
-		VM_OBJECT_LOCK(object);
+		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < count; i++)
 			if (i != reqpage) {
 				vm_page_lock(m[i]);
@@ -770,10 +767,10 @@
 				vm_page_free(m[i]);
 				vm_page_unlock(m[i]);
 			}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		PCPU_INC(cnt.v_vnodein);
 		PCPU_INC(cnt.v_vnodepgsin);
-		return vnode_pager_input_smlfs(object, m[reqpage]);
+		return (vnode_pager_input_smlfs(object, m[reqpage]));
 	}
 
 	/*
@@ -781,7 +778,7 @@
 	 * clean up and return.  Otherwise we have to re-read the
 	 * media.
 	 */
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
 		for (i = 0; i < count; i++)
 			if (i != reqpage) {
@@ -789,7 +786,7 @@
 				vm_page_free(m[i]);
 				vm_page_unlock(m[i]);
 			}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return VM_PAGER_OK;
 	} else if (reqblock == -1) {
 		pmap_zero_page(m[reqpage]);
@@ -802,87 +799,48 @@
 				vm_page_free(m[i]);
 				vm_page_unlock(m[i]);
 			}
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 	m[reqpage]->valid = 0;
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
-	/*
-	 * here on direct device I/O
-	 */
-	firstaddr = -1;
-
-	/*
-	 * calculate the run that includes the required page
-	 */
-	for (first = 0, i = 0; i < count; i = runend) {
-		if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
-		    &runpg) != 0) {
-			VM_OBJECT_LOCK(object);
-			for (; i < count; i++)
-				if (i != reqpage) {
-					vm_page_lock(m[i]);
-					vm_page_free(m[i]);
-					vm_page_unlock(m[i]);
-				}
-			VM_OBJECT_UNLOCK(object);
-			return (VM_PAGER_ERROR);
+	pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
+	pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
+	pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
+	first = reqpage < pbefore ? 0 : reqpage - pbefore;
+	last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
+	if (first > 0 || last + 1 < count) {
+		VM_OBJECT_WLOCK(object);
+		for (i = 0; i < first; i++) {
+			vm_page_lock(m[i]);
+			vm_page_free(m[i]);
+			vm_page_unlock(m[i]);
 		}
-		if (firstaddr == -1) {
-			VM_OBJECT_LOCK(object);
-			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
-				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
-				    (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
-				    (uintmax_t)foff,
-				    (uintmax_t)
-				    (object->un_pager.vnp.vnp_size >> 32),
-				    (uintmax_t)object->un_pager.vnp.vnp_size);
-			}
+		for (i = last + 1; i < count; i++) {
 			vm_page_lock(m[i]);
 			vm_page_free(m[i]);
 			vm_page_unlock(m[i]);
-			VM_OBJECT_UNLOCK(object);
-			runend = i + 1;
-			first = runend;
-			continue;
 		}
-		runend = i + runpg;
-		if (runend <= reqpage) {
-			VM_OBJECT_LOCK(object);
-			for (j = i; j < runend; j++) {
-				vm_page_lock(m[j]);
-				vm_page_free(m[j]);
-				vm_page_unlock(m[j]);
-			}
-			VM_OBJECT_UNLOCK(object);
-		} else {
-			if (runpg < (count - first)) {
-				VM_OBJECT_LOCK(object);
-				for (i = first + runpg; i < count; i++) {
-					vm_page_lock(m[i]);
-					vm_page_free(m[i]);
-					vm_page_unlock(m[i]);
-				}
-				VM_OBJECT_UNLOCK(object);
-				count = first + runpg;
-			}
-			break;
-		}
-		first = runend;
+		VM_OBJECT_WUNLOCK(object);
 	}
 
 	/*
-	 * the first and last page have been calculated now, move input pages
-	 * to be zero based...
+	 * here on direct device I/O
 	 */
-	if (first != 0) {
-		m += first;
-		count -= first;
-		reqpage -= first;
-	}
+	firstaddr = reqblock;
+	firstaddr += pib / DEV_BSIZE;
+	firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;
 
 	/*
+	 * The first and last page have been calculated now, move
+	 * input pages to be zero based, and adjust the count.
+	 */
+	m += first;
+	reqpage -= first;
+	count = last - first + 1;
+
+	/*
 	 * calculate the file virtual address for the transfer
 	 */
 	foff = IDX_TO_OFF(m[0]->pindex);
@@ -899,21 +857,31 @@
 	/*
 	 * round up physical size for real devices.
 	 */
-	if (1) {
-		int secmask = bo->bo_bsize - 1;
-		KASSERT(secmask < PAGE_SIZE && secmask > 0,
-		    ("vnode_pager_generic_getpages: sector size %d too large",
-		    secmask + 1));
-		size = (size + secmask) & ~secmask;
-	}
+	secmask = bo->bo_bsize - 1;
+	KASSERT(secmask < PAGE_SIZE && secmask > 0,
+	    ("vnode_pager_generic_getpages: sector size %d too large",
+	    secmask + 1));
+	size = (size + secmask) & ~secmask;
 
 	bp = getpbuf(&vnode_pbuf_freecnt);
 	kva = (vm_offset_t)bp->b_data;
 
 	/*
-	 * and map the pages to be read into the kva
+	 * and map the pages to be read into the kva, if the filesystem
+	 * requires mapped buffers.
 	 */
-	pmap_qenter(kva, m, count);
+	mp = vp->v_mount;
+	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
+	    unmapped_buf_allowed) {
+		bp->b_data = unmapped_buf;
+		bp->b_kvabase = unmapped_buf;
+		bp->b_offset = 0;
+		bp->b_flags |= B_UNMAPPED;
+		bp->b_npages = count;
+		for (i = 0; i < count; i++)
+			bp->b_pages[i] = m[i];
+	} else
+		pmap_qenter(kva, m, count);
 
 	/* build a minimal buffer header */
 	bp->b_iocmd = BIO_READ;
@@ -942,11 +910,22 @@
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		error = EIO;
 
-	if (!error) {
-		if (size != count * PAGE_SIZE)
-			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
+	if (error == 0 && size != count * PAGE_SIZE) {
+		if ((bp->b_flags & B_UNMAPPED) != 0) {
+			bp->b_flags &= ~B_UNMAPPED;
+			pmap_qenter(kva, m, count);
+		}
+		bzero((caddr_t)kva + size, PAGE_SIZE * count - size);
 	}
-	pmap_qremove(kva, count);
+	if ((bp->b_flags & B_UNMAPPED) == 0)
+		pmap_qremove(kva, count);
+	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
+		bp->b_data = (caddr_t)kva;
+		bp->b_kvabase = (caddr_t)kva;
+		bp->b_flags &= ~B_UNMAPPED;
+		for (i = 0; i < count; i++)
+			bp->b_pages[i] = NULL;
+	}
 
 	/*
 	 * free the buffer header back to the swap buffer pool
@@ -955,7 +934,7 @@
 	pbrelbo(bp);
 	relpbuf(bp, &vnode_pbuf_freecnt);
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
@@ -981,7 +960,7 @@
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
-			vm_page_set_valid(mt, 0,
+			vm_page_set_valid_range(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
@@ -992,7 +971,7 @@
 		if (i != reqpage)
 			vm_page_readahead_finish(mt);
 	}
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 	if (error) {
 		printf("vnode_pager_getpages: I/O read error\n");
 	}
@@ -1008,12 +987,8 @@
  * backing vp's VOP_PUTPAGES.
  */
 static void
-vnode_pager_putpages(object, m, count, sync, rtvals)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	boolean_t sync;
-	int *rtvals;
+vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+    int flags, int *rtvals)
 {
 	int rtval;
 	struct vnode *vp;
@@ -1022,7 +997,7 @@
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
-	 * allocate more memory to initiate the I/O ( i.e. do a BMAP 
+	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
@@ -1031,18 +1006,18 @@
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
-	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
-		sync |= OBJPC_SYNC;
+	if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min)
+		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
-	VM_OBJECT_UNLOCK(object);
-	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
+	VM_OBJECT_WUNLOCK(object);
+	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 }
 
 
@@ -1104,7 +1079,7 @@
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			int pgoff;
@@ -1119,8 +1094,7 @@
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
-				KASSERT(m->busy > 0,
-		("vnode_pager_generic_putpages: page %p is not busy", m));
+				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
@@ -1136,10 +1110,10 @@
 			}
 		}
 	}
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 
 	/*
-	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
+	 * pageouts are already clustered, use IO_ASYNC to force a bawrite()
 	 * rather then a bdwrite() to prevent paging I/O from saturating 
 	 * the buffer cache.  Dummy-up the sequential heuristic to cause
 	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
@@ -1190,7 +1164,7 @@
 	if (written == 0)
 		return;
 	obj = ma[0]->object;
-	VM_OBJECT_LOCK(obj);
+	VM_OBJECT_WLOCK(obj);
 	for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
 		if (pos < trunc_page(written)) {
 			rtvals[i] = VM_PAGER_OK;
@@ -1201,7 +1175,7 @@
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
-	VM_OBJECT_UNLOCK(obj);
+	VM_OBJECT_WUNLOCK(obj);
 }
 
 void
@@ -1211,9 +1185,9 @@
 	struct vnode *vp;
 	vm_ooffset_t old_wm;
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 	if (object->type != OBJT_VNODE) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	old_wm = object->un_pager.vnp.writemappings;
@@ -1222,11 +1196,15 @@
 	if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
 		ASSERT_VOP_ELOCKED(vp, "v_writecount inc");
 		VOP_ADD_WRITECOUNT(vp, 1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
+		    __func__, vp, vp->v_writecount);
 	} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
 		ASSERT_VOP_ELOCKED(vp, "v_writecount dec");
 		VOP_ADD_WRITECOUNT(vp, -1);
+		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
+		    __func__, vp, vp->v_writecount);
 	}
-	VM_OBJECT_UNLOCK(object);
+	VM_OBJECT_WUNLOCK(object);
 }
 
 void
@@ -1236,9 +1214,8 @@
 	struct vnode *vp;
 	struct mount *mp;
 	vm_offset_t inc;
-	int vfslocked;
 
-	VM_OBJECT_LOCK(object);
+	VM_OBJECT_WLOCK(object);
 
 	/*
 	 * First, recheck the object type to account for the race when
@@ -1245,7 +1222,7 @@
 	 * the vnode is reclaimed.
 	 */
 	if (object->type != OBJT_VNODE) {
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
@@ -1256,14 +1233,13 @@
 	inc = end - start;
 	if (object->un_pager.vnp.writemappings != inc) {
 		object->un_pager.vnp.writemappings -= inc;
-		VM_OBJECT_UNLOCK(object);
+		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	vp = object->handle;
 	vhold(vp);
-	VM_OBJECT_UNLOCK(object);
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	VM_OBJECT_WUNLOCK(object);
 	mp = NULL;
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
@@ -1279,5 +1255,4 @@
 	vdrop(vp);
 	if (mp != NULL)
 		vn_finished_write(mp);
-	VFS_UNLOCK_GIANT(vfslocked);
 }

Modified: trunk/sys/vm/vnode_pager.h
===================================================================
--- trunk/sys/vm/vnode_pager.h	2018-05-24 22:26:03 UTC (rev 9895)
+++ trunk/sys/vm/vnode_pager.h	2018-05-24 22:27:41 UTC (rev 9896)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
@@ -32,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vnode_pager.h	8.1 (Berkeley) 6/11/93
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $
  */
 
 #ifndef	_VNODE_PAGER_

Added: trunk/sys/x86/acpica/acpi_wakeup.c
===================================================================
--- trunk/sys/x86/acpica/acpi_wakeup.c	                        (rev 0)
+++ trunk/sys/x86/acpica/acpi_wakeup.c	2018-05-24 22:27:41 UTC (rev 9896)
@@ -0,0 +1,409 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2001 Takanori Watanabe <takawata at jp.freebsd.org>
+ * Copyright (c) 2001-2012 Mitsuru IWASAKI <iwasaki at jp.freebsd.org>
+ * Copyright (c) 2003 Peter Wemm
+ * Copyright (c) 2008-2012 Jung-uk Kim <jkim at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/x86/acpica/acpi_wakeup.c 331910 2018-04-03 07:52:06Z avg $");
+
+#ifdef __i386__
+#include "opt_npx.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/eventhandler.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/memrange.h>
+#include <sys/smp.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/clock.h>
+#include <machine/cpu.h>
+#include <machine/intr_machdep.h>
+#include <x86/mca.h>
+#include <machine/pcb.h>
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+#include <machine/md_var.h>
+
+#ifdef SMP
+#include <x86/apicreg.h>
+#include <machine/smp.h>
+#include <machine/vmparam.h>
+#endif
+
+#include <contrib/dev/acpica/include/acpi.h>
+
+#include <dev/acpica/acpivar.h>
+
+#include "acpi_wakecode.h"
+#include "acpi_wakedata.h"
+
+/* Make sure the code is less than a page and leave room for the stack. */
+CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024);
+
+extern int		acpi_resume_beep;
+extern int		acpi_reset_video;
+
+#ifdef SMP
+extern struct susppcb	**susppcbs;
+static cpuset_t		suspcpus;
+#else
+static struct susppcb	**susppcbs;
+#endif
+
+static void		*acpi_alloc_wakeup_handler(void);
+static void		acpi_stop_beep(void *);
+
+#ifdef SMP
+static int		acpi_wakeup_ap(struct acpi_softc *, int);
+static void		acpi_wakeup_cpus(struct acpi_softc *);
+#endif
+
+#ifdef __amd64__
+#define ACPI_PAGETABLES	3
+#else
+#define ACPI_PAGETABLES	0
+#endif
+
+#define	WAKECODE_VADDR(sc)				\
+    ((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE))
+#define	WAKECODE_PADDR(sc)				\
+    ((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE))
+#define	WAKECODE_FIXUP(offset, type, val)	do {	\
+	type	*addr;					\
+	addr = (type *)(WAKECODE_VADDR(sc) + offset);	\
+	*addr = val;					\
+} while (0)
+
+static void
+acpi_stop_beep(void *arg)
+{
+
+	if (acpi_resume_beep != 0)
+		timer_spkr_release();
+}
+
+#ifdef SMP
+static int
+acpi_wakeup_ap(struct acpi_softc *sc, int cpu)
+{
+	struct pcb *pcb;
+	int		vector = (WAKECODE_PADDR(sc) >> 12) & 0xff;
+	int		apic_id = cpu_apic_ids[cpu];
+	int		ms;
+
+	pcb = &susppcbs[cpu]->sp_pcb;
+	WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
+	WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
+	WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
+
+	ipi_startup(apic_id, vector);
+
+	/* Wait up to 5 seconds for it to resume. */
+	for (ms = 0; ms < 5000; ms++) {
+		if (!CPU_ISSET(cpu, &suspended_cpus))
+			return (1);	/* return SUCCESS */
+		DELAY(1000);
+	}
+	return (0);		/* return FAILURE */
+}
+
+#define	WARMBOOT_TARGET		0
+#define	WARMBOOT_OFF		(KERNBASE + 0x0467)
+#define	WARMBOOT_SEG		(KERNBASE + 0x0469)
+
+#define	CMOS_REG		(0x70)
+#define	CMOS_DATA		(0x71)
+#define	BIOS_RESET		(0x0f)
+#define	BIOS_WARM		(0x0a)
+
+static void
+acpi_wakeup_cpus(struct acpi_softc *sc)
+{
+	uint32_t	mpbioswarmvec;
+	int		cpu;
+	u_char		mpbiosreason;
+
+	/* save the current value of the warm-start vector */
+	mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF);
+	outb(CMOS_REG, BIOS_RESET);
+	mpbiosreason = inb(CMOS_DATA);
+
+	/* setup a vector to our boot code */
+	*((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
+	*((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4;
+	outb(CMOS_REG, BIOS_RESET);
+	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
+
+	/* Wake up each AP. */
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+		if (!CPU_ISSET(cpu, &suspcpus))
+			continue;
+		if (acpi_wakeup_ap(sc, cpu) == 0) {
+			/* restore the warmstart vector */
+			*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+			panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)",
+			    cpu, cpu_apic_ids[cpu]);
+		}
+	}
+
+	/* restore the warmstart vector */
+	*(uint32_t *)WARMBOOT_OFF = mpbioswarmvec;
+
+	outb(CMOS_REG, BIOS_RESET);
+	outb(CMOS_DATA, mpbiosreason);
+}
+#endif
+
+int
+acpi_sleep_machdep(struct acpi_softc *sc, int state)
+{
+	ACPI_STATUS	status;
+	struct pcb	*pcb;
+
+	if (sc->acpi_wakeaddr == 0ul)
+		return (-1);	/* couldn't alloc wake memory */
+
+#ifdef SMP
+	suspcpus = all_cpus;
+	CPU_CLR(PCPU_GET(cpuid), &suspcpus);
+#endif
+
+	if (acpi_resume_beep != 0)
+		timer_spkr_acquire();
+
+	AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc), 0);
+
+	intr_suspend();
+
+	pcb = &susppcbs[0]->sp_pcb;
+	if (savectx(pcb)) {
+#ifdef __amd64__
+		fpususpend(susppcbs[0]->sp_fpususpend);
+#elif defined(DEV_NPX)
+		npxsuspend(susppcbs[0]->sp_fpususpend);
+#endif
+#ifdef SMP
+		if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) {
+			device_printf(sc->acpi_dev, "Failed to suspend APs\n");
+			return (0);	/* couldn't sleep */
+		}
+#endif
+
+		WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0));
+		WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0));
+
+#ifndef __amd64__
+		WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4);
+#endif
+		WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb);
+		WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit);
+		WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base);
+
+		/* Call ACPICA to enter the desired sleep state */
+		if (state == ACPI_STATE_S4 && sc->acpi_s4bios)
+			status = AcpiEnterSleepStateS4bios();
+		else
+			status = AcpiEnterSleepState(state);
+		if (ACPI_FAILURE(status)) {
+			device_printf(sc->acpi_dev,
+			    "AcpiEnterSleepState failed - %s\n",
+			    AcpiFormatException(status));
+			return (0);	/* couldn't sleep */
+		}
+
+		for (;;)
+			ia32_pause();
+	} else {
+#ifdef __amd64__
+		fpuresume(susppcbs[0]->sp_fpususpend);
+#elif defined(DEV_NPX)
+		npxresume(susppcbs[0]->sp_fpususpend);
+#endif
+	}
+
+	return (1);	/* wakeup successfully */
+}
+
+int
+acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result,
+    int intr_enabled)
+{
+
+	if (sleep_result == -1)
+		return (sleep_result);
+
+	if (!intr_enabled) {
+		/* Wakeup MD procedures in interrupt disabled context */
+		if (sleep_result == 1) {
+			pmap_init_pat();
+			initializecpu();
+			PCPU_SET(switchtime, 0);
+			PCPU_SET(switchticks, ticks);
+#ifdef SMP
+			if (!CPU_EMPTY(&suspcpus))
+				acpi_wakeup_cpus(sc);
+#endif
+		}
+
+#ifdef SMP
+		if (!CPU_EMPTY(&suspcpus))
+			resume_cpus(suspcpus);
+#endif
+		mca_resume();
+#ifdef __amd64__
+		if (vmm_resume_p != NULL)
+			vmm_resume_p();
+#endif
+		intr_resume(/*suspend_cancelled*/false);
+
+		AcpiSetFirmwareWakingVector(0, 0);
+	} else {
+		/* Wakeup MD procedures in interrupt enabled context */
+		if (sleep_result == 1 && mem_range_softc.mr_op != NULL &&
+		    mem_range_softc.mr_op->reinit != NULL)
+			mem_range_softc.mr_op->reinit(&mem_range_softc);
+	}
+
+	return (sleep_result);
+}
+
+static void *
+acpi_alloc_wakeup_handler(void)
+{
+	void		*wakeaddr;
+	int		i;
+
+	/*
+	 * Specify the region for our wakeup code.  We want it in the low 1 MB
+	 * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA
+	 * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT),
+	 * and ROM area (0xa0000 and above).  The temporary page tables must be
+	 * page-aligned.
+	 */
+	wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF,
+	    M_WAITOK, 0x500, 0xa0000, PAGE_SIZE, 0ul);
+	if (wakeaddr == NULL) {
+		printf("%s: can't alloc wake memory\n", __func__);
+		return (NULL);
+	}
+	if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL,
+	    EVENTHANDLER_PRI_LAST) == NULL) {
+		printf("%s: can't register event handler\n", __func__);
+		contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE,
+		    M_DEVBUF);
+		return (NULL);
+	}
+	susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK);
+	for (i = 0; i < mp_ncpus; i++) {
+		susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK);
+		susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK);
+	}
+
+	return (wakeaddr);
+}
+
+void
+acpi_install_wakeup_handler(struct acpi_softc *sc)
+{
+	static void	*wakeaddr = NULL;
+#ifdef __amd64__
+	uint64_t	*pt4, *pt3, *pt2;
+	int		i;
+#endif
+
+	if (wakeaddr != NULL)
+		return;
+
+	wakeaddr = acpi_alloc_wakeup_handler();
+	if (wakeaddr == NULL)
+		return;
+
+	sc->acpi_wakeaddr = (vm_offset_t)wakeaddr;
+	sc->acpi_wakephys = vtophys(wakeaddr);
+
+	bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode));
+
+	/* Patch GDT base address, ljmp targets. */
+	WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t,
+	    WAKECODE_PADDR(sc) + bootgdt);
+	WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t,
+	    WAKECODE_PADDR(sc) + wakeup_32);
+#ifdef __amd64__
+	WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
+	    WAKECODE_PADDR(sc) + wakeup_64);
+	WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys);
+#endif
+
+	/* Save pointers to some global data. */
+	WAKECODE_FIXUP(wakeup_ret, void *, resumectx);
+#ifndef __amd64__
+#if defined(PAE) || defined(PAE_TABLES)
+	WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdpt));
+#else
+	WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir));
+#endif
+
+#else
+	/* Build temporary page tables below realmode code. */
+	pt4 = wakeaddr;
+	pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t);
+	pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t);
+
+	/* Create the initial 1GB replicated page tables */
+	for (i = 0; i < 512; i++) {
+		/*
+		 * Each slot of the level 4 pages points
+		 * to the same level 3 page
+		 */
+		pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE);
+		pt4[i] |= PG_V | PG_RW | PG_U;
+
+		/*
+		 * Each slot of the level 3 pages points
+		 * to the same level 2 page
+		 */
+		pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE));
+		pt3[i] |= PG_V | PG_RW | PG_U;
+
+		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
+		pt2[i] = i * (2 * 1024 * 1024);
+		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	}
+#endif
+
+	if (bootverbose)
+		device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n",
+		    (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys);
+}


Property changes on: trunk/sys/x86/acpica/acpi_wakeup.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property


More information about the Midnightbsd-cvs mailing list