[Midnightbsd-cvs] src [12314] trunk/sys/vm: sync with FreeBSD 11-stable

laffer1 at midnightbsd.org laffer1 at midnightbsd.org
Sat Feb 8 14:35:49 EST 2020


Revision: 12314
          http://svnweb.midnightbsd.org/src/?rev=12314
Author:   laffer1
Date:     2020-02-08 14:35:48 -0500 (Sat, 08 Feb 2020)
Log Message:
-----------
sync with FreeBSD 11-stable

Modified Paths:
--------------
    trunk/sys/vm/_vm_radix.h
    trunk/sys/vm/default_pager.c
    trunk/sys/vm/device_pager.c
    trunk/sys/vm/memguard.c
    trunk/sys/vm/memguard.h
    trunk/sys/vm/phys_pager.c
    trunk/sys/vm/pmap.h
    trunk/sys/vm/redzone.c
    trunk/sys/vm/redzone.h
    trunk/sys/vm/sg_pager.c
    trunk/sys/vm/swap_pager.c
    trunk/sys/vm/swap_pager.h
    trunk/sys/vm/uma.h
    trunk/sys/vm/uma_core.c
    trunk/sys/vm/uma_dbg.c
    trunk/sys/vm/uma_dbg.h
    trunk/sys/vm/uma_int.h
    trunk/sys/vm/vm.h
    trunk/sys/vm/vm_extern.h
    trunk/sys/vm/vm_fault.c
    trunk/sys/vm/vm_glue.c
    trunk/sys/vm/vm_init.c
    trunk/sys/vm/vm_kern.c
    trunk/sys/vm/vm_kern.h
    trunk/sys/vm/vm_map.c
    trunk/sys/vm/vm_map.h
    trunk/sys/vm/vm_meter.c
    trunk/sys/vm/vm_mmap.c
    trunk/sys/vm/vm_object.c
    trunk/sys/vm/vm_object.h
    trunk/sys/vm/vm_page.c
    trunk/sys/vm/vm_page.h
    trunk/sys/vm/vm_pageout.c
    trunk/sys/vm/vm_pageout.h
    trunk/sys/vm/vm_pager.c
    trunk/sys/vm/vm_pager.h
    trunk/sys/vm/vm_param.h
    trunk/sys/vm/vm_phys.c
    trunk/sys/vm/vm_phys.h
    trunk/sys/vm/vm_radix.c
    trunk/sys/vm/vm_radix.h
    trunk/sys/vm/vm_reserv.c
    trunk/sys/vm/vm_reserv.h
    trunk/sys/vm/vm_unix.c
    trunk/sys/vm/vm_zeroidle.c
    trunk/sys/vm/vnode_pager.c
    trunk/sys/vm/vnode_pager.h

Added Paths:
-----------
    trunk/sys/vm/vm_domain.c
    trunk/sys/vm/vm_domain.h
    trunk/sys/vm/vm_swapout.c
    trunk/sys/vm/vm_swapout_dummy.c

Modified: trunk/sys/vm/_vm_radix.h
===================================================================
--- trunk/sys/vm/_vm_radix.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/_vm_radix.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/_vm_radix.h 254141 2013-08-09 11:28:55Z attilio $
+ * $FreeBSD: stable/11/sys/vm/_vm_radix.h 321513 2017-07-26 06:52:45Z kib $
  */
 
 #ifndef __VM_RADIX_H_
@@ -37,20 +37,6 @@
  */
 struct vm_radix {
 	uintptr_t	rt_root;
-	uint8_t		rt_flags;
 };
 
-#define	RT_INSERT_INPROG	0x01
-#define	RT_TRIE_MODIFIED	0x02
-
-#ifdef _KERNEL
-
-static __inline boolean_t
-vm_radix_is_empty(struct vm_radix *rtree)
-{
-
-	return (rtree->rt_root == 0);
-}
-
-#endif /* _KERNEL */
 #endif /* !__VM_RADIX_H_ */

Modified: trunk/sys/vm/default_pager.c
===================================================================
--- trunk/sys/vm/default_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/default_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,18 +28,10 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * The default pager is responsible for supplying backing store to unbacked
- * storage.  The backing store is usually swap so we just fall through to
- * the swap routines.  However, since swap metadata has not been assigned,
- * the swap routines assign and manage the swap backing store through the
- * vm_page->swapblk field.  The object is only converted when the page is 
- * physically freed after having been cleaned and even then vm_page->swapblk
- * is maintained whenever a resident page also has swap backing store.
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/default_pager.c 310363 2016-12-21 11:32:08Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/default_pager.c 315473 2017-03-18 05:38:10Z alc $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -54,14 +46,16 @@
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
-static vm_object_t default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
-    vm_ooffset_t, struct ucred *);
-static void default_pager_dealloc(vm_object_t);
-static int default_pager_getpages(vm_object_t, vm_page_t *, int, int);
-static void default_pager_putpages(vm_object_t, vm_page_t *, int, 
-		boolean_t, int *);
-static boolean_t default_pager_haspage(vm_object_t, vm_pindex_t, int *, 
-		int *);
+static vm_object_t	default_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
+			    vm_ooffset_t, struct ucred *);
+static void		default_pager_dealloc(vm_object_t);
+static int		default_pager_getpages(vm_object_t, vm_page_t *, int,
+			    int *, int *);
+static void		default_pager_putpages(vm_object_t, vm_page_t *, int, 
+			    boolean_t, int *);
+static boolean_t	default_pager_haspage(vm_object_t, vm_pindex_t, int *, 
+			    int *);
+
 /*
  * pagerops for OBJT_DEFAULT - "default pager".
  *
@@ -84,7 +78,7 @@
 };
 
 /*
- * no_pager_alloc just returns an initialized object.
+ * Return an initialized object.
  */
 static vm_object_t
 default_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -102,51 +96,41 @@
 	object = vm_object_allocate(OBJT_DEFAULT,
 	    OFF_TO_IDX(round_page(offset + size)));
 	if (cred != NULL) {
-		VM_OBJECT_WLOCK(object);
 		object->cred = cred;
 		object->charge = size;
-		VM_OBJECT_WUNLOCK(object);
 	}
 	return (object);
 }
 
 /*
- * deallocate resources associated with default objects.   The default objects
- * have no special resources allocated to them, but the vm_page's being used
- * in this object might.  Still, we do not have to do anything - we will free
- * the swapblk in the underlying vm_page's when we free the vm_page or
- * garbage collect the vm_page cache list.
+ * Deallocate resources associated with the object.
  */
 static void
-default_pager_dealloc(object)
-	vm_object_t object;
+default_pager_dealloc(vm_object_t object)
 {
-	/*
-	 * OBJT_DEFAULT objects have no special resources allocated to them.
-	 */
+
+	/* Reserved swap is released by vm_object_destroy(). */
 	object->type = OBJT_DEAD;
 }
 
 /*
- * Load pages from backing store.  Since OBJT_DEFAULT is converted to
- * OBJT_SWAP at the time a swap-backed vm_page_t is freed, we will never
- * see a vm_page with assigned swap here.
+ * Load pages from backing store.
  */
 static int
-default_pager_getpages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
+default_pager_getpages(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead)
 {
-	return VM_PAGER_FAIL;
+
+	/*
+	 * Since an OBJT_DEFAULT object is converted to OBJT_SWAP by the first
+	 * call to the putpages method, this function will never be called on
+	 * a vm_page with assigned swap.
+	 */
+	return (VM_PAGER_FAIL);
 }
 
 /*
- * Store pages to backing store.  We should assign swap and initiate
- * I/O.  We do not actually convert the object to OBJT_SWAP here.  The
- * object will be converted when the written-out vm_page_t is moved from the
- * cache to the free list.
+ * Store pages to backing store.
  */
 static void
 default_pager_putpages(vm_object_t object, vm_page_t *m, int count,
@@ -153,28 +137,20 @@
     int flags, int *rtvals)
 {
 
+	/* The swap pager will convert the object to OBJT_SWAP. */
 	swappagerops.pgo_putpages(object, m, count, flags, rtvals);
 }
 
 /*
- * Tell us whether the backing store for the requested (object,index) is
- * synchronized.  i.e. tell us whether we can throw the page away and 
- * reload it later.  So, for example, if we are in the process of writing
- * the page to its backing store, or if no backing store has been assigned,
- * it is not yet synchronized.
- *
- * It is possible to have fully-synchronized swap assigned without the
- * object having been converted.  We just call swap_pager_haspage() to
- * deal with it since it must already deal with it plus deal with swap
- * meta-data structures.
+ * Tell us whether the requested (object,index) is available from the object's
+ * backing store.
  */
 static boolean_t
-default_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
+default_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
-	return FALSE;
+
+	/* An OBJT_DEFAULT object has no backing store. */
+	return (FALSE);
 }
 

Modified: trunk/sys/vm/device_pager.c
===================================================================
--- trunk/sys/vm/device_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/device_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/device_pager.c 320439 2017-06-28 06:13:58Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/device_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -47,6 +47,7 @@
 #include <sys/mman.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -60,10 +61,12 @@
 static vm_object_t dev_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dev_pager_dealloc(vm_object_t);
-static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int dev_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void dev_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t dev_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static void dev_pager_free_page(vm_object_t object, vm_page_t m);
+static int dev_pager_populate(vm_object_t object, vm_pindex_t pidx,
+    int fault_type, vm_prot_t, vm_pindex_t *first, vm_pindex_t *last);
 
 /* list of device pager objects */
 static struct pagerlst dev_pager_object_list;
@@ -85,6 +88,7 @@
 	.pgo_getpages =	dev_pager_getpages,
 	.pgo_putpages =	dev_pager_putpages,
 	.pgo_haspage =	dev_pager_haspage,
+	.pgo_populate =	dev_pager_populate,
 };
 
 static int old_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -128,6 +132,8 @@
 
 	if (tp != OBJT_DEVICE && tp != OBJT_MGTDEVICE)
 		return (NULL);
+	KASSERT(tp == OBJT_MGTDEVICE || ops->cdev_pg_populate == NULL,
+	    ("populate on unmanaged device pager"));
 
 	/*
 	 * Offset should be page aligned.
@@ -135,8 +141,18 @@
 	if (foff & PAGE_MASK)
 		return (NULL);
 
+	/*
+	 * Treat the mmap(2) file offset as an unsigned value for a
+	 * device mapping.  This, in effect, allows a user to pass all
+	 * possible off_t values as the mapping cookie to the driver.  At
+	 * this point, we know that both foff and size are a multiple
+	 * of the page size.  Do a check to avoid wrap.
+	 */
 	size = round_page(size);
-	pindex = OFF_TO_IDX(foff + size);
+	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+	if (pindex > OBJ_MAX_SIZE || pindex < UOFF_TO_IDX(foff) ||
+	    pindex < UOFF_TO_IDX(size))
+		return (NULL);
 
 	if (ops->cdev_pg_ctor(handle, size, prot, foff, cred, &color) != 0)
 		return (NULL);
@@ -169,6 +185,11 @@
 			 */
 			if (pindex > object->size)
 				object->size = pindex;
+			KASSERT(object->type == tp,
+			    ("Inconsistent device pager type %p %d",
+			    object, tp));
+			KASSERT(object->un_pager.devp.ops == ops,
+			    ("Inconsistent devops %p %p", object, ops));
 		} else {
 			object = object1;
 			object1 = NULL;
@@ -175,12 +196,14 @@
 			object->handle = handle;
 			TAILQ_INSERT_TAIL(&dev_pager_object_list, object,
 			    pager_object_list);
-			KASSERT(object->type == tp,
-		("Inconsistent device pager type %p %d", object, tp));
+			if (ops->cdev_pg_populate != NULL)
+				vm_object_set_flag(object, OBJ_POPULATE);
 		}
 	} else {
 		if (pindex > object->size)
 			object->size = pindex;
+		KASSERT(object->type == tp,
+		    ("Inconsistent device pager type %p %d", object, tp));
 	}
 	mtx_unlock(&dev_pager_mtx);
 	if (object1 != NULL) {
@@ -256,34 +279,35 @@
 }
 
 static int
-dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int reqpage)
+dev_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
-	int error, i;
+	int error;
 
+	/* Since our haspage reports zero after/before, the count is 1. */
+	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->un_pager.devp.ops->cdev_pg_fault == NULL)
+		return (VM_PAGER_FAIL);
 	error = object->un_pager.devp.ops->cdev_pg_fault(object,
-	    IDX_TO_OFF(ma[reqpage]->pindex), PROT_READ, &ma[reqpage]);
+	    IDX_TO_OFF(ma[0]->pindex), PROT_READ, &ma[0]);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
-	for (i = 0; i < count; i++) {
-		if (i != reqpage) {
-			vm_page_lock(ma[i]);
-			vm_page_free(ma[i]);
-			vm_page_unlock(ma[i]);
-		}
-	}
-
 	if (error == VM_PAGER_OK) {
 		KASSERT((object->type == OBJT_DEVICE &&
-		     (ma[reqpage]->oflags & VPO_UNMANAGED) != 0) ||
+		     (ma[0]->oflags & VPO_UNMANAGED) != 0) ||
 		    (object->type == OBJT_MGTDEVICE &&
-		     (ma[reqpage]->oflags & VPO_UNMANAGED) == 0),
-		    ("Wrong page type %p %p", ma[reqpage], object));
+		     (ma[0]->oflags & VPO_UNMANAGED) == 0),
+		    ("Wrong page type %p %p", ma[0], object));
 		if (object->type == OBJT_DEVICE) {
 			TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist,
-			    ma[reqpage], plinks.q);
+			    ma[0], plinks.q);
 		}
+		if (rbehind)
+			*rbehind = 0;
+		if (rahead)
+			*rahead = 0;
 	}
 
 	return (error);
@@ -290,6 +314,18 @@
 }
 
 static int
+dev_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->un_pager.devp.ops->cdev_pg_populate == NULL)
+		return (VM_PAGER_FAIL);
+	return (object->un_pager.devp.ops->cdev_pg_populate(object, pidx,
+	    fault_type, max_prot, first, last));
+}
+
+static int
 old_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
@@ -355,8 +391,7 @@
 		 */
 		page = vm_page_getfake(paddr, memattr);
 		VM_OBJECT_WLOCK(object);
-		if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
-			panic("old_dev_pager_fault: invalid page replacement");
+		vm_page_replace_checked(page, object, (*mres)->pindex, *mres);
 		vm_page_lock(*mres);
 		vm_page_free(*mres);
 		vm_page_unlock(*mres);

Modified: trunk/sys/vm/memguard.c
===================================================================
--- trunk/sys/vm/memguard.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/memguard.c 325037 2017-10-27 14:23:53Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/memguard.c 331017 2018-03-15 19:08:33Z kevans $");
 
 /*
  * MemGuard is a simple replacement allocator for debugging only
@@ -50,6 +50,7 @@
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
@@ -68,9 +69,9 @@
  * reserved for MemGuard.
  */
 static u_int vm_memguard_divisor;
-SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, divisor, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &vm_memguard_divisor,
-    0, "(kmem_size/memguard_divisor) == memguard submap size");     
+    0, "(kmem_size/memguard_divisor) == memguard submap size");
 
 /*
  * Short description (ks_shortdesc) of memory type to monitor.
@@ -131,8 +132,7 @@
 #define MG_GUARD_ALLLARGE	0x002
 #define MG_GUARD_NOFREE		0x004
 static int memguard_options = MG_GUARD_AROUND;
-TUNABLE_INT("vm.memguard.options", &memguard_options);
-SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RW,
+SYSCTL_INT(_vm_memguard, OID_AUTO, options, CTLFLAG_RWTUN,
     &memguard_options, 0,
     "MemGuard options:\n"
     "\t0x001 - add guard pages around each allocation\n"
@@ -148,8 +148,7 @@
 
 static u_int memguard_frequency;
 static u_long memguard_frequency_hits;
-TUNABLE_INT("vm.memguard.frequency", &memguard_frequency);
-SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RW,
+SYSCTL_UINT(_vm_memguard, OID_AUTO, frequency, CTLFLAG_RWTUN,
     &memguard_frequency, 0, "Times in 100000 that MemGuard will randomly run");
 SYSCTL_ULONG(_vm_memguard, OID_AUTO, frequency_hits, CTLFLAG_RD,
     &memguard_frequency_hits, 0, "# times MemGuard randomly chose");
@@ -165,6 +164,7 @@
 	u_long mem_pgs, parent_size;
 
 	vm_memguard_divisor = 10;
+	/* CTFLAG_RDTUN doesn't work during the early boot process. */
 	TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
 
 	parent_size = vm_map_max(parent_map) - vm_map_min(parent_map) +
@@ -180,7 +180,7 @@
 	 * This prevents memguard's page promotions from completely
 	 * using up memory, since most malloc(9) calls are sub-page.
 	 */
-	mem_pgs = cnt.v_page_count;
+	mem_pgs = vm_cnt.v_page_count;
 	memguard_physlimit = (mem_pgs / vm_memguard_divisor) * PAGE_SIZE;
 	/*
 	 * We want as much KVA as we can take safely.  Use at most our

Modified: trunk/sys/vm/memguard.h
===================================================================
--- trunk/sys/vm/memguard.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/memguard.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
+ * $FreeBSD: stable/11/sys/vm/memguard.h 254025 2013-08-07 06:21:20Z jeff $
  */
 
 #ifndef _VM_MEMGUARD_H_

Modified: trunk/sys/vm/phys_pager.c
===================================================================
--- trunk/sys/vm/phys_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/phys_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/phys_pager.c 310110 2016-12-15 10:47:35Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/phys_pager.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,6 +42,7 @@
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 
 /* list of phys pager objects */
@@ -99,6 +100,7 @@
 				object = object1;
 				object1 = NULL;
 				object->handle = handle;
+				vm_object_set_flag(object, OBJ_POPULATE);
 				TAILQ_INSERT_TAIL(&phys_pager_object_list,
 				    object, pager_object_list);
 			}
@@ -110,6 +112,7 @@
 		vm_object_deallocate(object1);
 	} else {
 		object = vm_object_allocate(OBJT_PHYS, pindex);
+		vm_object_set_flag(object, OBJ_POPULATE);
 	}
 
 	return (object);
@@ -134,7 +137,8 @@
  * Fill as many pages as vm_fault has allocated for us.
  */
 static int
-phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+phys_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
 	int i;
 
@@ -149,35 +153,98 @@
 		    ("phys_pager_getpages: partially valid page %p", m[i]));
 		KASSERT(m[i]->dirty == 0,
 		    ("phys_pager_getpages: dirty page %p", m[i]));
-		/* The requested page must remain busy, the others not. */
-		if (i == reqpage) {
-			vm_page_lock(m[i]);
-			vm_page_flash(m[i]);
-			vm_page_unlock(m[i]);
-		} else
-			vm_page_xunbusy(m[i]);
 	}
+	if (rbehind)
+		*rbehind = 0;
+	if (rahead)
+		*rahead = 0;
 	return (VM_PAGER_OK);
 }
 
-static void
-phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
-    int *rtvals)
-{
-
-	panic("phys_pager_putpage called");
-}
-
 /*
  * Implement a pretty aggressive clustered getpages strategy.  Hint that
  * everything in an entire 4MB window should be prefaulted at once.
  *
- * XXX 4MB (1024 slots per page table page) is convenient for x86,
+ * 4MB (1024 slots per page table page) is convenient for x86,
  * but may not be for other arches.
  */
 #ifndef PHYSCLUSTER
 #define PHYSCLUSTER 1024
 #endif
+static int phys_pager_cluster = PHYSCLUSTER;
+SYSCTL_INT(_vm, OID_AUTO, phys_pager_cluster, CTLFLAG_RWTUN,
+    &phys_pager_cluster, 0,
+    "prefault window size for phys pager");
+
+/*
+ * Max hint to vm_page_alloc() about the further allocation needs
+ * inside the phys_pager_populate() loop.  The number of bits used to
+ * implement VM_ALLOC_COUNT() determines the hard limit on this value.
+ * That limit is currently 65535.
+ */
+#define	PHYSALLOC	16
+
+static int
+phys_pager_populate(vm_object_t object, vm_pindex_t pidx,
+    int fault_type __unused, vm_prot_t max_prot __unused, vm_pindex_t *first,
+    vm_pindex_t *last)
+{
+	vm_page_t m;
+	vm_pindex_t base, end, i;
+	int ahead;
+
+	base = rounddown(pidx, phys_pager_cluster);
+	end = base + phys_pager_cluster - 1;
+	if (end >= object->size)
+		end = object->size - 1;
+	if (*first > base)
+		base = *first;
+	if (end > *last)
+		end = *last;
+	*first = base;
+	*last = end;
+
+	for (i = base; i <= end; i++) {
+retry:
+		m = vm_page_lookup(object, i);
+		if (m == NULL) {
+			ahead = MIN(end - i, PHYSALLOC);
+			m = vm_page_alloc(object, i, VM_ALLOC_NORMAL |
+			    VM_ALLOC_ZERO | VM_ALLOC_WAITFAIL |
+			    VM_ALLOC_COUNT(ahead));
+			if (m == NULL)
+				goto retry;
+			if ((m->flags & PG_ZERO) == 0)
+				pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+		} else if (vm_page_xbusied(m)) {
+			vm_page_lock(m);
+			VM_OBJECT_WUNLOCK(object);
+			vm_page_busy_sleep(m, "physb", true);
+			VM_OBJECT_WLOCK(object);
+			goto retry;
+		} else {
+			vm_page_xbusy(m);
+			if (m->valid != VM_PAGE_BITS_ALL)
+				vm_page_zero_invalid(m, TRUE);
+		}
+
+		KASSERT(m->valid == VM_PAGE_BITS_ALL,
+		    ("phys_pager_populate: partially valid page %p", m));
+		KASSERT(m->dirty == 0,
+		    ("phys_pager_populate: dirty page %p", m));
+	}
+	return (VM_PAGER_OK);
+}
+
+static void
+phys_pager_putpages(vm_object_t object, vm_page_t *m, int count, boolean_t sync,
+    int *rtvals)
+{
+
+	panic("phys_pager_putpage called");
+}
+
 static boolean_t
 phys_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
@@ -184,8 +251,8 @@
 {
 	vm_pindex_t base, end;
 
-	base = pindex & (~(PHYSCLUSTER - 1));
-	end = base + (PHYSCLUSTER - 1);
+	base = rounddown(pindex, phys_pager_cluster);
+	end = base + phys_pager_cluster - 1;
 	if (before != NULL)
 		*before = pindex - base;
 	if (after != NULL)
@@ -200,4 +267,5 @@
 	.pgo_getpages =	phys_pager_getpages,
 	.pgo_putpages =	phys_pager_putpages,
 	.pgo_haspage =	phys_pager_haspage,
+	.pgo_populate =	phys_pager_populate,
 };

Modified: trunk/sys/vm/pmap.h
===================================================================
--- trunk/sys/vm/pmap.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/pmap.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/pmap.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/pmap.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -101,10 +101,22 @@
 /*
  * Flags for pmap_enter().  The bits in the low-order byte are reserved
  * for the protection code (vm_prot_t) that describes the fault type.
+ * Bits 24 through 31 are reserved for the pmap's internal use.
  */
-#define	PMAP_ENTER_NOSLEEP	0x0100
-#define	PMAP_ENTER_WIRED	0x0200
+#define	PMAP_ENTER_NOSLEEP	0x00000100
+#define	PMAP_ENTER_WIRED	0x00000200
+#define	PMAP_ENTER_RESERVED	0xFF000000
 
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced().  This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages.  Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field.  An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define	PMAP_TS_REFERENCED_MAX	5
+
 void		 pmap_activate(struct thread *td);
 void		 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    int advice);
@@ -142,6 +154,8 @@
 void		 pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 void		 pmap_qenter(vm_offset_t, vm_page_t *, int);
 void		 pmap_qremove(vm_offset_t, int);
+vm_offset_t	 pmap_quick_enter_page(vm_page_t);
+void		 pmap_quick_remove_page(vm_offset_t);
 void		 pmap_release(pmap_t);
 void		 pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
 void		 pmap_remove_all(vm_page_t m);

Modified: trunk/sys/vm/redzone.c
===================================================================
--- trunk/sys/vm/redzone.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/redzone.c 227309 2011-11-07 15:43:11Z ed $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/redzone.c 267992 2014-06-28 03:56:17Z hselasky $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,8 +42,7 @@
 SYSCTL_ULONG(_vm_redzone, OID_AUTO, extra_mem, CTLFLAG_RD, &redzone_extra_mem,
     0, "Extra memory allocated by redzone");     
 static int redzone_panic = 0;
-TUNABLE_INT("vm.redzone.panic", &redzone_panic);
-SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RW, &redzone_panic, 0,
+SYSCTL_INT(_vm_redzone, OID_AUTO, panic, CTLFLAG_RWTUN, &redzone_panic, 0,
     "Panic when buffer corruption is detected");     
 
 #define	REDZONE_CHSIZE	(16)

Modified: trunk/sys/vm/redzone.h
===================================================================
--- trunk/sys/vm/redzone.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/redzone.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -24,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
+ * $FreeBSD: stable/11/sys/vm/redzone.h 155086 2006-01-31 11:09:21Z pjd $
  */
 
 #ifndef	_VM_REDZONE_H_

Modified: trunk/sys/vm/sg_pager.c
===================================================================
--- trunk/sys/vm/sg_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/sg_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/sg_pager.c 284100 2015-06-06 20:37:40Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/sg_pager.c 331017 2018-03-15 19:08:33Z kevans $");
 
 /*
  * This pager manages OBJT_SG objects.  These objects are backed by
@@ -39,6 +39,8 @@
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/sglist.h>
+#include <sys/vmmeter.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
@@ -50,7 +52,7 @@
 static vm_object_t sg_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void sg_pager_dealloc(vm_object_t);
-static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int sg_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static void sg_pager_putpages(vm_object_t, vm_page_t *, int, 
 		boolean_t, int *);
 static boolean_t sg_pager_haspage(vm_object_t, vm_pindex_t, int *,
@@ -97,8 +99,9 @@
 	 * to map beyond that.
 	 */
 	size = round_page(size);
-	pindex = OFF_TO_IDX(foff + size);
-	if (pindex > npages)
+	pindex = UOFF_TO_IDX(foff) + UOFF_TO_IDX(size);
+	if (pindex > npages || pindex < UOFF_TO_IDX(foff) ||
+	    pindex < UOFF_TO_IDX(size))
 		return (NULL);
 
 	/*
@@ -136,7 +139,8 @@
 }
 
 static int
-sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+sg_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
 	struct sglist *sg;
 	vm_page_t m_paddr, page;
@@ -146,11 +150,13 @@
 	size_t space;
 	int i;
 
+	/* Since our haspage reports zero after/before, the count is 1. */
+	KASSERT(count == 1, ("%s: count %d", __func__, count));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	sg = object->handle;
 	memattr = object->memattr;
 	VM_OBJECT_WUNLOCK(object);
-	offset = m[reqpage]->pindex;
+	offset = m[0]->pindex;
 
 	/*
 	 * Lookup the physical address of the requested page.  An initial
@@ -179,7 +185,7 @@
 	}
 
 	/* Return a fake page for the requested page. */
-	KASSERT(!(m[reqpage]->flags & PG_FICTITIOUS),
+	KASSERT(!(m[0]->flags & PG_FICTITIOUS),
 	    ("backing page for SG is fake"));
 
 	/* Construct a new fake page. */
@@ -186,19 +192,18 @@
 	page = vm_page_getfake(paddr, memattr);
 	VM_OBJECT_WLOCK(object);
 	TAILQ_INSERT_TAIL(&object->un_pager.sgp.sgp_pglist, page, plinks.q);
-
-	/* Free the original pages and insert this fake page into the object. */
-	for (i = 0; i < count; i++) {
-		if (i == reqpage &&
-		    vm_page_replace(page, object, offset) != m[i])
-			panic("sg_pager_getpages: invalid place replacement");
-		vm_page_lock(m[i]);
-		vm_page_free(m[i]);
-		vm_page_unlock(m[i]);
-	}
-	m[reqpage] = page;
+	vm_page_replace_checked(page, object, offset, m[0]);
+	vm_page_lock(m[0]);
+	vm_page_free(m[0]);
+	vm_page_unlock(m[0]);
+	m[0] = page;
 	page->valid = VM_PAGE_BITS_ALL;
 
+	if (rbehind)
+		*rbehind = 0;
+	if (rahead)
+		*rahead = 0;
+
 	return (VM_PAGER_OK);
 }
 

Modified: trunk/sys/vm/swap_pager.c
===================================================================
--- trunk/sys/vm/swap_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/swap_pager.c 320557 2017-07-01 22:21:11Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/swap_pager.c 350355 2019-07-26 10:36:07Z kib $");
 
 #include "opt_swap.h"
 #include "opt_vm.h"
@@ -87,10 +87,12 @@
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
+#include <sys/pctrie.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
@@ -120,7 +122,7 @@
  * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
-#define MAX_PAGEOUT_CLUSTER 16
+#define	MAX_PAGEOUT_CLUSTER	32
 #endif
 
 #if !defined(SWB_NPAGES)
@@ -127,22 +129,17 @@
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
+#define	SWAP_META_PAGES		PCTRIE_COUNT
+
 /*
- * The swblock structure maps an object and a small, fixed-size range
- * of page indices to disk addresses within a swap area.
- * The collection of these mappings is implemented as a hash table.
- * Unused disk addresses within a swap area are allocated and managed
- * using a blist.
+ * A swblk structure maps each page index within a
+ * SWAP_META_PAGES-aligned and sized range to the address of an
+ * on-disk swap block (or SWAPBLK_NONE). The collection of these
+ * mappings for an entire vm object is implemented as a pc-trie.
  */
-#define SWAP_META_PAGES		(SWB_NPAGES * 2)
-#define SWAP_META_MASK		(SWAP_META_PAGES - 1)
-
-struct swblock {
-	struct swblock	*swb_hnext;
-	vm_object_t	swb_object;
-	vm_pindex_t	swb_index;
-	int		swb_count;
-	daddr_t		swb_pages[SWAP_META_PAGES];
+struct swblk {
+	vm_pindex_t	p;
+	daddr_t		d[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
@@ -151,7 +148,7 @@
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
-static int swdev_syscall_active = 0; /* serialize swap(on|off) */
+static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 
 static vm_ooffset_t swap_total;
 SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
@@ -160,7 +157,7 @@
 SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
     "Amount of swap storage needed to back all allocated anonymous memory.");
 static int overcommit = 0;
-SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
+SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
@@ -210,7 +207,7 @@
 	mtx_lock(&sw_dev_mtx);
 	r = swap_reserved + incr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
-		s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
+		s = vm_cnt.v_page_count - vm_cnt.v_free_reserved - vm_cnt.v_wire_count;
 		s *= PAGE_SIZE;
 	} else
 		s = 0;
@@ -223,16 +220,14 @@
 	mtx_unlock(&sw_dev_mtx);
 
 	if (res) {
-		PROC_LOCK(curproc);
 		UIDINFO_VMSIZE_LOCK(uip);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
-		    uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
+		    uip->ui_vmsize + incr > lim_cur(curthread, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
 			res = 0;
 		else
 			uip->ui_vmsize += incr;
 		UIDINFO_VMSIZE_UNLOCK(uip);
-		PROC_UNLOCK(curproc);
 		if (!res) {
 			mtx_lock(&sw_dev_mtx);
 			swap_reserved -= incr;
@@ -314,12 +309,10 @@
 	racct_sub_cred(cred, RACCT_SWAP, decr);
 }
 
-static void swapdev_strategy(struct buf *, struct swdevt *sw);
-
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
-int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
+static int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
@@ -327,17 +320,17 @@
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
-static struct swblock **swhash;
-static int swhash_mask;
-static struct mtx swhash_mtx;
+static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
+    "Maximum running async swap ops");
+static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
+    "Swap Fragmentation Info");
 
-static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
-
-SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
-	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
-
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
@@ -348,9 +341,9 @@
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
-static struct mtx sw_alloc_mtx;	/* protect list manipulation */
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
-static uma_zone_t	swap_zone;
+static uma_zone_t swblk_zone;
+static uma_zone_t swpctrie_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
@@ -361,7 +354,10 @@
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
-static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
+    int *);
+static int	swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+    int *, pgo_getpages_iodone_t, void *);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
@@ -374,6 +370,7 @@
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
+	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async)		*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
@@ -391,7 +388,7 @@
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
-static int	swapongeom(struct thread *, struct vnode *);
+static int	swapongeom(struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
@@ -404,22 +401,28 @@
 /*
  * Metadata functions
  */
-static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
-static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
+static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
+static void *
+swblk_trie_alloc(struct pctrie *ptree)
+{
+
+	return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
+	    M_USE_RESERVE : 0)));
+}
+
 static void
-swp_pager_free_nrpage(vm_page_t m)
+swblk_trie_free(struct pctrie *ptree, void *node)
 {
 
-	vm_page_lock(m);
-	if (m->wire_count == 0)
-		vm_page_free(m);
-	vm_page_unlock(m);
+	uma_zfree(swpctrie_zone, node);
 }
 
+PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
+
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
@@ -448,33 +451,6 @@
 }
 
 /*
- * SWP_PAGER_HASH() -	hash swap meta data
- *
- *	This is an helper function which hashes the swapblk given
- *	the object and page index.  It returns a pointer to a pointer
- *	to the object, or a pointer to a NULL pointer if it could not
- *	find a swapblk.
- */
-static struct swblock **
-swp_pager_hash(vm_object_t object, vm_pindex_t index)
-{
-	struct swblock **pswap;
-	struct swblock *swap;
-
-	index &= ~(vm_pindex_t)SWAP_META_MASK;
-	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
-	while ((swap = *pswap) != NULL) {
-		if (swap->swb_object == object &&
-		    swap->swb_index == index
-		) {
-			break;
-		}
-		pswap = &swap->swb_hnext;
-	}
-	return (pswap);
-}
-
-/*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
@@ -491,9 +467,9 @@
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
-	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 	sx_init(&sw_alloc_sx, "swspsx");
+	sx_init(&swdev_syscall_lock, "swsysc");
 }
 
 /*
@@ -539,21 +515,25 @@
 	mtx_unlock(&pbuf_mtx);
 
 	/*
-	 * Initialize our zone.  Right now I'm just guessing on the number
-	 * we need based on the number of pages in the system.  Each swblock
-	 * can hold 32 pages, so this is probably overkill.  This reservation
-	 * is typically limited to around 32MB by default.
+	 * Initialize our zone, taking the user's requested size or
+	 * estimating the number we need based on the number of pages
+	 * in the system.
 	 */
-	n = cnt.v_page_count / 2;
-	if (maxswzone && n > maxswzone / sizeof(struct swblock))
-		n = maxswzone / sizeof(struct swblock);
+	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
+	    vm_cnt.v_page_count / 2;
+	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
+	    pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	if (swpctrie_zone == NULL)
+		panic("failed to create swap pctrie zone.");
+	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
+	    NULL, NULL, _Alignof(struct swblk) - 1,
+	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
+	if (swblk_zone == NULL)
+		panic("failed to create swap blk zone.");
 	n2 = n;
-	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
-	if (swap_zone == NULL)
-		panic("failed to create swap_zone.");
 	do {
-		if (uma_zone_reserve_kva(swap_zone, n))
+		if (uma_zone_reserve_kva(swblk_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
@@ -561,25 +541,50 @@
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
-	if (n2 != n)
-		printf("Swap zone entries reduced from %lu to %lu.\n", n2, n);
+
+	/*
+	 * Often uma_zone_reserve_kva() cannot reserve exactly the
+	 * requested size.  Account for the difference when
+	 * calculating swap_maxpages.
+	 */
+	n = uma_zone_get_max(swblk_zone);
+
+	if (n < n2)
+		printf("Swap blk zone entries changed from %lu to %lu.\n",
+		    n2, n);
 	swap_maxpages = n * SWAP_META_PAGES;
-	swzone = n * sizeof(struct swblock);
-	n2 = n;
+	swzone = n * sizeof(struct swblk);
+	if (!uma_zone_reserve_kva(swpctrie_zone, n))
+		printf("Cannot reserve swap pctrie zone, "
+		    "reduce kern.maxswzone.\n");
+}
 
+static vm_object_t
+swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
+    vm_ooffset_t offset)
+{
+	vm_object_t object;
+
+	if (cred != NULL) {
+		if (!swap_reserve_by_cred(size, cred))
+			return (NULL);
+		crhold(cred);
+	}
+
 	/*
-	 * Initialize our meta-data hash table.  The swapper does not need to
-	 * be quite as efficient as the VM system, so we do not use an
-	 * oversized hash table.
-	 *
-	 * 	n: 		size of hash table, must be power of 2
-	 *	swhash_mask:	hash table index mask
+	 * The un_pager.swp.swp_blks trie is initialized by
+	 * vm_object_allocate() to ensure the correct order of
+	 * visibility to other threads.
 	 */
-	for (n = 1; n < n2 / 8; n *= 2)
-		;
-	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
-	swhash_mask = n - 1;
-	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
+	object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
+	    PAGE_MASK + size));
+
+	object->handle = handle;
+	if (cred != NULL) {
+		object->cred = cred;
+		object->charge = size;
+	}
+	return (object);
 }
 
 /*
@@ -587,13 +592,11 @@
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
- *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
- *	and then converting it with swp_pager_meta_build().
+ *	OBJT_SWAP object.
  *
- *	This routine may block in vm_object_allocate() and create a named
- *	object lookup race, so we must interlock.
- *
- * MPSAFE
+ *	This routine must ensure that no live duplicate is created for
+ *	the named object request, which is protected against by
+ *	holding the sw_alloc_sx lock in case handle != NULL.
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
@@ -600,11 +603,8 @@
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
-	vm_pindex_t pindex;
 
-	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
-	if (handle) {
-		mtx_lock(&Giant);
+	if (handle != NULL) {
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
@@ -614,40 +614,16 @@
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
-			if (cred != NULL) {
-				if (!swap_reserve_by_cred(size, cred)) {
-					sx_xunlock(&sw_alloc_sx);
-					mtx_unlock(&Giant);
-					return (NULL);
-				}
-				crhold(cred);
+			object = swap_pager_alloc_init(handle, cred, size,
+			    offset);
+			if (object != NULL) {
+				TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
+				    object, pager_object_list);
 			}
-			object = vm_object_allocate(OBJT_DEFAULT, pindex);
-			VM_OBJECT_WLOCK(object);
-			object->handle = handle;
-			if (cred != NULL) {
-				object->cred = cred;
-				object->charge = size;
-			}
-			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-			VM_OBJECT_WUNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
-		mtx_unlock(&Giant);
 	} else {
-		if (cred != NULL) {
-			if (!swap_reserve_by_cred(size, cred))
-				return (NULL);
-			crhold(cred);
-		}
-		object = vm_object_allocate(OBJT_DEFAULT, pindex);
-		VM_OBJECT_WLOCK(object);
-		if (cred != NULL) {
-			object->cred = cred;
-			object->charge = size;
-		}
-		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
-		VM_OBJECT_WUNLOCK(object);
+		object = swap_pager_alloc_init(handle, cred, size, offset);
 	}
 	return (object);
 }
@@ -666,17 +642,22 @@
 swap_pager_dealloc(vm_object_t object)
 {
 
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
+
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
-		mtx_lock(&sw_alloc_mtx);
-		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
-		mtx_unlock(&sw_alloc_mtx);
+		VM_OBJECT_WUNLOCK(object);
+		sx_xlock(&sw_alloc_sx);
+		TAILQ_REMOVE(NOBJLIST(object->handle), object,
+		    pager_object_list);
+		sx_xunlock(&sw_alloc_sx);
+		VM_OBJECT_WLOCK(object);
 	}
 
-	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
@@ -763,11 +744,8 @@
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
-				bp->b_kvaalloc = bp->b_data;
 				bp->b_data = unmapped_buf;
-				bp->b_kvabase = unmapped_buf;
 				bp->b_offset = 0;
-				bp->b_flags |= B_UNMAPPED;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
@@ -815,6 +793,36 @@
 }
 
 /*
+ * SYSCTL_SWAP_FRAGMENTATION() -	produce raw swap space stats
+ */
+static int
+sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	struct swdevt *sp;
+	const char *devname;
+	int error;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	mtx_lock(&sw_dev_mtx);
+	TAILQ_FOREACH(sp, &swtailq, sw_list) {
+		if (vn_isdisk(sp->sw_vp, NULL))
+			devname = devtoname(sp->sw_vp->v_rdev);
+		else
+			devname = "[file]";
+		sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
+		blist_stats(sp->sw_blist, &sbuf);
+	}
+	mtx_unlock(&sw_dev_mtx);
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+/*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
@@ -906,16 +914,19 @@
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
-	if (destroysource) {
-		if (srcobject->handle != NULL) {
-			mtx_lock(&sw_alloc_mtx);
-			TAILQ_REMOVE(
-			    NOBJLIST(srcobject->handle),
-			    srcobject,
-			    pager_object_list
-			);
-			mtx_unlock(&sw_alloc_mtx);
-		}
+	if (destroysource && srcobject->handle != NULL) {
+		vm_object_pip_add(srcobject, 1);
+		VM_OBJECT_WUNLOCK(srcobject);
+		vm_object_pip_add(dstobject, 1);
+		VM_OBJECT_WUNLOCK(dstobject);
+		sx_xlock(&sw_alloc_sx);
+		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
+		    pager_object_list);
+		sx_xunlock(&sw_alloc_sx);
+		VM_OBJECT_WLOCK(dstobject);
+		vm_object_pip_wakeup(dstobject);
+		VM_OBJECT_WLOCK(srcobject);
+		vm_object_pip_wakeup(srcobject);
 	}
 
 	/*
@@ -970,7 +981,7 @@
 	/*
 	 * Free left over swap blocks in source.
 	 *
-	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
+	 * We have to revert the type to OBJT_DEFAULT so we do not accidentally
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
@@ -993,22 +1004,21 @@
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
- *	store exists before and after the requested page within a reasonable
- *	distance.  We do not try to restrict it to the swap device stripe
- *	(that is handled in getpages/putpages).  It probably isn't worth
- *	doing here.
+ *	store exists before and after the requested page.
  */
 static boolean_t
-swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
+swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
-	daddr_t blk0;
+	daddr_t blk, blk0;
+	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
+
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
-
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
@@ -1021,11 +1031,7 @@
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
-		int i;
-
-		for (i = 1; i < (SWB_NPAGES/2); ++i) {
-			daddr_t blk;
-
+		for (i = 1; i < SWB_NPAGES; i++) {
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
@@ -1032,7 +1038,7 @@
 			if (blk != blk0 - i)
 				break;
 		}
-		*before = (i - 1);
+		*before = i - 1;
 	}
 
 	/*
@@ -1039,16 +1045,12 @@
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
-		int i;
-
-		for (i = 1; i < (SWB_NPAGES/2); ++i) {
-			daddr_t blk;
-
+		for (i = 1; i < SWB_NPAGES; i++) {
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
-		*after = (i - 1);
+		*after = i - 1;
 	}
 	return (TRUE);
 }
@@ -1080,134 +1082,130 @@
 }
 
 /*
- * SWAP_PAGER_GETPAGES() - bring pages in from swap
+ * swap_pager_getpages() - bring pages in from swap
  *
- *	Attempt to retrieve (m, count) pages from backing store, but make
- *	sure we retrieve at least m[reqpage].  We try to load in as large
- *	a chunk surrounding m[reqpage] as is contiguous in swap and which
- *	belongs to the same object.
+ *	Attempt to page in the pages in array "ma" of length "count".  The
+ *	caller may optionally specify that additional pages preceding and
+ *	succeeding the specified range be paged in.  The number of such pages
+ *	is returned in the "rbehind" and "rahead" parameters, and they will
+ *	be in the inactive queue upon return.
  *
- *	The code is designed for asynchronous operation and
- *	immediate-notification of 'reqpage' but tends not to be
- *	used that way.  Please do not optimize-out this algorithmic
- *	feature, I intend to improve on it in the future.
- *
- *	The parent has a single vm_object_pip_add() reference prior to
- *	calling us and we should return with the same.
- *
- *	The parent has BUSY'd the pages.  We should return with 'm'
- *	left busy, but the others adjusted.
+ *	The pages in "ma" must be busied and will remain busied upon return.
  */
 static int
-swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
+swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
 	struct buf *bp;
-	vm_page_t mreq;
-	int i;
-	int j;
+	vm_page_t bm, mpred, msucc, p;
+	vm_pindex_t pindex;
 	daddr_t blk;
+	int i, maxahead, maxbehind, reqcount;
 
-	mreq = m[reqpage];
+	reqcount = count;
 
-	KASSERT(mreq->object == object,
-	    ("swap_pager_getpages: object mismatch %p/%p",
-	    object, mreq->object));
+	/*
+	 * Determine the final number of read-behind pages and
+	 * allocate them BEFORE releasing the object lock.  Otherwise,
+	 * there can be a problematic race with vm_object_split().
+	 * Specifically, vm_object_split() might first transfer pages
+	 * that precede ma[0] in the current object to a new object,
+	 * and then this function incorrectly recreates those pages as
+	 * read-behind pages in the current object.
+	 */
+	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
+		return (VM_PAGER_FAIL);
 
 	/*
-	 * Calculate range to retrieve.  The pages have already been assigned
-	 * their swapblks.  We require a *contiguous* range but we know it to
-	 * not span devices.   If we do not supply it, bad things
-	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
-	 * loops are set up such that the case(s) are handled implicitly.
-	 *
-	 * The swp_*() calls must be made with the object locked.
+	 * Clip the readahead and readbehind ranges to exclude resident pages.
 	 */
-	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
-
-	for (i = reqpage - 1; i >= 0; --i) {
-		daddr_t iblk;
-
-		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
-		if (blk != iblk + (reqpage - i))
-			break;
+	if (rahead != NULL) {
+		KASSERT(reqcount - 1 <= maxahead,
+		    ("page count %d extends beyond swap block", reqcount));
+		*rahead = imin(*rahead, maxahead - (reqcount - 1));
+		pindex = ma[reqcount - 1]->pindex;
+		msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
+		if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
+			*rahead = msucc->pindex - pindex - 1;
 	}
-	++i;
+	if (rbehind != NULL) {
+		*rbehind = imin(*rbehind, maxbehind);
+		pindex = ma[0]->pindex;
+		mpred = TAILQ_PREV(ma[0], pglist, listq);
+		if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
+			*rbehind = pindex - mpred->pindex - 1;
+	}
 
-	for (j = reqpage + 1; j < count; ++j) {
-		daddr_t jblk;
+	bm = ma[0];
+	for (i = 0; i < count; i++)
+		ma[i]->oflags |= VPO_SWAPINPROG;
 
-		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
-		if (blk != jblk - (j - reqpage))
-			break;
-	}
-
 	/*
-	 * free pages outside our collection range.   Note: we never free
-	 * mreq, it must remain busy throughout.
+	 * Allocate readahead and readbehind pages.
 	 */
-	if (0 < i || j < count) {
-		int k;
-
-		for (k = 0; k < i; ++k)
-			swp_pager_free_nrpage(m[k]);
-		for (k = j; k < count; ++k)
-			swp_pager_free_nrpage(m[k]);
+	if (rbehind != NULL) {
+		for (i = 1; i <= *rbehind; i++) {
+			p = vm_page_alloc(object, ma[0]->pindex - i,
+			    VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			p->oflags |= VPO_SWAPINPROG;
+			bm = p;
+		}
+		*rbehind = i - 1;
 	}
+	if (rahead != NULL) {
+		for (i = 0; i < *rahead; i++) {
+			p = vm_page_alloc(object,
+			    ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			p->oflags |= VPO_SWAPINPROG;
+		}
+		*rahead = i;
+	}
+	if (rbehind != NULL)
+		count += *rbehind;
+	if (rahead != NULL)
+		count += *rahead;
 
-	/*
-	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq
-	 * still busy, but the others unbusied.
-	 */
-	if (blk == SWAPBLK_NONE)
-		return (VM_PAGER_FAIL);
+	vm_object_pip_add(object, count);
 
-	/*
-	 * Getpbuf() can sleep.
-	 */
+	pindex = bm->pindex;
+	blk = swp_pager_meta_ctl(object, pindex, 0);
+	KASSERT(blk != SWAPBLK_NONE,
+	    ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
+
 	VM_OBJECT_WUNLOCK(object);
-	/*
-	 * Get a swap buffer header to perform the IO
-	 */
 	bp = getpbuf(&nsw_rcount);
+	/* Pages cannot leave the object while busy. */
+	for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
+		MPASS(p->pindex == bm->pindex + i);
+		bp->b_pages[i] = p;
+	}
+
 	bp->b_flags |= B_PAGING;
-
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
-	bp->b_blkno = blk - (reqpage - i);
-	bp->b_bcount = PAGE_SIZE * (j - i);
-	bp->b_bufsize = PAGE_SIZE * (j - i);
-	bp->b_pager.pg_reqpage = reqpage - i;
+	bp->b_blkno = blk;
+	bp->b_bcount = PAGE_SIZE * count;
+	bp->b_bufsize = PAGE_SIZE * count;
+	bp->b_npages = count;
+	bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
+	bp->b_pgafter = rahead != NULL ? *rahead : 0;
 
-	VM_OBJECT_WLOCK(object);
-	{
-		int k;
-
-		for (k = i; k < j; ++k) {
-			bp->b_pages[k - i] = m[k];
-			m[k]->oflags |= VPO_SWAPINPROG;
-		}
-	}
-	bp->b_npages = j - i;
-
 	PCPU_INC(cnt.v_swapin);
-	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);
+	PCPU_ADD(cnt.v_swappgsin, count);
 
 	/*
-	 * We still hold the lock on mreq, and our automatic completion routine
-	 * does not remove it.
-	 */
-	vm_object_pip_add(object, bp->b_npages);
-	VM_OBJECT_WUNLOCK(object);
-
-	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
-	 * The other pages in our m[] array are also released on completion,
+	 * The other pages in our ma[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1216,13 +1214,13 @@
 	swp_pager_strategy(bp);
 
 	/*
-	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
+	 * Wait for the pages we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
-	 * is set in the meta-data.
+	 * is set in the metadata for each page in the request.
 	 */
 	VM_OBJECT_WLOCK(object);
-	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
-		mreq->oflags |= VPO_SWAPSLEEP;
+	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
+		ma[0]->oflags |= VPO_SWAPSLEEP;
 		PCPU_INC(cnt.v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
 		    "swread", hz * 20)) {
@@ -1233,16 +1231,14 @@
 	}
 
 	/*
-	 * mreq is left busied after completion, but all the other pages
-	 * are freed.  If we had an unrecoverable read error the page will
-	 * not be valid.
+	 * If we had an unrecoverable read error pages will not be valid.
 	 */
-	if (mreq->valid != VM_PAGE_BITS_ALL) {
-		return (VM_PAGER_ERROR);
-	} else {
-		return (VM_PAGER_OK);
-	}
+	for (i = 0; i < reqcount; i++)
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			return (VM_PAGER_ERROR);
 
+	return (VM_PAGER_OK);
+
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
@@ -1252,6 +1248,39 @@
 }
 
 /*
+ * 	swap_pager_getpages_async():
+ *
+ *	Right now this is emulation of asynchronous operation on top of
+ *	swap_pager_getpages().
+ */
+static int
+swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+	int r, error;
+
+	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
+	VM_OBJECT_WUNLOCK(object);
+	switch (r) {
+	case VM_PAGER_OK:
+		error = 0;
+		break;
+	case VM_PAGER_ERROR:
+		error = EIO;
+		break;
+	case VM_PAGER_FAIL:
+		error = EINVAL;
+		break;
+	default:
+		panic("unhandled swap_pager_getpages() error %d", r);
+	}
+	(iodone)(arg, ma, count, error);
+	VM_OBJECT_WLOCK(object);
+
+	return (r);
+}
+
+/*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
@@ -1273,17 +1302,17 @@
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
-void
-swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
+static void
+swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
 	int i, n;
 	boolean_t sync;
 
-	if (count && m[0]->object != object) {
+	if (count && ma[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p",
 		    object,
-		    m[0]->object
+		    ma[0]->object
 		);
 	}
 
@@ -1307,39 +1336,6 @@
 	/*
 	 * Step 2
 	 *
-	 * Update nsw parameters from swap_async_max sysctl values.
-	 * Do not let the sysop crash the machine with bogus numbers.
-	 */
-	mtx_lock(&pbuf_mtx);
-	if (swap_async_max != nsw_wcount_async_max) {
-		int n;
-
-		/*
-		 * limit range
-		 */
-		if ((n = swap_async_max) > nswbuf / 2)
-			n = nswbuf / 2;
-		if (n < 1)
-			n = 1;
-		swap_async_max = n;
-
-		/*
-		 * Adjust difference ( if possible ).  If the current async
-		 * count is too low, we may not be able to make the adjustment
-		 * at this time.
-		 */
-		n -= nsw_wcount_async_max;
-		if (nsw_wcount_async + n >= 0) {
-			nsw_wcount_async += n;
-			nsw_wcount_async_max += n;
-			wakeup(&nsw_wcount_async);
-		}
-	}
-	mtx_unlock(&pbuf_mtx);
-
-	/*
-	 * Step 3
-	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
@@ -1394,7 +1390,7 @@
 
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
-			vm_page_t mreq = m[i+j];
+			vm_page_t mreq = ma[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object,
@@ -1402,8 +1398,6 @@
 			    blk + j
 			);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
-			rtvals[i+j] = VM_PAGER_OK;
-
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
@@ -1419,6 +1413,16 @@
 		PCPU_ADD(cnt.v_swappgsout, bp->b_npages);
 
 		/*
+		 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
+		 * can call the async completion routine at the end of a
+		 * synchronous I/O operation.  Otherwise, our caller would
+		 * perform duplicate unbusy and wakeup operations on the page
+		 * and object, respectively.
+		 */
+		for (j = 0; j < n; j++)
+			rtvals[i + j] = VM_PAGER_PEND;
+
+		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
@@ -1427,10 +1431,6 @@
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
-
-			for (j = 0; j < n; ++j)
-				rtvals[i+j] = VM_PAGER_PEND;
-			/* restart outter loop */
 			continue;
 		}
 
@@ -1443,14 +1443,10 @@
 		swp_pager_strategy(bp);
 
 		/*
-		 * Wait for the sync I/O to complete, then update rtvals.
-		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
-		 * our async completion routine at the end, thus avoiding a
-		 * double-free.
+		 * Wait for the sync I/O to complete.
 		 */
 		bwait(bp, PVM, "swwrt");
-		for (j = 0; j < n; ++j)
-			rtvals[i+j] = VM_PAGER_PEND;
+
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
@@ -1491,12 +1487,10 @@
 	/*
 	 * remove the mapping for kernel virtual
 	 */
-	if ((bp->b_flags & B_UNMAPPED) != 0) {
-		bp->b_data = bp->b_kvaalloc;
-		bp->b_kvabase = bp->b_kvaalloc;
-		bp->b_flags &= ~B_UNMAPPED;
-	} else
+	if (buf_mapped(bp))
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+	else
+		bp->b_data = bp->b_kvabase;
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
@@ -1529,33 +1523,11 @@
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
-				 * When reading, reqpage needs to stay
-				 * locked for the parent, but all other
-				 * pages can be freed.  We still want to
-				 * wakeup the parent waiting on the page,
-				 * though.  ( also: pg_reqpage can be -1 and
-				 * not match anything ).
-				 *
-				 * We have to wake specifically requested pages
-				 * up too because we cleared VPO_SWAPINPROG and
-				 * someone may be waiting for that.
-				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
-				if (i != bp->b_pager.pg_reqpage)
-					swp_pager_free_nrpage(m);
-				else {
-					vm_page_lock(m);
-					vm_page_flash(m);
-					vm_page_unlock(m);
-				}
-				/*
-				 * If i == bp->b_pager.pg_reqpage, do not wake
-				 * the page up.  The caller needs to.
-				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
@@ -1562,7 +1534,7 @@
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
-				vm_page_dirty(m);
+				MPASS(m->dirty == VM_PAGE_BITS_ALL);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
@@ -1577,54 +1549,33 @@
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
-			 *
-			 * If not the requested page then deactivate it.
-			 *
-			 * Note that the requested page, reqpage, is left
-			 * busied, but we still have to wake it up.  The
-			 * other pages are released (unbusied) by
-			 * vm_page_xunbusy().
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
-			m->valid = VM_PAGE_BITS_ALL;
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
-			/*
-			 * We have to wake specifically requested pages
-			 * up too because we cleared VPO_SWAPINPROG and
-			 * could be waiting for it in getpages.  However,
-			 * be sure to not unbusy getpages specifically
-			 * requested page - getpages expects it to be
-			 * left busy.
-			 */
-			if (i != bp->b_pager.pg_reqpage) {
-				vm_page_lock(m);
-				vm_page_deactivate(m);
-				vm_page_unlock(m);
-				vm_page_xunbusy(m);
-			} else {
-				vm_page_lock(m);
-				vm_page_flash(m);
-				vm_page_unlock(m);
-			}
+			m->valid = VM_PAGE_BITS_ALL;
+			if (i < bp->b_pgbefore ||
+			    i >= bp->b_npages - bp->b_pgafter)
+				vm_page_readahead_finish(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
+			 * A page is only written to swap after a period of
+			 * inactivity.  Therefore, we do not expect it to be
+			 * reused.
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
+			vm_page_lock(m);
+			vm_page_deactivate_noreuse(m);
+			vm_page_unlock(m);
 			vm_page_sunbusy(m);
-			if (vm_page_count_severe()) {
-				vm_page_lock(m);
-				vm_page_try_to_cache(m);
-				vm_page_unlock(m);
-			}
 		}
 	}
 
@@ -1661,51 +1612,17 @@
 }
 
 /*
- *	swap_pager_isswapped:
+ * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
- *	Return 1 if at least one page in the given object is paged
- *	out to the given swap device.
+ *	This routine dissociates the page at the given index within an object
+ *	from its backing store, paging it in if it does not reside in memory.
+ *	If the page is paged in, it is marked dirty and placed in the laundry
+ *	queue.  The page is marked dirty because it no longer has backing
+ *	store.  It is placed in the laundry queue because it has not been
+ *	accessed recently.  Otherwise, it would already reside in memory.
  *
- *	This routine may not sleep.
- */
-int
-swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
-{
-	daddr_t index = 0;
-	int bcount;
-	int i;
-
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (object->type != OBJT_SWAP)
-		return (0);
-
-	mtx_lock(&swhash_mtx);
-	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
-		struct swblock *swap;
-
-		if ((swap = *swp_pager_hash(object, index)) != NULL) {
-			for (i = 0; i < SWAP_META_PAGES; ++i) {
-				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
-					mtx_unlock(&swhash_mtx);
-					return (1);
-				}
-			}
-		}
-		index += SWAP_META_PAGES;
-	}
-	mtx_unlock(&swhash_mtx);
-	return (0);
-}
-
-/*
- * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
- *
- *	This routine dissociates the page at the given index within a
- *	swap block from its backing store, paging it in if necessary.
- *	If the page is paged in, it is placed in the inactive queue,
- *	since it had its backing store ripped out from under it.
- *	We also attempt to swap in all other pages in the swap block,
- *	we only guarantee that the one at the specified index is
+ *	We also attempt to swap in all other pages in the swap block.
+ *	However, we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
@@ -1719,7 +1636,7 @@
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid == VM_PAGE_BITS_ALL) {
-		vm_object_pip_subtract(object, 1);
+		vm_object_pip_wakeup(object);
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_activate(m);
@@ -1729,12 +1646,12 @@
 		return;
 	}
 
-	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
+	if (swap_pager_getpages(object, &m, 1, NULL, NULL) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
-	vm_object_pip_subtract(object, 1);
+	vm_object_pip_wakeup(object);
 	vm_page_dirty(m);
 	vm_page_lock(m);
-	vm_page_deactivate(m);
+	vm_page_launder(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	vm_pager_page_unswapped(m);
@@ -1753,50 +1670,56 @@
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
-	struct swblock *swap;
-	vm_object_t locked_obj, object;
-	vm_pindex_t pindex;
-	int i, j, retries;
+	struct swblk *sb;
+	vm_object_t object;
+	vm_pindex_t pi;
+	int i, retries;
 
-	GIANT_REQUIRED;
+	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 
 	retries = 0;
-	locked_obj = NULL;
 full_rescan:
-	mtx_lock(&swhash_mtx);
-	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
-restart:
-		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
-			object = swap->swb_object;
-			pindex = swap->swb_index;
-			for (j = 0; j < SWAP_META_PAGES; ++j) {
-				if (!swp_pager_isondev(swap->swb_pages[j], sp))
+	mtx_lock(&vm_object_list_mtx);
+	TAILQ_FOREACH(object, &vm_object_list, object_list) {
+		if (object->type != OBJT_SWAP)
+			continue;
+		mtx_unlock(&vm_object_list_mtx);
+		/* Depends on type-stability. */
+		VM_OBJECT_WLOCK(object);
+
+		/*
+		 * Dead objects are eventually terminated on their own.
+		 */
+		if ((object->flags & OBJ_DEAD) != 0)
+			goto next_obj;
+
+		/*
+		 * Sync with fences placed after pctrie
+		 * initialization.  We must not access pctrie below
+		 * unless we checked that our object is swap and not
+		 * dead.
+		 */
+		atomic_thread_fence_acq();
+		if (object->type != OBJT_SWAP)
+			goto next_obj;
+
+		for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+		    &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
+			pi = sb->p + SWAP_META_PAGES;
+			for (i = 0; i < SWAP_META_PAGES; i++) {
+				if (sb->d[i] == SWAPBLK_NONE)
 					continue;
-				if (locked_obj != object) {
-					if (locked_obj != NULL)
-						VM_OBJECT_WUNLOCK(locked_obj);
-					locked_obj = object;
-					if (!VM_OBJECT_TRYWLOCK(object)) {
-						mtx_unlock(&swhash_mtx);
-						/* Depends on type-stability. */
-						VM_OBJECT_WLOCK(object);
-						mtx_lock(&swhash_mtx);
-						goto restart;
-					}
-				}
-				MPASS(locked_obj == object);
-				mtx_unlock(&swhash_mtx);
-				swp_pager_force_pagein(object, pindex + j);
-				mtx_lock(&swhash_mtx);
-				goto restart;
+				if (swp_pager_isondev(sb->d[i], sp))
+					swp_pager_force_pagein(object,
+					    sb->p + i);
 			}
 		}
+next_obj:
+		VM_OBJECT_WUNLOCK(object);
+		mtx_lock(&vm_object_list_mtx);
 	}
-	mtx_unlock(&swhash_mtx);
-	if (locked_obj != NULL) {
-		VM_OBJECT_WUNLOCK(locked_obj);
-		locked_obj = NULL;
-	}
+	mtx_unlock(&vm_object_list_mtx);
+
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
@@ -1839,94 +1762,120 @@
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
-	static volatile int exhausted;
-	struct swblock *swap;
-	struct swblock **pswap;
-	int idx;
+	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
+	struct swblk *sb, *sb1;
+	vm_pindex_t modpi, rdpi;
+	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
+
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
+		pctrie_init(&object->un_pager.swp.swp_blks);
+
+		/*
+		 * Ensure that swap_pager_swapoff()'s iteration over
+		 * object_list does not see a garbage pctrie.
+		 */
+		atomic_thread_fence_rel();
+
 		object->type = OBJT_SWAP;
-		object->un_pager.swp.swp_bcount = 0;
-
-		if (object->handle != NULL) {
-			mtx_lock(&sw_alloc_mtx);
-			TAILQ_INSERT_TAIL(
-			    NOBJLIST(object->handle),
-			    object,
-			    pager_object_list
-			);
-			mtx_unlock(&sw_alloc_mtx);
-		}
+		KASSERT(object->handle == NULL, ("default pager with handle"));
 	}
 
-	/*
-	 * Locate hash entry.  If not found create, but if we aren't adding
-	 * anything just return.  If we run out of space in the map we wait
-	 * and, since the hash table may have changed, retry.
-	 */
-retry:
-	mtx_lock(&swhash_mtx);
-	pswap = swp_pager_hash(object, pindex);
-
-	if ((swap = *pswap) == NULL) {
-		int i;
-
+	rdpi = rounddown(pindex, SWAP_META_PAGES);
+	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
+	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
-			goto done;
-
-		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT |
-		    (curproc == pageproc ? M_USE_RESERVE : 0));
-		if (swap == NULL) {
-			mtx_unlock(&swhash_mtx);
+			return;
+		for (;;) {
+			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
+			    pageproc ? M_USE_RESERVE : 0));
+			if (sb != NULL) {
+				sb->p = rdpi;
+				for (i = 0; i < SWAP_META_PAGES; i++)
+					sb->d[i] = SWAPBLK_NONE;
+				if (atomic_cmpset_int(&swblk_zone_exhausted,
+				    1, 0))
+					printf("swblk zone ok\n");
+				break;
+			}
 			VM_OBJECT_WUNLOCK(object);
-			if (uma_zone_exhausted(swap_zone)) {
-				if (atomic_cmpset_int(&exhausted, 0, 1))
-					printf("swap zone exhausted, "
+			if (uma_zone_exhausted(swblk_zone)) {
+				if (atomic_cmpset_int(&swblk_zone_exhausted,
+				    0, 1))
+					printf("swap blk zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
-				pause("swzonex", 10);
+				pause("swzonxb", 10);
 			} else
-				VM_WAIT;
+				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
-			goto retry;
+			sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			if (sb != NULL)
+				/*
+				 * Somebody swapped out a nearby page,
+				 * allocating swblk at the rdpi index,
+				 * while we dropped the object lock.
+				 */
+				goto allocated;
 		}
+		for (;;) {
+			error = SWAP_PCTRIE_INSERT(
+			    &object->un_pager.swp.swp_blks, sb);
+			if (error == 0) {
+				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+				    1, 0))
+					printf("swpctrie zone ok\n");
+				break;
+			}
+			VM_OBJECT_WUNLOCK(object);
+			if (uma_zone_exhausted(swpctrie_zone)) {
+				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
+				    0, 1))
+					printf("swap pctrie zone exhausted, "
+					    "increase kern.maxswzone\n");
+				vm_pageout_oom(VM_OOM_SWAPZ);
+				pause("swzonxp", 10);
+			} else
+				uma_zwait(swpctrie_zone);
+			VM_OBJECT_WLOCK(object);
+			sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			if (sb1 != NULL) {
+				uma_zfree(swblk_zone, sb);
+				sb = sb1;
+				goto allocated;
+			}
+		}
+	}
+allocated:
+	MPASS(sb->p == rdpi);
 
-		if (atomic_cmpset_int(&exhausted, 1, 0))
-			printf("swap zone ok\n");
+	modpi = pindex % SWAP_META_PAGES;
+	/* Delete prior contents of metadata. */
+	if (sb->d[modpi] != SWAPBLK_NONE)
+		swp_pager_freeswapspace(sb->d[modpi], 1);
+	/* Enter block into metadata. */
+	sb->d[modpi] = swapblk;
 
-		swap->swb_hnext = NULL;
-		swap->swb_object = object;
-		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
-		swap->swb_count = 0;
-
-		++object->un_pager.swp.swp_bcount;
-
-		for (i = 0; i < SWAP_META_PAGES; ++i)
-			swap->swb_pages[i] = SWAPBLK_NONE;
-	}
-
 	/*
-	 * Delete prior contents of metadata
+	 * Free the swblk if we end up with the empty page run.
 	 */
-	idx = pindex & SWAP_META_MASK;
-
-	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
-		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
-		--swap->swb_count;
+	if (swapblk == SWAPBLK_NONE) {
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				break;
+		}
+		if (i == SWAP_META_PAGES) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    rdpi);
+			uma_zfree(swblk_zone, sb);
+		}
 	}
-
-	/*
-	 * Enter block into metadata
-	 */
-	swap->swb_pages[idx] = swapblk;
-	if (swapblk != SWAPBLK_NONE)
-		++swap->swb_count;
-done:
-	mtx_unlock(&swhash_mtx);
 }
 
 /*
@@ -1940,41 +1889,39 @@
  *	with resident pages.
  */
 static void
-swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
+swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
 {
+	struct swblk *sb;
+	vm_pindex_t last;
+	int i;
+	bool empty;
 
-	VM_OBJECT_ASSERT_LOCKED(object);
-	if (object->type != OBJT_SWAP)
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	if (object->type != OBJT_SWAP || count == 0)
 		return;
 
-	while (count > 0) {
-		struct swblock **pswap;
-		struct swblock *swap;
-
-		mtx_lock(&swhash_mtx);
-		pswap = swp_pager_hash(object, index);
-
-		if ((swap = *pswap) != NULL) {
-			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
-
-			if (v != SWAPBLK_NONE) {
-				swp_pager_freeswapspace(v, 1);
-				swap->swb_pages[index & SWAP_META_MASK] =
-					SWAPBLK_NONE;
-				if (--swap->swb_count == 0) {
-					*pswap = swap->swb_hnext;
-					uma_zfree(swap_zone, swap);
-					--object->un_pager.swp.swp_bcount;
-				}
-			}
-			--count;
-			++index;
-		} else {
-			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
-			count -= n;
-			index += n;
+	last = pindex + count - 1;
+	for (;;) {
+		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+		    rounddown(pindex, SWAP_META_PAGES));
+		if (sb == NULL || sb->p > last)
+			break;
+		empty = true;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] == SWAPBLK_NONE)
+				continue;
+			if (pindex <= sb->p + i && sb->p + i <= last) {
+				swp_pager_freeswapspace(sb->d[i], 1);
+				sb->d[i] = SWAPBLK_NONE;
+			} else
+				empty = false;
 		}
-		mtx_unlock(&swhash_mtx);
+		pindex = sb->p + SWAP_META_PAGES;
+		if (empty) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    sb->p);
+			uma_zfree(swblk_zone, sb);
+		}
 	}
 }
 
@@ -1987,9 +1934,8 @@
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
-	struct swblock **pswap, *swap;
-	vm_pindex_t index;
-	daddr_t v;
+	struct swblk *sb;
+	vm_pindex_t pindex;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1996,27 +1942,15 @@
 	if (object->type != OBJT_SWAP)
 		return;
 
-	index = 0;
-	while (object->un_pager.swp.swp_bcount != 0) {
-		mtx_lock(&swhash_mtx);
-		pswap = swp_pager_hash(object, index);
-		if ((swap = *pswap) != NULL) {
-			for (i = 0; i < SWAP_META_PAGES; ++i) {
-				v = swap->swb_pages[i];
-				if (v != SWAPBLK_NONE) {
-					--swap->swb_count;
-					swp_pager_freeswapspace(v, 1);
-				}
-			}
-			if (swap->swb_count != 0)
-				panic(
-				    "swap_pager_meta_free_all: swb_count != 0");
-			*pswap = swap->swb_hnext;
-			uma_zfree(swap_zone, swap);
-			--object->un_pager.swp.swp_bcount;
+	for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
+	    &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
+		pindex = sb->p + SWAP_META_PAGES;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				swp_pager_freeswapspace(sb->d[i], 1);
 		}
-		mtx_unlock(&swhash_mtx);
-		index += SWAP_META_PAGES;
+		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
+		uma_zfree(swblk_zone, sb);
 	}
 }
 
@@ -2030,9 +1964,6 @@
  *	was invalid.  This routine will automatically free any invalid
  *	meta-data swapblks.
  *
- *	It is not possible to store invalid swapblks in the swap meta data
- *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
- *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
@@ -2043,44 +1974,90 @@
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
-	struct swblock **pswap;
-	struct swblock *swap;
+	struct swblk *sb;
 	daddr_t r1;
-	int idx;
+	int i;
 
-	VM_OBJECT_ASSERT_LOCKED(object);
+	if ((flags & (SWM_FREE | SWM_POP)) != 0)
+		VM_OBJECT_ASSERT_WLOCKED(object);
+	else
+		VM_OBJECT_ASSERT_LOCKED(object);
+
 	/*
-	 * The meta data only exists of the object is OBJT_SWAP
+	 * The meta data only exists if the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
-	r1 = SWAPBLK_NONE;
-	mtx_lock(&swhash_mtx);
-	pswap = swp_pager_hash(object, pindex);
+	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
+	    rounddown(pindex, SWAP_META_PAGES));
+	if (sb == NULL)
+		return (SWAPBLK_NONE);
+	r1 = sb->d[pindex % SWAP_META_PAGES];
+	if (r1 == SWAPBLK_NONE)
+		return (SWAPBLK_NONE);
+	if ((flags & (SWM_FREE | SWM_POP)) != 0) {
+		sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
+		for (i = 0; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				break;
+		}
+		if (i == SWAP_META_PAGES) {
+			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
+			    rounddown(pindex, SWAP_META_PAGES));
+			uma_zfree(swblk_zone, sb);
+		}
+	}
+	if ((flags & SWM_FREE) != 0) {
+		swp_pager_freeswapspace(r1, 1);
+		r1 = SWAPBLK_NONE;
+	}
+	return (r1);
+}
 
-	if ((swap = *pswap) != NULL) {
-		idx = pindex & SWAP_META_MASK;
-		r1 = swap->swb_pages[idx];
+/*
+ * Returns the least page index which is greater than or equal to the
+ * parameter pindex and for which there is a swap block allocated.
+ * Returns object's size if the object's type is not swap or if there
+ * are no allocated swap blocks for the object after the requested
+ * pindex.
+ */
+vm_pindex_t
+swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
+{
+	struct swblk *sb;
+	int i;
 
-		if (r1 != SWAPBLK_NONE) {
-			if (flags & SWM_FREE) {
-				swp_pager_freeswapspace(r1, 1);
-				r1 = SWAPBLK_NONE;
-			}
-			if (flags & (SWM_FREE|SWM_POP)) {
-				swap->swb_pages[idx] = SWAPBLK_NONE;
-				if (--swap->swb_count == 0) {
-					*pswap = swap->swb_hnext;
-					uma_zfree(swap_zone, swap);
-					--object->un_pager.swp.swp_bcount;
-				}
-			}
+	VM_OBJECT_ASSERT_LOCKED(object);
+	if (object->type != OBJT_SWAP)
+		return (object->size);
+
+	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+	    rounddown(pindex, SWAP_META_PAGES));
+	if (sb == NULL)
+		return (object->size);
+	if (sb->p < pindex) {
+		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
+			if (sb->d[i] != SWAPBLK_NONE)
+				return (sb->p + i);
 		}
+		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
+		    roundup(pindex, SWAP_META_PAGES));
+		if (sb == NULL)
+			return (object->size);
 	}
-	mtx_unlock(&swhash_mtx);
-	return (r1);
+	for (i = 0; i < SWAP_META_PAGES; i++) {
+		if (sb->d[i] != SWAPBLK_NONE)
+			return (sb->p + i);
+	}
+
+	/*
+	 * We get here if a swblk is present in the trie but it
+	 * doesn't map any blocks.
+	 */
+	MPASS(0);
+	return (object->size);
 }
 
 /*
@@ -2110,16 +2087,13 @@
 	if (error)
 		return (error);
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
-	if (swap_zone == NULL) {
+	if (swblk_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
@@ -2134,7 +2108,7 @@
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
-		error = swapongeom(td, vp);
+		error = swapongeom(vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
@@ -2148,9 +2122,7 @@
 	if (error)
 		vrele(vp);
 done:
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
@@ -2157,15 +2129,16 @@
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
- * message and return -1; otherwise, return 0.
+ * message.
  */
-static int
-swapon_check_swzone(unsigned long npages)
+static void
+swapon_check_swzone(void)
 {
-	unsigned long maxpages;
+	unsigned long maxpages, npages;
 
+	npages = swap_total / PAGE_SIZE;
 	/* absolute maximum we can handle assuming 100% efficiency */
-	maxpages = uma_zone_get_max(swap_zone) * SWAP_META_PAGES;
+	maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
@@ -2174,9 +2147,7 @@
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
-		return (-1);
 	}
-	return (0);
 }
 
 static void
@@ -2212,7 +2183,6 @@
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
-	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
@@ -2244,7 +2214,7 @@
 	nswapdev++;
 	swap_pager_avail += nblks - 2;
 	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
-	swapon_check_swzone(swap_total / PAGE_SIZE);
+	swapon_check_swzone();
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
@@ -2280,10 +2250,7 @@
 	if (error)
 		return (error);
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
@@ -2305,9 +2272,7 @@
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
@@ -2319,7 +2284,7 @@
 	int error;
 #endif
 
-	mtx_assert(&Giant, MA_OWNED);
+	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
@@ -2335,10 +2300,8 @@
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
-	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
-	    nblks + nswap_lowat) {
+	if (vm_cnt.v_free_count + swap_pager_avail < nblks + nswap_lowat)
 		return (ENOMEM);
-	}
 
 	/*
 	 * Prevent further allocations on this device.
@@ -2378,10 +2341,7 @@
 	const char *devname;
 	int error;
 
-	mtx_lock(&Giant);
-	while (swdev_syscall_active)
-		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
-	swdev_syscall_active = 1;
+	sx_xlock(&swdev_syscall_lock);
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
@@ -2401,9 +2361,7 @@
 	}
 	mtx_unlock(&sw_dev_mtx);
 
-	swdev_syscall_active = 0;
-	wakeup_one(&swdev_syscall_active);
-	mtx_unlock(&Giant);
+	sx_xunlock(&swdev_syscall_lock);
 }
 
 void
@@ -2472,19 +2430,14 @@
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
-SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
+SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
+    sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
- * vmspace_swap_count() - count the approximate swap usage in pages for a
- *			  vmspace.
- *
- *	The map must be locked.
- *
- *	Swap usage is determined by taking the proportional swap used by
- *	VM objects backing the VM map.  To make up for fractional losses,
- *	if the VM object has any swap use at all the associated map entries
- *	count for at least 1 swap page.
+ * Count the approximate swap usage in pages for a vmspace.  The
+ * shadowed or not yet copied on write swap blocks are not accounted.
+ * The map must be locked.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
@@ -2492,23 +2445,38 @@
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
-	long count, n;
+	struct swblk *sb;
+	vm_pindex_t e, pi;
+	long count;
+	int i;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
-		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
-		    (object = cur->object.vm_object) != NULL) {
-			VM_OBJECT_WLOCK(object);
-			if (object->type == OBJT_SWAP &&
-			    object->un_pager.swp.swp_bcount != 0) {
-				n = (cur->end - cur->start) / PAGE_SIZE;
-				count += object->un_pager.swp.swp_bcount *
-				    SWAP_META_PAGES * n / object->size + 1;
+		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
+			continue;
+		object = cur->object.vm_object;
+		if (object == NULL || object->type != OBJT_SWAP)
+			continue;
+		VM_OBJECT_RLOCK(object);
+		if (object->type != OBJT_SWAP)
+			goto unlock;
+		pi = OFF_TO_IDX(cur->offset);
+		e = pi + OFF_TO_IDX(cur->end - cur->start);
+		for (;; pi = sb->p + SWAP_META_PAGES) {
+			sb = SWAP_PCTRIE_LOOKUP_GE(
+			    &object->un_pager.swp.swp_blks, pi);
+			if (sb == NULL || sb->p >= e)
+				break;
+			for (i = 0; i < SWAP_META_PAGES; i++) {
+				if (sb->p + i < e &&
+				    sb->d[i] != SWAPBLK_NONE)
+					count++;
 			}
-			VM_OBJECT_WUNLOCK(object);
 		}
+unlock:
+		VM_OBJECT_RUNLOCK(object);
 	}
 	return (count);
 }
@@ -2554,8 +2522,9 @@
 }
 
 /*
- * Remove a reference from the g_consumer. Post a close event if
- * all referneces go away.
+ * Remove a reference from the g_consumer.  Post a close event if all
+ * references go away, since the function might be called from the
+ * biodone context.
  */
 static void
 swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
@@ -2628,7 +2597,7 @@
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
-	if ((bp->b_flags & B_UNMAPPED) != 0) {
+	if (!buf_mapped(bp)) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
@@ -2678,22 +2647,19 @@
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
-	/* XXX: direct call when Giant untangled */
+
+	/*
+	 * swapgeom_close() may be called from the biodone context,
+	 * where we cannot perform topology changes.  Delegate the
+	 * work to the events thread.
+	 */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
-
-struct swh0h0 {
-	struct cdev *dev;
-	struct vnode *vp;
-	int	error;
-};
-
-static void
-swapongeom_ev(void *arg, int flags)
+static int
+swapongeom_locked(struct cdev *dev, struct vnode *vp)
 {
-	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
@@ -2701,20 +2667,15 @@
 	u_long nblks;
 	int error;
 
-	swh = arg;
-	swh->error = 0;
-	pp = g_dev_getprovider(swh->dev);
-	if (pp == NULL) {
-		swh->error = ENODEV;
-		return;
-	}
+	pp = g_dev_getprovider(dev);
+	if (pp == NULL)
+		return (ENODEV);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
-			swh->error = EBUSY;
-			return;
+			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
@@ -2721,44 +2682,41 @@
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
-	cp->index = 1;		/* Number of active I/Os, plus one for being active. */
+	cp->index = 1;	/* Number of active I/Os, plus one for being active. */
 	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	/*
-	 * XXX: Everytime you think you can improve the margin for
+	 * XXX: Every time you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
-	if (error) {
+	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
-		swh->error = error;
-		return;
+		return (error);
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
-	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
-	    swapgeom_close, dev2udev(swh->dev),
+	swaponsomething(vp, cp, nblks, swapgeom_strategy,
+	    swapgeom_close, dev2udev(dev),
 	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
-	swh->error = 0;
+	return (0);
 }
 
 static int
-swapongeom(struct thread *td, struct vnode *vp)
+swapongeom(struct vnode *vp)
 {
 	int error;
-	struct swh0h0 swh;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-
-	swh.dev = vp->v_rdev;
-	swh.vp = vp;
-	swh.error = 0;
-	/* XXX: direct call when Giant untangled */
-	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
-	if (!error)
-		error = swh.error;
+	if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) {
+		error = ENOENT;
+	} else {
+		g_topology_lock();
+		error = swapongeom_locked(vp->v_rdev, vp);
+		g_topology_unlock();
+	}
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
@@ -2833,3 +2791,40 @@
 	    NODEV, 0);
 	return (0);
 }
+
+static int
+sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
+{
+	int error, new, n;
+
+	new = nsw_wcount_async_max;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	if (new > nswbuf / 2 || new < 1)
+		return (EINVAL);
+
+	mtx_lock(&pbuf_mtx);
+	while (nsw_wcount_async_max != new) {
+		/*
+		 * Adjust difference.  If the current async count is too low,
+		 * we will need to sqeeze our update slowly in.  Sleep with a
+		 * higher priority than getpbuf() to finish faster.
+		 */
+		n = new - nsw_wcount_async_max;
+		if (nsw_wcount_async + n >= 0) {
+			nsw_wcount_async += n;
+			nsw_wcount_async_max += n;
+			wakeup(&nsw_wcount_async);
+		} else {
+			nsw_wcount_async_max -= nsw_wcount_async;
+			nsw_wcount_async = 0;
+			msleep(&nsw_wcount_async, &pbuf_mtx, PSWP,
+			    "swpsysctl", 0);
+		}
+	}
+	mtx_unlock(&pbuf_mtx);
+
+	return (0);
+}

Modified: trunk/sys/vm/swap_pager.h
===================================================================
--- trunk/sys/vm/swap_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/swap_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	from: @(#)swap_pager.h	7.1 (Berkeley) 12/5/90
- * $FreeBSD: stable/10/sys/vm/swap_pager.h 248514 2013-03-19 14:39:27Z kib $
+ * $FreeBSD: stable/11/sys/vm/swap_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_VM_SWAP_PAGER_H_
@@ -74,15 +74,14 @@
 
 #ifdef _KERNEL
 
-extern int swap_pager_full;
 extern int swap_pager_avail;
 
 struct xswdev;
 int swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len);
 void swap_pager_copy(vm_object_t, vm_object_t, vm_pindex_t, int);
+vm_pindex_t swap_pager_find_least(vm_object_t object, vm_pindex_t pindex);
 void swap_pager_freespace(vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_swap_init(void);
-int swap_pager_isswapped(vm_object_t, struct swdevt *);
 int swap_pager_reserve(vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_status(int *total, int *used);
 void swapoff_all(void);

Modified: trunk/sys/vm/uma.h
===================================================================
--- trunk/sys/vm/uma.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma.h 324602 2017-10-13 17:11:08Z jhb $
+ * $FreeBSD: stable/11/sys/vm/uma.h 338389 2018-08-29 17:58:01Z markj $
  *
  */
 
@@ -263,8 +263,8 @@
 					 * information in the vm_page.
 					 */
 #define	UMA_ZONE_SECONDARY	0x0200	/* Zone is a Secondary Zone */
-#define	UMA_ZONE_REFCNT		0x0400	/* Allocate refcnts in slabs */
-#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets */
+#define	UMA_ZONE_NOBUCKET	0x0400	/* Do not use buckets. */
+#define	UMA_ZONE_MAXBUCKET	0x0800	/* Use largest buckets. */
 #define	UMA_ZONE_CACHESPREAD	0x1000	/*
 					 * Spread memory start locations across
 					 * all possible cache lines.  May
@@ -277,7 +277,7 @@
 					 * mini-dumps.
 					 */
 #define	UMA_ZONE_PCPU		0x8000	/*
-					 * Allocates mp_ncpus slabs sized to
+					 * Allocates mp_maxid + 1 slabs sized to
 					 * sizeof(struct pcpu).
 					 */
 
@@ -288,7 +288,7 @@
  */
 #define	UMA_ZONE_INHERIT						\
     (UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE |		\
-    UMA_ZONE_HASH | UMA_ZONE_REFCNT | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
+    UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
 
 /* Definitions for align */
 #define UMA_ALIGN_PTR	(sizeof(void *) - 1)	/* Alignment fit for ptr */
@@ -367,6 +367,11 @@
 }
 
 /*
+ * Wait until the specified zone can allocate an item.
+ */
+void uma_zwait(uma_zone_t zone);
+
+/*
  * XXX The rest of the prototypes in this header are h0h0 magic for the VM.
  * If you think you need to use it for a normal zone you're probably incorrect.
  */
@@ -523,6 +528,19 @@
 void uma_zone_set_warning(uma_zone_t zone, const char *warning);
 
 /*
+ * Sets a function to run when limit is reached
+ *
+ * Arguments:
+ *	zone  The zone to which this applies
+ *	fx  The function ro run
+ *
+ * Returns:
+ *	Nothing
+ */
+typedef void (*uma_maxaction_t)(uma_zone_t, int);
+void uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t);
+
+/*
  * Obtains the approximate current number of items allocated from a zone
  *
  * Arguments:
@@ -612,21 +630,6 @@
 void uma_prealloc(uma_zone_t zone, int itemcnt);
 
 /*
- * Used to lookup the reference counter allocated for an item
- * from a UMA_ZONE_REFCNT zone.  For UMA_ZONE_REFCNT zones,
- * reference counters are allocated for items and stored in
- * the underlying slab header.
- *
- * Arguments:
- *	zone  The UMA_ZONE_REFCNT zone to which the item belongs.
- *	item  The address of the item for which we want a refcnt.
- *
- * Returns:
- *	A pointer to a uint32_t reference counter.
- */
-uint32_t *uma_find_refcnt(uma_zone_t zone, void *item);
-
-/*
  * Used to determine if a fixed-size zone is exhausted.
  *
  * Arguments:

Modified: trunk/sys/vm/uma_core.c
===================================================================
--- trunk/sys/vm/uma_core.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_core.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,7 +32,7 @@
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
- * effecient.  A primary design goal is to return unused memory to the rest of
+ * efficient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
@@ -49,7 +49,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 320440 2017-06-28 06:40:13Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_core.c 357046 2020-01-23 14:14:38Z markj $");
 
 /* I should really use ktr.. */
 /*
@@ -75,10 +75,12 @@
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/random.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
@@ -112,7 +114,6 @@
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
-static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
@@ -138,7 +139,7 @@
     LIST_HEAD_INITIALIZER(uma_cachezones);
 
 /* This RW lock protects the keg list */
-static struct rwlock_padalign uma_rwlock;
+static struct rwlock_padalign __exclusive_cache_line uma_rwlock;
 
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
@@ -153,14 +154,9 @@
 static int booted = 0;
 #define	UMA_STARTUP	1
 #define	UMA_STARTUP2	2
+#define	UMA_SHUTDOWN	3
 
 /*
- * Only mbuf clusters use ref zones.  Just provide enough references
- * to support the one user.  New code should not use the ref facility.
- */
-static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
-
-/*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
@@ -248,11 +244,12 @@
 static void keg_large_init(uma_keg_t keg);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
-static int hash_alloc(struct uma_hash *);
+static int hash_alloc(struct uma_hash *, u_int);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
+static void uma_shutdown(void);
 static void *zone_alloc_item(uma_zone_t, void *, int);
 static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
 static void bucket_enable(void);
@@ -276,6 +273,11 @@
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
+#ifdef INVARIANTS
+static void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
+static void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
+#endif
+
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
@@ -285,8 +287,7 @@
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 static int zone_warnings = 1;
-TUNABLE_INT("vm.zone_warnings", &zone_warnings);
-SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
+SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RWTUN, &zone_warnings, 0,
     "Warn when UMA zones becomes full");
 
 /*
@@ -433,6 +434,14 @@
 		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
 }
 
+static inline void
+zone_maxaction(uma_zone_t zone)
+{
+
+	if (zone->uz_maxaction.ta_func != NULL)
+		taskqueue_enqueue(taskqueue_thread, &zone->uz_maxaction);
+}
+
 static void
 zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
 {
@@ -471,6 +480,7 @@
 static void
 keg_timeout(uma_keg_t keg)
 {
+	u_int slabs;
 
 	KEG_LOCK(keg);
 	/*
@@ -481,7 +491,8 @@
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	if (keg->uk_flags & UMA_ZONE_HASH &&
-	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
+	    (slabs = keg->uk_pages / keg->uk_ppera) >
+	     keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
@@ -492,9 +503,8 @@
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
-		newhash = keg->uk_hash;
 		KEG_UNLOCK(keg);
-		ret = hash_alloc(&newhash);
+		ret = hash_alloc(&newhash, 1 << fls(slabs));
 		KEG_LOCK(keg);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
@@ -526,19 +536,16 @@
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
- *	1 on sucess and 0 on failure.
+ *	1 on success and 0 on failure.
  */
 static int
-hash_alloc(struct uma_hash *hash)
+hash_alloc(struct uma_hash *hash, u_int size)
 {
-	int oldsize;
-	int alloc;
+	size_t alloc;
 
-	oldsize = hash->uh_hashsize;
-
-	/* We're just going to go to a power of two greater */
-	if (oldsize)  {
-		hash->uh_hashsize = oldsize * 2;
+	KASSERT(powerof2(size), ("hash size must be power of 2"));
+	if (size > UMA_HASH_SIZE_INIT)  {
+		hash->uh_hashsize = size;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
@@ -575,8 +582,8 @@
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
-	int hval;
-	int i;
+	u_int hval;
+	u_int idx;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
@@ -589,10 +596,10 @@
 	 * full rehash.
 	 */
 
-	for (i = 0; i < oldhash->uh_hashsize; i++)
-		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
-			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
-			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
+	for (idx = 0; idx < oldhash->uh_hashsize; idx++)
+		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[idx])) {
+			slab = SLIST_FIRST(&oldhash->uh_slab_hash[idx]);
+			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[idx], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
@@ -840,8 +847,7 @@
 keg_drain(uma_keg_t keg)
 {
 	struct slabhead freeslabs = { 0 };
-	uma_slab_t slab;
-	uma_slab_t n;
+	uma_slab_t slab, tmp;
 
 	/*
 	 * We don't want to take pages from statically allocated kegs at this
@@ -857,15 +863,10 @@
 	if (keg->uk_free == 0)
 		goto finished;
 
-	slab = LIST_FIRST(&keg->uk_free_slab);
-	while (slab) {
-		n = LIST_NEXT(slab, us_link);
-
-		/* We have no where to free these to */
-		if (slab->us_flags & UMA_SLAB_BOOT) {
-			slab = n;
+	LIST_FOREACH_SAFE(slab, &keg->uk_free_slab, us_link, tmp) {
+		/* We have nowhere to free these to. */
+		if (slab->us_flags & UMA_SLAB_BOOT)
 			continue;
-		}
 
 		LIST_REMOVE(slab, us_link);
 		keg->uk_pages -= keg->uk_ppera;
@@ -875,8 +876,6 @@
 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
-
-		slab = n;
 	}
 finished:
 	KEG_UNLOCK(keg);
@@ -939,7 +938,6 @@
 static uma_slab_t
 keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
 {
-	uma_slabrefcnt_t slabref;
 	uma_alloc allocf;
 	uma_slab_t slab;
 	uint8_t *mem;
@@ -1002,11 +1000,6 @@
 #ifdef INVARIANTS
 	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
 #endif
-	if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		slabref = (uma_slabrefcnt_t)slab;
-		for (i = 0; i < keg->uk_ipers; i++)
-			slabref->us_refcnt[i] = 0;
-	}
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
@@ -1135,7 +1128,9 @@
 	npages = howmany(bytes, PAGE_SIZE);
 	while (npages > 0) {
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
-		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
+		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ |
+		    ((wait & M_WAITOK) != 0 ? VM_ALLOC_WAITOK :
+		    VM_ALLOC_NOWAIT));
 		if (p != NULL) {
 			/*
 			 * Since the page does not belong to an object, its
@@ -1145,17 +1140,12 @@
 			npages--;
 			continue;
 		}
-		if (wait & M_WAITOK) {
-			VM_WAIT;
-			continue;
-		}
-
 		/*
 		 * Page allocation failed, free intermediate pages and
 		 * exit.
 		 */
 		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
-			vm_page_unwire(p, 0);
+			vm_page_unwire(p, PQ_NONE);
 			vm_page_free(p); 
 		}
 		return (NULL);
@@ -1229,7 +1219,7 @@
 	u_int slabsize;
 
 	if (keg->uk_flags & UMA_ZONE_PCPU) {
-		u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+		u_int ncpus = (mp_maxid + 1) ? (mp_maxid + 1) : MAXCPU;
 
 		slabsize = sizeof(struct pcpu);
 		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
@@ -1255,15 +1245,20 @@
 	    keg->uk_rsize < sizeof(struct pcpu),
 	    ("%s: size %u too large", __func__, keg->uk_rsize));
 
-	if (keg->uk_flags & UMA_ZONE_REFCNT)
-		rsize += sizeof(uint32_t);
-
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 		shsize = 0;
 	else 
 		shsize = sizeof(struct uma_slab);
 
-	keg->uk_ipers = (slabsize - shsize) / rsize;
+	if (rsize <= slabsize - shsize)
+		keg->uk_ipers = (slabsize - shsize) / rsize;
+	else {
+		/* Handle special case when we have 1 item per slab, so
+		 * alignment requirement can be relaxed. */
+		KASSERT(keg->uk_size <= slabsize - shsize,
+		    ("%s: size %u greater than slab", __func__, keg->uk_size));
+		keg->uk_ipers = 1;
+	}
 	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
 	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
 
@@ -1337,21 +1332,24 @@
 	keg->uk_ipers = 1;
 	keg->uk_rsize = keg->uk_size;
 
-	/* We can't do OFFPAGE if we're internal, bail out here. */
-	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
-		return;
-
 	/* Check whether we have enough space to not do OFFPAGE. */
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) == 0) {
 		shsize = sizeof(struct uma_slab);
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			shsize += keg->uk_ipers * sizeof(uint32_t);
 		if (shsize & UMA_ALIGN_PTR)
 			shsize = (shsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 
-		if ((PAGE_SIZE * keg->uk_ppera) - keg->uk_rsize < shsize)
-			keg->uk_flags |= UMA_ZONE_OFFPAGE;
+		if (PAGE_SIZE * keg->uk_ppera - keg->uk_rsize < shsize) {
+			/*
+			 * We can't do OFFPAGE if we're internal, in which case
+			 * we need an extra page per allocation to contain the
+			 * slab header.
+			 */
+			if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) == 0)
+				keg->uk_flags |= UMA_ZONE_OFFPAGE;
+			else
+				keg->uk_ppera++;
+		}
 	}
 
 	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
@@ -1433,7 +1431,7 @@
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
-	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
+	if (arg->flags & UMA_ZONE_MALLOC)
 		keg->uk_flags |= UMA_ZONE_VTOSLAB;
 
 	if (arg->flags & UMA_ZONE_PCPU)
@@ -1445,13 +1443,6 @@
 
 	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
 		keg_cachespread_init(keg);
-	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
-		if (keg->uk_size >
-		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
-		    sizeof(uint32_t)))
-			keg_large_init(keg);
-		else
-			keg_small_init(keg);
 	} else {
 		if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			keg_large_init(keg);
@@ -1459,15 +1450,8 @@
 			keg_small_init(keg);
 	}
 
-	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
-		if (keg->uk_flags & UMA_ZONE_REFCNT) {
-			if (keg->uk_ipers > uma_max_ipers_ref)
-				panic("Too many ref items per zone: %d > %d\n",
-				    keg->uk_ipers, uma_max_ipers_ref);
-			keg->uk_slabzone = slabrefzone;
-		} else
-			keg->uk_slabzone = slabzone;
-	}
+	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
+		keg->uk_slabzone = slabzone;
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
@@ -1504,10 +1488,6 @@
 		/* Size of the slab struct and free list */
 		totsize = sizeof(struct uma_slab);
 
-		/* Size of the reference counts. */
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize += keg->uk_ipers * sizeof(uint32_t);
-
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
@@ -1521,8 +1501,6 @@
 		 * sure here anyway.
 		 */
 		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
-		if (keg->uk_flags & UMA_ZONE_REFCNT)
-			totsize += keg->uk_ipers * sizeof(uint32_t);
 		if (totsize > PAGE_SIZE * keg->uk_ppera) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
@@ -1532,7 +1510,7 @@
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
-		hash_alloc(&keg->uk_hash);
+		hash_alloc(&keg->uk_hash, 0);
 
 #ifdef UMA_DEBUG
 	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
@@ -1667,10 +1645,15 @@
 	}
 
 out:
-	if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
+	KASSERT((arg->flags & (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET)) !=
+	    (UMA_ZONE_MAXBUCKET | UMA_ZONE_NOBUCKET),
+	    ("Invalid zone flag combination"));
+	if ((arg->flags & UMA_ZONE_MAXBUCKET) != 0)
+		zone->uz_count = BUCKET_MAX;
+	else if ((arg->flags & UMA_ZONE_NOBUCKET) != 0)
+		zone->uz_count = 0;
+	else
 		zone->uz_count = bucket_select(zone->uz_size);
-	else
-		zone->uz_count = BUCKET_MAX;
 	zone->uz_count_min = zone->uz_count;
 
 	return (0);
@@ -1785,7 +1768,6 @@
 {
 	struct uma_zctor_args args;
 	uma_slab_t slab;
-	u_int slabsize;
 	int i;
 
 #ifdef UMA_DEBUG
@@ -1835,9 +1817,6 @@
 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
-	printf("Initializing pcpu cache locks.\n");
-#endif
-#ifdef UMA_DEBUG
 	printf("Creating slab and hash zones.\n");
 #endif
 
@@ -1847,18 +1826,6 @@
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
-	/*
-	 * We also create a zone for the bigger slabs with reference
-	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
-	 */
-	slabsize = sizeof(struct uma_slab_refcnt);
-	slabsize += uma_max_ipers_ref * sizeof(uint32_t);
-	slabrefzone = uma_zcreate("UMA RCntSlabs",
-				  slabsize,
-				  NULL, NULL, NULL, NULL,
-				  UMA_ALIGN_PTR,
-				  UMA_ZFLAG_INTERNAL);
-
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
@@ -1885,10 +1852,6 @@
 #endif
 }
 
-/*
- * Initialize our callout handle
- *
- */
 
 static void
 uma_startup3(void)
@@ -1901,8 +1864,18 @@
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
 #endif
+
+	EVENTHANDLER_REGISTER(shutdown_post_sync, uma_shutdown, NULL,
+	    EVENTHANDLER_PRI_FIRST);
 }
 
+static void
+uma_shutdown(void)
+{
+
+	booted = UMA_SHUTDOWN;
+}
+
 static uma_keg_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, uint32_t flags)
@@ -1948,6 +1921,20 @@
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
+#ifdef  INVARIANTS
+	/*
+	 * If a zone is being created with an empty constructor and
+	 * destructor, pass UMA constructor/destructor which checks for
+	 * memory use after free.
+	 */
+	if ((!(flags & (UMA_ZONE_ZINIT | UMA_ZONE_NOFREE))) &&
+	    ctor == NULL && dtor == NULL && uminit == NULL && fini == NULL) {
+		args.ctor = trash_ctor;
+		args.dtor = trash_dtor;
+		args.uminit = trash_init;
+		args.fini = trash_fini;
+	}
+#endif
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
@@ -2070,15 +2057,8 @@
 		error = EINVAL;
 		goto out;
 	}
+
 	/*
-	 * Both must either be refcnt, or not be refcnt.
-	 */
-	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
-	    (master->uz_flags & UMA_ZONE_REFCNT)) {
-		error = EINVAL;
-		goto out;
-	}
-	/*
 	 * The underlying object must be the same size.  rsize
 	 * may be different.
 	 */
@@ -2114,11 +2094,28 @@
 uma_zdestroy(uma_zone_t zone)
 {
 
+	/*
+	 * Large slabs are expensive to reclaim, so don't bother doing
+	 * unnecessary work if we're shutting down.
+	 */
+	if (booted == UMA_SHUTDOWN &&
+	    zone->uz_fini == NULL &&
+	    zone->uz_release == (uma_release)zone_release)
+		return;
 	sx_slock(&uma_drain_lock);
 	zone_free_item(zones, zone, NULL, SKIP_NONE);
 	sx_sunlock(&uma_drain_lock);
 }
 
+void
+uma_zwait(uma_zone_t zone)
+{
+	void *item;
+
+	item = uma_zalloc_arg(zone, NULL, M_WAITOK);
+	uma_zfree(zone, item);
+}
+
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
@@ -2129,6 +2126,9 @@
 	int lockfail;
 	int cpu;
 
+	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
@@ -2140,20 +2140,17 @@
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
+	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+	    ("uma_zalloc_arg: called with spinlock or critical section held"));
+
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp_zone(zone)) {
 		item = memguard_alloc(zone->uz_size, flags);
 		if (item != NULL) {
-			/*
-			 * Avoid conflict with the use-after-free
-			 * protecting infrastructure from INVARIANTS.
-			 */
 			if (zone->uz_init != NULL &&
-			    zone->uz_init != mtrash_init &&
 			    zone->uz_init(item, zone->uz_size, flags) != 0)
 				return (NULL);
 			if (zone->uz_ctor != NULL &&
-			    zone->uz_ctor != mtrash_ctor &&
 			    zone->uz_ctor(item, zone->uz_size, udata,
 			    flags) != 0) {
 			    	zone->uz_fini(item, zone->uz_size);
@@ -2289,7 +2286,7 @@
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
-	 * works we'll restart the allocation from the begining and it
+	 * works we'll restart the allocation from the beginning and it
 	 * will use the just filled bucket.
 	 */
 	bucket = zone_alloc_bucket(zone, udata, flags);
@@ -2370,6 +2367,7 @@
 			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
 				zone->uz_flags |= UMA_ZFLAG_FULL;
 				zone_log_warning(zone);
+				zone_maxaction(zone);
 			}
 			if (flags & M_NOWAIT)
 				break;
@@ -2489,6 +2487,7 @@
 			zone->uz_flags |= UMA_ZFLAG_FULL;
 			zone->uz_sleeps++;
 			zone_log_warning(zone);
+			zone_maxaction(zone);
 			msleep(zone, zone->uz_lockptr, PVM,
 			    "zonelimit", hz/100);
 			zone->uz_flags &= ~UMA_ZFLAG_FULL;
@@ -2668,6 +2667,9 @@
 	int lockfail;
 	int cpu;
 
+	/* Enable entropy collection for RANDOM_ENABLE_UMA kernel option */
+	random_harvest_fast_uma(&zone, sizeof(zone), 1, RANDOM_UMA);
+
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
@@ -2674,14 +2676,17 @@
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
+	KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
+	    ("uma_zfree_arg: called with spinlock or critical section held"));
+
         /* uma_zfree(..., NULL) does nothing, to match free(9). */
         if (item == NULL)
                 return;
 #ifdef DEBUG_MEMGUARD
 	if (is_memguard_addr(item)) {
-		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
+		if (zone->uz_dtor != NULL)
 			zone->uz_dtor(item, zone->uz_size, udata);
-		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
+		if (zone->uz_fini != NULL)
 			zone->uz_fini(item, zone->uz_size);
 		memguard_free(item);
 		return;
@@ -2988,6 +2993,16 @@
 }
 
 /* See uma.h */
+void
+uma_zone_set_maxaction(uma_zone_t zone, uma_maxaction_t maxaction)
+{
+
+	ZONE_LOCK(zone);
+	TASK_INIT(&zone->uz_maxaction, 0, (task_fn_t *)maxaction, zone);
+	ZONE_UNLOCK(zone);
+}
+
+/* See uma.h */
 int
 uma_zone_get_cur(uma_zone_t zone)
 {
@@ -3176,26 +3191,6 @@
 }
 
 /* See uma.h */
-uint32_t *
-uma_find_refcnt(uma_zone_t zone, void *item)
-{
-	uma_slabrefcnt_t slabref;
-	uma_slab_t slab;
-	uma_keg_t keg;
-	uint32_t *refcnt;
-	int idx;
-
-	slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
-	slabref = (uma_slabrefcnt_t)slab;
-	keg = slab->us_keg;
-	KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
-	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
-	idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-	refcnt = &slabref->us_refcnt[idx];
-	return refcnt;
-}
-
-/* See uma.h */
 static void
 uma_reclaim_locked(bool kmem_danger)
 {
@@ -3216,7 +3211,6 @@
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
-	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
@@ -3309,9 +3303,10 @@
 static void
 uma_zero_item(void *item, uma_zone_t zone)
 {
+	int i;
 
 	if (zone->uz_flags & UMA_ZONE_PCPU) {
-		for (int i = 0; i < mp_ncpus; i++)
+		CPU_FOREACH(i)
 			bzero(zpcpu_get_cpu(item, i), zone->uz_size);
 	} else
 		bzero(item, zone->uz_size);
@@ -3447,7 +3442,7 @@
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
-	struct uma_percpu_stat ups;
+	struct uma_percpu_stat *ups;
 	uma_bucket_t bucket;
 	struct sbuf sbuf;
 	uma_cache_t cache;
@@ -3461,6 +3456,8 @@
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	sbuf_clear_flags(&sbuf, SBUF_INCLUDENUL);
+	ups = malloc((mp_maxid + 1) * sizeof(*ups), M_TEMP, M_WAITOK);
 
 	count = 0;
 	rw_rlock(&uma_rwlock);
@@ -3509,7 +3506,6 @@
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			uth.uth_sleeps = z->uz_sleeps;
-			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
@@ -3518,30 +3514,31 @@
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
-			for (i = 0; i < (mp_maxid + 1); i++) {
-				bzero(&ups, sizeof(ups));
-				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
-					goto skip;
-				if (CPU_ABSENT(i))
-					goto skip;
+			for (i = 0; i < mp_maxid + 1; i++) {
+				bzero(&ups[i], sizeof(*ups));
+				if (kz->uk_flags & UMA_ZFLAG_INTERNAL ||
+				    CPU_ABSENT(i))
+					continue;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
-					ups.ups_cache_free +=
+					ups[i].ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
-					ups.ups_cache_free +=
+					ups[i].ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
-				ups.ups_allocs = cache->uc_allocs;
-				ups.ups_frees = cache->uc_frees;
-skip:
-				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
+				ups[i].ups_allocs = cache->uc_allocs;
+				ups[i].ups_frees = cache->uc_frees;
 			}
 			ZONE_UNLOCK(z);
+			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
+			for (i = 0; i < mp_maxid + 1; i++)
+				(void)sbuf_bcat(&sbuf, &ups[i], sizeof(ups[i]));
 		}
 	}
 	rw_runlock(&uma_rwlock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
+	free(ups, M_TEMP);
 	return (error);
 }
 
@@ -3549,16 +3546,13 @@
 sysctl_handle_uma_zone_max(SYSCTL_HANDLER_ARGS)
 {
 	uma_zone_t zone = *(uma_zone_t *)arg1;
-	int error, max, old;
+	int error, max;
 
-	old = max = uma_zone_get_max(zone);
+	max = uma_zone_get_max(zone);
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
-	if (max < old)
-		return (EINVAL);
-
 	uma_zone_set_max(zone, max);
 
 	return (0);
@@ -3574,6 +3568,102 @@
 	return (sysctl_handle_int(oidp, &cur, 0, req));
 }
 
+#ifdef INVARIANTS
+static uma_slab_t
+uma_dbg_getslab(uma_zone_t zone, void *item)
+{
+	uma_slab_t slab;
+	uma_keg_t keg;
+	uint8_t *mem;
+
+	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
+	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
+		slab = vtoslab((vm_offset_t)mem);
+	} else {
+		/*
+		 * It is safe to return the slab here even though the
+		 * zone is unlocked because the item's allocation state
+		 * essentially holds a reference.
+		 */
+		ZONE_LOCK(zone);
+		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
+		if (keg->uk_flags & UMA_ZONE_HASH)
+			slab = hash_sfind(&keg->uk_hash, mem);
+		else
+			slab = (uma_slab_t)(mem + keg->uk_pgoff);
+		ZONE_UNLOCK(zone);
+	}
+
+	return (slab);
+}
+
+/*
+ * Set up the slab's freei data such that uma_dbg_free can function.
+ *
+ */
+static void
+uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+	uma_keg_t keg;
+	int freei;
+
+	if (zone_first_keg(zone) == NULL)
+		return;
+	if (slab == NULL) {
+		slab = uma_dbg_getslab(zone, item);
+		if (slab == NULL) 
+			panic("uma: item %p did not belong to zone %s\n",
+			    item, zone->uz_name);
+	}
+	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+
+	return;
+}
+
+/*
+ * Verifies freed addresses.  Checks for alignment, valid slab membership
+ * and duplicate frees.
+ *
+ */
+static void
+uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
+{
+	uma_keg_t keg;
+	int freei;
+
+	if (zone_first_keg(zone) == NULL)
+		return;
+	if (slab == NULL) {
+		slab = uma_dbg_getslab(zone, item);
+		if (slab == NULL) 
+			panic("uma: Freed item %p did not belong to zone %s\n",
+			    item, zone->uz_name);
+	}
+	keg = slab->us_keg;
+	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
+
+	if (freei >= keg->uk_ipers)
+		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
+		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
+		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
+		    item, zone, zone->uz_name, slab, freei);
+
+	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
+}
+#endif /* INVARIANTS */
+
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
@@ -3631,4 +3721,4 @@
 			return;
 	}
 }
-#endif
+#endif	/* DDB */

Modified: trunk/sys/vm/uma_dbg.c
===================================================================
--- trunk/sys/vm/uma_dbg.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -32,8 +32,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/uma_dbg.c 252040 2013-06-20 19:08:12Z jeff $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/uma_dbg.c 301176 2016-06-01 22:31:35Z markj $");
 
+#include "opt_vm.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitset.h>
@@ -50,6 +52,7 @@
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
+#include <vm/memguard.h>
 
 static const uint32_t uma_junk = 0xdeadc0de;
 
@@ -58,7 +61,6 @@
  * prior to subsequent reallocation.
  *
  * Complies with standard ctor arg/return
- *
  */
 int
 trash_ctor(void *mem, int size, void *arg, int flags)
@@ -66,12 +68,22 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	cnt = size / sizeof(uma_junk);
 
 	for (p = mem; cnt > 0; cnt--, p++)
 		if (*p != uma_junk) {
+#ifdef INVARIANTS
+			panic("Memory modified after free %p(%d) val=%x @ %p\n",
+			    mem, size, *p, p);
+#else
 			printf("Memory modified after free %p(%d) val=%x @ %p\n",
 			    mem, size, *p, p);
+#endif
 			return (0);
 		}
 	return (0);
@@ -89,6 +101,11 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return;
+#endif
+
 	cnt = size / sizeof(uma_junk);
 
 	for (p = mem; cnt > 0; cnt--, p++)
@@ -127,6 +144,11 @@
 	uint32_t *p = mem;
 	int cnt;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	size -= sizeof(struct malloc_type *);
 	ksp = (struct malloc_type **)mem;
 	ksp += size / sizeof(struct malloc_type *);
@@ -154,6 +176,11 @@
 	int cnt;
 	uint32_t *p;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return;
+#endif
+
 	size -= sizeof(struct malloc_type *);
 	cnt = size / sizeof(uma_junk);
 
@@ -172,6 +199,11 @@
 {
 	struct malloc_type **ksp;
 
+#ifdef DEBUG_MEMGUARD
+	if (is_memguard_addr(mem))
+		return (0);
+#endif
+
 	mtrash_dtor(mem, size, NULL);
 
 	ksp = (struct malloc_type **)mem;
@@ -192,100 +224,3 @@
 {
 	(void)mtrash_ctor(mem, size, NULL, 0);
 }
-
-#ifdef INVARIANTS
-static uma_slab_t
-uma_dbg_getslab(uma_zone_t zone, void *item)
-{
-	uma_slab_t slab;
-	uma_keg_t keg;
-	uint8_t *mem;
-
-	mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
-	if (zone->uz_flags & UMA_ZONE_VTOSLAB) {
-		slab = vtoslab((vm_offset_t)mem);
-	} else {
-		/*
-		 * It is safe to return the slab here even though the
-		 * zone is unlocked because the item's allocation state
-		 * essentially holds a reference.
-		 */
-		ZONE_LOCK(zone);
-		keg = LIST_FIRST(&zone->uz_kegs)->kl_keg;
-		if (keg->uk_flags & UMA_ZONE_HASH)
-			slab = hash_sfind(&keg->uk_hash, mem);
-		else
-			slab = (uma_slab_t)(mem + keg->uk_pgoff);
-		ZONE_UNLOCK(zone);
-	}
-
-	return (slab);
-}
-
-/*
- * Set up the slab's freei data such that uma_dbg_free can function.
- *
- */
-void
-uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item)
-{
-	uma_keg_t keg;
-	int freei;
-
-	if (zone_first_keg(zone) == NULL)
-		return;
-	if (slab == NULL) {
-		slab = uma_dbg_getslab(zone, item);
-		if (slab == NULL) 
-			panic("uma: item %p did not belong to zone %s\n",
-			    item, zone->uz_name);
-	}
-	keg = slab->us_keg;
-	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
-	if (BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
-		panic("Duplicate alloc of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-	BIT_SET_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-
-	return;
-}
-
-/*
- * Verifies freed addresses.  Checks for alignment, valid slab membership
- * and duplicate frees.
- *
- */
-void
-uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item)
-{
-	uma_keg_t keg;
-	int freei;
-
-	if (zone_first_keg(zone) == NULL)
-		return;
-	if (slab == NULL) {
-		slab = uma_dbg_getslab(zone, item);
-		if (slab == NULL) 
-			panic("uma: Freed item %p did not belong to zone %s\n",
-			    item, zone->uz_name);
-	}
-	keg = slab->us_keg;
-	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
-
-	if (freei >= keg->uk_ipers)
-		panic("Invalid free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	if (((freei * keg->uk_rsize) + slab->us_data) != item) 
-		panic("Unaligned free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	if (!BIT_ISSET(SLAB_SETSIZE, freei, &slab->us_debugfree))
-		panic("Duplicate free of %p from zone %p(%s) slab %p(%d)\n",
-		    item, zone, zone->uz_name, slab, freei);
-
-	BIT_CLR_ATOMIC(SLAB_SETSIZE, freei, &slab->us_debugfree);
-}
-
-#endif /* INVARIANTS */

Modified: trunk/sys/vm/uma_dbg.h
===================================================================
--- trunk/sys/vm/uma_dbg.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_dbg.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,7 +25,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma_dbg.h 148078 2005-07-16 09:51:52Z rwatson $
+ * $FreeBSD: stable/11/sys/vm/uma_dbg.h 295221 2016-02-03 22:02:36Z glebius $
  *
  */
 
@@ -50,7 +50,4 @@
 int mtrash_init(void *mem, int size, int flags);
 void mtrash_fini(void *mem, int size);
 
-void uma_dbg_free(uma_zone_t zone, uma_slab_t slab, void *item);
-void uma_dbg_alloc(uma_zone_t zone, uma_slab_t slab, void *item);
-
 #endif /* VM_UMA_DBG_H */

Modified: trunk/sys/vm/uma_int.h
===================================================================
--- trunk/sys/vm/uma_int.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/uma_int.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -25,10 +25,13 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/uma_int.h 316835 2017-04-14 14:11:59Z avg $
+ * $FreeBSD: stable/11/sys/vm/uma_int.h 344363 2019-02-20 14:12:25Z pfg $
  *
  */
 
+#include <sys/_bitset.h>
+#include <sys/_task.h>
+
 /* 
  * This file includes definitions, structures, prototypes, and inlines that
  * should not be used outside of the actual implementation of UMA.
@@ -109,6 +112,8 @@
 #define UMA_SLAB_SHIFT	PAGE_SHIFT	/* Number of bits PAGE_MASK */
 
 #define UMA_BOOT_PAGES		64	/* Pages allocated for startup */
+#define UMA_BOOT_PAGES_ZONES	32	/* Multiplier for pages to reserve */
+					/* if uma_zone > PAGE_SIZE */
 
 /* Max waste percentage before going to off page slab management */
 #define UMA_MAX_WASTE	10
@@ -140,8 +145,8 @@
 
 struct uma_hash {
 	struct slabhead	*uh_slab_hash;	/* Hash table for slabs */
-	int		uh_hashsize;	/* Current size of the hash table */
-	int		uh_hashmask;	/* Mask used during hashing */
+	u_int		uh_hashsize;	/* Current size of the hash table */
+	u_int		uh_hashmask;	/* Mask used during hashing */
 };
 
 /*
@@ -207,7 +212,7 @@
 	vm_offset_t	uk_kva;		/* Zone base KVA */
 	uma_zone_t	uk_slabzone;	/* Slab zone backing us, if OFFPAGE */
 
-	uint16_t	uk_pgoff;	/* Offset to uma_slab struct */
+	uint32_t	uk_pgoff;	/* Offset to uma_slab struct */
 	uint16_t	uk_ppera;	/* pages per allocation from backend */
 	uint16_t	uk_ipers;	/* Items per slab */
 	uint32_t	uk_flags;	/* Internal flags */
@@ -248,17 +253,7 @@
 #define	us_link	us_type._us_link
 #define	us_size	us_type._us_size
 
-/*
- * The slab structure for UMA_ZONE_REFCNT zones for whose items we
- * maintain reference counters in the slab for.
- */
-struct uma_slab_refcnt {
-	struct uma_slab		us_head;	/* slab header data */
-	uint32_t		us_refcnt[0];	/* Actually larger. */
-};
-
 typedef struct uma_slab * uma_slab_t;
-typedef struct uma_slab_refcnt * uma_slabrefcnt_t;
 typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
 
 struct uma_klink {
@@ -303,10 +298,12 @@
 	uint16_t	uz_count;	/* Amount of items in full bucket */
 	uint16_t	uz_count_min;	/* Minimal amount of items there */
 
-	/* The next three fields are used to print a rate-limited warnings. */
+	/* The next two fields are used to print a rate-limited warnings. */
 	const char	*uz_warning;	/* Warning to print on failure */
 	struct timeval	uz_ratecheck;	/* Warnings rate-limiting */
 
+	struct task	uz_maxaction;	/* Task to run when at limit */
+
 	/*
 	 * This HAS to be the last item because we adjust the zone size
 	 * based on NCPU and then allocate the space for the zones.
@@ -390,7 +387,7 @@
 hash_sfind(struct uma_hash *hash, uint8_t *data)
 {
         uma_slab_t slab;
-        int hval;
+        u_int hval;
 
         hval = UMA_HASH(hash, data);
 
@@ -421,7 +418,7 @@
 
 /*
  * The following two functions may be defined by architecture specific code
- * if they can provide more effecient allocation functions.  This is useful
+ * if they can provide more efficient allocation functions.  This is useful
  * for using direct mapped addresses.
  */
 void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,

Modified: trunk/sys/vm/vm.h
===================================================================
--- trunk/sys/vm/vm.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -56,7 +56,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm.h 321717 2017-07-30 10:36:20Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm.h 331921 2018-04-03 09:38:53Z kib $
  */
 
 #ifndef VM_H
@@ -79,7 +79,9 @@
 #define	VM_PROT_WRITE		((vm_prot_t) 0x02)
 #define	VM_PROT_EXECUTE		((vm_prot_t) 0x04)
 #define	VM_PROT_COPY		((vm_prot_t) 0x08)	/* copy-on-read */
-#define	VM_PROT_FAULT_LOOKUP	((vm_prot_t) 0x010)
+#define	VM_PROT_PRIV_FLAG	((vm_prot_t) 0x10)
+#define	VM_PROT_FAULT_LOOKUP	VM_PROT_PRIV_FLAG
+#define	VM_PROT_QUICK_NOFAULT	VM_PROT_PRIV_FLAG	/* same to save bits */
 
 #define	VM_PROT_ALL		(VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
 #define VM_PROT_RW		(VM_PROT_READ|VM_PROT_WRITE)
@@ -112,8 +114,9 @@
 typedef int boolean_t;
 
 /*
- * The exact set of memory attributes is machine dependent.  However, every
- * machine is required to define VM_MEMATTR_DEFAULT.
+ * The exact set of memory attributes is machine dependent.  However,
+ * every machine is required to define VM_MEMATTR_DEFAULT and
+ * VM_MEMATTR_UNCACHEABLE.
  */
 typedef	char vm_memattr_t;	/* memory attribute codes */
 

Added: trunk/sys/vm/vm_domain.c
===================================================================
--- trunk/sys/vm/vm_domain.c	                        (rev 0)
+++ trunk/sys/vm/vm_domain.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,401 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *    redistribution must be conditioned upon including a substantially
+ *    similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_domain.c 312714 2017-01-24 19:39:24Z mjg $");
+
+#include "opt_vm.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#ifdef VM_NUMA_ALLOC
+#include <sys/proc.h>
+#endif
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/vmmeter.h>
+#include <sys/seq.h>
+
+#include <ddb/ddb.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_phys.h>
+
+#include <vm/vm_domain.h>
+
+#ifdef VM_NUMA_ALLOC
+static __inline int
+vm_domain_rr_selectdomain(int skip_domain)
+{
+	struct thread *td;
+
+	td = curthread;
+
+	td->td_dom_rr_idx++;
+	td->td_dom_rr_idx %= vm_ndomains;
+
+	/*
+	 * If skip_domain is provided then skip over that
+	 * domain.  This is intended for round robin variants
+	 * which first try a fixed domain.
+	 */
+	if ((skip_domain > -1) && (td->td_dom_rr_idx == skip_domain)) {
+		td->td_dom_rr_idx++;
+		td->td_dom_rr_idx %= vm_ndomains;
+	}
+	return (td->td_dom_rr_idx);
+}
+#endif
+
+/*
+ * This implements a very simple set of VM domain memory allocation
+ * policies and iterators.
+ */
+
+/*
+ * A VM domain policy represents a desired VM domain policy.
+ * Iterators implement searching through VM domains in a specific
+ * order.
+ */
+
+/*
+ * When setting a policy, the caller must establish their own
+ * exclusive write protection for the contents of the domain
+ * policy.
+ */
+int
+vm_domain_policy_init(struct vm_domain_policy *vp)
+{
+
+	bzero(vp, sizeof(*vp));
+	vp->p.policy = VM_POLICY_NONE;
+	vp->p.domain = -1;
+	return (0);
+}
+
+int
+vm_domain_policy_set(struct vm_domain_policy *vp,
+    vm_domain_policy_type_t vt, int domain)
+{
+
+	seq_write_begin(&vp->seq);
+	vp->p.policy = vt;
+	vp->p.domain = domain;
+	seq_write_end(&vp->seq);
+	return (0);
+}
+
+/*
+ * Take a local copy of a policy.
+ *
+ * The destination policy isn't write-barriered; this is used
+ * for doing local copies into something that isn't shared.
+ */
+void
+vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+    const struct vm_domain_policy *src)
+{
+	seq_t seq;
+
+	for (;;) {
+		seq = seq_read(&src->seq);
+		*dst = *src;
+		if (seq_consistent(&src->seq, seq))
+			return;
+	}
+}
+
+/*
+ * Take a write-barrier copy of a policy.
+ *
+ * The destination policy is write -barriered; this is used
+ * for doing copies into policies that may be read by other
+ * threads.
+ */
+void
+vm_domain_policy_copy(struct vm_domain_policy *dst,
+    const struct vm_domain_policy *src)
+{
+	seq_t seq;
+	struct vm_domain_policy d;
+
+	for (;;) {
+		seq = seq_read(&src->seq);
+		d = *src;
+		if (seq_consistent(&src->seq, seq)) {
+			seq_write_begin(&dst->seq);
+			dst->p.domain = d.p.domain;
+			dst->p.policy = d.p.policy;
+			seq_write_end(&dst->seq);
+			return;
+		}
+	}
+}
+
+int
+vm_domain_policy_validate(const struct vm_domain_policy *vp)
+{
+
+	switch (vp->p.policy) {
+	case VM_POLICY_NONE:
+	case VM_POLICY_ROUND_ROBIN:
+	case VM_POLICY_FIRST_TOUCH:
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		if (vp->p.domain == -1)
+			return (0);
+		return (-1);
+	case VM_POLICY_FIXED_DOMAIN:
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+#ifdef VM_NUMA_ALLOC
+		if (vp->p.domain >= 0 && vp->p.domain < vm_ndomains)
+			return (0);
+#else
+		if (vp->p.domain == 0)
+			return (0);
+#endif
+		return (-1);
+	default:
+		return (-1);
+	}
+	return (-1);
+}
+
+int
+vm_domain_policy_cleanup(struct vm_domain_policy *vp)
+{
+
+	/* For now, empty */
+	return (0);
+}
+
+int
+vm_domain_iterator_init(struct vm_domain_iterator *vi)
+{
+
+	/* Nothing to do for now */
+	return (0);
+}
+
+/*
+ * Manually setup an iterator with the given details.
+ */
+int
+vm_domain_iterator_set(struct vm_domain_iterator *vi,
+    vm_domain_policy_type_t vt, int domain)
+{
+
+#ifdef VM_NUMA_ALLOC
+	switch (vt) {
+	case VM_POLICY_FIXED_DOMAIN:
+		vi->policy = VM_POLICY_FIXED_DOMAIN;
+		vi->domain = domain;
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+		vi->policy = VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN;
+		vi->domain = domain;
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_FIRST_TOUCH:
+		vi->policy = VM_POLICY_FIRST_TOUCH;
+		vi->domain = PCPU_GET(domain);
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		vi->policy = VM_POLICY_FIRST_TOUCH_ROUND_ROBIN;
+		vi->domain = PCPU_GET(domain);
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		vi->policy = VM_POLICY_ROUND_ROBIN;
+		vi->domain = -1;
+		vi->n = vm_ndomains;
+		break;
+	}
+#else
+	vi->domain = 0;
+	vi->n = 1;
+#endif
+	return (0);
+}
+
+/*
+ * Setup an iterator based on the given policy.
+ */
+static inline void
+_vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+    const struct vm_domain_policy *vt)
+{
+
+#ifdef VM_NUMA_ALLOC
+	/*
+	 * Initialise the iterator.
+	 *
+	 * For first-touch, the initial domain is set
+	 * via the current thread CPU domain.
+	 *
+	 * For fixed-domain, it's assumed that the
+	 * caller has initialised the specific domain
+	 * it is after.
+	 */
+	switch (vt->p.policy) {
+	case VM_POLICY_FIXED_DOMAIN:
+		vi->policy = vt->p.policy;
+		vi->domain = vt->p.domain;
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+		vi->policy = vt->p.policy;
+		vi->domain = vt->p.domain;
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_FIRST_TOUCH:
+		vi->policy = vt->p.policy;
+		vi->domain = PCPU_GET(domain);
+		vi->n = 1;
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		vi->policy = vt->p.policy;
+		vi->domain = PCPU_GET(domain);
+		vi->n = vm_ndomains;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		/*
+		 * Default to round-robin policy.
+		 */
+		vi->policy = VM_POLICY_ROUND_ROBIN;
+		vi->domain = -1;
+		vi->n = vm_ndomains;
+		break;
+	}
+#else
+	vi->domain = 0;
+	vi->n = 1;
+#endif
+}
+
+void
+vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+    const struct vm_domain_policy *vt)
+{
+	seq_t seq;
+	struct vm_domain_policy vt_lcl;
+
+	for (;;) {
+		seq = seq_read(&vt->seq);
+		vt_lcl = *vt;
+		if (seq_consistent(&vt->seq, seq)) {
+			_vm_domain_iterator_set_policy(vi, &vt_lcl);
+			return;
+		}
+	}
+}
+
+/*
+ * Return the next VM domain to use.
+ *
+ * Returns 0 w/ domain set to the next domain to use, or
+ * -1 to indicate no more domains are available.
+ */
+int
+vm_domain_iterator_run(struct vm_domain_iterator *vi, int *domain)
+{
+
+	/* General catch-all */
+	if (vi->n <= 0)
+		return (-1);
+
+#ifdef VM_NUMA_ALLOC
+	switch (vi->policy) {
+	case VM_POLICY_FIXED_DOMAIN:
+	case VM_POLICY_FIRST_TOUCH:
+		*domain = vi->domain;
+		vi->n--;
+		break;
+	case VM_POLICY_FIXED_DOMAIN_ROUND_ROBIN:
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		/*
+		 * XXX TODO: skip over the rr'ed domain
+		 * if it equals the one we started with.
+		 */
+		if (vi->n == vm_ndomains)
+			*domain = vi->domain;
+		else
+			*domain = vm_domain_rr_selectdomain(vi->domain);
+		vi->n--;
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		*domain = vm_domain_rr_selectdomain(-1);
+		vi->n--;
+		break;
+	}
+#else
+	*domain = 0;
+	vi->n--;
+#endif
+
+	return (0);
+}
+
+/*
+ * Returns 1 if the iteration is done, or 0 if it has not.
+
+ * This can only be called after at least one loop through
+ * the iterator.  Ie, it's designed to be used as a tail
+ * check of a loop, not the head check of a loop.
+ */
+int
+vm_domain_iterator_isdone(struct vm_domain_iterator *vi)
+{
+
+	return (vi->n <= 0);
+}
+
+int
+vm_domain_iterator_cleanup(struct vm_domain_iterator *vi)
+{
+
+	return (0);
+}


Property changes on: trunk/sys/vm/vm_domain.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_domain.h
===================================================================
--- trunk/sys/vm/vm_domain.h	                        (rev 0)
+++ trunk/sys/vm/vm_domain.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,67 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2015 Adrian Chadd <adrian at FreeBSD.org>.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
+ *    redistribution must be conditioned upon including a substantially
+ *    similar Disclaimer requirement for further binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD: stable/11/sys/vm/vm_domain.h 285387 2015-07-11 15:21:37Z adrian $
+ */
+#ifndef	__VM_DOMAIN_H__
+#define	__VM_DOMAIN_H__
+
+#include <sys/_vm_domain.h>
+
+struct vm_domain_iterator {
+	vm_domain_policy_type_t policy;
+	int domain;
+	int n;
+};
+
+/*
+ * TODO: check to see if these should just become inline functions
+ * at some point.
+ */
+extern	int vm_domain_policy_init(struct vm_domain_policy *vp);
+extern	int vm_domain_policy_set(struct vm_domain_policy *vp,
+	    vm_domain_policy_type_t vt, int domain);
+extern	int vm_domain_policy_cleanup(struct vm_domain_policy *vp);
+extern	void vm_domain_policy_localcopy(struct vm_domain_policy *dst,
+	    const struct vm_domain_policy *src);
+extern	void vm_domain_policy_copy(struct vm_domain_policy *dst,
+	    const struct vm_domain_policy *src);
+extern	int vm_domain_policy_validate(const struct vm_domain_policy *vp);
+
+extern	int vm_domain_iterator_init(struct vm_domain_iterator *vi);
+extern	int vm_domain_iterator_set(struct vm_domain_iterator *vi,
+	    vm_domain_policy_type_t vt, int domain);
+extern	void vm_domain_iterator_set_policy(struct vm_domain_iterator *vi,
+	    const struct vm_domain_policy *vt);
+extern	int vm_domain_iterator_run(struct vm_domain_iterator *vi,
+	    int *domain);
+extern	int vm_domain_iterator_isdone(struct vm_domain_iterator *vi);
+extern	int vm_domain_iterator_cleanup(struct vm_domain_iterator *vi);
+
+#endif	/* __VM_DOMAIN_H__ */


Property changes on: trunk/sys/vm/vm_domain.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_extern.h
===================================================================
--- trunk/sys/vm/vm_extern.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_extern.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -28,7 +28,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_extern.h 270920 2014-09-01 07:58:15Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_extern.h 337262 2018-08-03 15:42:39Z markj $
  */
 
 #ifndef _VM_EXTERN_H_
@@ -41,6 +41,8 @@
 struct vmem;
 
 #ifdef _KERNEL
+struct cdev;
+struct cdevsw;
 
 /* These operate on kernel virtual addresses only. */
 vm_offset_t kva_alloc(vm_size_t);
@@ -64,6 +66,7 @@
 void kmem_unback(vm_object_t, vm_offset_t, vm_size_t);
 
 /* Bootstrapping. */
+void kmem_bootstrap_free(vm_offset_t, vm_size_t);
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t,
     boolean_t);
 void kmem_init(vm_offset_t, vm_offset_t);
@@ -70,7 +73,6 @@
 void kmem_init_zero_region(void);
 void kmeminit(void);
 
-void swapout_procs(int);
 int kernacc(void *, int, int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
@@ -82,10 +84,18 @@
     int fault_flags, vm_page_t *m_hold);
 int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count);
-int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *,
+    struct vmspace *, int);
 void vm_waitproc(struct proc *);
-int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
+int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int,
+    objtype_t, void *, vm_ooffset_t);
+int vm_mmap_object(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t,
+    vm_prot_t, int, vm_object_t, vm_ooffset_t, boolean_t, struct thread *);
 int vm_mmap_to_errno(int rv);
+int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
+    int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
+int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
+    struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
@@ -97,6 +107,7 @@
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
+void vmspace_switch_aio(struct vmspace *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
@@ -104,6 +115,5 @@
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 int vm_thread_new(struct thread *td, int pages);
-int vm_mlock(struct proc *, struct ucred *, const void *, size_t);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */

Modified: trunk/sys/vm/vm_fault.c
===================================================================
--- trunk/sys/vm/vm_fault.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_fault.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -73,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 329707 2018-02-21 11:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_fault.c 345572 2019-03-27 11:03:07Z kib $");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
@@ -82,7 +82,9 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/mman.h>
 #include <sys/proc.h>
+#include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
@@ -107,14 +109,11 @@
 #define PFBAK 4
 #define PFFOR 4
 
-static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
-
-#define	VM_FAULT_READ_BEHIND	8
+#define	VM_FAULT_READ_DEFAULT	(1 + VM_FAULT_READ_AHEAD_INIT)
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
-#define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
-#define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
-#define	VM_FAULT_CACHE_BEHIND	(VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
 
+#define	VM_FAULT_DONTNEED_MIN	1048576
+
 struct faultstate {
 	vm_page_t m;
 	vm_object_t object;
@@ -124,14 +123,15 @@
 	vm_pindex_t first_pindex;
 	vm_map_t map;
 	vm_map_entry_t entry;
-	int lookup_still_valid;
 	int map_generation;
+	bool lookup_still_valid;
 	struct vnode *vp;
 };
 
-static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
+static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
+	    int ahead);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
-	    int faultcount, int reqpage);
+	    int backward, int forward, bool obj_locked);
 
 static inline void
 release_page(struct faultstate *fs)
@@ -150,7 +150,7 @@
 
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
-		fs->lookup_still_valid = FALSE;
+		fs->lookup_still_valid = false;
 	}
 }
 
@@ -237,14 +237,15 @@
 	 * written NOW so dirty it explicitly to save on
 	 * pmap_is_modified() calls later.
 	 *
-	 * Also tell the backing pager, if any, that it should remove
-	 * any swap backing since the page is now dirty.
+	 * Also, since the page is now dirty, we can possibly tell
+	 * the pager to release any swap backing the page.  Calling
+	 * the pager requires a write lock on the object.
 	 */
 	if (need_dirty)
 		vm_page_dirty(m);
 	if (!set_wd)
 		vm_page_unlock(m);
-	if (need_dirty)
+	else if (need_dirty)
 		vm_pager_page_unswapped(m);
 }
 
@@ -267,8 +268,12 @@
 vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
     int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
 {
-	vm_page_t m;
-	int rv;
+	vm_page_t m, m_map;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+	vm_page_t m_super;
+	int flags;
+#endif
+	int psind, rv;
 
 	MPASS(fs->vp == NULL);
 	m = vm_page_lookup(fs->first_object, fs->first_pindex);
@@ -276,20 +281,204 @@
 	if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
 	    vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
 		return (KERN_FAILURE);
-	rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
-	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), 0);
+	m_map = m;
+	psind = 0;
+#if defined(__amd64__) && VM_NRESERVLEVEL > 0
+	if ((m->flags & PG_FICTITIOUS) == 0 &&
+	    (m_super = vm_reserv_to_superpage(m)) != NULL &&
+	    rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
+	    roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
+	    (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
+	    (pagesizes[m_super->psind] - 1)) &&
+	    pmap_ps_enabled(fs->map->pmap)) {
+		flags = PS_ALL_VALID;
+		if ((prot & VM_PROT_WRITE) != 0) {
+			/*
+			 * Create a superpage mapping allowing write access
+			 * only if none of the constituent pages are busy and
+			 * all of them are already dirty (except possibly for
+			 * the page that was faulted on).
+			 */
+			flags |= PS_NONE_BUSY;
+			if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
+				flags |= PS_ALL_DIRTY;
+		}
+		if (vm_page_ps_test(m_super, flags, m)) {
+			m_map = m_super;
+			psind = m_super->psind;
+			vaddr = rounddown2(vaddr, pagesizes[psind]);
+			/* Preset the modified bit for dirty superpages. */
+			if ((flags & PS_ALL_DIRTY) != 0)
+				fault_type |= VM_PROT_WRITE;
+		}
+	}
+#endif
+	rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
+	    PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
 	if (rv != KERN_SUCCESS)
 		return (rv);
 	vm_fault_fill_hold(m_hold, m);
 	vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
+	if (psind == 0 && !wired)
+		vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
 	VM_OBJECT_RUNLOCK(fs->first_object);
-	if (!wired)
-		vm_fault_prefault(fs, vaddr, 0, 0);
 	vm_map_lookup_done(fs->map, fs->entry);
 	curthread->td_ru.ru_minflt++;
 	return (KERN_SUCCESS);
 }
 
+static void
+vm_fault_restore_map_lock(struct faultstate *fs)
+{
+
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	MPASS(fs->first_object->paging_in_progress > 0);
+
+	if (!vm_map_trylock_read(fs->map)) {
+		VM_OBJECT_WUNLOCK(fs->first_object);
+		vm_map_lock_read(fs->map);
+		VM_OBJECT_WLOCK(fs->first_object);
+	}
+	fs->lookup_still_valid = true;
+}
+
+static void
+vm_fault_populate_check_page(vm_page_t m)
+{
+
+	/*
+	 * Check each page to ensure that the pager is obeying the
+	 * interface: the page must be installed in the object, fully
+	 * valid, and exclusively busied.
+	 */
+	MPASS(m != NULL);
+	MPASS(m->valid == VM_PAGE_BITS_ALL);
+	MPASS(vm_page_xbusied(m));
+}
+
+static void
+vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
+    vm_pindex_t last)
+{
+	vm_page_t m;
+	vm_pindex_t pidx;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	MPASS(first <= last);
+	for (pidx = first, m = vm_page_lookup(object, pidx);
+	    pidx <= last; pidx++, m = vm_page_next(m)) {
+		vm_fault_populate_check_page(m);
+		vm_page_lock(m);
+		vm_page_deactivate(m);
+		vm_page_unlock(m);
+		vm_page_xunbusy(m);
+	}
+}
+
+static int
+vm_fault_populate(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
+    int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
+{
+	vm_page_t m;
+	vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
+	int rv;
+
+	MPASS(fs->object == fs->first_object);
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	MPASS(fs->first_object->paging_in_progress > 0);
+	MPASS(fs->first_object->backing_object == NULL);
+	MPASS(fs->lookup_still_valid);
+
+	pager_first = OFF_TO_IDX(fs->entry->offset);
+	pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
+	unlock_map(fs);
+	unlock_vp(fs);
+
+	/*
+	 * Call the pager (driver) populate() method.
+	 *
+	 * There is no guarantee that the method will be called again
+	 * if the current fault is for read, and a future fault is
+	 * for write.  Report the entry's maximum allowed protection
+	 * to the driver.
+	 */
+	rv = vm_pager_populate(fs->first_object, fs->first_pindex,
+	    fault_type, fs->entry->max_protection, &pager_first, &pager_last);
+
+	VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
+	if (rv == VM_PAGER_BAD) {
+		/*
+		 * VM_PAGER_BAD is the backdoor for a pager to request
+		 * normal fault handling.
+		 */
+		vm_fault_restore_map_lock(fs);
+		if (fs->map->timestamp != fs->map_generation)
+			return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+		return (KERN_NOT_RECEIVER);
+	}
+	if (rv != VM_PAGER_OK)
+		return (KERN_FAILURE); /* AKA SIGSEGV */
+
+	/* Ensure that the driver is obeying the interface. */
+	MPASS(pager_first <= pager_last);
+	MPASS(fs->first_pindex <= pager_last);
+	MPASS(fs->first_pindex >= pager_first);
+	MPASS(pager_last < fs->first_object->size);
+
+	vm_fault_restore_map_lock(fs);
+	if (fs->map->timestamp != fs->map_generation) {
+		vm_fault_populate_cleanup(fs->first_object, pager_first,
+		    pager_last);
+		return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
+	}
+
+	/*
+	 * The map is unchanged after our last unlock.  Process the fault.
+	 *
+	 * The range [pager_first, pager_last] that is given to the
+	 * pager is only a hint.  The pager may populate any range
+	 * within the object that includes the requested page index.
+	 * In case the pager expanded the range, clip it to fit into
+	 * the map entry.
+	 */
+	map_first = OFF_TO_IDX(fs->entry->offset);
+	if (map_first > pager_first) {
+		vm_fault_populate_cleanup(fs->first_object, pager_first,
+		    map_first - 1);
+		pager_first = map_first;
+	}
+	map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
+	if (map_last < pager_last) {
+		vm_fault_populate_cleanup(fs->first_object, map_last + 1,
+		    pager_last);
+		pager_last = map_last;
+	}
+	for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
+	    pidx <= pager_last; pidx++, m = vm_page_next(m)) {
+		vm_fault_populate_check_page(m);
+		vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags,
+		    true);
+		VM_OBJECT_WUNLOCK(fs->first_object);
+		pmap_enter(fs->map->pmap, fs->entry->start + IDX_TO_OFF(pidx) -
+		    fs->entry->offset, m, prot, fault_type | (wired ?
+		    PMAP_ENTER_WIRED : 0), 0);
+		VM_OBJECT_WLOCK(fs->first_object);
+		if (pidx == fs->first_pindex)
+			vm_fault_fill_hold(m_hold, m);
+		vm_page_lock(m);
+		if ((fault_flags & VM_FAULT_WIRE) != 0) {
+			KASSERT(wired, ("VM_FAULT_WIRE && !wired"));
+			vm_page_wire(m);
+		} else {
+			vm_page_activate(m);
+		}
+		vm_page_unlock(m);
+		vm_page_xunbusy(m);
+	}
+	curthread->td_ru.ru_majflt++;
+	return (KERN_SUCCESS);
+}
+
 /*
  *	vm_fault:
  *
@@ -334,21 +523,23 @@
 vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
 {
-	vm_prot_t prot;
-	long ahead, behind;
-	int alloc_req, era, faultcount, nera, reqpage, result;
-	boolean_t dead, is_first_object_locked, wired;
-	vm_object_t next_object;
-	vm_page_t marray[VM_FAULT_READ_MAX];
-	int hardfault;
 	struct faultstate fs;
 	struct vnode *vp;
-	int locked, error;
+	vm_object_t next_object, retry_object;
+	vm_offset_t e_end, e_start;
+	vm_pindex_t retry_pindex;
+	vm_prot_t prot, retry_prot;
+	int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
+	int locked, nera, result, rv;
+	u_char behavior;
+	boolean_t wired;	/* Passed by reference. */
+	bool dead, hardfault, is_first_object_locked;
 
-	hardfault = 0;
 	PCPU_INC(cnt.v_vm_faults);
 	fs.vp = NULL;
-	faultcount = reqpage = 0;
+	faultcount = 0;
+	nera = -1;
+	hardfault = false;
 
 RetryFault:;
 
@@ -415,10 +606,10 @@
 		    (fs.first_object->type != OBJT_VNODE &&
 		    (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
 		    (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
-			result = vm_fault_soft_fast(&fs, vaddr, prot,
-			    fault_type, fault_flags, wired, m_hold);
-			if (result == KERN_SUCCESS)
-				return (result);
+			rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type,
+			    fault_flags, wired, m_hold);
+			if (rv == KERN_SUCCESS)
+				return (rv);
 		}
 		if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
 			VM_OBJECT_RUNLOCK(fs.first_object);
@@ -435,13 +626,12 @@
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
-	 * truncation operations) during I/O.  This must be done after
-	 * obtaining the vnode lock in order to avoid possible deadlocks.
+	 * truncation operations) during I/O.
 	 */
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
-	fs.lookup_still_valid = TRUE;
+	fs.lookup_still_valid = true;
 
 	fs.first_m = NULL;
 
@@ -534,11 +724,13 @@
 				goto readrest;
 			break;
 		}
+		KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m));
 
 		/*
-		 * Page is not resident.  If this is the search termination
-		 * or the pager might contain the page, allocate a new page.
-		 * Default objects are zero-fill, there is no real pager.
+		 * Page is not resident.  If the pager might contain the page
+		 * or this is the beginning of the search, allocate a new
+		 * page.  (Default objects are zero-fill, so there is no real
+		 * pager for them.)
 		 */
 		if (fs.object->type != OBJT_DEFAULT ||
 		    fs.object == fs.first_object) {
@@ -547,6 +739,30 @@
 				return (KERN_PROTECTION_FAILURE);
 			}
 
+			if (fs.object == fs.first_object &&
+			    (fs.first_object->flags & OBJ_POPULATE) != 0 &&
+			    fs.first_object->shadow_count == 0) {
+				rv = vm_fault_populate(&fs, vaddr, prot,
+				    fault_type, fault_flags, wired, m_hold);
+				switch (rv) {
+				case KERN_SUCCESS:
+				case KERN_FAILURE:
+					unlock_and_deallocate(&fs);
+					return (rv);
+				case KERN_RESOURCE_SHORTAGE:
+					unlock_and_deallocate(&fs);
+					goto RetryFault;
+				case KERN_NOT_RECEIVER:
+					/*
+					 * Pager's populate() method
+					 * returned VM_PAGER_BAD.
+					 */
+					break;
+				default:
+					panic("inconsistent return codes");
+				}
+			}
+
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 *
@@ -555,14 +771,10 @@
 			 * there, and allocation can fail, causing
 			 * restart and new reading of the p_flag.
 			 */
-			fs.m = NULL;
 			if (!vm_page_count_severe() || P_KILLED(curproc)) {
 #if VM_NRESERVLEVEL > 0
-				if ((fs.object->flags & OBJ_COLORED) == 0) {
-					fs.object->flags |= OBJ_COLORED;
-					fs.object->pg_color = atop(vaddr) -
-					    fs.pindex;
-				}
+				vm_object_color(fs.object, atop(vaddr) -
+				    fs.pindex);
 #endif
 				alloc_req = P_KILLED(curproc) ?
 				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
@@ -576,80 +788,113 @@
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
-			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
-				break;
+			}
 		}
 
 readrest:
 		/*
-		 * We have found a valid page or we have allocated a new page.
-		 * The page thus may not be valid or may not be entirely 
-		 * valid.
+		 * At this point, we have either allocated a new page or found
+		 * an existing page that is only partially valid.
 		 *
-		 * Attempt to fault-in the page if there is a chance that the
-		 * pager has it, and potentially fault in additional pages
-		 * at the same time.  For default objects simply provide
-		 * zero-filled pages.
+		 * We hold a reference on the current object and the page is
+		 * exclusive busied.
 		 */
-		if (fs.object->type != OBJT_DEFAULT) {
-			int rv;
-			u_char behavior = vm_map_entry_behavior(fs.entry);
 
-			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
-			    P_KILLED(curproc)) {
-				behind = 0;
-				ahead = 0;
+		/*
+		 * If the pager for the current object might have the page,
+		 * then determine the number of additional pages to read and
+		 * potentially reprioritize previously read pages for earlier
+		 * reclamation.  These operations should only be performed
+		 * once per page fault.  Even if the current pager doesn't
+		 * have the page, the number of additional pages to read will
+		 * apply to subsequent objects in the shadow chain.
+		 */
+		if (fs.object->type != OBJT_DEFAULT && nera == -1 &&
+		    !P_KILLED(curproc)) {
+			KASSERT(fs.lookup_still_valid, ("map unlocked"));
+			era = fs.entry->read_ahead;
+			behavior = vm_map_entry_behavior(fs.entry);
+			if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
+				nera = 0;
 			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
-				behind = 0;
-				ahead = atop(fs.entry->end - vaddr) - 1;
-				if (ahead > VM_FAULT_READ_AHEAD_MAX)
-					ahead = VM_FAULT_READ_AHEAD_MAX;
-				if (fs.pindex == fs.entry->next_read)
-					vm_fault_cache_behind(&fs,
-					    VM_FAULT_READ_MAX);
-			} else {
+				nera = VM_FAULT_READ_AHEAD_MAX;
+				if (vaddr == fs.entry->next_read)
+					vm_fault_dontneed(&fs, vaddr, nera);
+			} else if (vaddr == fs.entry->next_read) {
 				/*
-				 * If this is a sequential page fault, then
-				 * arithmetically increase the number of pages
-				 * in the read-ahead window.  Otherwise, reset
-				 * the read-ahead window to its smallest size.
+				 * This is a sequential fault.  Arithmetically
+				 * increase the requested number of pages in
+				 * the read-ahead window.  The requested
+				 * number of pages is "# of sequential faults
+				 * x (read ahead min + 1) + read ahead min"
 				 */
-				behind = atop(vaddr - fs.entry->start);
-				if (behind > VM_FAULT_READ_BEHIND)
-					behind = VM_FAULT_READ_BEHIND;
-				ahead = atop(fs.entry->end - vaddr) - 1;
-				era = fs.entry->read_ahead;
-				if (fs.pindex == fs.entry->next_read) {
-					nera = era + behind;
+				nera = VM_FAULT_READ_AHEAD_MIN;
+				if (era > 0) {
+					nera += era + 1;
 					if (nera > VM_FAULT_READ_AHEAD_MAX)
 						nera = VM_FAULT_READ_AHEAD_MAX;
-					behind = 0;
-					if (ahead > nera)
-						ahead = nera;
-					if (era == VM_FAULT_READ_AHEAD_MAX)
-						vm_fault_cache_behind(&fs,
-						    VM_FAULT_CACHE_BEHIND);
-				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
-					ahead = VM_FAULT_READ_AHEAD_MIN;
-				if (era != ahead)
-					fs.entry->read_ahead = ahead;
+				}
+				if (era == VM_FAULT_READ_AHEAD_MAX)
+					vm_fault_dontneed(&fs, vaddr, nera);
+			} else {
+				/*
+				 * This is a non-sequential fault.
+				 */
+				nera = 0;
 			}
+			if (era != nera) {
+				/*
+				 * A read lock on the map suffices to update
+				 * the read ahead count safely.
+				 */
+				fs.entry->read_ahead = nera;
+			}
 
 			/*
-			 * Call the pager to retrieve the data, if any, after
-			 * releasing the lock on the map.  We hold a ref on
-			 * fs.object and the pages are exclusive busied.
+			 * Prepare for unlocking the map.  Save the map
+			 * entry's start and end addresses, which are used to
+			 * optimize the size of the pager operation below.
+			 * Even if the map entry's addresses change after
+			 * unlocking the map, using the saved addresses is
+			 * safe.
 			 */
+			e_start = fs.entry->start;
+			e_end = fs.entry->end;
+		}
+
+		/*
+		 * Call the pager to retrieve the page if there is a chance
+		 * that the pager has it, and potentially retrieve additional
+		 * pages at the same time.
+		 */
+		if (fs.object->type != OBJT_DEFAULT) {
+			/*
+			 * Release the map lock before locking the vnode or
+			 * sleeping in the pager.  (If the current object has
+			 * a shadow, then an earlier iteration of this loop
+			 * may have already unlocked the map.)
+			 */
 			unlock_map(&fs);
 
 			if (fs.object->type == OBJT_VNODE &&
 			    (vp = fs.object->handle) != fs.vp) {
+				/*
+				 * Perform an unlock in case the desired vnode
+				 * changed while the map was unlocked during a
+				 * retry.
+				 */
 				unlock_vp(&fs);
+
 				locked = VOP_ISLOCKED(vp);
-
 				if (locked != LK_EXCLUSIVE)
 					locked = LK_SHARED;
-				/* Do not sleep for vnode lock while fs.m is busy */
+
+				/*
+				 * We must not sleep acquiring the vnode lock
+				 * while we have the page exclusive busied or
+				 * the object's paging-in-progress count
+				 * incremented.  Otherwise, we could deadlock.
+				 */
 				error = vget(vp, locked | LK_CANRECURSE |
 				    LK_NOWAIT, curthread);
 				if (error != 0) {
@@ -670,88 +915,85 @@
 			    ("vm_fault: vnode-backed object mapped by system map"));
 
 			/*
-			 * now we find out if any other pages should be paged
-			 * in at this time this routine checks to see if the
-			 * pages surrounding this fault reside in the same
-			 * object as the page for this fault.  If they do,
-			 * then they are faulted in also into the object.  The
-			 * array "marray" returned contains an array of
-			 * vm_page_t structs where one of them is the
-			 * vm_page_t passed to the routine.  The reqpage
-			 * return value is the index into the marray for the
-			 * vm_page_t passed to the routine.
-			 *
-			 * fs.m plus the additional pages are exclusive busied.
+			 * Page in the requested page and hint the pager,
+			 * that it may bring up surrounding pages.
 			 */
-			faultcount = vm_fault_additional_pages(
-			    fs.m, behind, ahead, marray, &reqpage);
-
-			rv = faultcount ?
-			    vm_pager_get_pages(fs.object, marray, faultcount,
-				reqpage) : VM_PAGER_FAIL;
-
+			if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
+			    P_KILLED(curproc)) {
+				behind = 0;
+				ahead = 0;
+			} else {
+				/* Is this a sequential fault? */
+				if (nera > 0) {
+					behind = 0;
+					ahead = nera;
+				} else {
+					/*
+					 * Request a cluster of pages that is
+					 * aligned to a VM_FAULT_READ_DEFAULT
+					 * page offset boundary within the
+					 * object.  Alignment to a page offset
+					 * boundary is more likely to coincide
+					 * with the underlying file system
+					 * block than alignment to a virtual
+					 * address boundary.
+					 */
+					cluster_offset = fs.pindex %
+					    VM_FAULT_READ_DEFAULT;
+					behind = ulmin(cluster_offset,
+					    atop(vaddr - e_start));
+					ahead = VM_FAULT_READ_DEFAULT - 1 -
+					    cluster_offset;
+				}
+				ahead = ulmin(ahead, atop(e_end - vaddr) - 1);
+			}
+			rv = vm_pager_get_pages(fs.object, &fs.m, 1,
+			    &behind, &ahead);
 			if (rv == VM_PAGER_OK) {
-				/*
-				 * Found the page. Leave it busy while we play
-				 * with it.
-				 */
-
-				/*
-				 * Relookup in case pager changed page. Pager
-				 * is responsible for disposition of old page
-				 * if moved.
-				 */
-				fs.m = vm_page_lookup(fs.object, fs.pindex);
-				if (!fs.m) {
-					unlock_and_deallocate(&fs);
-					goto RetryFault;
-				}
-
-				hardfault++;
+				faultcount = behind + 1 + ahead;
+				hardfault = true;
 				break; /* break to PAGE HAS BEEN FOUND */
 			}
-			/*
-			 * Remove the bogus page (which does not exist at this
-			 * object/offset); before doing so, we must get back
-			 * our object lock to preserve our invariant.
-			 *
-			 * Also wake up any other process that may want to bring
-			 * in this page.
-			 *
-			 * If this is the top-level object, we must leave the
-			 * busy page to prevent another process from rushing
-			 * past us, and inserting the page in that object at
-			 * the same time that we are.
-			 */
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager read error, pid %d (%s)\n",
 				    curproc->p_pid, curproc->p_comm);
+
 			/*
-			 * Data outside the range of the pager or an I/O error
+			 * If an I/O error occurred or the requested page was
+			 * outside the range of the pager, clean up and return
+			 * an error.
 			 */
-			/*
-			 * XXX - the check for kernel_map is a kludge to work
-			 * around having the machine panic on a kernel space
-			 * fault w/ I/O error.
-			 */
-			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
-				(rv == VM_PAGER_BAD)) {
+			if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
 				vm_page_lock(fs.m);
-				vm_page_free(fs.m);
+				if (fs.m->wire_count == 0)
+					vm_page_free(fs.m);
+				else
+					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 				unlock_and_deallocate(&fs);
-				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
+				return (rv == VM_PAGER_ERROR ? KERN_FAILURE :
+				    KERN_PROTECTION_FAILURE);
 			}
+
+			/*
+			 * The requested page does not exist at this object/
+			 * offset.  Remove the invalid page from the object,
+			 * waking up anyone waiting for it, and continue on to
+			 * the next object.  However, if this is the top-level
+			 * object, we must leave the busy page in place to
+			 * prevent another process from rushing past us, and
+			 * inserting the page in that object at the same time
+			 * that we are.
+			 */
 			if (fs.object != fs.first_object) {
 				vm_page_lock(fs.m);
-				vm_page_free(fs.m);
+				if (fs.m->wire_count == 0)
+					vm_page_free(fs.m);
+				else
+					vm_page_xunbusy_maybelocked(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
-				/*
-				 * XXX - we cannot just fall out at this
-				 * point, m has been freed and is invalid!
-				 */
 			}
 		}
 
@@ -766,7 +1008,6 @@
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
-		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
 		next_object = fs.object->backing_object;
 		if (next_object == NULL) {
 			/*
@@ -804,6 +1045,8 @@
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
+			fs.pindex +=
+			    OFF_TO_IDX(fs.object->backing_object_offset);
 			VM_OBJECT_WUNLOCK(fs.object);
 			fs.object = next_object;
 		}
@@ -836,7 +1079,7 @@
 			 * dirty in the first object so that it will go out 
 			 * to swap when needed.
 			 */
-			is_first_object_locked = FALSE;
+			is_first_object_locked = false;
 			if (
 				/*
 				 * Only one shadow object
@@ -860,22 +1103,15 @@
 				 * We don't chase down the shadow chain
 				 */
 			    fs.object == fs.first_object->backing_object) {
-				/*
-				 * get rid of the unnecessary page
-				 */
+				vm_page_lock(fs.m);
+				vm_page_remove(fs.m);
+				vm_page_unlock(fs.m);
 				vm_page_lock(fs.first_m);
+				vm_page_replace_checked(fs.m, fs.first_object,
+				    fs.first_pindex, fs.first_m);
 				vm_page_free(fs.first_m);
 				vm_page_unlock(fs.first_m);
-				/*
-				 * grab the page and put it into the 
-				 * process'es object.  The page is 
-				 * automatically made dirty.
-				 */
-				if (vm_page_rename(fs.m, fs.first_object,
-				    fs.first_pindex)) {
-					unlock_and_deallocate(&fs);
-					goto RetryFault;
-				}
+				vm_page_dirty(fs.m);
 #if VM_NRESERVLEVEL > 0
 				/*
 				 * Rename the reservation.
@@ -884,6 +1120,10 @@
 				    fs.object, OFF_TO_IDX(
 				    fs.first_object->backing_object_offset));
 #endif
+				/*
+				 * Removing the page from the backing object
+				 * unbusied it.
+				 */
 				vm_page_xbusy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
@@ -905,7 +1145,7 @@
 					vm_page_unlock(fs.first_m);
 					
 					vm_page_lock(fs.m);
-					vm_page_unwire(fs.m, FALSE);
+					vm_page_unwire(fs.m, PQ_INACTIVE);
 					vm_page_unlock(fs.m);
 				}
 				/*
@@ -939,16 +1179,12 @@
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
-		vm_object_t retry_object;
-		vm_pindex_t retry_pindex;
-		vm_prot_t retry_prot;
-
 		if (!vm_map_trylock_read(fs.map)) {
 			release_page(&fs);
 			unlock_and_deallocate(&fs);
 			goto RetryFault;
 		}
-		fs.lookup_still_valid = TRUE;
+		fs.lookup_still_valid = true;
 		if (fs.map->timestamp != fs.map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
@@ -986,20 +1222,23 @@
 			 * write-enabled after all.
 			 */
 			prot &= retry_prot;
+			fault_type &= retry_prot;
+			if (prot == 0) {
+				release_page(&fs);
+				unlock_and_deallocate(&fs);
+				goto RetryFault;
+			}
 		}
 	}
+
 	/*
-	 * If the page was filled by a pager, update the map entry's
-	 * last read offset.  Since the pager does not return the
-	 * actual set of pages that it read, this update is based on
-	 * the requested set.  Typically, the requested and actual
-	 * sets are the same.
-	 *
-	 * XXX The following assignment modifies the map
-	 * without holding a write lock on it.
+	 * If the page was filled by a pager, save the virtual address that
+	 * should be faulted on next under a sequential access pattern to the
+	 * map entry.  A read lock on the map suffices to update this address
+	 * safely.
 	 */
 	if (hardfault)
-		fs.entry->next_read = fs.pindex + faultcount - reqpage;
+		fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 
 	vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
 	vm_page_assert_xbusied(fs.m);
@@ -1022,7 +1261,9 @@
 	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
 	if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
 	    wired == 0)
-		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
+		vm_fault_prefault(&fs, vaddr,
+		    faultcount > 0 ? behind : PFBAK,
+		    faultcount > 0 ? ahead : PFFOR, false);
 	VM_OBJECT_WLOCK(fs.object);
 	vm_page_lock(fs.m);
 
@@ -1049,6 +1290,21 @@
 	if (hardfault) {
 		PCPU_INC(cnt.v_io_faults);
 		curthread->td_ru.ru_majflt++;
+#ifdef RACCT
+		if (racct_enable && fs.object->type == OBJT_VNODE) {
+			PROC_LOCK(curproc);
+			if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
+				racct_add_force(curproc, RACCT_WRITEBPS,
+				    PAGE_SIZE + behind * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_WRITEIOPS, 1);
+			} else {
+				racct_add_force(curproc, RACCT_READBPS,
+				    PAGE_SIZE + ahead * PAGE_SIZE);
+				racct_add_force(curproc, RACCT_READIOPS, 1);
+			}
+			PROC_UNLOCK(curproc);
+		}
+#endif
 	} else 
 		curthread->td_ru.ru_minflt++;
 
@@ -1056,15 +1312,26 @@
 }
 
 /*
- * Speed up the reclamation of up to "distance" pages that precede the
- * faulting pindex within the first object of the shadow chain.
+ * Speed up the reclamation of pages that precede the faulting pindex within
+ * the first object of the shadow chain.  Essentially, perform the equivalent
+ * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
+ * the faulting pindex by the cluster size when the pages read by vm_fault()
+ * cross a cluster-size boundary.  The cluster size is the greater of the
+ * smallest superpage size and VM_FAULT_DONTNEED_MIN.
+ *
+ * When "fs->first_object" is a shadow object, the pages in the backing object
+ * that precede the faulting pindex are deactivated by vm_fault().  So, this
+ * function must only be concerned with pages in the first object.
  */
 static void
-vm_fault_cache_behind(const struct faultstate *fs, int distance)
+vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 {
+	vm_map_entry_t entry;
 	vm_object_t first_object, object;
-	vm_page_t m, m_prev;
-	vm_pindex_t pindex;
+	vm_offset_t end, start;
+	vm_page_t m, m_next;
+	vm_pindex_t pend, pstart;
+	vm_size_t size;
 
 	object = fs->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -1076,32 +1343,44 @@
 			VM_OBJECT_WLOCK(object);
 		}
 	}
-	/* Neither fictitious nor unmanaged pages can be cached. */
+	/* Neither fictitious nor unmanaged pages can be reclaimed. */
 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
-		if (fs->first_pindex < distance)
-			pindex = 0;
-		else
-			pindex = fs->first_pindex - distance;
-		if (pindex < OFF_TO_IDX(fs->entry->offset))
-			pindex = OFF_TO_IDX(fs->entry->offset);
-		m = first_object != object ? fs->first_m : fs->m;
-		vm_page_assert_xbusied(m);
-		m_prev = vm_page_prev(m);
-		while ((m = m_prev) != NULL && m->pindex >= pindex &&
-		    m->valid == VM_PAGE_BITS_ALL) {
-			m_prev = vm_page_prev(m);
-			if (vm_page_busied(m))
-				continue;
-			vm_page_lock(m);
-			if (m->hold_count == 0 && m->wire_count == 0) {
-				pmap_remove_all(m);
-				vm_page_aflag_clear(m, PGA_REFERENCED);
-				if (m->dirty != 0)
-					vm_page_deactivate(m);
-				else
-					vm_page_cache(m);
+		size = VM_FAULT_DONTNEED_MIN;
+		if (MAXPAGESIZES > 1 && size < pagesizes[1])
+			size = pagesizes[1];
+		end = rounddown2(vaddr, size);
+		if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
+		    (entry = fs->entry)->start < end) {
+			if (end - entry->start < size)
+				start = entry->start;
+			else
+				start = end - size;
+			pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
+			pstart = OFF_TO_IDX(entry->offset) + atop(start -
+			    entry->start);
+			m_next = vm_page_find_least(first_object, pstart);
+			pend = OFF_TO_IDX(entry->offset) + atop(end -
+			    entry->start);
+			while ((m = m_next) != NULL && m->pindex < pend) {
+				m_next = TAILQ_NEXT(m, listq);
+				if (m->valid != VM_PAGE_BITS_ALL ||
+				    vm_page_busied(m))
+					continue;
+
+				/*
+				 * Don't clear PGA_REFERENCED, since it would
+				 * likely represent a reference by a different
+				 * process.
+				 *
+				 * Typically, at this point, prefetched pages
+				 * are still in the inactive queue.  Only
+				 * pages that triggered page faults are in the
+				 * active queue.
+				 */
+				vm_page_lock(m);
+				vm_page_deactivate(m);
+				vm_page_unlock(m);
 			}
-			vm_page_unlock(m);
 		}
 	}
 	if (first_object != object)
@@ -1116,7 +1395,7 @@
  */
 static void
 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
-    int faultcount, int reqpage)
+    int backward, int forward, bool obj_locked)
 {
 	pmap_t pmap;
 	vm_map_entry_t entry;
@@ -1124,19 +1403,12 @@
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
-	int backward, forward, i;
+	int i;
 
 	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
-	if (faultcount > 0) {
-		backward = reqpage;
-		forward = faultcount - reqpage - 1;
-	} else {
-		backward = PFBAK;
-		forward = PFFOR;
-	}
 	entry = fs->entry;
 
 	if (addra < backward * PAGE_SIZE) {
@@ -1169,7 +1441,8 @@
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = entry->object.vm_object;
-		VM_OBJECT_RLOCK(lobject);
+		if (!obj_locked)
+			VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
@@ -1177,17 +1450,20 @@
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_RLOCK(backing_object);
-			VM_OBJECT_RUNLOCK(lobject);
+			if (!obj_locked || lobject != entry->object.vm_object)
+				VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
 		if (m == NULL) {
-			VM_OBJECT_RUNLOCK(lobject);
+			if (!obj_locked || lobject != entry->object.vm_object)
+				VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (m->valid == VM_PAGE_BITS_ALL &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
-		VM_OBJECT_RUNLOCK(lobject);
+		if (!obj_locked || lobject != entry->object.vm_object)
+			VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
@@ -1252,7 +1528,18 @@
 		 * page was mapped at the specified virtual address or that
 		 * mapping had insufficient permissions.  Attempt to fault in
 		 * and hold these pages.
+		 *
+		 * If vm_fault_disable_pagefaults() was called,
+		 * i.e., TDP_NOFAULTING is set, we must not sleep nor
+		 * acquire MD VM locks, which means we must not call
+		 * vm_fault_hold().  Some (out of tree) callers mark
+		 * too wide a code area with vm_fault_disable_pagefaults()
+		 * already, use the VM_PROT_QUICK_NOFAULT flag to request
+		 * the proper behaviour explicitly.
 		 */
+		if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
+		    (curthread->td_pflags & TDP_NOFAULTING) != 0)
+			goto error;
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 			if (*mp == NULL && vm_fault_hold(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
@@ -1315,11 +1602,12 @@
 		 * actually shadow anything - we copy the pages directly.)
 		 */
 		dst_object = vm_object_allocate(OBJT_DEFAULT,
-		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
+		    atop(dst_entry->end - dst_entry->start));
 #if VM_NRESERVLEVEL > 0
 		dst_object->flags |= OBJ_COLORED;
 		dst_object->pg_color = atop(dst_entry->start);
 #endif
+		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 
 	VM_OBJECT_WLOCK(dst_object);
@@ -1328,7 +1616,6 @@
 	if (src_object != dst_object) {
 		dst_entry->object.vm_object = dst_object;
 		dst_entry->offset = 0;
-		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
@@ -1336,7 +1623,9 @@
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
-	} else if (dst_object->cred == NULL) {
+	} else if ((dst_object->type == OBJT_DEFAULT ||
+	    dst_object->type == OBJT_SWAP) &&
+	    dst_object->cred == NULL) {
 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 		    dst_entry));
 		dst_object->cred = dst_entry->cred;
@@ -1361,7 +1650,7 @@
 	 * range, copying each page from the source object to the
 	 * destination object.  Since the source is wired, those pages
 	 * must exist.  In contrast, the destination is pageable.
-	 * Since the destination object does share any backing storage
+	 * Since the destination object doesn't share any backing storage
 	 * with the source object, all of its pages must be dirtied,
 	 * regardless of whether they can be written.
 	 */
@@ -1417,15 +1706,19 @@
 			}
 			pmap_copy_page(src_m, dst_m);
 			VM_OBJECT_RUNLOCK(object);
-			dst_m->valid = VM_PAGE_BITS_ALL;
-			dst_m->dirty = VM_PAGE_BITS_ALL;
+			dst_m->dirty = dst_m->valid = src_m->valid;
 		} else {
 			dst_m = src_m;
 			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
 				goto again;
+			if (dst_m->pindex >= dst_object->size)
+				/*
+				 * We are upgrading.  Index can occur
+				 * out of bounds if the object type is
+				 * vnode and the file was truncated.
+				 */
+				break;
 			vm_page_xbusy(dst_m);
-			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
-			    ("invalid dst page %p", dst_m));
 		}
 		VM_OBJECT_WUNLOCK(dst_object);
 
@@ -1433,9 +1726,18 @@
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
+		 *
+		 * The page can be invalid if the user called
+		 * msync(MS_INVALIDATE) or truncated the backing vnode
+		 * or shared memory object.  In this case, do not
+		 * insert it into pmap, but still do the copy so that
+		 * all copies of the wired map entry have similar
+		 * backing pages.
 		 */
-		pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
-		    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+		if (dst_m->valid == VM_PAGE_BITS_ALL) {
+			pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
+			    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
+		}
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
@@ -1445,7 +1747,7 @@
 		if (upgrade) {
 			if (src_m != dst_m) {
 				vm_page_lock(src_m);
-				vm_page_unwire(src_m, 0);
+				vm_page_unwire(src_m, PQ_INACTIVE);
 				vm_page_unlock(src_m);
 				vm_page_lock(dst_m);
 				vm_page_wire(dst_m);
@@ -1468,134 +1770,7 @@
 	}
 }
 
-
 /*
- * This routine checks around the requested page for other pages that
- * might be able to be faulted in.  This routine brackets the viable
- * pages for the pages to be paged in.
- *
- * Inputs:
- *	m, rbehind, rahead
- *
- * Outputs:
- *  marray (array of vm_page_t), reqpage (index of requested page)
- *
- * Return value:
- *  number of pages in marray
- */
-static int
-vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
-	vm_page_t m;
-	int rbehind;
-	int rahead;
-	vm_page_t *marray;
-	int *reqpage;
-{
-	int i,j;
-	vm_object_t object;
-	vm_pindex_t pindex, startpindex, endpindex, tpindex;
-	vm_page_t rtm;
-	int cbehind, cahead;
-
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
-
-	object = m->object;
-	pindex = m->pindex;
-	cbehind = cahead = 0;
-
-	/*
-	 * if the requested page is not available, then give up now
-	 */
-	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
-		return 0;
-	}
-
-	if ((cbehind == 0) && (cahead == 0)) {
-		*reqpage = 0;
-		marray[0] = m;
-		return 1;
-	}
-
-	if (rahead > cahead) {
-		rahead = cahead;
-	}
-
-	if (rbehind > cbehind) {
-		rbehind = cbehind;
-	}
-
-	/*
-	 * scan backward for the read behind pages -- in memory 
-	 */
-	if (pindex > 0) {
-		if (rbehind > pindex) {
-			rbehind = pindex;
-			startpindex = 0;
-		} else {
-			startpindex = pindex - rbehind;
-		}
-
-		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
-		    rtm->pindex >= startpindex)
-			startpindex = rtm->pindex + 1;
-
-		/* tpindex is unsigned; beware of numeric underflow. */
-		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
-		    tpindex < pindex; i++, tpindex--) {
-
-			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
-			    VM_ALLOC_IFNOTCACHED);
-			if (rtm == NULL) {
-				/*
-				 * Shift the allocated pages to the
-				 * beginning of the array.
-				 */
-				for (j = 0; j < i; j++) {
-					marray[j] = marray[j + tpindex + 1 -
-					    startpindex];
-				}
-				break;
-			}
-
-			marray[tpindex - startpindex] = rtm;
-		}
-	} else {
-		startpindex = 0;
-		i = 0;
-	}
-
-	marray[i] = m;
-	/* page offset of the required page */
-	*reqpage = i;
-
-	tpindex = pindex + 1;
-	i++;
-
-	/*
-	 * scan forward for the read ahead pages
-	 */
-	endpindex = tpindex + rahead;
-	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
-		endpindex = rtm->pindex;
-	if (endpindex > object->size)
-		endpindex = object->size;
-
-	for (; tpindex < endpindex; i++, tpindex++) {
-
-		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
-		    VM_ALLOC_IFNOTCACHED);
-		if (rtm == NULL) {
-			break;
-		}
-
-		marray[i] = rtm;
-	}
-
-	/* return number of pages */
-	return i;
-}
-
-/*
  * Block entry into the machine-independent layer's page fault handler by
  * the calling thread.  Subsequent calls to vm_fault() by that thread will
  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of

Modified: trunk/sys/vm/vm_glue.c
===================================================================
--- trunk/sys/vm/vm_glue.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_glue.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_glue.c 300673 2016-05-25 10:04:53Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_glue.c 341467 2018-12-04 15:04:48Z emaste $");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
@@ -102,13 +102,6 @@
 
 #include <machine/cpu.h>
 
-#ifndef NO_SWAPPING
-static int swapout(struct proc *);
-static void swapclear(struct proc *);
-static void vm_thread_swapin(struct thread *td);
-static void vm_thread_swapout(struct thread *td);
-#endif
-
 /*
  * MPSAFE
  *
@@ -119,9 +112,7 @@
  * space.
  */
 int
-kernacc(addr, len, rw)
-	void *addr;
-	int len, rw;
+kernacc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
@@ -130,7 +121,7 @@
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
-	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
+	if ((vm_offset_t)addr + len > vm_map_max(kernel_map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
@@ -150,12 +141,10 @@
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
- * used in conjuction with this call.
+ * used in conjunction with this call.
  */
 int
-useracc(addr, len, rw)
-	void *addr;
-	int len, rw;
+useracc(void *addr, int len, int rw)
 {
 	boolean_t rv;
 	vm_prot_t prot;
@@ -201,16 +190,21 @@
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
+	if (error == KERN_SUCCESS) {
+		curthread->td_vslock_sz += len;
+		return (0);
+	}
+
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
-	return (error == KERN_SUCCESS ? 0 : EFAULT);
+	return (EFAULT);
 }
 
 void
@@ -218,6 +212,8 @@
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
+	MPASS(curthread->td_vslock_sz >= len);
+	curthread->td_vslock_sz -= len;
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
@@ -231,19 +227,16 @@
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
-	vm_page_t m, ma[1];
+	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
-	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
+	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
-		ma[0] = m;
-		rv = vm_pager_get_pages(object, ma, 1, 0);
-		m = vm_page_lookup(object, pindex);
-		if (m == NULL)
-			goto out;
+		vm_page_xbusy(m);
+		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			vm_page_lock(m);
 			vm_page_free(m);
@@ -251,8 +244,8 @@
 			m = NULL;
 			goto out;
 		}
+		vm_page_xunbusy(m);
 	}
-	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	vm_page_activate(m);
@@ -312,10 +305,6 @@
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
     "");
 
-#ifndef KSTACK_MAX_PAGES
-#define KSTACK_MAX_PAGES 32
-#endif
-
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
@@ -326,17 +315,17 @@
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
-	vm_page_t m, ma[KSTACK_MAX_PAGES];
+	vm_page_t ma[KSTACK_MAX_PAGES];
 	struct kstack_cache_entry *ks_ce;
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
-		pages = KSTACK_PAGES;
+		pages = kstack_pages;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
-	if (pages == KSTACK_PAGES) {
+	if (pages == kstack_pages) {
 		mtx_lock(&kstack_cache_mtx);
 		if (kstack_cache != NULL) {
 			ks_ce = kstack_cache;
@@ -345,7 +334,7 @@
 
 			td->td_kstack_obj = ks_ce->ksobj;
 			td->td_kstack = (vm_offset_t)ks_ce;
-			td->td_kstack_pages = KSTACK_PAGES;
+			td->td_kstack_pages = kstack_pages;
 			return (1);
 		}
 		mtx_unlock(&kstack_cache_mtx);
@@ -395,15 +384,10 @@
 	 * page of stack.
 	 */
 	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++) {
-		/*
-		 * Get a kernel stack page.
-		 */
-		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
-		ma[i] = m;
-		m->valid = VM_PAGE_BITS_ALL;
-	}
+	(void)vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED, ma, pages);
+	for (i = 0; i < pages; i++)
+		ma[i]->valid = VM_PAGE_BITS_ALL;
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
@@ -423,7 +407,7 @@
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
@@ -449,7 +433,7 @@
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
-	if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) {
+	if (pages == kstack_pages && kstacks <= kstack_cache_size) {
 		ks_ce = (struct kstack_cache_entry *)ks;
 		ks_ce->ksobj = ksobj;
 		mtx_lock(&kstack_cache_mtx);
@@ -476,7 +460,7 @@
 		ks_ce = ks_ce->next_ks_entry;
 
 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
-		    KSTACK_PAGES);
+		    kstack_pages);
 	}
 }
 
@@ -536,78 +520,7 @@
 }
 #endif /* KSTACK_USAGE_PROF */
 
-#ifndef NO_SWAPPING
 /*
- * Allow a thread's kernel stack to be paged out.
- */
-static void
-vm_thread_swapout(struct thread *td)
-{
-	vm_object_t ksobj;
-	vm_page_t m;
-	int i, pages;
-
-	cpu_thread_swapout(td);
-	pages = td->td_kstack_pages;
-	ksobj = td->td_kstack_obj;
-	pmap_qremove(td->td_kstack, pages);
-	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++) {
-		m = vm_page_lookup(ksobj, i);
-		if (m == NULL)
-			panic("vm_thread_swapout: kstack already missing?");
-		vm_page_dirty(m);
-		vm_page_lock(m);
-		vm_page_unwire(m, 0);
-		vm_page_unlock(m);
-	}
-	VM_OBJECT_WUNLOCK(ksobj);
-}
-
-/*
- * Bring the kernel stack for a specified thread back in.
- */
-static void
-vm_thread_swapin(struct thread *td)
-{
-	vm_object_t ksobj;
-	vm_page_t ma[KSTACK_MAX_PAGES];
-	int i, j, k, pages, rv;
-
-	pages = td->td_kstack_pages;
-	ksobj = td->td_kstack_obj;
-	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++)
-		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
-		    VM_ALLOC_WIRED);
-	for (i = 0; i < pages; i++) {
-		if (ma[i]->valid != VM_PAGE_BITS_ALL) {
-			vm_page_assert_xbusied(ma[i]);
-			vm_object_pip_add(ksobj, 1);
-			for (j = i + 1; j < pages; j++) {
-				if (ma[j]->valid != VM_PAGE_BITS_ALL)
-					vm_page_assert_xbusied(ma[j]);
-				if (ma[j]->valid == VM_PAGE_BITS_ALL)
-					break;
-			}
-			rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0);
-			if (rv != VM_PAGER_OK)
-	panic("vm_thread_swapin: cannot get kstack for proc: %d",
-				    td->td_proc->p_pid);
-			vm_object_pip_wakeup(ksobj);
-			for (k = i; k < j; k++)
-				ma[k] = vm_page_lookup(ksobj, k);
-			vm_page_xunbusy(ma[i]);
-		} else if (vm_page_xbusied(ma[i]))
-			vm_page_xunbusy(ma[i]);
-	}
-	VM_OBJECT_WUNLOCK(ksobj);
-	pmap_qenter(td->td_kstack, ma, pages);
-	cpu_thread_swapin(td);
-}
-#endif /* !NO_SWAPPING */
-
-/*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
@@ -616,12 +529,8 @@
  * to user mode to avoid stack copying and relocation problems.
  */
 int
-vm_forkproc(td, p2, td2, vm2, flags)
-	struct thread *td;
-	struct proc *p2;
-	struct thread *td2;
-	struct vmspace *vm2;
-	int flags;
+vm_forkproc(struct thread *td, struct proc *p2, struct thread *td2,
+    struct vmspace *vm2, int flags)
 {
 	struct proc *p1 = td->td_proc;
 	int error;
@@ -667,7 +576,7 @@
 }
 
 /*
- * Called after process has been wait(2)'ed apon and is being reaped.
+ * Called after process has been wait(2)'ed upon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
@@ -680,414 +589,8 @@
 }
 
 void
-faultin(p)
-	struct proc *p;
-{
-#ifdef NO_SWAPPING
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	if ((p->p_flag & P_INMEM) == 0)
-		panic("faultin: proc swapped out with NO_SWAPPING!");
-#else /* !NO_SWAPPING */
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	/*
-	 * If another process is swapping in this process,
-	 * just wait until it finishes.
-	 */
-	if (p->p_flag & P_SWAPPINGIN) {
-		while (p->p_flag & P_SWAPPINGIN)
-			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
-		return;
-	}
-	if ((p->p_flag & P_INMEM) == 0) {
-		/*
-		 * Don't let another thread swap process p out while we are
-		 * busy swapping it in.
-		 */
-		++p->p_lock;
-		p->p_flag |= P_SWAPPINGIN;
-		PROC_UNLOCK(p);
-
-		/*
-		 * We hold no lock here because the list of threads
-		 * can not change while all threads in the process are
-		 * swapped out.
-		 */
-		FOREACH_THREAD_IN_PROC(p, td)
-			vm_thread_swapin(td);
-		PROC_LOCK(p);
-		swapclear(p);
-		p->p_swtick = ticks;
-
-		wakeup(&p->p_flag);
-
-		/* Allow other threads to swap p out now. */
-		--p->p_lock;
-	}
-#endif /* NO_SWAPPING */
-}
-
-/*
- * This swapin algorithm attempts to swap-in processes only if there
- * is enough space for them.  Of course, if a process waits for a long
- * time, it will be swapped in anyway.
- */
-void
-swapper(void)
-{
-	struct proc *p;
-	struct thread *td;
-	struct proc *pp;
-	int slptime;
-	int swtime;
-	int ppri;
-	int pri;
-
-loop:
-	if (vm_page_count_min()) {
-		VM_WAIT;
-		goto loop;
-	}
-
-	pp = NULL;
-	ppri = INT_MIN;
-	sx_slock(&allproc_lock);
-	FOREACH_PROC_IN_SYSTEM(p) {
-		PROC_LOCK(p);
-		if (p->p_state == PRS_NEW ||
-		    p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
-			PROC_UNLOCK(p);
-			continue;
-		}
-		swtime = (ticks - p->p_swtick) / hz;
-		FOREACH_THREAD_IN_PROC(p, td) {
-			/*
-			 * An otherwise runnable thread of a process
-			 * swapped out has only the TDI_SWAPPED bit set.
-			 * 
-			 */
-			thread_lock(td);
-			if (td->td_inhibitors == TDI_SWAPPED) {
-				slptime = (ticks - td->td_slptick) / hz;
-				pri = swtime + slptime;
-				if ((td->td_flags & TDF_SWAPINREQ) == 0)
-					pri -= p->p_nice * 8;
-				/*
-				 * if this thread is higher priority
-				 * and there is enough space, then select
-				 * this process instead of the previous
-				 * selection.
-				 */
-				if (pri > ppri) {
-					pp = p;
-					ppri = pri;
-				}
-			}
-			thread_unlock(td);
-		}
-		PROC_UNLOCK(p);
-	}
-	sx_sunlock(&allproc_lock);
-
-	/*
-	 * Nothing to do, back to sleep.
-	 */
-	if ((p = pp) == NULL) {
-		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
-		goto loop;
-	}
-	PROC_LOCK(p);
-
-	/*
-	 * Another process may be bringing or may have already
-	 * brought this process in while we traverse all threads.
-	 * Or, this process may even be being swapped out again.
-	 */
-	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
-		PROC_UNLOCK(p);
-		goto loop;
-	}
-
-	/*
-	 * We would like to bring someone in. (only if there is space).
-	 * [What checks the space? ]
-	 */
-	faultin(p);
-	PROC_UNLOCK(p);
-	goto loop;
-}
-
-void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
-
-#ifndef NO_SWAPPING
-
-/*
- * Swap_idle_threshold1 is the guaranteed swapped in time for a process
- */
-static int swap_idle_threshold1 = 2;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
-    &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
-
-/*
- * Swap_idle_threshold2 is the time that a process can be idle before
- * it will be swapped out, if idle swapping is enabled.
- */
-static int swap_idle_threshold2 = 10;
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
-    &swap_idle_threshold2, 0, "Time before a process will be swapped out");
-
-/*
- * First, if any processes have been sleeping or stopped for at least
- * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
- * no such processes exist, then the longest-sleeping or stopped
- * process is swapped out.  Finally, and only as a last resort, if
- * there are no sleeping or stopped processes, the longest-resident
- * process is swapped out.
- */
-void
-swapout_procs(action)
-int action;
-{
-	struct proc *p;
-	struct thread *td;
-	int didswap = 0;
-
-retry:
-	sx_slock(&allproc_lock);
-	FOREACH_PROC_IN_SYSTEM(p) {
-		struct vmspace *vm;
-		int minslptime = 100000;
-		int slptime;
-		
-		/*
-		 * Watch out for a process in
-		 * creation.  It may have no
-		 * address space or lock yet.
-		 */
-		if (p->p_state == PRS_NEW)
-			continue;
-		/*
-		 * An aio daemon switches its
-		 * address space while running.
-		 * Perform a quick check whether
-		 * a process has P_SYSTEM.
-		 */
-		if ((p->p_flag & P_SYSTEM) != 0)
-			continue;
-		/*
-		 * Do not swapout a process that
-		 * is waiting for VM data
-		 * structures as there is a possible
-		 * deadlock.  Test this first as
-		 * this may block.
-		 *
-		 * Lock the map until swapout
-		 * finishes, or a thread of this
-		 * process may attempt to alter
-		 * the map.
-		 */
-		vm = vmspace_acquire_ref(p);
-		if (vm == NULL)
-			continue;
-		if (!vm_map_trylock(&vm->vm_map))
-			goto nextproc1;
-
-		PROC_LOCK(p);
-		if (p->p_lock != 0 ||
-		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
-		    ) != 0) {
-			goto nextproc;
-		}
-		/*
-		 * only aiod changes vmspace, however it will be
-		 * skipped because of the if statement above checking 
-		 * for P_SYSTEM
-		 */
-		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
-			goto nextproc;
-
-		switch (p->p_state) {
-		default:
-			/* Don't swap out processes in any sort
-			 * of 'special' state. */
-			break;
-
-		case PRS_NORMAL:
-			/*
-			 * do not swapout a realtime process
-			 * Check all the thread groups..
-			 */
-			FOREACH_THREAD_IN_PROC(p, td) {
-				thread_lock(td);
-				if (PRI_IS_REALTIME(td->td_pri_class)) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-				slptime = (ticks - td->td_slptick) / hz;
-				/*
-				 * Guarantee swap_idle_threshold1
-				 * time in memory.
-				 */
-				if (slptime < swap_idle_threshold1) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-
-				/*
-				 * Do not swapout a process if it is
-				 * waiting on a critical event of some
-				 * kind or there is a thread whose
-				 * pageable memory may be accessed.
-				 *
-				 * This could be refined to support
-				 * swapping out a thread.
-				 */
-				if (!thread_safetoswapout(td)) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-				/*
-				 * If the system is under memory stress,
-				 * or if we are swapping
-				 * idle processes >= swap_idle_threshold2,
-				 * then swap the process out.
-				 */
-				if (((action & VM_SWAP_NORMAL) == 0) &&
-				    (((action & VM_SWAP_IDLE) == 0) ||
-				    (slptime < swap_idle_threshold2))) {
-					thread_unlock(td);
-					goto nextproc;
-				}
-
-				if (minslptime > slptime)
-					minslptime = slptime;
-				thread_unlock(td);
-			}
-
-			/*
-			 * If the pageout daemon didn't free enough pages,
-			 * or if this process is idle and the system is
-			 * configured to swap proactively, swap it out.
-			 */
-			if ((action & VM_SWAP_NORMAL) ||
-				((action & VM_SWAP_IDLE) &&
-				 (minslptime > swap_idle_threshold2))) {
-				if (swapout(p) == 0)
-					didswap++;
-				PROC_UNLOCK(p);
-				vm_map_unlock(&vm->vm_map);
-				vmspace_free(vm);
-				sx_sunlock(&allproc_lock);
-				goto retry;
-			}
-		}
-nextproc:
-		PROC_UNLOCK(p);
-		vm_map_unlock(&vm->vm_map);
-nextproc1:
-		vmspace_free(vm);
-		continue;
-	}
-	sx_sunlock(&allproc_lock);
-	/*
-	 * If we swapped something out, and another process needed memory,
-	 * then wakeup the sched process.
-	 */
-	if (didswap)
-		wakeup(&proc0);
-}
-
-static void
-swapclear(p)
-	struct proc *p;
-{
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-
-	FOREACH_THREAD_IN_PROC(p, td) {
-		thread_lock(td);
-		td->td_flags |= TDF_INMEM;
-		td->td_flags &= ~TDF_SWAPINREQ;
-		TD_CLR_SWAPPED(td);
-		if (TD_CAN_RUN(td))
-			if (setrunnable(td)) {
-#ifdef INVARIANTS
-				/*
-				 * XXX: We just cleared TDI_SWAPPED
-				 * above and set TDF_INMEM, so this
-				 * should never happen.
-				 */
-				panic("not waking up swapper");
-#endif
-			}
-		thread_unlock(td);
-	}
-	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
-	p->p_flag |= P_INMEM;
-}
-
-static int
-swapout(p)
-	struct proc *p;
-{
-	struct thread *td;
-
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-#if defined(SWAP_DEBUG)
-	printf("swapping out %d\n", p->p_pid);
-#endif
-
-	/*
-	 * The states of this process and its threads may have changed
-	 * by now.  Assuming that there is only one pageout daemon thread,
-	 * this process should still be in memory.
-	 */
-	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
-		("swapout: lost a swapout race?"));
-
-	/*
-	 * remember the process resident count
-	 */
-	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
-	/*
-	 * Check and mark all threads before we proceed.
-	 */
-	p->p_flag &= ~P_INMEM;
-	p->p_flag |= P_SWAPPINGOUT;
-	FOREACH_THREAD_IN_PROC(p, td) {
-		thread_lock(td);
-		if (!thread_safetoswapout(td)) {
-			thread_unlock(td);
-			swapclear(p);
-			return (EBUSY);
-		}
-		td->td_flags &= ~TDF_INMEM;
-		TD_SET_SWAPPED(td);
-		thread_unlock(td);
-	}
-	td = FIRST_THREAD_IN_PROC(p);
-	++td->td_ru.ru_nswap;
-	PROC_UNLOCK(p);
-
-	/*
-	 * This list is stable because all threads are now prevented from
-	 * running.  The list is only modified in the context of a running
-	 * thread in this process.
-	 */
-	FOREACH_THREAD_IN_PROC(p, td)
-		vm_thread_swapout(td);
-
-	PROC_LOCK(p);
-	p->p_flag &= ~P_SWAPPINGOUT;
-	p->p_swtick = ticks;
-	return (0);
-}
-#endif /* !NO_SWAPPING */

Modified: trunk/sys/vm/vm_init.c
===================================================================
--- trunk/sys/vm/vm_init.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_init.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_init.c 255426 2013-09-09 18:11:59Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_init.c 338484 2018-09-05 21:28:33Z kib $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -75,6 +75,7 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/selinfo.h>
+#include <sys/smp.h>
 #include <sys/pipe.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
@@ -91,11 +92,6 @@
 
 long physmem;
 
-static int exec_map_entries = 16;
-TUNABLE_INT("vm.exec_map_entries", &exec_map_entries);
-SYSCTL_INT(_vm, OID_AUTO, exec_map_entries, CTLFLAG_RD, &exec_map_entries, 0,
-    "Maximum number of simultaneous execs");
-
 /*
  * System initialization
  */
@@ -197,8 +193,8 @@
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
 	 */
-	physmem_est = lmin(physmem, btoc(kernel_map->max_offset -
-	    kernel_map->min_offset));
+	physmem_est = lmin(physmem, btoc(vm_map_max(kernel_map) -
+	    vm_map_min(kernel_map)));
 
 	v = kern_vfs_bio_buffer_alloc(v, physmem_est);
 
@@ -231,12 +227,15 @@
 
 	/*
 	 * Allocate the buffer arena.
+	 *
+	 * Enable the quantum cache if we have more than 4 cpus.  This
+	 * avoids lock contention at the expense of some fragmentation.
 	 */
 	size = (long)nbuf * BKVASIZE;
 	kmi->buffer_sva = firstaddr;
 	kmi->buffer_eva = kmi->buffer_sva + size;
 	vmem_init(buffer_arena, "buffer arena", kmi->buffer_sva, size,
-	    PAGE_SIZE, 0, 0);
+	    PAGE_SIZE, (mp_ncpus > 4) ? BKVASIZE * 8 : 0, 0);
 	firstaddr += size;
 
 	/*
@@ -259,10 +258,19 @@
 		panic("Clean map calculation incorrect");
 
 	/*
- 	 * Allocate the pageable submaps.
+	 * Allocate the pageable submaps.  We may cache an exec map entry per
+	 * CPU, so we therefore need to reserve space for at least ncpu+1
+	 * entries to avoid deadlock.  The exec map is also used by some image
+	 * activators, so we leave a fixed number of pages for their use.
 	 */
+#ifdef __LP64__
+	exec_map_entries = 8 * mp_ncpus;
+#else
+	exec_map_entries = 2 * mp_ncpus + 4;
+#endif
+	exec_map_entry_size = round_page(PATH_MAX + ARG_MAX);
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
-	    exec_map_entries * round_page(PATH_MAX + ARG_MAX), FALSE);
+	    exec_map_entries * exec_map_entry_size + 64 * PAGE_SIZE, FALSE);
 	pipe_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, maxpipekva,
 	    FALSE);
 }

Modified: trunk/sys/vm/vm_kern.c
===================================================================
--- trunk/sys/vm/vm_kern.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_kern.c 324782 2017-10-20 00:38:01Z emaste $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_kern.c 340660 2018-11-20 01:12:21Z markj $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -85,6 +85,8 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
+#include <vm/vm_phys.h>
+#include <vm/vm_radix.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
@@ -98,6 +100,9 @@
 /* NB: Used by kernel debuggers. */
 const u_long vm_maxuser_address = VM_MAXUSER_ADDRESS;
 
+u_int exec_map_entry_size;
+u_int exec_map_entries;
+
 SYSCTL_ULONG(_vm, OID_AUTO, min_kernel_address, CTLFLAG_RD,
     SYSCTL_NULL_ULONG_PTR, VM_MIN_KERNEL_ADDRESS, "Min kernel address");
 
@@ -160,8 +165,7 @@
     vm_paddr_t high, vm_memattr_t memattr)
 {
 	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
-	vm_offset_t addr, i;
-	vm_ooffset_t offset;
+	vm_offset_t addr, i, offset;
 	vm_page_t m;
 	int pflags, tries;
 
@@ -170,16 +174,21 @@
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	pflags |= VM_ALLOC_NOWAIT;
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		tries = 0;
 retry:
-		m = vm_page_alloc_contig(object, OFF_TO_IDX(offset + i),
+		m = vm_page_alloc_contig(object, atop(offset + i),
 		    pflags, 1, low, high, PAGE_SIZE, 0, memattr);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-				vm_pageout_grow_cache(tries, low, high);
+				if (!vm_page_reclaim_contig(pflags, 1,
+				    low, high, PAGE_SIZE, 0) &&
+				    (flags & M_WAITOK) != 0)
+					VM_WAIT;
 				VM_OBJECT_WLOCK(object);
 				tries++;
 				goto retry;
@@ -212,9 +221,9 @@
     vm_memattr_t memattr)
 {
 	vm_object_t object = vmem == kmem_arena ? kmem_object : kernel_object;
-	vm_offset_t addr, tmp;
-	vm_ooffset_t offset;
+	vm_offset_t addr, offset, tmp;
 	vm_page_t end_m, m;
+	u_long npages;
 	int pflags, tries;
  
 	size = round_page(size);
@@ -222,15 +231,20 @@
 		return (0);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	pflags |= VM_ALLOC_NOWAIT;
+	npages = atop(size);
 	VM_OBJECT_WLOCK(object);
 	tries = 0;
 retry:
-	m = vm_page_alloc_contig(object, OFF_TO_IDX(offset), pflags,
-	    atop(size), low, high, alignment, boundary, memattr);
+	m = vm_page_alloc_contig(object, atop(offset), pflags,
+	    npages, low, high, alignment, boundary, memattr);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		if (tries < ((flags & M_NOWAIT) != 0 ? 1 : 3)) {
-			vm_pageout_grow_cache(tries, low, high);
+			if (!vm_page_reclaim_contig(pflags, npages, low, high,
+			    alignment, boundary) && (flags & M_WAITOK) != 0)
+				VM_WAIT;
 			VM_OBJECT_WLOCK(object);
 			tries++;
 			goto retry;
@@ -238,7 +252,7 @@
 		vmem_free(vmem, addr, size);
 		return (0);
 	}
-	end_m = m + atop(size);
+	end_m = m + npages;
 	tmp = addr;
 	for (; m < end_m; m++) {
 		if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0)
@@ -322,7 +336,7 @@
 kmem_back(vm_object_t object, vm_offset_t addr, vm_size_t size, int flags)
 {
 	vm_offset_t offset, i;
-	vm_page_t m;
+	vm_page_t m, mpred;
 	int pflags;
 
 	KASSERT(object == kmem_object || object == kernel_object,
@@ -330,11 +344,17 @@
 
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	pflags = malloc2vm_flags(flags) | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED;
+	pflags &= ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	if (flags & M_WAITOK)
+		pflags |= VM_ALLOC_WAITFAIL;
 
+	i = 0;
 	VM_OBJECT_WLOCK(object);
-	for (i = 0; i < size; i += PAGE_SIZE) {
 retry:
-		m = vm_page_alloc(object, OFF_TO_IDX(offset + i), pflags);
+	mpred = vm_radix_lookup_le(&object->rtree, atop(offset + i));
+	for (; i < size; i += PAGE_SIZE, mpred = m) {
+		m = vm_page_alloc_after(object, atop(offset + i), pflags,
+		    mpred);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
@@ -342,12 +362,9 @@
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
+			if ((flags & M_NOWAIT) == 0)
+				goto retry;
 			VM_OBJECT_WUNLOCK(object);
-			if ((flags & M_NOWAIT) == 0) {
-				VM_WAIT;
-				VM_OBJECT_WLOCK(object);
-				goto retry;
-			}
 			kmem_unback(object, addr, i);
 			return (KERN_NO_SPACE);
 		}
@@ -376,8 +393,8 @@
 void
 kmem_unback(vm_object_t object, vm_offset_t addr, vm_size_t size)
 {
-	vm_page_t m;
-	vm_offset_t i, offset;
+	vm_page_t m, next;
+	vm_offset_t end, offset;
 
 	KASSERT(object == kmem_object || object == kernel_object,
 	    ("kmem_unback: only supports kernel objects."));
@@ -384,10 +401,12 @@
 
 	pmap_remove(kernel_pmap, addr, addr + size);
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
+	end = offset + size;
 	VM_OBJECT_WLOCK(object);
-	for (i = 0; i < size; i += PAGE_SIZE) {
-		m = vm_page_lookup(object, OFF_TO_IDX(offset + i));
-		vm_page_unwire(m, 0);
+	for (m = vm_page_lookup(object, atop(offset)); offset < end;
+	    offset += PAGE_SIZE, m = next) {
+		next = vm_page_next(m);
+		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 	}
 	VM_OBJECT_WUNLOCK(object);
@@ -443,8 +462,8 @@
 		map->needs_wakeup = TRUE;
 		vm_map_unlock_and_wait(map, 0);
 	}
-	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_ALL,
-	    VM_PROT_ALL, MAP_ACC_CHARGED);
+	vm_map_insert(map, NULL, 0, addr, addr + size, VM_PROT_RW, VM_PROT_RW,
+	    MAP_ACC_CHARGED);
 	vm_map_unlock(map);
 	return (addr);
 }
@@ -520,6 +539,43 @@
 	vm_map_unlock(m);
 }
 
+/*
+ *	kmem_bootstrap_free:
+ *
+ *	Free pages backing preloaded data (e.g., kernel modules) to the
+ *	system.  Currently only supported on platforms that create a
+ *	vm_phys segment for preloaded data.
+ */
+void
+kmem_bootstrap_free(vm_offset_t start, vm_size_t size)
+{
+#if defined(__i386__) || defined(__amd64__)
+	struct vm_domain *vmd;
+	vm_offset_t end, va;
+	vm_paddr_t pa;
+	vm_page_t m;
+
+	end = trunc_page(start + size);
+	start = round_page(start);
+
+	for (va = start; va < end; va += PAGE_SIZE) {
+		pa = pmap_kextract(va);
+		m = PHYS_TO_VM_PAGE(pa);
+
+		vmd = vm_phys_domain(m);
+		mtx_lock(&vm_page_queue_free_mtx);
+		vm_phys_free_pages(m, 0);
+		vmd->vmd_page_count++;
+		vm_phys_freecnt_adj(m, 1);
+		mtx_unlock(&vm_page_queue_free_mtx);
+
+		vm_cnt.v_page_count++;
+	}
+	pmap_remove(kernel_pmap, start, end);
+	(void)vmem_add(kernel_arena, start, end - start, M_WAITOK);
+#endif
+}
+
 #ifdef DIAGNOSTIC
 /*
  * Allow userspace to directly trigger the VM drain routine for testing

Modified: trunk/sys/vm/vm_kern.h
===================================================================
--- trunk/sys/vm/vm_kern.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_kern.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,11 +58,11 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_kern.h 254307 2013-08-13 22:40:43Z jeff $
+ * $FreeBSD: stable/11/sys/vm/vm_kern.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _VM_VM_KERN_H_
-#define _VM_VM_KERN_H_ 1
+#define	_VM_VM_KERN_H_
 
 /* Kernel memory management definitions. */
 extern vm_map_t kernel_map;
@@ -75,5 +75,7 @@
 extern struct vmem *memguard_arena;
 extern vm_offset_t swapbkva;
 extern u_long vm_kmem_size;
+extern u_int exec_map_entries;
+extern u_int exec_map_entry_size;
 
-#endif				/* _VM_VM_KERN_H_ */
+#endif /* _VM_VM_KERN_H_ */

Modified: trunk/sys/vm/vm_map.c
===================================================================
--- trunk/sys/vm/vm_map.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_map.c 355049 2019-11-24 06:54:17Z dougm $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -136,6 +136,8 @@
 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
     vm_map_entry_t gap_entry);
+static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
+    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
@@ -277,12 +279,7 @@
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
-
 	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
-
-	if (pinit == NULL)
-		pinit = &pmap_pinit;
-
 	if (!pinit(vmspace_pmap(vm))) {
 		uma_zfree(vmspace_zone, vm);
 		return (NULL);
@@ -333,8 +330,8 @@
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
-	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
-	    vm->vm_map.max_offset);
+	(void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
+	    vm_map_max(&vm->vm_map));
 
 	pmap_release(vmspace_pmap(vm));
 	vm->vm_map.pmap = NULL;
@@ -346,7 +343,7 @@
 {
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
-	    "vmspace_free() called with non-sleepable lock held");
+	    "vmspace_free() called");
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
@@ -452,7 +449,48 @@
 	return (vm);
 }
 
+/*
+ * Switch between vmspaces in an AIO kernel process.
+ *
+ * The new vmspace is either the vmspace of a user process obtained
+ * from an active AIO request or the initial vmspace of the AIO kernel
+ * process (when it is idling).  Because user processes will block to
+ * drain any active AIO requests before proceeding in exit() or
+ * execve(), the reference count for vmspaces from AIO requests can
+ * never be 0.  Similarly, AIO kernel processes hold an extra
+ * reference on their initial vmspace for the life of the process.  As
+ * a result, the 'newvm' vmspace always has a non-zero reference
+ * count.  This permits an additional reference on 'newvm' to be
+ * acquired via a simple atomic increment rather than the loop in
+ * vmspace_acquire_ref() above.
+ */
 void
+vmspace_switch_aio(struct vmspace *newvm)
+{
+	struct vmspace *oldvm;
+
+	/* XXX: Need some way to assert that this is an aio daemon. */
+
+	KASSERT(newvm->vm_refcnt > 0,
+	    ("vmspace_switch_aio: newvm unreferenced"));
+
+	oldvm = curproc->p_vmspace;
+	if (oldvm == newvm)
+		return;
+
+	/*
+	 * Point to the new address space and refer to it.
+	 */
+	curproc->p_vmspace = newvm;
+	atomic_add_int(&newvm->vm_refcnt, 1);
+
+	/* Activate the new mapping. */
+	pmap_activate(curthread);
+
+	vmspace_free(oldvm);
+}
+
+void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
@@ -748,8 +786,8 @@
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->pmap = pmap;
-	map->min_offset = min;
-	map->max_offset = max;
+	map->header.end = min;
+	map->header.start = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
@@ -952,12 +990,10 @@
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	VM_MAP_ASSERT_LOCKED(map);
-	KASSERT(after_where == &map->header ||
-	    after_where->end <= entry->start,
+	KASSERT(after_where->end <= entry->start,
 	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
 	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
-	KASSERT(after_where->next == &map->header ||
-	    entry->end <= after_where->next->start,
+	KASSERT(entry->end <= after_where->next->start,
 	    ("vm_map_entry_link: new end %jx next start %jx overlap",
 	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
 
@@ -979,8 +1015,7 @@
 		entry->right = map->root;
 		entry->left = NULL;
 	}
-	entry->adj_free = (entry->next == &map->header ? map->max_offset :
-	    entry->next->start) - entry->end;
+	entry->adj_free = entry->next->start - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
@@ -999,8 +1034,7 @@
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
-		root->adj_free = (entry->next == &map->header ? map->max_offset :
-		    entry->next->start) - root->end;
+		root->adj_free = entry->next->start - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
@@ -1036,8 +1070,7 @@
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
-	entry->adj_free = (entry->next == &map->header ? map->max_offset :
-	    entry->next->start) - entry->end;
+	entry->adj_free = entry->next->start - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
@@ -1152,7 +1185,8 @@
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
-	if (start < map->min_offset || end > map->max_offset || start >= end)
+	if (start < vm_map_min(map) || end > vm_map_max(map) ||
+	    start >= end)
 		return (KERN_INVALID_ADDRESS);
 
 	/*
@@ -1167,7 +1201,7 @@
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
-	if (prev_entry->next != &map->header && prev_entry->next->start < end)
+	if (prev_entry->next->start < end)
 		return (KERN_NO_SPACE);
 
 	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
@@ -1295,7 +1329,7 @@
 	new_entry->wired_count = 0;
 	new_entry->wiring_thread = NULL;
 	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
-	new_entry->next_read = OFF_TO_IDX(offset);
+	new_entry->next_read = start;
 
 	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
 	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
@@ -1352,9 +1386,8 @@
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
-	if (start < map->min_offset)
-		start = map->min_offset;
-	if (start + length > map->max_offset || start + length < start)
+	start = MAX(start, vm_map_min(map));
+	if (start + length > vm_map_max(map) || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
@@ -1456,6 +1489,8 @@
 	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
 	    object == NULL,
 	    ("vm_map_find: non-NULL backing object for stack"));
+	MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
+	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
 	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
 	    (object->flags & OBJ_COLORED) == 0))
 		find_space = VMFS_ANY_SPACE;
@@ -1496,6 +1531,14 @@
 			}
 
 			start = *addr;
+		} else if ((cow & MAP_REMAP) != 0) {
+			if (start < vm_map_min(map) ||
+			    start + length > vm_map_max(map) ||
+			    start + length <= length) {
+				result = KERN_INVALID_ADDRESS;
+				break;
+			}
+			vm_map_delete(map, start, start + length);
 		}
 		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
 			result = vm_map_stack_locked(map, start, length,
@@ -1549,7 +1592,7 @@
  *
  *	The map must be locked.
  *
- *	This routine guarentees that the passed entry remains valid (though
+ *	This routine guarantees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
@@ -1655,6 +1698,8 @@
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(entry->end > start && entry->start < start,
+	    ("_vm_map_clip_start: invalid clip of entry %p", entry));
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
@@ -1740,6 +1785,8 @@
 	vm_map_entry_t new_entry;
 
 	VM_MAP_ASSERT_LOCKED(map);
+	KASSERT(entry->start < end && entry->end > end,
+	    ("_vm_map_clip_end: invalid clip of entry %p", entry));
 
 	/*
 	 * If there is no object backing this entry, we might as well create
@@ -1856,11 +1903,9 @@
  *	limited number of page mappings are created at the low-end of the
  *	specified address range.  (For this purpose, a superpage mapping
  *	counts as one page mapping.)  Otherwise, all resident pages within
- *	the specified address range are mapped.  Because these mappings are
- *	being created speculatively, cached pages are not reactivated and
- *	mapped.
+ *	the specified address range are mapped.
  */
-void
+static void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
@@ -1910,7 +1955,7 @@
 		 * free pages allocating pv entries.
 		 */
 		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
-		    cnt.v_free_count < cnt.v_free_reserved) ||
+		    vm_cnt.v_free_count < vm_cnt.v_free_reserved) ||
 		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
 		    tmpidx >= threshold)) {
 			psize = tmpidx;
@@ -1926,7 +1971,7 @@
 			    (pagesizes[p->psind] - 1)) == 0) {
 				mask = atop(pagesizes[p->psind]) - 1;
 				if (tmpidx + mask < psize &&
-				    vm_page_ps_is_valid(p)) {
+				    vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
 					p += mask;
 					threshold += mask;
 				}
@@ -1955,7 +2000,7 @@
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
-	vm_map_entry_t current, entry;
+	vm_map_entry_t current, entry, in_tran;
 	vm_object_t obj;
 	struct ucred *cred;
 	vm_prot_t old_prot;
@@ -1963,8 +2008,18 @@
 	if (start == end)
 		return (KERN_SUCCESS);
 
+again:
+	in_tran = NULL;
 	vm_map_lock(map);
 
+	/*
+	 * Ensure that we are not concurrently wiring pages.  vm_map_wire() may
+	 * need to fault pages into the map and will drop the map lock while
+	 * doing so, and the VM object may end up in an inconsistent state if we
+	 * update the protection on the map entry in between faults.
+	 */
+	vm_map_wait_busy(map);
+
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
@@ -1976,8 +2031,7 @@
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -1988,15 +2042,29 @@
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
+		if ((current->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
+			in_tran = current;
 	}
 
 	/*
+	 * Postpone the operation until all in-transition map entries have
+	 * stabilized.  An in-transition entry might already have its pages
+	 * wired and wired_count incremented, but not yet have its
+	 * MAP_ENTRY_USER_WIRED flag set.  In which case, we would fail to call
+	 * vm_fault_copy_entry() in the final loop below.
+	 */
+	if (in_tran != NULL) {
+		in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+		vm_map_unlock_and_wait(map, 0);
+		goto again;
+	}
+
+	/*
 	 * Do an accounting pass for private read-only mappings that
 	 * now will do cow due to allowed write (e.g. debugger sets
 	 * breakpoint on text segment)
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 
 		vm_map_clip_end(map, current, end);
 
@@ -2050,8 +2118,7 @@
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
 			continue;
 
@@ -2160,10 +2227,8 @@
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
-		for (current = entry;
-		     (current != &map->header) && (current->start < end);
-		     current = current->next
-		) {
+		for (current = entry; current->start < end;
+		    current = current->next) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
@@ -2207,15 +2272,25 @@
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
-		for (current = entry;
-		     (current != &map->header) && (current->start < end);
-		     current = current->next
-		) {
+		for (current = entry; current->start < end;
+		    current = current->next) {
 			vm_offset_t useEnd, useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
+			/*
+			 * MADV_FREE would otherwise rewind time to
+			 * the creation of the shadow object.  Because
+			 * we hold the VM map read-locked, neither the
+			 * entry's object nor the presence of a
+			 * backing object can change.
+			 */
+			if (behav == MADV_FREE &&
+			    current->object.vm_object != NULL &&
+			    current->object.vm_object->backing_object != NULL)
+				continue;
+
 			pstart = OFF_TO_IDX(current->offset);
 			pend = pstart + atop(current->end - current->start);
 			useStart = current->start;
@@ -2306,7 +2381,7 @@
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
-	while ((entry != &map->header) && (entry->start < end)) {
+	while (entry->start < end) {
 		vm_map_clip_end(map, entry, end);
 		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
 		    new_inheritance != VM_INHERIT_ZERO)
@@ -2348,7 +2423,7 @@
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
@@ -2411,8 +2486,7 @@
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
-		    (entry->end < end && (entry->next == &map->header ||
-		    entry->next->start > entry->end))) {
+		    (entry->end < end && entry->next->start > entry->end)) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
@@ -2438,8 +2512,7 @@
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
-	for (entry = first_entry; entry != &map->header && entry->start < end;
-	    entry = entry->next) {
+	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
@@ -2553,7 +2626,7 @@
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
-	while (entry != &map->header && entry->start < end) {
+	while (entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
@@ -2690,8 +2763,7 @@
 		 */
 	next_entry:
 		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
-		    entry->end < end && (entry->next == &map->header ||
-		    entry->next->start > entry->end)) {
+		    entry->end < end && entry->next->start > entry->end) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
@@ -2708,8 +2780,7 @@
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
-	for (entry = first_entry; entry != &map->header && entry->start < end;
-	    entry = entry->next) {
+	for (entry = first_entry; entry->start < end; entry = entry->next) {
 		/*
 		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
 		 * space in the unwired region could have been mapped
@@ -2813,15 +2884,13 @@
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
-	for (current = entry; current != &map->header && current->start < end;
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
-		    (current->next == &map->header ||
-			current->end != current->next->start)) {
+		    current->end != current->next->start) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
@@ -2835,7 +2904,7 @@
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
-	for (current = entry; current != &map->header && current->start < end;) {
+	for (current = entry; current->start < end;) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
@@ -2912,7 +2981,7 @@
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count, size1;
-	vm_ooffset_t size;
+	vm_size_t size;
 
 	vm_map_entry_unlink(map, entry);
 	object = entry->object.vm_object;
@@ -2938,7 +3007,7 @@
 		KASSERT(entry->cred == NULL || object->cred == NULL ||
 		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
 		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
-		count = OFF_TO_IDX(size);
+		count = atop(size);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_WLOCK(object);
@@ -3012,7 +3081,7 @@
 	/*
 	 * Step through all entries in this region
 	 */
-	while ((entry != &map->header) && (entry->start < end)) {
+	while (entry->start < end) {
 		vm_map_entry_t next;
 
 		/*
@@ -3058,11 +3127,17 @@
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
-		if (entry->wired_count != 0) {
+		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
-		}
 
-		pmap_remove(map->pmap, entry->start, entry->end);
+		/*
+		 * Remove mappings for the pages, but only if the
+		 * mappings could exist.  For instance, it does not
+		 * make sense to call pmap_remove() for guard entries.
+		 */
+		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
+		    entry->object.vm_object != NULL)
+			pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry only after removing all pmap
@@ -3120,8 +3195,6 @@
 	entry = tmp_entry;
 
 	while (start < end) {
-		if (entry == &map->header)
-			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
@@ -3325,7 +3398,8 @@
 
 	old_map = &vm1->vm_map;
 	/* Copy immutable fields of vm1 to vm2. */
-	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
+	vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
+	    pmap_pinit);
 	if (vm2 == NULL)
 		return (NULL);
 	vm2->vm_taddr = vm1->vm_taddr;
@@ -3529,9 +3603,7 @@
 	growsize = sgrowsiz;
 	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
 	vm_map_lock(map);
-	PROC_LOCK(curproc);
-	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
-	PROC_UNLOCK(curproc);
+	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		rv = KERN_NO_SPACE;
@@ -3572,7 +3644,8 @@
 	    addrbos + max_ssize > vm_map_max(map) ||
 	    addrbos + max_ssize <= addrbos)
 		return (KERN_INVALID_ADDRESS);
-	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
+	sgp = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+	    (vm_size_t)stack_guard_page * PAGE_SIZE;
 	if (sgp >= max_ssize)
 		return (KERN_INVALID_ARGUMENT);
 
@@ -3585,10 +3658,9 @@
 		return (KERN_NO_SPACE);
 
 	/*
-	 * If we can't accomodate max_ssize in the current mapping, no go.
+	 * If we can't accommodate max_ssize in the current mapping, no go.
 	 */
-	if ((prev_entry->next != &map->header) &&
-	    (prev_entry->next->start < addrbos + max_ssize))
+	if (prev_entry->next->start < addrbos + max_ssize)
 		return (KERN_NO_SPACE);
 
 	/*
@@ -3624,11 +3696,25 @@
 	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
 	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
 	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
+	if (gap_bot == gap_top)
+		return (KERN_SUCCESS);
 	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
 	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
 	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
-	if (rv != KERN_SUCCESS)
+	if (rv == KERN_SUCCESS) {
+		/*
+		 * Gap can never successfully handle a fault, so
+		 * read-ahead logic is never used for it.  Re-use
+		 * next_read of the gap entry to store
+		 * stack_guard_page for vm_map_growstack().
+		 */
+		if (orient == MAP_STACK_GROWS_DOWN)
+			new_entry->prev->next_read = sgp;
+		else
+			new_entry->next->next_read = sgp;
+	} else {
 		(void)vm_map_delete(map, bot, top);
+	}
 	return (rv);
 }
 
@@ -3663,17 +3749,15 @@
 	 * debugger or AIO daemon.  The reason is that the wrong
 	 * resource limits are applied.
 	 */
-	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
+	if (p != initproc && (map != &p->p_vmspace->vm_map ||
+	    p->p_textvp == NULL))
 		return (KERN_FAILURE);
 
 	MPASS(!map->system_map);
 
-	guard = stack_guard_page * PAGE_SIZE;
-	PROC_LOCK(p);
-	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
-	stacklim = lim_cur(p, RLIMIT_STACK);
-	vmemlim = lim_cur(p, RLIMIT_VMEM);
-	PROC_UNLOCK(p);
+	lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
+	stacklim = lim_cur(curthread, RLIMIT_STACK);
+	vmemlim = lim_cur(curthread, RLIMIT_VMEM);
 retry:
 	/* If addr is not in a hole for a stack grow area, no need to grow. */
 	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
@@ -3697,6 +3781,8 @@
 	} else {
 		return (KERN_FAILURE);
 	}
+	guard = (curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ? 0 :
+	    gap_entry->next_read;
 	max_grow = gap_entry->end - gap_entry->start;
 	if (guard > max_grow)
 		return (KERN_NO_SPACE);
@@ -3844,9 +3930,7 @@
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
 		vm_map_unlock(map);
 		vm_map_wire(map, grow_start, grow_start + grow_amount,
-		    (p->p_flag & P_SYSTEM)
-		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
-		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
+		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 		vm_map_lock_read(map);
 	} else
 		vm_map_lock_downgrade(map);
@@ -3883,7 +3967,7 @@
 
 	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("vmspace_exec recursed"));
-	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
+	newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
 	if (newvmspace == NULL)
 		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
@@ -4125,7 +4209,7 @@
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
-	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
@@ -4206,7 +4290,7 @@
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
-	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
+	*pindex = UOFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
@@ -4228,6 +4312,27 @@
 	vm_map_unlock_read(map);
 }
 
+vm_offset_t
+vm_map_max_KBI(const struct vm_map *map)
+{
+
+	return (vm_map_max(map));
+}
+
+vm_offset_t
+vm_map_min_KBI(const struct vm_map *map)
+{
+
+	return (vm_map_min(map));
+}
+
+pmap_t
+vm_map_pmap_KBI(vm_map_t map)
+{
+
+	return (map->pmap);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>

Modified: trunk/sys/vm/vm_map.h
===================================================================
--- trunk/sys/vm/vm_map.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_map.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_map.h 321718 2017-07-30 10:49:13Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_map.h 343426 2019-01-25 11:46:07Z kib $
  */
 
 /*
@@ -105,6 +105,7 @@
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
 	vm_offset_t pad0;
+	vm_offset_t next_read;		/* vaddr of the next sequential read */
 	vm_size_t adj_free;		/* amount of adjacent free space */
 	vm_size_t max_free;		/* max free space in subtree */
 	union vm_map_object object;	/* object I point to */
@@ -115,7 +116,6 @@
 	vm_inherit_t inheritance;	/* inheritance */
 	uint8_t read_ahead;		/* pages in the read-ahead window */
 	int wired_count;		/* can be paged if = 0 */
-	vm_pindex_t next_read;		/* index of the next sequential read */
 	struct ucred *cred;		/* tmp storage for creator ref */
 	struct thread *wiring_thread;
 };
@@ -173,15 +173,26 @@
  *	A map is a set of map entries.  These map entries are
  *	organized both as a binary search tree and as a doubly-linked
  *	list.  Both structures are ordered based upon the start and
- *	end addresses contained within each map entry.  Sleator and
- *	Tarjan's top-down splay algorithm is employed to control
- *	height imbalance in the binary search tree.
+ *	end addresses contained within each map entry.
  *
- * List of locks
+ *	Counterintuitively, the map's min offset value is stored in
+ *	map->header.end, and its max offset value is stored in
+ *	map->header.start.
+ *
+ *	The list header has max start value and min end value to act
+ *	as sentinels for sequential search of the doubly-linked list.
+ *	Sleator and Tarjan's top-down splay algorithm is employed to
+ *	control height imbalance in the binary search tree.
+ *
+ *	List of locks
  *	(c)	const until freed
  */
 struct vm_map {
 	struct vm_map_entry header;	/* List of entries */
+/*
+	map min_offset	header.end	(c)
+	map max_offset	header.start	(c)
+*/
 	struct sx lock;			/* Lock for map data */
 	struct mtx system_mtx;
 	int nentries;			/* Number of entries */
@@ -192,8 +203,6 @@
 	vm_flags_t flags;		/* flags for this vm_map */
 	vm_map_entry_t root;		/* Root of a binary search tree */
 	pmap_t pmap;			/* (c) Physical map */
-#define	min_offset	header.start	/* (c) */
-#define	max_offset	header.end	/* (c) */
 	int busy;
 };
 
@@ -204,16 +213,23 @@
 #define	MAP_BUSY_WAKEUP		0x02
 
 #ifdef	_KERNEL
+#ifdef KLD_MODULE
+#define	vm_map_max(map)		vm_map_max_KBI((map))
+#define	vm_map_min(map)		vm_map_min_KBI((map))
+#define	vm_map_pmap(map)	vm_map_pmap_KBI((map))
+#else
 static __inline vm_offset_t
 vm_map_max(const struct vm_map *map)
 {
-	return (map->max_offset);
+
+	return (map->header.start);
 }
 
 static __inline vm_offset_t
 vm_map_min(const struct vm_map *map)
 {
-	return (map->min_offset);
+
+	return (map->header.end);
 }
 
 static __inline pmap_t
@@ -227,6 +243,7 @@
 {
 	map->flags = (map->flags | set) & ~clear;
 }
+#endif	/* KLD_MODULE */
 #endif	/* _KERNEL */
 
 /*
@@ -287,6 +304,9 @@
 void vm_map_busy(vm_map_t map);
 void vm_map_unbusy(vm_map_t map);
 void vm_map_wait_busy(vm_map_t map);
+vm_offset_t vm_map_max_KBI(const struct vm_map *map);
+vm_offset_t vm_map_min_KBI(const struct vm_map *map);
+pmap_t vm_map_pmap_KBI(vm_map_t map);
 
 #define	vm_map_lock(map)	_vm_map_lock(map, LOCK_FILE, LOCK_LINE)
 #define	vm_map_unlock(map)	_vm_map_unlock(map, LOCK_FILE, LOCK_LINE)
@@ -306,9 +326,8 @@
 #endif	/* _KERNEL */
 
 
-/* XXX: number of kernel maps and entries to statically allocate */
+/* XXX: number of kernel maps to statically allocate */
 #define MAX_KMAP	10
-#define	MAX_KMAPENT	128
 
 /*
  * Copy-on-write flags for vm_map operations
@@ -324,6 +343,7 @@
 #define MAP_DISABLE_COREDUMP	0x0100
 #define MAP_PREFAULT_MADVISE	0x0200	/* from (user) madvise request */
 #define	MAP_VN_WRITECOUNT	0x0400
+#define	MAP_REMAP		0x0800
 #define	MAP_STACK_GROWS_DOWN	0x1000
 #define	MAP_STACK_GROWS_UP	0x2000
 #define	MAP_ACC_CHARGED		0x4000
@@ -389,15 +409,13 @@
     vm_pindex_t *, vm_prot_t *, boolean_t *);
 void vm_map_lookup_done (vm_map_t, vm_map_entry_t);
 boolean_t vm_map_lookup_entry (vm_map_t, vm_offset_t, vm_map_entry_t *);
-void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
-    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
 int vm_map_protect (vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t);
 int vm_map_remove (vm_map_t, vm_offset_t, vm_offset_t);
+void vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry);
 void vm_map_startup (void);
 int vm_map_submap (vm_map_t, vm_offset_t, vm_offset_t, vm_map_t);
 int vm_map_sync(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
 int vm_map_madvise (vm_map_t, vm_offset_t, vm_offset_t, int);
-void vm_map_simplify_entry (vm_map_t, vm_map_entry_t);
 int vm_map_stack (vm_map_t, vm_offset_t, vm_size_t, vm_prot_t, vm_prot_t, int);
 int vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags);

Modified: trunk/sys/vm/vm_meter.c
===================================================================
--- trunk/sys/vm/vm_meter.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_meter.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_meter.c 311049 2017-01-02 08:31:29Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_meter.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -54,24 +54,20 @@
 #include <vm/vm_object.h>
 #include <sys/sysctl.h>
 
-struct vmmeter cnt;
+struct vmmeter vm_cnt;
 
 SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
-	CTLFLAG_RW, &cnt.v_free_min, 0, "Minimum low-free-pages threshold");
+	CTLFLAG_RW, &vm_cnt.v_free_min, 0, "Minimum low-free-pages threshold");
 SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
-	CTLFLAG_RW, &cnt.v_free_target, 0, "Desired free pages");
+	CTLFLAG_RW, &vm_cnt.v_free_target, 0, "Desired free pages");
 SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
-	CTLFLAG_RW, &cnt.v_free_reserved, 0, "Pages reserved for deadlock");
+	CTLFLAG_RW, &vm_cnt.v_free_reserved, 0, "Pages reserved for deadlock");
 SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
-	CTLFLAG_RW, &cnt.v_inactive_target, 0, "Pages desired inactive");
-SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
-	CTLFLAG_RW, &cnt.v_cache_min, 0, "Min pages on cache queue");
-SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
-	CTLFLAG_RW, &cnt.v_cache_max, 0, "Max pages on cache queue");
+	CTLFLAG_RW, &vm_cnt.v_inactive_target, 0, "Pages desired inactive");
 SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
-	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
+	CTLFLAG_RW, &vm_cnt.v_pageout_free_min, 0, "Min pages reserved for kernel");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
-	CTLFLAG_RW, &cnt.v_free_severe, 0, "Severe page depletion point");
+	CTLFLAG_RW, &vm_cnt.v_free_severe, 0, "Severe page depletion point");
 
 static int
 sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
@@ -140,7 +136,7 @@
 						else
 							total.t_sl++;
 						if (td->td_wchan ==
-						    &cnt.v_free_count)
+						    &vm_cnt.v_free_count)
 							total.t_pw++;
 					}
 					break;
@@ -209,13 +205,13 @@
 		}
 	}
 	mtx_unlock(&vm_object_list_mtx);
-	total.t_free = cnt.v_free_count + cnt.v_cache_count;
+	total.t_free = vm_cnt.v_free_count;
 	return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
 }
 
 /*
- * vcnt() -	accumulate statistics from all cpus and the global cnt
- *		structure.
+ * vm_meter_cnt() -	accumulate statistics from all cpus and the global cnt
+ *			structure.
  *
  *	The vmmeter structure is now per-cpu as well as global.  Those
  *	statistics which can be kept on a per-cpu basis (to avoid cache
@@ -222,23 +218,31 @@
  *	stalls between cpus) can be moved to the per-cpu vmmeter.  Remaining
  *	statistics, such as v_free_reserved, are left in the global
  *	structure.
- *
- * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
  */
-static int
-vcnt(SYSCTL_HANDLER_ARGS)
+u_int
+vm_meter_cnt(size_t offset)
 {
-	int count = *(int *)arg1;
-	int offset = (char *)arg1 - (char *)&cnt;
+	struct pcpu *pcpu;
+	u_int count;
 	int i;
 
+	count = *(u_int *)((char *)&vm_cnt + offset);
 	CPU_FOREACH(i) {
-		struct pcpu *pcpu = pcpu_find(i);
-		count += *(int *)((char *)&pcpu->pc_cnt + offset);
+		pcpu = pcpu_find(i);
+		count += *(u_int *)((char *)&pcpu->pc_cnt + offset);
 	}
-	return (SYSCTL_OUT(req, &count, sizeof(int)));
+	return (count);
 }
 
+static int
+cnt_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	u_int count;
+
+	count = vm_meter_cnt((char *)arg1 - (char *)&vm_cnt);
+	return (SYSCTL_OUT(req, &count, sizeof(count)));
+}
+
 SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", 
     "System virtual memory statistics");
@@ -251,8 +255,8 @@
 
 #define	VM_STATS(parent, var, descr) \
 	SYSCTL_PROC(parent, OID_AUTO, var, \
-	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &cnt.var, 0, vcnt, \
-	    "IU", descr)
+	    CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE, &vm_cnt.var, 0,	\
+	    cnt_sysctl, "IU", descr)
 #define	VM_STATS_VM(var, descr)		VM_STATS(_vm_stats_vm, var, descr)
 #define	VM_STATS_SYS(var, descr)	VM_STATS(_vm_stats_sys, var, descr)
 
@@ -276,9 +280,10 @@
 VM_STATS_VM(v_vnodepgsin, "Vnode pages paged in");
 VM_STATS_VM(v_vnodepgsout, "Vnode pages paged out");
 VM_STATS_VM(v_intrans, "In transit page faults");
-VM_STATS_VM(v_reactivated, "Pages reactivated from free list");
+VM_STATS_VM(v_reactivated, "Pages reactivated by pagedaemon");
 VM_STATS_VM(v_pdwakeups, "Pagedaemon wakeups");
 VM_STATS_VM(v_pdpages, "Pages analyzed by pagedaemon");
+VM_STATS_VM(v_pdshortfalls, "Page reclamation shortfalls");
 VM_STATS_VM(v_tcached, "Total pages cached");
 VM_STATS_VM(v_dfree, "Pages freed by pagedaemon");
 VM_STATS_VM(v_pfree, "Pages freed by exiting processes");
@@ -293,9 +298,8 @@
 VM_STATS_VM(v_active_count, "Active pages");
 VM_STATS_VM(v_inactive_target, "Desired inactive pages");
 VM_STATS_VM(v_inactive_count, "Inactive pages");
+VM_STATS_VM(v_laundry_count, "Pages eligible for laundering");
 VM_STATS_VM(v_cache_count, "Pages on cache queue");
-VM_STATS_VM(v_cache_min, "Min pages on cache queue");
-VM_STATS_VM(v_cache_max, "Max pages on cached queue");
 VM_STATS_VM(v_pageout_free_min, "Min pages reserved for kernel");
 VM_STATS_VM(v_interrupt_free_min, "Reserved pages for interrupt code");
 VM_STATS_VM(v_forks, "Number of fork() calls");

Modified: trunk/sys/vm/vm_mmap.c
===================================================================
--- trunk/sys/vm/vm_mmap.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_mmap.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -42,10 +42,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_mmap.c 321717 2017-07-30 10:36:20Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_mmap.c 356634 2020-01-11 15:06:06Z kevans $");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
+#include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -74,6 +75,7 @@
 #include <sys/sysent.h>
 #include <sys/vmmeter.h>
 
+#include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
@@ -93,21 +95,16 @@
 #endif
 
 int old_mlock = 0;
-SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
+SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
     "Do not apply RLIMIT_MEMLOCK on mlockall");
-TUNABLE_INT("vm.old_mlock", &old_mlock);
+static int mincore_mapped = 1;
+SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
+    "mincore reports mappings, not residency");
 
 #ifdef MAP_32BIT
 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
 #endif
 
-static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
-static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
-static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
-    int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
-
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
@@ -177,34 +174,48 @@
 #endif
 
 int
-sys_mmap(td, uap)
-	struct thread *td;
-	struct mmap_args *uap;
+sys_mmap(struct thread *td, struct mmap_args *uap)
 {
-#ifdef HWPMC_HOOKS
-	struct pmckern_map_in pkm;
-#endif
+
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
+}
+
+int
+kern_mmap(struct thread *td, uintptr_t addr0, size_t size, int prot, int flags,
+    int fd, off_t pos)
+{
+
+	return (kern_mmap_fpcheck(td, addr0, size, prot, flags, fd, pos, NULL));
+}
+
+/*
+ * When mmap'ing a file, check_fp_fn may be used for the caller to do any
+ * last-minute validation based on the referenced file in a non-racy way.
+ */
+int
+kern_mmap_fpcheck(struct thread *td, uintptr_t addr0, size_t size, int prot,
+    int flags, int fd, off_t pos, mmap_check_fp_fn check_fp_fn)
+{
+	struct vmspace *vms;
 	struct file *fp;
-	struct vnode *vp;
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	vm_prot_t cap_maxprot, prot, maxprot;
-	void *handle;
-	objtype_t handle_type;
-	int align, error, flags;
-	off_t pos;
-	struct vmspace *vms = td->td_proc->p_vmspace;
+	vm_size_t pageoff;
+	vm_prot_t cap_maxprot;
+	int align, error;
 	cap_rights_t rights;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot & VM_PROT_ALL;
-	flags = uap->flags;
-	pos = uap->pos;
-
+	vms = td->td_proc->p_vmspace;
 	fp = NULL;
+	AUDIT_ARG_FD(fd);
+	addr = addr0;
 
 	/*
+	 * Ignore old flags that used to be defined but did not do anything.
+	 */
+	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
+	
+	/*
 	 * Enforce the constraints.
 	 * Mapping of length 0 is only allowed for old binaries.
 	 * Anonymous mapping shall specify -1 as filedescriptor and
@@ -214,8 +225,8 @@
 	 * pos.
 	 */
 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
-		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
-		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
+		if ((size == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
+		    ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
 			return (EINVAL);
 	} else {
 		if ((flags & MAP_ANON) != 0)
@@ -223,15 +234,28 @@
 	}
 
 	if (flags & MAP_STACK) {
-		if ((uap->fd != -1) ||
+		if ((fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
+	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
+	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
+	    MAP_PREFAULT_READ | MAP_GUARD |
+#ifdef MAP_32BIT
+	    MAP_32BIT |
+#endif
+	    MAP_ALIGNMENT_MASK)) != 0)
+		return (EINVAL);
 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
 		return (EINVAL);
-	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
+	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
+		return (EINVAL);
+	if (prot != PROT_NONE &&
+	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
+		return (EINVAL);
+	if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
 	    pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
 	    MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
 		return (EINVAL);
@@ -295,28 +319,32 @@
 		 * There should really be a pmap call to determine a reasonable
 		 * location.
 		 */
-		PROC_LOCK(td->td_proc);
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
-		    lim_max(td->td_proc, RLIMIT_DATA))))
+		    lim_max(td, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
-			    lim_max(td->td_proc, RLIMIT_DATA));
-		PROC_UNLOCK(td->td_proc);
+			    lim_max(td, RLIMIT_DATA));
 	}
-	if ((flags & MAP_GUARD) != 0) {
-		handle = NULL;
-		handle_type = OBJT_DEFAULT;
-		maxprot = VM_PROT_NONE;
-		cap_maxprot = VM_PROT_NONE;
+	if (size == 0) {
+		/*
+		 * Return success without mapping anything for old
+		 * binaries that request a page-aligned mapping of
+		 * length 0.  For modern binaries, this function
+		 * returns an error earlier.
+		 */
+		error = 0;
+	} else if ((flags & MAP_GUARD) != 0) {
+		error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
+		    VM_PROT_NONE, flags, NULL, pos, FALSE, td);
 	} else if ((flags & MAP_ANON) != 0) {
 		/*
 		 * Mapping blank space is trivial.
+		 *
+		 * This relies on VM_PROT_* matching PROT_*.
 		 */
-		handle = NULL;
-		handle_type = OBJT_DEFAULT;
-		maxprot = VM_PROT_ALL;
-		cap_maxprot = VM_PROT_ALL;
+		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
+		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
 	} else {
 		/*
 		 * Mapping file, get fp for validation and don't let the
@@ -333,94 +361,24 @@
 		}
 		if (prot & PROT_EXEC)
 			cap_rights_set(&rights, CAP_MMAP_X);
-		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
+		error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
 		if (error != 0)
 			goto done;
-		if (fp->f_type == DTYPE_SHM) {
-			handle = fp->f_data;
-			handle_type = OBJT_SWAP;
-			maxprot = VM_PROT_NONE;
-
-			/* FREAD should always be set. */
-			if (fp->f_flag & FREAD)
-				maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
-			if (fp->f_flag & FWRITE)
-				maxprot |= VM_PROT_WRITE;
-			goto map;
-		}
-		if (fp->f_type != DTYPE_VNODE) {
-			error = ENODEV;
+		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
+		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
+			error = EINVAL;
 			goto done;
 		}
-#if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
-    defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
-		/*
-		 * POSIX shared-memory objects are defined to have
-		 * kernel persistence, and are not defined to support
-		 * read(2)/write(2) -- or even open(2).  Thus, we can
-		 * use MAP_ASYNC to trade on-disk coherence for speed.
-		 * The shm_open(3) library routine turns on the FPOSIXSHM
-		 * flag to request this behavior.
-		 */
-		if (fp->f_flag & FPOSIXSHM)
-			flags |= MAP_NOSYNC;
-#endif
-		vp = fp->f_vnode;
-		/*
-		 * Ensure that file and memory protections are
-		 * compatible.  Note that we only worry about
-		 * writability if mapping is shared; in this case,
-		 * current and max prot are dictated by the open file.
-		 * XXX use the vnode instead?  Problem is: what
-		 * credentials do we use for determination? What if
-		 * proc does a setuid?
-		 */
-		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
-			maxprot = VM_PROT_NONE;
-		else
-			maxprot = VM_PROT_EXECUTE;
-		if (fp->f_flag & FREAD) {
-			maxprot |= VM_PROT_READ;
-		} else if (prot & PROT_READ) {
-			error = EACCES;
-			goto done;
-		}
-		/*
-		 * If we are sharing potential changes (either via
-		 * MAP_SHARED or via the implicit sharing of character
-		 * device mappings), and we are trying to get write
-		 * permission although we opened it without asking
-		 * for it, bail out.
-		 */
-		if ((flags & MAP_SHARED) != 0) {
-			if ((fp->f_flag & FWRITE) != 0) {
-				maxprot |= VM_PROT_WRITE;
-			} else if ((prot & PROT_WRITE) != 0) {
-				error = EACCES;
+		if (check_fp_fn != NULL) {
+			error = check_fp_fn(fp, prot, cap_maxprot, flags);
+			if (error != 0)
 				goto done;
-			}
-		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
-			maxprot |= VM_PROT_WRITE;
-			cap_maxprot |= VM_PROT_WRITE;
 		}
-		handle = (void *)vp;
-		handle_type = OBJT_VNODE;
+		/* This relies on VM_PROT_* matching PROT_*. */
+		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
+		    cap_maxprot, flags, pos, td);
 	}
-map:
-	td->td_fpop = fp;
-	maxprot &= cap_maxprot;
-	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
-	    flags, handle_type, handle, pos);
-	td->td_fpop = NULL;
-#ifdef HWPMC_HOOKS
-	/* inform hwpmc(4) if an executable is being mapped */
-	if (error == 0 && handle_type == OBJT_VNODE &&
-	    (prot & PROT_EXEC)) {
-		pkm.pm_file = handle;
-		pkm.pm_address = (uintptr_t) addr;
-		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
-	}
-#endif
+
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
@@ -430,19 +388,15 @@
 	return (error);
 }
 
+#if defined(COMPAT_FREEBSD6)
 int
 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
 {
-	struct mmap_args oargs;
 
-	oargs.addr = uap->addr;
-	oargs.len = uap->len;
-	oargs.prot = uap->prot;
-	oargs.flags = uap->flags;
-	oargs.fd = uap->fd;
-	oargs.pos = uap->pos;
-	return (sys_mmap(td, &oargs));
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
+	    uap->flags, uap->fd, uap->pos));
 }
+#endif
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
@@ -456,11 +410,8 @@
 };
 #endif
 int
-ommap(td, uap)
-	struct thread *td;
-	struct ommap_args *uap;
+ommap(struct thread *td, struct ommap_args *uap)
 {
-	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
@@ -471,6 +422,7 @@
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
+	int flags, prot;
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
@@ -477,30 +429,27 @@
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
-	nargs.addr = uap->addr;
-	nargs.len = uap->len;
-	nargs.prot = cvtbsdprot[uap->prot & 0x7];
+	prot = cvtbsdprot[uap->prot & 0x7];
 #ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
-	    nargs.prot != 0)
-		nargs.prot |= PROT_EXEC;
+	    prot != 0)
+		prot |= PROT_EXEC;
 #endif
 #endif
-	nargs.flags = 0;
+	flags = 0;
 	if (uap->flags & OMAP_ANON)
-		nargs.flags |= MAP_ANON;
+		flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
-		nargs.flags |= MAP_COPY;
+		flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
-		nargs.flags |= MAP_SHARED;
+		flags |= MAP_SHARED;
 	else
-		nargs.flags |= MAP_PRIVATE;
+		flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
-		nargs.flags |= MAP_FIXED;
-	nargs.fd = uap->fd;
-	nargs.pos = uap->pos;
-	return (sys_mmap(td, &nargs));
+		flags |= MAP_FIXED;
+	return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
+	    uap->fd, uap->pos));
 }
 #endif				/* COMPAT_43 */
 
@@ -513,20 +462,21 @@
 };
 #endif
 int
-sys_msync(td, uap)
-	struct thread *td;
-	struct msync_args *uap;
+sys_msync(struct thread *td, struct msync_args *uap)
 {
+
+	return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
+}
+
+int
+kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
+{
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	int flags;
+	vm_size_t pageoff;
 	vm_map_t map;
 	int rv;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	flags = uap->flags;
-
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
@@ -565,23 +515,28 @@
 };
 #endif
 int
-sys_munmap(td, uap)
-	struct thread *td;
-	struct munmap_args *uap;
+sys_munmap(struct thread *td, struct munmap_args *uap)
 {
+
+	return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
+{
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
+	bool pmc_handled;
 #endif
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
+	vm_size_t pageoff;
 	vm_map_t map;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
+	addr = addr0;
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
@@ -597,20 +552,23 @@
 		return (EINVAL);
 	vm_map_lock(map);
 #ifdef HWPMC_HOOKS
-	/*
-	 * Inform hwpmc if the address range being unmapped contains
-	 * an executable region.
-	 */
-	pkm.pm_address = (uintptr_t) NULL;
-	if (vm_map_lookup_entry(map, addr, &entry)) {
-		for (;
-		     entry != &map->header && entry->start < addr + size;
-		     entry = entry->next) {
-			if (vm_map_check_protection(map, entry->start,
-				entry->end, VM_PROT_EXECUTE) == TRUE) {
-				pkm.pm_address = (uintptr_t) addr;
-				pkm.pm_size = (size_t) size;
-				break;
+	pmc_handled = false;
+	if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
+		pmc_handled = true;
+		/*
+		 * Inform hwpmc if the address range being unmapped contains
+		 * an executable region.
+		 */
+		pkm.pm_address = (uintptr_t) NULL;
+		if (vm_map_lookup_entry(map, addr, &entry)) {
+			for (; entry->start < addr + size;
+			    entry = entry->next) {
+				if (vm_map_check_protection(map, entry->start,
+					entry->end, VM_PROT_EXECUTE) == TRUE) {
+					pkm.pm_address = (uintptr_t) addr;
+					pkm.pm_size = (size_t) size;
+					break;
+				}
 			}
 		}
 	}
@@ -618,14 +576,16 @@
 	vm_map_delete(map, addr, addr + size);
 
 #ifdef HWPMC_HOOKS
-	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
-	vm_map_lock_downgrade(map);
-	if (pkm.pm_address != (uintptr_t) NULL)
-		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
-	vm_map_unlock_read(map);
-#else
-	vm_map_unlock(map);
+	if (__predict_false(pmc_handled)) {
+		/* downgrade the lock to prevent a LOR with the pmc-sx lock */
+		vm_map_lock_downgrade(map);
+		if (pkm.pm_address != (uintptr_t) NULL)
+			PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
+		vm_map_unlock_read(map);
+	} else
 #endif
+		vm_map_unlock(map);
+
 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
 	return (0);
 }
@@ -638,22 +598,30 @@
 };
 #endif
 int
-sys_mprotect(td, uap)
-	struct thread *td;
-	struct mprotect_args *uap;
+sys_mprotect(struct thread *td, struct mprotect_args *uap)
 {
+
+	return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
+}
+
+int
+kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
+{
 	vm_offset_t addr;
-	vm_size_t size, pageoff;
-	vm_prot_t prot;
+	vm_size_t pageoff;
 
-	addr = (vm_offset_t) uap->addr;
-	size = uap->len;
-	prot = uap->prot & VM_PROT_ALL;
-
+	addr = addr0;
+	prot = (prot & VM_PROT_ALL);
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		if (((addr + size) & 0xffffffff) < addr)
+			return (EINVAL);
+	} else
+#endif
 	if (addr + size < addr)
 		return (EINVAL);
 
@@ -715,8 +683,15 @@
 int
 sys_madvise(struct thread *td, struct madvise_args *uap)
 {
-	vm_offset_t start, end;
+
+	return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
+}
+
+int
+kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
+{
 	vm_map_t map;
+	vm_offset_t addr, end, start;
 	int flags;
 
 	/*
@@ -723,7 +698,7 @@
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
-	if (uap->behav == MADV_PROTECT) {
+	if (behav == MADV_PROTECT) {
 		flags = PPROT_SET;
 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
 		    PROC_SPROTECT, &flags));
@@ -732,7 +707,7 @@
 	/*
 	 * Check for illegal behavior
 	 */
-	if (uap->behav < 0 || uap->behav > MADV_CORE)
+	if (behav < 0 || behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
@@ -739,10 +714,10 @@
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
-	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
-	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
+	addr = addr0;
+	if (addr < vm_map_min(map) || addr + len > vm_map_max(map))
 		return (EINVAL);
-	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
+	if ((addr + len) < addr)
 		return (EINVAL);
 
 	/*
@@ -749,10 +724,10 @@
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
-	start = trunc_page((vm_offset_t) uap->addr);
-	end = round_page((vm_offset_t) uap->addr + uap->len);
+	start = trunc_page(addr);
+	end = round_page(addr + len);
 
-	if (vm_map_madvise(map, start, end, uap->behav))
+	if (vm_map_madvise(map, start, end, behav))
 		return (EINVAL);
 	return (0);
 }
@@ -768,11 +743,17 @@
 int
 sys_mincore(struct thread *td, struct mincore_args *uap)
 {
+
+	return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
+}
+
+int
+kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
+{
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
-	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
@@ -789,17 +770,12 @@
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
-	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
-	end = addr + (vm_size_t)round_page(uap->len);
+	first_addr = addr = trunc_page(addr0);
+	end = addr + (vm_size_t)round_page(len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
-	/*
-	 * Address of byte vector
-	 */
-	vec = uap->vec;
-
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
@@ -817,16 +793,12 @@
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
-	for (current = entry;
-	    (current != &map->header) && (current->start < end);
-	    current = current->next) {
+	for (current = entry; current->start < end; current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
-		if (current->end < end &&
-		    (entry->next == &map->header ||
-		     current->next->start > current->end)) {
+		if (current->end < end && current->next->start > current->end) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
@@ -862,8 +834,17 @@
 		retry:
 			m = NULL;
 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
-			if (locked_pa != 0) {
+			if (mincore_mapped) {
 				/*
+				 * We only care about this pmap's
+				 * mapping of the page, if any.
+				 */
+				if (locked_pa != 0) {
+					vm_page_unlock(PHYS_TO_VM_PAGE(
+					    locked_pa));
+				}
+			} else if (locked_pa != 0) {
+				/*
 				 * The page is mapped by this process but not
 				 * both accessed and modified.  It is also
 				 * managed.  Acquire the object lock so that
@@ -905,9 +886,6 @@
 					pindex = OFF_TO_IDX(current->offset +
 					    (addr - current->start));
 					m = vm_page_lookup(object, pindex);
-					if (m == NULL &&
-					    vm_page_is_cached(object, pindex))
-						mincoreinfo = MINCORE_INCORE;
 					if (m != NULL && m->valid == 0)
 						m = NULL;
 					if (m != NULL)
@@ -945,7 +923,7 @@
 			/*
 			 * calculate index into user supplied byte vector
 			 */
-			vecindex = OFF_TO_IDX(addr - first_addr);
+			vecindex = atop(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
@@ -991,7 +969,7 @@
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
-	vecindex = OFF_TO_IDX(end - first_addr);
+	vecindex = atop(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		++lastvecindex;
 		error = subyte(vec + lastvecindex, 0);
@@ -1023,11 +1001,12 @@
 sys_mlock(struct thread *td, struct mlock_args *uap)
 {
 
-	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
+	return (kern_mlock(td->td_proc, td->td_ucred,
+	    __DECONST(uintptr_t, uap->addr), uap->len));
 }
 
 int
-vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
+kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
@@ -1038,7 +1017,7 @@
 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)addr0;
+	addr = addr0;
 	size = len;
 	last = addr + size;
 	start = trunc_page(addr);
@@ -1051,12 +1030,12 @@
 	map = &proc->p_vmspace->vm_map;
 	PROC_LOCK(proc);
 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
-	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
+	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef RACCT
 	if (racct_enable) {
@@ -1106,7 +1085,7 @@
 	 */
 	if (!old_mlock && uap->how & MCL_CURRENT) {
 		PROC_LOCK(td->td_proc);
-		if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
@@ -1195,12 +1174,16 @@
 };
 #endif
 int
-sys_munlock(td, uap)
-	struct thread *td;
-	struct munlock_args *uap;
+sys_munlock(struct thread *td, struct munlock_args *uap)
 {
+
+	return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
+}
+
+int
+kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
+{
 	vm_offset_t addr, end, last, start;
-	vm_size_t size;
 #ifdef RACCT
 	vm_map_t map;
 #endif
@@ -1209,8 +1192,7 @@
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
-	addr = (vm_offset_t)uap->addr;
-	size = uap->len;
+	addr = addr0;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
@@ -1235,9 +1217,6 @@
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
- *
- * For VCHR vnodes, the vnode lock is held over the call to
- * vm_mmap_cdev() to keep vp->v_rdev valid.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
@@ -1247,7 +1226,7 @@
 {
 	struct vattr va;
 	vm_object_t obj;
-	vm_offset_t foff;
+	vm_ooffset_t foff;
 	struct ucred *cred;
 	int error, flags, locktype;
 
@@ -1258,6 +1237,7 @@
 		locktype = LK_SHARED;
 	if ((error = vget(vp, locktype, td)) != 0)
 		return (error);
+	AUDIT_ARG_VNODE1(vp);
 	foff = *foffp;
 	flags = *flagsp;
 	obj = vp->v_object;
@@ -1284,12 +1264,6 @@
 			*writecounted = TRUE;
 			vnode_pager_update_writecount(obj, 0, objsize);
 		}
-	} else if (vp->v_type == VCHR) {
-		error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
-		    vp->v_rdev, foffp, objp);
-		if (error == 0)
-			goto mark_atime;
-		goto done;
 	} else {
 		error = EINVAL;
 		goto done;
@@ -1297,13 +1271,14 @@
 	if ((error = VOP_GETATTR(vp, &va, cred)))
 		goto done;
 #ifdef MAC
-	error = mac_vnode_check_mmap(cred, vp, prot, flags);
+	/* This relies on VM_PROT_* matching PROT_*. */
+	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
-			if (prot & PROT_WRITE) {
+			if (prot & VM_PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
@@ -1318,22 +1293,26 @@
 	objsize = round_page(va.va_size);
 	if (va.va_nlink == 0)
 		flags |= MAP_NOSYNC;
-	if (obj->type == OBJT_VNODE)
+	if (obj->type == OBJT_VNODE) {
 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 		    cred);
-	else {
+		if (obj == NULL) {
+			error = ENOMEM;
+			goto done;
+		}
+	} else {
 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 		    ("wrong object type"));
-		vm_object_reference(obj);
+		VM_OBJECT_WLOCK(obj);
+		vm_object_reference_locked(obj);
+#if VM_NRESERVLEVEL > 0
+		vm_object_color(obj, 0);
+#endif
+		VM_OBJECT_WUNLOCK(obj);
 	}
-	if (obj == NULL) {
-		error = ENOMEM;
-		goto done;
-	}
 	*objp = obj;
 	*flagsp = flags;
 
-mark_atime:
 	vfs_mark_atime(vp, cred);
 
 done:
@@ -1352,21 +1331,18 @@
  * operations on cdevs.
  */
 int
-vm_mmap_cdev(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
+vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
+    vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
+    vm_ooffset_t *foff, vm_object_t *objp)
 {
 	vm_object_t obj;
-	struct cdevsw *dsw;
-	int error, flags, ref;
+	int error, flags;
 
 	flags = *flagsp;
 
-	dsw = dev_refthread(cdev, &ref);
-	if (dsw == NULL)
-		return (ENXIO);
 	if (dsw->d_flags & D_MMAP_ANON) {
-		dev_relthread(cdev, ref);
+		*objp = NULL;
+		*foff = 0;
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
@@ -1375,24 +1351,18 @@
 	 * cdevs do not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0) {
-		dev_relthread(cdev, ref);
+	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
-	}
-	if (flags & (MAP_PRIVATE|MAP_COPY)) {
-		dev_relthread(cdev, ref);
+	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
-	}
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
-	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
-	if (error != 0) {
-		dev_relthread(cdev, ref);
+	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
+	if (error != 0)
 		return (error);
-	}
 #endif
 	/*
 	 * First, try d_mmap_single().  If that is not implemented
@@ -1404,7 +1374,6 @@
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
-	dev_relthread(cdev, ref);
 	if (error != ENODEV)
 		return (error);
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
@@ -1417,65 +1386,96 @@
 }
 
 /*
- * vm_mmap_shm()
+ * vm_mmap()
  *
- * MPSAFE
- *
- * Helper function for vm_mmap.  Perform sanity check specific for mmap
- * operations on shm file descriptors.
+ * Internal version of mmap used by exec, sys5 shared memory, and
+ * various device drivers.  Handle is either a vnode pointer, a
+ * character device, or NULL for MAP_ANON.
  */
 int
-vm_mmap_shm(struct thread *td, vm_size_t objsize,
-    vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
-    struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
+vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+	vm_prot_t maxprot, int flags,
+	objtype_t handle_type, void *handle,
+	vm_ooffset_t foff)
 {
+	vm_object_t object;
+	struct thread *td = curthread;
 	int error;
+	boolean_t writecounted;
 
-	if ((*flagsp & MAP_SHARED) != 0 &&
-	    (*maxprotp & VM_PROT_WRITE) == 0 &&
-	    (prot & PROT_WRITE) != 0)
-		return (EACCES);
-#ifdef MAC
-	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
-	if (error != 0)
-		return (error);
-#endif
-	error = shm_mmap(shmfd, objsize, foff, objp);
+	if (size == 0)
+		return (EINVAL);
+
+	size = round_page(size);
+	object = NULL;
+	writecounted = FALSE;
+
+	/*
+	 * Lookup/allocate object.
+	 */
+	switch (handle_type) {
+	case OBJT_DEVICE: {
+		struct cdevsw *dsw;
+		struct cdev *cdev;
+		int ref;
+
+		cdev = handle;
+		dsw = dev_refthread(cdev, &ref);
+		if (dsw == NULL)
+			return (ENXIO);
+		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
+		    dsw, &foff, &object);
+		dev_relthread(cdev, ref);
+		break;
+	}
+	case OBJT_VNODE:
+		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
+		    handle, &foff, &object, &writecounted);
+		break;
+	case OBJT_DEFAULT:
+		if (handle == NULL) {
+			error = 0;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		error = EINVAL;
+		break;
+	}
 	if (error)
 		return (error);
-	return (0);
+
+	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
+	    foff, writecounted, td);
+	if (error != 0 && object != NULL) {
+		/*
+		 * If this mapping was accounted for in the vnode's
+		 * writecount, then undo that now.
+		 */
+		if (writecounted)
+			vnode_pager_release_writecount(object, 0, size);
+		vm_object_deallocate(object);
+	}
+	return (error);
 }
 
 /*
- * vm_mmap()
- *
- * MPSAFE
- *
- * Internal version of mmap.  Currently used by mmap, exec, and sys5
- * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
+ * Internal version of mmap that maps a specific VM object into an
+ * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
  */
 int
-vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
-	vm_prot_t maxprot, int flags,
-	objtype_t handle_type, void *handle,
-	vm_ooffset_t foff)
+vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
+    vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
+    boolean_t writecounted, struct thread *td)
 {
 	boolean_t curmap, fitit;
 	vm_offset_t max_addr;
-	vm_object_t object = NULL;
-	struct thread *td = curthread;
 	int docow, error, findspace, rv;
-	boolean_t writecounted;
 
-	if (size == 0)
-		return (0);
-
-	size = round_page(size);
-
 	curmap = map == &td->td_proc->p_vmspace->vm_map;
 	if (curmap) {
 		PROC_LOCK(td->td_proc);
-		if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
+		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
@@ -1485,7 +1485,7 @@
 		}
 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 			if (ptoa(pmap_wired_count(map->pmap)) + size >
-			    lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
+			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
 				racct_set_force(td->td_proc, RACCT_VMEM,
 				    map->size);
 				PROC_UNLOCK(td->td_proc);
@@ -1505,11 +1505,11 @@
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
-	 * The check is here rather than in the syscall because the
-	 * kernel calls this function internally for other mmaping
-	 * operations (such as in exec) and non-aligned offsets will
-	 * cause pmap inconsistencies...so we want to be sure to
-	 * disallow this in all cases.
+	 * The mmap() system call already enforces this by subtracting
+	 * the page offset from the file offset, but checking here
+	 * catches errors in device drivers (e.g. d_single_mmap()
+	 * callbacks) and other internal mapping requests (such as in
+	 * exec).
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
@@ -1522,44 +1522,11 @@
 			return (EINVAL);
 		fitit = FALSE;
 	}
-	writecounted = FALSE;
 
-	/*
-	 * Lookup/allocate object.
-	 */
-	switch (handle_type) {
-	case OBJT_DEVICE:
-		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object);
-		break;
-	case OBJT_VNODE:
-		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object, &writecounted);
-		break;
-	case OBJT_SWAP:
-		error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
-		    handle, foff, &object);
-		break;
-	case OBJT_DEFAULT:
-		if (handle == NULL) {
-			error = 0;
-			break;
-		}
-		/* FALLTHROUGH */
-	default:
-		error = EINVAL;
-		break;
-	}
-	if (error)
-		return (error);
 	if (flags & MAP_ANON) {
-		object = NULL;
+		if (object != NULL || foff != 0)
+			return (EINVAL);
 		docow = 0;
-		/*
-		 * Unnamed anonymous regions always start at 0.
-		 */
-		if (handle == 0)
-			foff = 0;
 	} else if (flags & MAP_PREFAULT_READ)
 		docow = MAP_PREFAULT;
 	else
@@ -1600,15 +1567,9 @@
 			max_addr = MAP_32BIT_MAX_ADDR;
 #endif
 		if (curmap) {
-			vm_offset_t min_addr;
-
-			PROC_LOCK(td->td_proc);
-			min_addr = round_page((vm_offset_t)td->td_proc->
-			    p_vmspace->vm_daddr + lim_max(td->td_proc,
-			    RLIMIT_DATA));
-			PROC_UNLOCK(td->td_proc);
 			rv = vm_map_find_min(map, object, foff, addr, size,
-			    min_addr, max_addr,
+			    round_page((vm_offset_t)td->td_proc->p_vmspace->
+			    vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
 			    findspace, prot, maxprot, docow);
 		} else {
 			rv = vm_map_find(map, object, foff, addr, size,
@@ -1629,19 +1590,6 @@
 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 		}
-	} else {
-		/*
-		 * If this mapping was accounted for in the vnode's
-		 * writecount, then undo that now.
-		 */
-		if (writecounted)
-			vnode_pager_release_writecount(object, 0, size);
-		/*
-		 * Lose the object reference.  Will destroy the
-		 * object if it's an unnamed anonymous mapping
-		 * or named anonymous without other references.
-		 */
-		vm_object_deallocate(object);
 	}
 	return (vm_mmap_to_errno(rv));
 }

Modified: trunk/sys/vm/vm_object.c
===================================================================
--- trunk/sys/vm/vm_object.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -64,7 +64,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_object.c 321677 2017-07-29 08:24:51Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_object.c 352331 2019-09-14 13:35:48Z kib $");
 
 #include "opt_vm.h"
 
@@ -74,6 +74,7 @@
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
+#include <sys/pctrie.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
@@ -179,9 +180,6 @@
 	    ("object %p has reservations",
 	    object));
 #endif
-	KASSERT(vm_object_cache_is_empty(object),
-	    ("object %p has cached pages",
-	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
@@ -203,19 +201,16 @@
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
-	bzero(&object->lock, sizeof(object->lock));
-	rw_init_flags(&object->lock, "vm object", RW_DUPOK);
+	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	object->ref_count = 0;
-	object->rtree.rt_root = 0;
-	object->rtree.rt_flags = 0;
+	vm_radix_init(&object->rtree);
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
-	object->cache.rt_root = 0;
-	object->cache.rt_flags = 0;
+	object->flags = OBJ_DEAD;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
@@ -231,6 +226,16 @@
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
+	if (type == OBJT_SWAP)
+		pctrie_init(&object->un_pager.swp.swp_blks);
+
+	/*
+	 * Ensure that swap_pager_swapoff() iteration over object_list
+	 * sees up to date type and pctrie head if it observed
+	 * non-dead object.
+	 */
+	atomic_thread_fence_rel();
+
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
@@ -266,6 +271,7 @@
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
+	umtx_shm_object_init(object);
 }
 
 /*
@@ -280,8 +286,8 @@
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
-	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kernel_object);
+	_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+	    VM_MIN_KERNEL_ADDRESS), kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -288,8 +294,8 @@
 #endif
 
 	rw_init(&kmem_object->lock, "kmem vm object");
-	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
-	    kmem_object);
+	_vm_object_allocate(OBJT_PHYS, atop(VM_MAX_KERNEL_ADDRESS -
+	    VM_MIN_KERNEL_ADDRESS), kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
@@ -308,7 +314,7 @@
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
-	vm_radix_init();
+	vm_radix_zinit();
 }
 
 void
@@ -472,11 +478,14 @@
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
-		vprint("vm_object_vndeallocate", vp);
+		vn_printf(vp, "vm_object_vndeallocate ");
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
+	if (!umtx_shm_vnobj_persistent && object->ref_count == 1)
+		umtx_shm_object_terminated(object);
+
 	/*
 	 * The test for text of vp vnode does not need a bypass to
 	 * reach right VV_TEXT there, since it is obtained from
@@ -649,6 +658,7 @@
 			return;
 		}
 doterm:
+		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
@@ -697,6 +707,89 @@
 }
 
 /*
+ *	vm_object_terminate_pages removes any remaining pageable pages
+ *	from the object and resets the object to an empty state.
+ */
+static void
+vm_object_terminate_pages(vm_object_t object)
+{
+	vm_page_t p, p_next;
+	struct mtx *mtx, *mtx1;
+	struct vm_pagequeue *pq, *pq1;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+
+	mtx = NULL;
+	pq = NULL;
+
+	/*
+	 * Free any remaining pageable pages.  This also removes them from the
+	 * paging queues.  However, don't free wired pages, just remove them
+	 * from the object.  Rather than incrementally removing each page from
+	 * the object, the page and object are reset to any empty state. 
+	 */
+	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
+		vm_page_assert_unbusied(p);
+		if ((object->flags & OBJ_UNMANAGED) == 0) {
+			/*
+			 * vm_page_free_prep() only needs the page
+			 * lock for managed pages.
+			 */
+			mtx1 = vm_page_lockptr(p);
+			if (mtx1 != mtx) {
+				if (mtx != NULL)
+					mtx_unlock(mtx);
+				if (pq != NULL) {
+					vm_pagequeue_unlock(pq);
+					pq = NULL;
+				}
+				mtx = mtx1;
+				mtx_lock(mtx);
+			}
+		}
+		p->object = NULL;
+		if (p->wire_count != 0)
+			goto unlist;
+		PCPU_INC(cnt.v_pfree);
+		p->flags &= ~PG_ZERO;
+		if (p->queue != PQ_NONE) {
+			KASSERT(p->queue < PQ_COUNT, ("vm_object_terminate: "
+			    "page %p is not queued", p));
+			pq1 = vm_page_pagequeue(p);
+			if (pq != pq1) {
+				if (pq != NULL)
+					vm_pagequeue_unlock(pq);
+				pq = pq1;
+				vm_pagequeue_lock(pq);
+			}
+		}
+		if (vm_page_free_prep(p, true))
+			continue;
+unlist:
+		TAILQ_REMOVE(&object->memq, p, listq);
+	}
+	if (pq != NULL)
+		vm_pagequeue_unlock(pq);
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+
+	vm_page_free_phys_pglist(&object->memq);
+
+	/*
+	 * If the object contained any pages, then reset it to an empty state.
+	 * None of the object's fields, including "resident_page_count", were
+	 * modified by the preceding loop.
+	 */
+	if (object->resident_page_count != 0) {
+		vm_radix_reclaim_allnodes(&object->rtree);
+		TAILQ_INIT(&object->memq);
+		object->resident_page_count = 0;
+		if (object->type == OBJT_VNODE)
+			vdrop(object->handle);
+	}
+}
+
+/*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
@@ -706,7 +799,6 @@
 void
 vm_object_terminate(vm_object_t object)
 {
-	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
@@ -749,48 +841,13 @@
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
-	/*
-	 * Free any remaining pageable pages.  This also removes them from the
-	 * paging queues.  However, don't free wired pages, just remove them
-	 * from the object.  Rather than incrementally removing each page from
-	 * the object, the page and object are reset to any empty state. 
-	 */
-	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
-		vm_page_assert_unbusied(p);
-		vm_page_lock(p);
-		/*
-		 * Optimize the page's removal from the object by resetting
-		 * its "object" field.  Specifically, if the page is not
-		 * wired, then the effect of this assignment is that
-		 * vm_page_free()'s call to vm_page_remove() will return
-		 * immediately without modifying the page or the object.
-		 */ 
-		p->object = NULL;
-		if (p->wire_count == 0) {
-			vm_page_free(p);
-			PCPU_INC(cnt.v_pfree);
-		}
-		vm_page_unlock(p);
-	}
-	/*
-	 * If the object contained any pages, then reset it to an empty state.
-	 * None of the object's fields, including "resident_page_count", were
-	 * modified by the preceding loop.
-	 */
-	if (object->resident_page_count != 0) {
-		vm_radix_reclaim_allnodes(&object->rtree);
-		TAILQ_INIT(&object->memq);
-		object->resident_page_count = 0;
-		if (object->type == OBJT_VNODE)
-			vdrop(object->handle);
-	}
+	if ((object->flags & OBJ_PG_DTOR) == 0)
+		vm_object_terminate_pages(object);
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
-	if (__predict_false(!vm_object_cache_is_empty(object)))
-		vm_page_cache_free(object, 0, 0);
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
@@ -1027,13 +1084,13 @@
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
-	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
-		vp = object->handle;
+	    (object->flags & OBJ_MIGHTBEDIRTY) != 0 &&
+	    ((vp = object->handle)->v_vflag & VV_NOSYNC) == 0) {
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
-		    OFF_TO_IDX(size) == object->size) {
+		    atop(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
@@ -1080,6 +1137,33 @@
 }
 
 /*
+ * Determine whether the given advice can be applied to the object.  Advice is
+ * not applied to unmanaged pages since they never belong to page queues, and
+ * since MADV_FREE is destructive, it can apply only to anonymous pages that
+ * have been mapped at most once.
+ */
+static bool
+vm_object_advice_applies(vm_object_t object, int advice)
+{
+
+	if ((object->flags & OBJ_UNMANAGED) != 0)
+		return (false);
+	if (advice != MADV_FREE)
+		return (true);
+	return ((object->type == OBJT_DEFAULT || object->type == OBJT_SWAP) &&
+	    (object->flags & OBJ_ONEMAPPING) != 0);
+}
+
+static void
+vm_object_madvise_freespace(vm_object_t object, int advice, vm_pindex_t pindex,
+    vm_size_t size)
+{
+
+	if (advice == MADV_FREE && object->type == OBJT_SWAP)
+		swap_pager_freespace(object, pindex, size);
+}
+
+/*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
@@ -1102,103 +1186,109 @@
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
-    int advise)
+    int advice)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
-	vm_page_t m;
+	vm_page_t m, tm;
 
 	if (object == NULL)
 		return;
+
+relookup:
 	VM_OBJECT_WLOCK(object);
-	/*
-	 * Locate and adjust resident pages
-	 */
-	for (; pindex < end; pindex += 1) {
-relookup:
+	if (!vm_object_advice_applies(object, advice)) {
+		VM_OBJECT_WUNLOCK(object);
+		return;
+	}
+	for (m = vm_page_find_least(object, pindex); pindex < end; pindex++) {
 		tobject = object;
-		tpindex = pindex;
-shadowlookup:
+
 		/*
-		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
-		 * and those pages must be OBJ_ONEMAPPING.
+		 * If the next page isn't resident in the top-level object, we
+		 * need to search the shadow chain.  When applying MADV_FREE, we
+		 * take care to release any swap space used to store
+		 * non-resident pages.
 		 */
-		if (advise == MADV_FREE) {
-			if ((tobject->type != OBJT_DEFAULT &&
-			     tobject->type != OBJT_SWAP) ||
-			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
-				goto unlock_tobject;
-			}
-		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
-			goto unlock_tobject;
-		m = vm_page_lookup(tobject, tpindex);
-		if (m == NULL && advise == MADV_WILLNEED) {
+		if (m == NULL || pindex < m->pindex) {
 			/*
-			 * If the page is cached, reactivate it.
+			 * Optimize a common case: if the top-level object has
+			 * no backing object, we can skip over the non-resident
+			 * range in constant time.
 			 */
-			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
-			    VM_ALLOC_NOBUSY);
+			if (object->backing_object == NULL) {
+				tpindex = (m != NULL && m->pindex < end) ?
+				    m->pindex : end;
+				vm_object_madvise_freespace(object, advice,
+				    pindex, tpindex - pindex);
+				if ((pindex = tpindex) == end)
+					break;
+				goto next_page;
+			}
+
+			tpindex = pindex;
+			do {
+				vm_object_madvise_freespace(tobject, advice,
+				    tpindex, 1);
+				/*
+				 * Prepare to search the next object in the
+				 * chain.
+				 */
+				backing_object = tobject->backing_object;
+				if (backing_object == NULL)
+					goto next_pindex;
+				VM_OBJECT_WLOCK(backing_object);
+				tpindex +=
+				    OFF_TO_IDX(tobject->backing_object_offset);
+				if (tobject != object)
+					VM_OBJECT_WUNLOCK(tobject);
+				tobject = backing_object;
+				if (!vm_object_advice_applies(tobject, advice))
+					goto next_pindex;
+			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
+			    NULL);
+		} else {
+next_page:
+			tm = m;
+			m = TAILQ_NEXT(m, listq);
 		}
-		if (m == NULL) {
-			/*
-			 * There may be swap even if there is no backing page
-			 */
-			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
-				swap_pager_freespace(tobject, tpindex, 1);
-			/*
-			 * next object
-			 */
-			backing_object = tobject->backing_object;
-			if (backing_object == NULL)
-				goto unlock_tobject;
-			VM_OBJECT_WLOCK(backing_object);
-			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
-			if (tobject != object)
-				VM_OBJECT_WUNLOCK(tobject);
-			tobject = backing_object;
-			goto shadowlookup;
-		} else if (m->valid != VM_PAGE_BITS_ALL)
-			goto unlock_tobject;
+
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
-		vm_page_lock(m);
-		if (m->hold_count != 0 || m->wire_count != 0) {
-			vm_page_unlock(m);
-			goto unlock_tobject;
+		if (tm->valid != VM_PAGE_BITS_ALL)
+			goto next_pindex;
+		vm_page_lock(tm);
+		if (tm->hold_count != 0 || tm->wire_count != 0) {
+			vm_page_unlock(tm);
+			goto next_pindex;
 		}
-		KASSERT((m->flags & PG_FICTITIOUS) == 0,
-		    ("vm_object_madvise: page %p is fictitious", m));
-		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
-		    ("vm_object_madvise: page %p is not managed", m));
-		if (vm_page_busied(m)) {
-			if (advise == MADV_WILLNEED) {
+		KASSERT((tm->flags & PG_FICTITIOUS) == 0,
+		    ("vm_object_madvise: page %p is fictitious", tm));
+		KASSERT((tm->oflags & VPO_UNMANAGED) == 0,
+		    ("vm_object_madvise: page %p is not managed", tm));
+		if (vm_page_busied(tm)) {
+			if (object != tobject)
+				VM_OBJECT_WUNLOCK(tobject);
+			VM_OBJECT_WUNLOCK(object);
+			if (advice == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
-				 * likely to reclaim it. 
+				 * likely to reclaim it.
 				 */
-				vm_page_aflag_set(m, PGA_REFERENCED);
+				vm_page_aflag_set(tm, PGA_REFERENCED);
 			}
-			if (object != tobject)
-				VM_OBJECT_WUNLOCK(object);
-			VM_OBJECT_WUNLOCK(tobject);
-			vm_page_busy_sleep(m, "madvpo", false);
-			VM_OBJECT_WLOCK(object);
+			vm_page_busy_sleep(tm, "madvpo", false);
   			goto relookup;
 		}
-		if (advise == MADV_WILLNEED) {
-			vm_page_activate(m);
-		} else {
-			vm_page_advise(m, advise);
-		}
-		vm_page_unlock(m);
-		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
-			swap_pager_freespace(tobject, tpindex, 1);
-unlock_tobject:
+		vm_page_advise(tm, advice);
+		vm_page_unlock(tm);
+		vm_object_madvise_freespace(tobject, advice, tm->pindex, 1);
+next_pindex:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
-	}	
+	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
@@ -1368,11 +1458,11 @@
 			goto retry;
 		}
 
-		/* vm_page_rename() will handle dirty and cache. */
+		/* vm_page_rename() will dirty the page. */
 		if (vm_page_rename(m, new_object, idx)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
-			VM_WAIT;
+			vm_radix_wait();
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
@@ -1403,19 +1493,6 @@
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
-
-		/*
-		 * Transfer any cached pages from orig_object to new_object.
-		 * If swap_pager_copy() found swapped out pages within the
-		 * specified range of orig_object, then it changed
-		 * new_object's type to OBJT_SWAP when it transferred those
-		 * pages to new_object.  Otherwise, new_object's type
-		 * should still be OBJT_DEFAULT and orig_object should not
-		 * contain any cached pages within the specified range.
-		 */
-		if (__predict_false(!vm_object_cache_is_empty(orig_object)))
-			vm_page_cache_transfer(orig_object, offidxstart,
-			    new_object);
 	}
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
@@ -1425,12 +1502,11 @@
 	VM_OBJECT_WLOCK(new_object);
 }
 
-#define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static vm_page_t
-vm_object_backing_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
+vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
     int op)
 {
 	vm_object_t backing_object;
@@ -1448,8 +1524,9 @@
 		vm_page_lock(p);
 	VM_OBJECT_WUNLOCK(object);
 	VM_OBJECT_WUNLOCK(backing_object);
+	/* The page is only NULL when rename fails. */
 	if (p == NULL)
-		VM_WAIT;
+		vm_radix_wait();
 	else
 		vm_page_busy_sleep(p, "vmocol", false);
 	VM_OBJECT_WLOCK(object);
@@ -1458,192 +1535,195 @@
 }
 
 static bool
-vm_object_backing_scan(vm_object_t object, int op)
+vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
-	vm_page_t next, p, pp;
-	vm_pindex_t backing_offset_index, new_pindex;
+	vm_page_t p, pp;
+	vm_pindex_t backing_offset_index, new_pindex, pi, ps;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
-	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
-	 * Initial conditions
+	 * Initial conditions:
+	 *
+	 * We do not want to have to test for the existence of swap
+	 * pages in the backing object.  XXX but with the new swapper this
+	 * would be pretty easy to do.
 	 */
-	if (op & OBSC_TEST_ALL_SHADOWED) {
+	if (backing_object->type != OBJT_DEFAULT &&
+	    backing_object->type != OBJT_SWAP)
+		return (false);
+
+	pi = backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+	p = vm_page_find_least(backing_object, pi);
+	ps = swap_pager_find_least(backing_object, pi);
+
+	/*
+	 * Only check pages inside the parent object's range and
+	 * inside the parent object's mapping of the backing object.
+	 */
+	for (;; pi++) {
+		if (p != NULL && p->pindex < pi)
+			p = TAILQ_NEXT(p, listq);
+		if (ps < pi)
+			ps = swap_pager_find_least(backing_object, pi);
+		if (p == NULL && ps >= backing_object->size)
+			break;
+		else if (p == NULL)
+			pi = ps;
+		else
+			pi = MIN(p->pindex, ps);
+
+		new_pindex = pi - backing_offset_index;
+		if (new_pindex >= object->size)
+			break;
+
 		/*
-		 * We do not want to have to test for the existence of cache
-		 * or swap pages in the backing object.  XXX but with the
-		 * new swapper this would be pretty easy to do.
+		 * See if the parent has the page or if the parent's object
+		 * pager has the page.  If the parent has the page but the page
+		 * is not valid, the parent's object pager must have the page.
 		 *
-		 * XXX what about anonymous MAP_SHARED memory that hasn't
-		 * been ZFOD faulted yet?  If we do not test for this, the
-		 * shadow test may succeed! XXX
+		 * If this fails, the parent does not completely shadow the
+		 * object and we might as well give up now.
 		 */
-		if (backing_object->type != OBJT_DEFAULT) {
+		pp = vm_page_lookup(object, new_pindex);
+		if ((pp == NULL || pp->valid == 0) &&
+		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			return (false);
-		}
 	}
-	if (op & OBSC_COLLAPSE_WAIT) {
+	return (true);
+}
+
+static bool
+vm_object_collapse_scan(vm_object_t object, int op)
+{
+	vm_object_t backing_object;
+	vm_page_t next, p, pp;
+	vm_pindex_t backing_offset_index, new_pindex;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
+
+	backing_object = object->backing_object;
+	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
+
+	/*
+	 * Initial conditions
+	 */
+	if ((op & OBSC_COLLAPSE_WAIT) != 0)
 		vm_object_set_flag(backing_object, OBJ_DEAD);
-	}
 
 	/*
 	 * Our scan
 	 */
-	p = TAILQ_FIRST(&backing_object->memq);
-	while (p) {
+	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
-		if (op & OBSC_TEST_ALL_SHADOWED) {
-			/*
-			 * Ignore pages outside the parent object's range
-			 * and outside the parent object's mapping of the 
-			 * backing object.
-			 *
-			 * Note that we do not busy the backing object's
-			 * page.
-			 */
-			if (p->pindex < backing_offset_index ||
-			    new_pindex >= object->size) {
-				p = next;
-				continue;
-			}
 
-			/*
-			 * See if the parent has the page or if the parent's
-			 * object pager has the page.  If the parent has the
-			 * page but the page is not valid, the parent's
-			 * object pager must have the page.
-			 *
-			 * If this fails, the parent does not completely shadow
-			 * the object and we might as well give up now.
-			 */
-
-			pp = vm_page_lookup(object, new_pindex);
-			if ((pp == NULL || pp->valid == 0) &&
-			    !vm_pager_has_page(object, new_pindex, NULL, NULL))
-				return (false);
-		}
-
 		/*
 		 * Check for busy page
 		 */
-		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
-			if (vm_page_busied(p)) {
-				p = vm_object_backing_scan_wait(object, p,
-				    next, op);
-				continue;
-			}
+		if (vm_page_busied(p)) {
+			next = vm_object_collapse_scan_wait(object, p, next, op);
+			continue;
+		}
 
-			KASSERT(p->object == backing_object,
-			    ("vm_object_backing_scan: object mismatch"));
+		KASSERT(p->object == backing_object,
+		    ("vm_object_collapse_scan: object mismatch"));
 
-			if (p->pindex < backing_offset_index ||
-			    new_pindex >= object->size) {
-				if (backing_object->type == OBJT_SWAP)
-					swap_pager_freespace(backing_object, 
-					    p->pindex, 1);
+		if (p->pindex < backing_offset_index ||
+		    new_pindex >= object->size) {
+			if (backing_object->type == OBJT_SWAP)
+				swap_pager_freespace(backing_object, p->pindex,
+				    1);
 
-				/*
-				 * Page is out of the parent object's range, we 
-				 * can simply destroy it. 
-				 */
-				vm_page_lock(p);
-				KASSERT(!pmap_page_is_mapped(p),
-				    ("freeing mapped page %p", p));
-				if (p->wire_count == 0)
-					vm_page_free(p);
-				else
-					vm_page_remove(p);
-				vm_page_unlock(p);
-				p = next;
-				continue;
-			}
+			/*
+			 * Page is out of the parent object's range, we can
+			 * simply destroy it.
+			 */
+			vm_page_lock(p);
+			KASSERT(!pmap_page_is_mapped(p),
+			    ("freeing mapped page %p", p));
+			if (p->wire_count == 0)
+				vm_page_free(p);
+			else
+				vm_page_remove(p);
+			vm_page_unlock(p);
+			continue;
+		}
 
-			pp = vm_page_lookup(object, new_pindex);
-			if (pp != NULL && vm_page_busied(pp)) {
-				/*
-				 * The page in the parent is busy and
-				 * possibly not (yet) valid.  Until
-				 * its state is finalized by the busy
-				 * bit owner, we can't tell whether it
-				 * shadows the original page.
-				 * Therefore, we must either skip it
-				 * and the original (backing_object)
-				 * page or wait for its state to be
-				 * finalized.
-				 *
-				 * This is due to a race with vm_fault()
-				 * where we must unbusy the original
-				 * (backing_obj) page before we can
-				 * (re)lock the parent.  Hence we can
-				 * get here.
-				 */
-				p = vm_object_backing_scan_wait(object, pp,
-				    next, op);
-				continue;
-			}
+		pp = vm_page_lookup(object, new_pindex);
+		if (pp != NULL && vm_page_busied(pp)) {
+			/*
+			 * The page in the parent is busy and possibly not
+			 * (yet) valid.  Until its state is finalized by the
+			 * busy bit owner, we can't tell whether it shadows the
+			 * original page.  Therefore, we must either skip it
+			 * and the original (backing_object) page or wait for
+			 * its state to be finalized.
+			 *
+			 * This is due to a race with vm_fault() where we must
+			 * unbusy the original (backing_obj) page before we can
+			 * (re)lock the parent.  Hence we can get here.
+			 */
+			next = vm_object_collapse_scan_wait(object, pp, next,
+			    op);
+			continue;
+		}
 
-			KASSERT(pp == NULL || pp->valid != 0,
-			    ("unbusy invalid page %p", pp));
+		KASSERT(pp == NULL || pp->valid != 0,
+		    ("unbusy invalid page %p", pp));
 
-			if (pp != NULL || vm_pager_has_page(object,
-			    new_pindex, NULL, NULL)) {
-				/*
-				 * The page already exists in the
-				 * parent OR swap exists for this
-				 * location in the parent.  Leave the
-				 * parent's page alone.  Destroy the
-				 * original page from the backing
-				 * object.
-				 */
-				if (backing_object->type == OBJT_SWAP)
-					swap_pager_freespace(backing_object,
-					    p->pindex, 1);
-				vm_page_lock(p);
-				KASSERT(!pmap_page_is_mapped(p),
-				    ("freeing mapped page %p", p));
-				if (p->wire_count == 0)
-					vm_page_free(p);
-				else
-					vm_page_remove(p);
-				vm_page_unlock(p);
-				p = next;
-				continue;
-			}
-
+		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
+			NULL)) {
 			/*
-			 * Page does not exist in parent, rename the
-			 * page from the backing object to the main object. 
-			 *
-			 * If the page was mapped to a process, it can remain 
-			 * mapped through the rename.
-			 * vm_page_rename() will handle dirty and cache.
+			 * The page already exists in the parent OR swap exists
+			 * for this location in the parent.  Leave the parent's
+			 * page alone.  Destroy the original page from the
+			 * backing object.
 			 */
-			if (vm_page_rename(p, object, new_pindex)) {
-				p = vm_object_backing_scan_wait(object, NULL,
-				    next, op);
-				continue;
-			}
-
-			/* Use the old pindex to free the right page. */
 			if (backing_object->type == OBJT_SWAP)
-				swap_pager_freespace(backing_object,
-				    new_pindex + backing_offset_index, 1);
+				swap_pager_freespace(backing_object, p->pindex,
+				    1);
+			vm_page_lock(p);
+			KASSERT(!pmap_page_is_mapped(p),
+			    ("freeing mapped page %p", p));
+			if (p->wire_count == 0)
+				vm_page_free(p);
+			else
+				vm_page_remove(p);
+			vm_page_unlock(p);
+			continue;
+		}
 
+		/*
+		 * Page does not exist in parent, rename the page from the
+		 * backing object to the main object.
+		 *
+		 * If the page was mapped to a process, it can remain mapped
+		 * through the rename.  vm_page_rename() will dirty the page.
+		 */
+		if (vm_page_rename(p, object, new_pindex)) {
+			next = vm_object_collapse_scan_wait(object, NULL, next,
+			    op);
+			continue;
+		}
+
+		/* Use the old pindex to free the right page. */
+		if (backing_object->type == OBJT_SWAP)
+			swap_pager_freespace(backing_object,
+			    new_pindex + backing_offset_index, 1);
+
 #if VM_NRESERVLEVEL > 0
-			/*
-			 * Rename the reservation.
-			 */
-			vm_reserv_rename(p, object, backing_object,
-			    backing_offset_index);
+		/*
+		 * Rename the reservation.
+		 */
+		vm_reserv_rename(p, object, backing_object,
+		    backing_offset_index);
 #endif
-		}
-		p = next;
 	}
 	return (true);
 }
@@ -1665,7 +1745,7 @@
 	if (backing_object->ref_count != 1)
 		return;
 
-	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
+	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
@@ -1698,8 +1778,8 @@
 		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
-		     backing_object->type != OBJT_SWAP) ||
-		    (backing_object->flags & OBJ_DEAD) ||
+		    backing_object->type != OBJT_SWAP) ||
+		    (backing_object->flags & (OBJ_DEAD | OBJ_NOSPLIT)) != 0 ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
@@ -1722,7 +1802,7 @@
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
-		 * vm_object_backing_scan fails the shadowing test in this
+		 * vm_object_collapse_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
@@ -1731,9 +1811,9 @@
 
 			/*
 			 * If there is exactly one reference to the backing
-			 * object, we can collapse it into the parent.  
+			 * object, we can collapse it into the parent.
 			 */
-			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
+			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
@@ -1759,13 +1839,6 @@
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
-
-				/*
-				 * Free any cached pages from backing_object.
-				 */
-				if (__predict_false(
-				    !vm_object_cache_is_empty(backing_object)))
-					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
@@ -1814,8 +1887,7 @@
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
-			    !vm_object_backing_scan(object,
-			    OBSC_TEST_ALL_SHADOWED)) {
+			    !vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
@@ -1889,6 +1961,8 @@
     int options)
 {
 	vm_page_t p, next;
+	struct mtx *mtx;
+	struct pglist pgl;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
@@ -1895,10 +1969,12 @@
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
-		goto skipmemq;
+		return;
 	vm_object_pip_add(object, 1);
+	TAILQ_INIT(&pgl);
 again:
 	p = vm_page_find_least(object, start);
+	mtx = NULL;
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
@@ -1915,7 +1991,7 @@
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
-		vm_page_lock(p);
+		vm_page_change_lock(p, &mtx);
 		if (vm_page_xbusied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopax", true);
@@ -1923,13 +1999,14 @@
 			goto again;
 		}
 		if (p->wire_count != 0) {
-			if ((options & OBJPR_NOTMAPPED) == 0)
+			if ((options & OBJPR_NOTMAPPED) == 0 &&
+			    object->ref_count != 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
-			goto next;
+			continue;
 		}
 		if (vm_page_busied(p)) {
 			VM_OBJECT_WUNLOCK(object);
@@ -1940,33 +2017,34 @@
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
-			if ((options & OBJPR_NOTMAPPED) == 0)
+			if ((options & OBJPR_NOTMAPPED) == 0 &&
+			    object->ref_count != 0)
 				pmap_remove_write(p);
-			if (p->dirty)
-				goto next;
+			if (p->dirty != 0)
+				continue;
 		}
-		if ((options & OBJPR_NOTMAPPED) == 0)
+		if ((options & OBJPR_NOTMAPPED) == 0 && object->ref_count != 0)
 			pmap_remove_all(p);
-		vm_page_free(p);
-next:
-		vm_page_unlock(p);
+		p->flags &= ~PG_ZERO;
+		if (vm_page_free_prep(p, false))
+			TAILQ_INSERT_TAIL(&pgl, p, listq);
 	}
+	if (mtx != NULL)
+		mtx_unlock(mtx);
+	vm_page_free_phys_pglist(&pgl);
 	vm_object_pip_wakeup(object);
-skipmemq:
-	if (__predict_false(!vm_object_cache_is_empty(object)))
-		vm_page_cache_free(object, start, end);
 }
 
 /*
- *	vm_object_page_cache:
+ *	vm_object_page_noreuse:
  *
- *	For the given object, attempt to move the specified clean
- *	pages to the cache queue.  If a page is wired for any reason,
- *	then it will not be changed.  Pages are specified by the given
- *	range ["start", "end").  As a special case, if "end" is zero,
- *	then the range extends from "start" to the end of the object.
- *	Any mappings to the specified pages are removed before the
- *	pages are moved to the cache queue.
+ *	For the given object, attempt to move the specified pages to
+ *	the head of the inactive queue.  This bypasses regular LRU
+ *	operation and allows the pages to be reused quickly under memory
+ *	pressure.  If a page is wired for any reason, then it will not
+ *	be queued.  Pages are specified by the range ["start", "end").
+ *	As a special case, if "end" is zero, then the range extends from
+ *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
@@ -1974,14 +2052,14 @@
  *	The object must be locked.
  */
 void
-vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
+vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	struct mtx *mtx, *new_mtx;
+	struct mtx *mtx;
 	vm_page_t p, next;
 
-	VM_OBJECT_ASSERT_WLOCKED(object);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
-	    ("vm_object_page_cache: illegal object %p", object));
+	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
@@ -1993,18 +2071,8 @@
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
-
-		/*
-		 * Avoid releasing and reacquiring the same page lock.
-		 */
-		new_mtx = vm_page_lockptr(p);
-		if (mtx != new_mtx) {
-			if (mtx != NULL)
-				mtx_unlock(mtx);
-			mtx = new_mtx;
-			mtx_lock(mtx);
-		}
-		vm_page_try_to_cache(p);
+		vm_page_change_lock(p, &mtx);
+		vm_page_deactivate_noreuse(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
@@ -2023,7 +2091,7 @@
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
-	vm_page_t m, ma[1];
+	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
@@ -2031,11 +2099,7 @@
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
-			ma[0] = m;
-			rv = vm_pager_get_pages(object, ma, 1, 0);
-			m = vm_page_lookup(object, pindex);
-			if (m == NULL)
-				break;
+			rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
@@ -2090,7 +2154,7 @@
 	VM_OBJECT_WLOCK(prev_object);
 	if ((prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) ||
-	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
+	    (prev_object->flags & OBJ_NOSPLIT) != 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
@@ -2127,7 +2191,7 @@
 
 		/*
 		 * If prev_object was charged, then this mapping,
-		 * althought not charged now, may become writable
+		 * although not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
@@ -2205,7 +2269,7 @@
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
-	vm_object_t tobject;
+	vm_object_t tobject, t1object;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
@@ -2219,6 +2283,7 @@
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
+again:
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
@@ -2252,6 +2317,16 @@
 			m = TAILQ_NEXT(m, listq);
 		}
 		vm_page_lock(tm);
+		if (vm_page_xbusied(tm)) {
+			for (tobject = object; locked_depth >= 1;
+			    locked_depth--) {
+				t1object = tobject->backing_object;
+				VM_OBJECT_RUNLOCK(tobject);
+				tobject = t1object;
+			}
+			vm_page_busy_sleep(tm, "unwbo", true);
+			goto again;
+		}
 		vm_page_unwire(tm, queue);
 		vm_page_unlock(tm);
 next_page:
@@ -2258,10 +2333,10 @@
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
-	for (depth = 0; depth < locked_depth; depth++) {
-		tobject = object->backing_object;
-		VM_OBJECT_RUNLOCK(object);
-		object = tobject;
+	for (tobject = object; locked_depth >= 1; locked_depth--) {
+		t1object = tobject->backing_object;
+		VM_OBJECT_RUNLOCK(tobject);
+		tobject = t1object;
 	}
 }
 
@@ -2340,9 +2415,9 @@
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
-			if (m->queue == PQ_ACTIVE)
+			if (vm_page_active(m))
 				kvo->kvo_active++;
-			else if (m->queue == PQ_INACTIVE)
+			else if (vm_page_inactive(m))
 				kvo->kvo_inactive++;
 		}
 

Modified: trunk/sys/vm/vm_object.h
===================================================================
--- trunk/sys/vm/vm_object.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_object.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_object.h 313384 2017-02-07 08:33:46Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_object.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -71,6 +71,7 @@
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
+#include <sys/_pctrie.h>
 #include <sys/_rwlock.h>
 
 #include <vm/_vm_radix.h>
@@ -80,17 +81,6 @@
  *
  *	vm_object_t		Virtual memory object.
  *
- *	The root of cached pages pool is protected by both the per-object lock
- *	and the free pages queue mutex.
- *	On insert in the cache radix trie, the per-object lock is expected
- *	to be already held and the free pages queue mutex will be
- *	acquired during the operation too.
- *	On remove and lookup from the cache radix trie, only the free
- *	pages queue mutex is expected to be locked.
- *	These rules allow for reliably checking for the presence of cached
- *	pages with only the per-object lock held, thereby reducing contention
- *	for the free pages queue mutex.
- *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object lock 
@@ -98,12 +88,17 @@
  *
  */
 
+#ifndef VM_PAGE_HAVE_PGLIST
+TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
+
 struct vm_object {
 	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
-	TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
+	struct pglist memq;		/* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
@@ -119,7 +114,6 @@
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
-	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
@@ -164,17 +158,17 @@
 		 *		     the handle changed and hash-chain
 		 *		     invalid.
 		 *
-		 *	swp_bcount - number of swap 'swblock' metablocks, each
-		 *		     contains up to 16 swapblk assignments.
-		 *		     see vm/swap_pager.h
+		 *	swp_blks -   pc-trie of the allocated swap blocks.
+		 *
 		 */
 		struct {
 			void *swp_tmpfs;
-			int swp_bcount;
+			struct pctrie swp_blks;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
+	void *umtx_data;
 };
 
 /*
@@ -182,10 +176,13 @@
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
-#define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
+#define	OBJ_POPULATE	0x0004		/* pager implements populate() */
+#define	OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
-#define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
-#define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
+#define	OBJ_UMTXDEAD	0x0020		/* umtx pshared was terminated */
+#define	OBJ_PIPWNT	0x0040		/* paging in progress wanted */
+#define	OBJ_PG_DTOR	0x0080		/* dont reset object, leave that for dtor */
+#define	OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
 #define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
@@ -193,14 +190,29 @@
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 #define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
+/*
+ * Helpers to perform conversion between vm_object page indexes and offsets.
+ * IDX_TO_OFF() converts an index into an offset.
+ * OFF_TO_IDX() converts an offset into an index.  Since offsets are signed
+ *   by default, the sign propagation in OFF_TO_IDX(), when applied to
+ *   negative offsets, is intentional and returns a vm_object page index
+ *   that cannot be created by a userspace mapping.
+ * UOFF_TO_IDX() treats the offset as an unsigned value and converts it
+ *   into an index accordingly.  Use it only when the full range of offset
+ *   values are allowed.  Currently, this only applies to device mappings.
+ * OBJ_MAX_SIZE specifies the maximum page index corresponding to the
+ *   maximum unsigned offset.
+ */
 #define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define	OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
+#define	UOFF_TO_IDX(off) (((vm_pindex_t)(off)) >> PAGE_SHIFT)
+#define	OBJ_MAX_SIZE	(UOFF_TO_IDX(UINT64_MAX) + 1)
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
-#define OBJPC_NOSYNC	0x4			/* skip if PG_NOSYNC */
+#define OBJPC_NOSYNC	0x4			/* skip if VPO_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
@@ -243,6 +255,8 @@
 	rw_try_upgrade(&(object)->lock)
 #define	VM_OBJECT_WLOCK(object)						\
 	rw_wlock(&(object)->lock)
+#define	VM_OBJECT_WOWNED(object)					\
+	rw_wowned(&(object)->lock)
 #define	VM_OBJECT_WUNLOCK(object)					\
 	rw_wunlock(&(object)->lock)
 
@@ -256,6 +270,30 @@
 	object->flags |= bits;
 }
 
+/*
+ *	Conditionally set the object's color, which (1) enables the allocation
+ *	of physical memory reservations for anonymous objects and larger-than-
+ *	superpage-sized named objects and (2) determines the first page offset
+ *	within the object at which a reservation may be allocated.  In other
+ *	words, the color determines the alignment of the object with respect
+ *	to the largest superpage boundary.  When mapping named objects, like
+ *	files or POSIX shared memory objects, the color should be set to zero
+ *	before a virtual address is selected for the mapping.  In contrast,
+ *	for anonymous objects, the color may be set after the virtual address
+ *	is selected.
+ *
+ *	The object must be locked.
+ */
+static __inline void
+vm_object_color(vm_object_t object, u_short color)
+{
+
+	if ((object->flags & OBJ_COLORED) == 0) {
+		object->pg_color = color;
+		object->flags |= OBJ_COLORED;
+	}
+}
+
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
@@ -263,13 +301,10 @@
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
-static __inline boolean_t
-vm_object_cache_is_empty(vm_object_t object)
-{
+void umtx_shm_object_init(vm_object_t object);
+void umtx_shm_object_terminated(vm_object_t object);
+extern int umtx_shm_vnobj_persistent;
 
-	return (vm_radix_is_empty(&object->cache));
-}
-
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
@@ -280,10 +315,10 @@
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
-void vm_object_page_cache(vm_object_t object, vm_pindex_t start,
-    vm_pindex_t end);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
+void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
+    vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);

Modified: trunk/sys/vm/vm_page.c
===================================================================
--- trunk/sys/vm/vm_page.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -83,7 +83,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_page.c 320190 2017-06-21 14:39:31Z jhb $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_page.c 342797 2019-01-06 00:38:28Z kib $");
 
 #include "opt_vm.h"
 
@@ -92,6 +92,7 @@
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
+#include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
@@ -98,6 +99,8 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
@@ -125,9 +128,9 @@
  */
 
 struct vm_domain vm_dom[MAXMEMDOM];
-struct mtx_padalign vm_page_queue_free_mtx;
+struct mtx_padalign __exclusive_cache_line vm_page_queue_free_mtx;
 
-struct mtx_padalign pa_lock[PA_LOCK_COUNT];
+struct mtx_padalign __exclusive_cache_line pa_lock[PA_LOCK_COUNT];
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
@@ -135,25 +138,37 @@
 int vm_page_zero_count;
 
 static int boot_pages = UMA_BOOT_PAGES;
-TUNABLE_INT("vm.boot_pages", &boot_pages);
-SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
-	"number of pages allocated for bootstrapping the VM system");
+SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &boot_pages, 0,
+    "number of pages allocated for bootstrapping the VM system");
 
 static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
+static TAILQ_HEAD(, vm_page) blacklist_head;
+static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
+
+/* Is the page daemon waiting for free pages? */
+static int vm_pageout_pages_needed;
+
 static uma_zone_t fakepg_zone;
 
-static struct vnode *vm_page_alloc_init(vm_page_t m);
-static void vm_page_cache_turn_free(vm_page_t m);
+static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(int queue, vm_page_t m);
+static void vm_page_enqueue(uint8_t queue, vm_page_t m);
+static void vm_page_free_phys(vm_page_t m);
+static void vm_page_free_wakeup(void);
 static void vm_page_init_fakepg(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
+static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high);
+static int vm_page_alloc_fail(vm_object_t object, int req);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
@@ -162,7 +177,7 @@
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 
+	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 }
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
@@ -210,43 +225,171 @@
 void
 vm_set_page_size(void)
 {
-	if (cnt.v_page_size == 0)
-		cnt.v_page_size = PAGE_SIZE;
-	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
+	if (vm_cnt.v_page_size == 0)
+		vm_cnt.v_page_size = PAGE_SIZE;
+	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
- *	vm_page_blacklist_lookup:
+ *	vm_page_blacklist_next:
  *
- *	See if a physical address in this page has been listed
- *	in the blacklist tunable.  Entries in the tunable are
- *	separated by spaces or commas.  If an invalid integer is
- *	encountered then the rest of the string is skipped.
+ *	Find the next entry in the provided string of blacklist
+ *	addresses.  Entries are separated by space, comma, or newline.
+ *	If an invalid integer is encountered then the rest of the
+ *	string is skipped.  Updates the list pointer to the next
+ *	character, or NULL if the string is exhausted or invalid.
  */
-static int
-vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
+static vm_paddr_t
+vm_page_blacklist_next(char **list, char *end)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
-	for (pos = list; *pos != '\0'; pos = cp) {
+	if (list == NULL || *list == NULL)
+		return (0);
+	if (**list =='\0') {
+		*list = NULL;
+		return (0);
+	}
+
+	/*
+	 * If there's no end pointer then the buffer is coming from
+	 * the kenv and we know it's null-terminated.
+	 */
+	if (end == NULL)
+		end = *list + strlen(*list);
+
+	/* Ensure that strtoq() won't walk off the end */
+	if (*end != '\0') {
+		if (*end == '\n' || *end == ' ' || *end  == ',')
+			*end = '\0';
+		else {
+			printf("Blacklist not terminated, skipping\n");
+			*list = NULL;
+			return (0);
+		}
+	}
+
+	for (pos = *list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
-		if (*cp != '\0') {
-			if (*cp == ' ' || *cp == ',') {
-				cp++;
-				if (cp == pos)
+		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
+			if (bad == 0) {
+				if (++cp < end)
 					continue;
-			} else
-				break;
-		}
-		if (pa == trunc_page(bad))
-			return (1);
+				else
+					break;
+			}
+		} else
+			break;
+		if (*cp == '\0' || ++cp >= end)
+			*list = NULL;
+		else
+			*list = cp;
+		return (trunc_page(bad));
 	}
+	printf("Garbage in RAM blacklist, skipping\n");
+	*list = NULL;
 	return (0);
 }
 
+bool
+vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
+{
+	vm_page_t m;
+	int ret;
+
+	m = vm_phys_paddr_to_vm_page(pa);
+	if (m == NULL)
+		return (true); /* page does not exist, no failure */
+
+	mtx_lock(&vm_page_queue_free_mtx);
+	ret = vm_phys_unfree_page(m);
+	if (ret != 0)
+		vm_phys_freecnt_adj(m, -1);
+	mtx_unlock(&vm_page_queue_free_mtx);
+	if (ret != 0) {
+		TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
+		if (verbose)
+			printf("Skipping page with pa 0x%jx\n", (uintmax_t)pa);
+	}
+	return (ret);
+}
+
+/*
+ *	vm_page_blacklist_check:
+ *
+ *	Iterate through the provided string of blacklist addresses, pulling
+ *	each entry out of the physical allocator free list and putting it
+ *	onto a list for reporting via the vm.page_blacklist sysctl.
+ */
 static void
+vm_page_blacklist_check(char *list, char *end)
+{
+	vm_paddr_t pa;
+	char *next;
+
+	next = list;
+	while (next != NULL) {
+		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
+			continue;
+		vm_page_blacklist_add(pa, bootverbose);
+	}
+}
+
+/*
+ *	vm_page_blacklist_load:
+ *
+ *	Search for a special module named "ram_blacklist".  It'll be a
+ *	plain text file provided by the user via the loader directive
+ *	of the same name.
+ */
+static void
+vm_page_blacklist_load(char **list, char **end)
+{
+	void *mod;
+	u_char *ptr;
+	u_int len;
+
+	mod = NULL;
+	ptr = NULL;
+
+	mod = preload_search_by_type("ram_blacklist");
+	if (mod != NULL) {
+		ptr = preload_fetch_addr(mod);
+		len = preload_fetch_size(mod);
+        }
+	*list = ptr;
+	if (ptr != NULL)
+		*end = ptr + len;
+	else
+		*end = NULL;
+	return;
+}
+
+static int
+sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
+{
+	vm_page_t m;
+	struct sbuf sbuf;
+	int error, first;
+
+	first = 1;
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+	TAILQ_FOREACH(m, &blacklist_head, listq) {
+		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
+		    (uintmax_t)m->phys_addr);
+		first = 0;
+	}
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+
+static void
 vm_page_domain_init(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
@@ -255,16 +398,19 @@
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
-	    &cnt.v_inactive_count;
+	    &vm_cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
-	    &cnt.v_active_count;
+	    &vm_cnt.v_active_count;
+	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
+	    "vm laundry pagequeue";
+	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
+	    &vm_cnt.v_laundry_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
-	vmd->vmd_pass = 0;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
@@ -274,6 +420,29 @@
 }
 
 /*
+ * Initialize a physical page in preparation for adding it to the free
+ * lists.
+ */
+static void
+vm_page_init_page(vm_page_t m, vm_paddr_t pa, int segind)
+{
+
+	m->object = NULL;
+	m->wire_count = 0;
+	m->busy_lock = VPB_UNBUSIED;
+	m->hold_count = 0;
+	m->flags = 0;
+	m->phys_addr = pa;
+	m->queue = PQ_NONE;
+	m->psind = 0;
+	m->segind = segind;
+	m->order = VM_NFREEORDER;
+	m->pool = VM_FREEPOOL_DEFAULT;
+	m->valid = m->dirty = 0;
+	pmap_page_init(m);
+}
+
+/*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.  Allocates physical memory for
@@ -284,19 +453,16 @@
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
+	struct vm_domain *vmd;
+	struct vm_phys_seg *seg;
+	vm_page_t m;
+	char *list, *listend;
 	vm_offset_t mapped;
-	vm_paddr_t high_avail, low_avail, page_range, size;
-	vm_paddr_t new_end;
-	int i;
-	vm_paddr_t pa;
-	vm_paddr_t last_pa;
-	char *list;
+	vm_paddr_t end, high_avail, low_avail, new_end, page_range, size;
+	vm_paddr_t biggestsize, last_pa, pa;
+	u_long pagecount;
+	int biggestone, i, pages_per_zone, segind;
 
-	/* the biggest memory array is the second group of pages */
-	vm_paddr_t end;
-	vm_paddr_t biggestsize;
-	int biggestone;
-
 	biggestsize = 0;
 	biggestone = 0;
 	vaddr = round_page(vaddr);
@@ -305,15 +471,6 @@
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
-
-#ifdef XEN
-	/*
-	 * There is no obvious reason why i386 PV Xen needs vm_page structs
-	 * created for these pseudo-physical addresses.  XXX
-	 */
-	vm_phys_add_seg(0, phys_avail[0]);
-#endif
-
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		size = phys_avail[i + 1] - phys_avail[i];
 		if (size > biggestsize) {
@@ -334,9 +491,27 @@
 		vm_page_domain_init(&vm_dom[i]);
 
 	/*
+	 * Almost all of the pages needed for bootstrapping UMA are used
+	 * for zone structures, so if the number of CPUs results in those
+	 * structures taking more than one page each, we set aside more pages
+	 * in proportion to the zone structure size.
+	 */
+	pages_per_zone = howmany(sizeof(struct uma_zone) +
+	    sizeof(struct uma_cache) * (mp_maxid + 1) +
+	    roundup2(sizeof(struct uma_slab), sizeof(void *)), UMA_SLAB_SIZE);
+	if (pages_per_zone > 1) {
+		/* Reserve more pages so that we don't run out. */
+		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
+	}
+
+	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
+	 *
+	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
+	 * manually fetch the value.
 	 */
+	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
@@ -344,8 +519,8 @@
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
-#if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
-    defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+    defined(__i386__) || defined(__mips__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
@@ -367,8 +542,10 @@
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
+#else
+	(void)last_pa;
 #endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
 	 * When pmap_map() uses the direct map, they are not automatically 
@@ -471,7 +648,9 @@
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
-	vm_page_array = (vm_page_t) mapped;
+	vm_page_array = (vm_page_t)mapped;
+	vm_page_array_size = page_range;
+
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate physical memory for the reservation management system's
@@ -481,13 +660,13 @@
 		high_avail = new_end;
 	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
 #endif
-#if defined(__amd64__) || defined(__mips__)
+#if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include vm_page_array and vm_reserv_array in a crash dump.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
-#endif	
+#endif
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
@@ -498,38 +677,60 @@
 		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
 
 	/*
-	 * Clear all of the page structures
-	 */
-	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
-	for (i = 0; i < page_range; i++)
-		vm_page_array[i].order = VM_NFREEORDER;
-	vm_page_array_size = page_range;
-
-	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
-	 * Add every available physical page that is not blacklisted to
-	 * the free lists.
+	 * Initialize the page structures and add every available page to the
+	 * physical memory allocator's free lists.
 	 */
-	cnt.v_page_count = 0;
-	cnt.v_free_count = 0;
-	list = getenv("vm.blacklist");
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
-		pa = phys_avail[i];
-		last_pa = phys_avail[i + 1];
-		while (pa < last_pa) {
-			if (list != NULL &&
-			    vm_page_blacklist_lookup(list, pa))
-				printf("Skipping page with pa 0x%jx\n",
-				    (uintmax_t)pa);
-			else
-				vm_phys_add_page(pa);
-			pa += PAGE_SIZE;
+	vm_cnt.v_page_count = 0;
+	vm_cnt.v_free_count = 0;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		for (m = seg->first_page, pa = seg->start; pa < seg->end;
+		    m++, pa += PAGE_SIZE)
+			vm_page_init_page(m, pa, segind);
+
+		/*
+		 * Add the segment to the free lists only if it is covered by
+		 * one of the ranges in phys_avail.  Because we've added the
+		 * ranges to the vm_phys_segs array, we can assume that each
+		 * segment is either entirely contained in one of the ranges,
+		 * or doesn't overlap any of them.
+		 */
+		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+			if (seg->start < phys_avail[i] ||
+			    seg->end > phys_avail[i + 1])
+				continue;
+
+			m = seg->first_page;
+			pagecount = (u_long)atop(seg->end - seg->start);
+
+			mtx_lock(&vm_page_queue_free_mtx);
+			vm_phys_free_contig(m, pagecount);
+			vm_phys_freecnt_adj(m, (int)pagecount);
+			mtx_unlock(&vm_page_queue_free_mtx);
+			vm_cnt.v_page_count += (u_int)pagecount;
+
+			vmd = &vm_dom[seg->domain];
+			vmd->vmd_page_count += (u_int)pagecount;
+			vmd->vmd_segs |= 1UL << m->segind;
+			break;
 		}
 	}
+
+	/*
+	 * Remove blacklisted pages from the physical memory allocator.
+	 */
+	TAILQ_INIT(&blacklist_head);
+	vm_page_blacklist_load(&list, &listend);
+	vm_page_blacklist_check(list, listend);
+
+	list = kern_getenv("vm.blacklist");
+	vm_page_blacklist_check(list, NULL);
+
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
@@ -603,6 +804,7 @@
 {
 	u_int x;
 
+	vm_page_lock_assert(m, MA_NOTOWNED);
 	vm_page_assert_sbusied(m);
 
 	for (;;) {
@@ -683,6 +885,41 @@
 	}
 }
 
+static void
+vm_page_xunbusy_locked(vm_page_t m)
+{
+
+	vm_page_assert_xbusied(m);
+	vm_page_assert_locked(m);
+
+	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
+	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
+	wakeup(m);
+}
+
+void
+vm_page_xunbusy_maybelocked(vm_page_t m)
+{
+	bool lockacq;
+
+	vm_page_assert_xbusied(m);
+
+	/*
+	 * Fast path for unbusy.  If it succeeds, we know that there
+	 * are no waiters, so we do not need a wakeup.
+	 */
+	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
+	    VPB_UNBUSIED))
+		return;
+
+	lockacq = !mtx_owned(vm_page_lockptr(m));
+	if (lockacq)
+		vm_page_lock(m);
+	vm_page_xunbusy_locked(m);
+	if (lockacq)
+		vm_page_unlock(m);
+}
+
 /*
  *	vm_page_xunbusy_hard:
  *
@@ -696,8 +933,7 @@
 	vm_page_assert_xbusied(m);
 
 	vm_page_lock(m);
-	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
-	wakeup(m);
+	vm_page_xunbusy_locked(m);
 	vm_page_unlock(m);
 }
 
@@ -728,6 +964,23 @@
 }
 
 /*
+ * Avoid releasing and reacquiring the same page lock.
+ */
+void
+vm_page_change_lock(vm_page_t m, struct mtx **mtx)
+{
+	struct mtx *mtx1;
+
+	mtx1 = vm_page_lockptr(m);
+	if (*mtx == mtx1)
+		return;
+	if (*mtx != NULL)
+		mtx_unlock(*mtx);
+	*mtx = mtx1;
+	mtx_lock(mtx1);
+}
+
+/*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
@@ -756,24 +1009,15 @@
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
- */ 
+ */
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
-	struct mtx *mtx, *new_mtx;
+	struct mtx *mtx;
 
 	mtx = NULL;
 	for (; count != 0; count--) {
-		/*
-		 * Avoid releasing and reacquiring the same page lock.
-		 */
-		new_mtx = vm_page_lockptr(*ma);
-		if (mtx != new_mtx) {
-			if (mtx != NULL)
-				mtx_unlock(mtx);
-			mtx = new_mtx;
-			mtx_lock(mtx);
-		}
+		vm_page_change_lock(*ma, &mtx);
 		vm_page_unhold(*ma);
 		ma++;
 	}
@@ -905,39 +1149,29 @@
 }
 
 /*
- * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
- * array which is not the request page.
+ * Unbusy and handle the page queueing for a page from a getpages request that
+ * was optionally read ahead or behind.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
-	if (m->valid != 0) {
-		/*
-		 * Since the page is not the requested page, whether
-		 * it should be activated or deactivated is not
-		 * obvious.  Empirical results have shown that
-		 * deactivating the page is usually the best choice,
-		 * unless the page is wanted by another thread.
-		 */
-		vm_page_lock(m);
-		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
-			vm_page_activate(m);
-		else
-			vm_page_deactivate(m);
-		vm_page_unlock(m);
-		vm_page_xunbusy(m);
-	} else {
-		/*
-		 * Free the completely invalid page.  Such page state
-		 * occurs due to the short read operation which did
-		 * not covered our page at all, or in case when a read
-		 * error happens.
-		 */
-		vm_page_lock(m);
-		vm_page_free(m);
-		vm_page_unlock(m);
-	}
+	/* We shouldn't put invalid pages on queues. */
+	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
+
+	/*
+	 * Since the page is not the actually needed one, whether it should
+	 * be activated or deactivated is not obvious.  Empirical results
+	 * have shown that deactivating the page is usually the best choice,
+	 * unless the page is wanted by another thread.
+	 */
+	vm_page_lock(m);
+	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+		vm_page_activate(m);
+	else
+		vm_page_deactivate(m);
+	vm_page_unlock(m);
+	vm_page_xunbusy(m);
 }
 
 /*
@@ -991,11 +1225,7 @@
 vm_page_dirty_KBI(vm_page_t m)
 {
 
-	/* These assertions refer to this operation by its public name. */
-	KASSERT((m->flags & PG_CACHED) == 0,
-	    ("vm_page_dirty: page in cache!"));
-	KASSERT(!VM_PAGE_IS_FREE(m),
-	    ("vm_page_dirty: page is free!"));
+	/* Refer to this operation by its public name. */
 	KASSERT(m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
@@ -1119,9 +1349,8 @@
 /*
  *	vm_page_remove:
  *
- *	Removes the given mem entry from the object/offset-page
- *	table and the object page list, but do not invalidate/terminate
- *	the backing store.
+ *	Removes the specified page from its containing object, but does not
+ *	invalidate any backing storage.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
@@ -1129,30 +1358,21 @@
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
-	boolean_t lockacq;
+	vm_page_t mrem;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
-		vm_page_lock_assert(m, MA_OWNED);
+		vm_page_assert_locked(m);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (vm_page_xbusied(m)) {
-		lockacq = FALSE;
-		if ((m->oflags & VPO_UNMANAGED) != 0 &&
-		    !mtx_owned(vm_page_lockptr(m))) {
-			lockacq = TRUE;
-			vm_page_lock(m);
-		}
-		vm_page_flash(m);
-		atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
-		if (lockacq)
-			vm_page_unlock(m);
-	}
+	if (vm_page_xbusied(m))
+		vm_page_xunbusy_maybelocked(m);
+	mrem = vm_radix_remove(&object->rtree, m->pindex);
+	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
-	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
@@ -1215,7 +1435,7 @@
 {
 	vm_page_t next;
 
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 		MPASS(next->object == m->object);
 		if (next->pindex != m->pindex + 1)
@@ -1235,7 +1455,7 @@
 {
 	vm_page_t prev;
 
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
+	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 		MPASS(prev->object == m->object);
 		if (prev->pindex != m->pindex - 1)
@@ -1253,9 +1473,13 @@
 vm_page_t
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 {
-	vm_page_t mold, mpred;
+	vm_page_t mold;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(mnew->object == NULL,
+	    ("vm_page_replace: page %p already in object", mnew));
+	KASSERT(mnew->queue == PQ_NONE,
+	    ("vm_page_replace: new page %p is on a paging queue", mnew));
 
 	/*
 	 * This function mostly follows vm_page_insert() and
@@ -1262,31 +1486,24 @@
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
-	mpred = vm_radix_lookup(&object->rtree, pindex);
-	KASSERT(mpred != NULL,
-	    ("vm_page_replace: replacing page not present with pindex"));
-	mpred = TAILQ_PREV(mpred, respgs, listq);
-	if (mpred != NULL)
-		KASSERT(mpred->pindex < pindex,
-		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	mold = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mold->queue == PQ_NONE,
-	    ("vm_page_replace: mold is on a paging queue"));
+	    ("vm_page_replace: old page %p is on a paging queue", mold));
 
-	/* Detach the old page from the resident tailq. */
+	/* Keep the resident page list in sorted order. */
+	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 	TAILQ_REMOVE(&object->memq, mold, listq);
 
 	mold->object = NULL;
-	vm_page_xunbusy(mold);
+	vm_page_xunbusy_maybelocked(mold);
 
-	/* Insert the new page in the resident tailq. */
-	if (mpred != NULL)
-		TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
-	else
-		TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+	/*
+	 * The object's resident_page_count does not change because we have
+	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
+	 */
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	return (mold);
@@ -1306,9 +1523,7 @@
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
- *	      swap.  If the page is on the cache, we have to deactivate it
- *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
- *	      on the cache.
+ *	      swap.
  *
  *	The objects must be locked.
  */
@@ -1354,142 +1569,6 @@
 }
 
 /*
- *	Convert all of the given object's cached pages that have a
- *	pindex within the given range into free pages.  If the value
- *	zero is given for "end", then the range's upper bound is
- *	infinity.  If the given object is backed by a vnode and it
- *	transitions from having one or more cached pages to none, the
- *	vnode's hold count is reduced. 
- */
-void
-vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
-{
-	vm_page_t m;
-	boolean_t empty;
-
-	mtx_lock(&vm_page_queue_free_mtx);
-	if (__predict_false(vm_radix_is_empty(&object->cache))) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		return;
-	}
-	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
-		if (end != 0 && m->pindex >= end)
-			break;
-		vm_radix_remove(&object->cache, m->pindex);
-		vm_page_cache_turn_free(m);
-	}
-	empty = vm_radix_is_empty(&object->cache);
-	mtx_unlock(&vm_page_queue_free_mtx);
-	if (object->type == OBJT_VNODE && empty)
-		vdrop(object->handle);
-}
-
-/*
- *	Returns the cached page that is associated with the given
- *	object and offset.  If, however, none exists, returns NULL.
- *
- *	The free page queue must be locked.
- */
-static inline vm_page_t
-vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	return (vm_radix_lookup(&object->cache, pindex));
-}
-
-/*
- *	Remove the given cached page from its containing object's
- *	collection of cached pages.
- *
- *	The free page queue must be locked.
- */
-static void
-vm_page_cache_remove(vm_page_t m)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	KASSERT((m->flags & PG_CACHED) != 0,
-	    ("vm_page_cache_remove: page %p is not cached", m));
-	vm_radix_remove(&m->object->cache, m->pindex);
-	m->object = NULL;
-	cnt.v_cache_count--;
-}
-
-/*
- *	Transfer all of the cached pages with offset greater than or
- *	equal to 'offidxstart' from the original object's cache to the
- *	new object's cache.  However, any cached pages with offset
- *	greater than or equal to the new object's size are kept in the
- *	original object.  Initially, the new object's cache must be
- *	empty.  Offset 'offidxstart' in the original object must
- *	correspond to offset zero in the new object.
- *
- *	The new object must be locked.
- */
-void
-vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
-    vm_object_t new_object)
-{
-	vm_page_t m;
-
-	/*
-	 * Insertion into an object's collection of cached pages
-	 * requires the object to be locked.  In contrast, removal does
-	 * not.
-	 */
-	VM_OBJECT_ASSERT_WLOCKED(new_object);
-	KASSERT(vm_radix_is_empty(&new_object->cache),
-	    ("vm_page_cache_transfer: object %p has cached pages",
-	    new_object));
-	mtx_lock(&vm_page_queue_free_mtx);
-	while ((m = vm_radix_lookup_ge(&orig_object->cache,
-	    offidxstart)) != NULL) {
-		/*
-		 * Transfer all of the pages with offset greater than or
-		 * equal to 'offidxstart' from the original object's
-		 * cache to the new object's cache.
-		 */
-		if ((m->pindex - offidxstart) >= new_object->size)
-			break;
-		vm_radix_remove(&orig_object->cache, m->pindex);
-		/* Update the page's object and offset. */
-		m->object = new_object;
-		m->pindex -= offidxstart;
-		if (vm_radix_insert(&new_object->cache, m))
-			vm_page_cache_turn_free(m);
-	}
-	mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
- *	Returns TRUE if a cached page is associated with the given object and
- *	offset, and FALSE otherwise.
- *
- *	The object must be locked.
- */
-boolean_t
-vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
-{
-	vm_page_t m;
-
-	/*
-	 * Insertion into an object's collection of cached pages requires the
-	 * object to be locked.  Therefore, if the object is locked and the
-	 * object's collection is empty, there is no need to acquire the free
-	 * page queues lock in order to prove that the specified page doesn't
-	 * exist.
-	 */
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (__predict_true(vm_object_cache_is_empty(object)))
-		return (FALSE);
-	mtx_lock(&vm_page_queue_free_mtx);
-	m = vm_page_cache_lookup(object, pindex);
-	mtx_unlock(&vm_page_queue_free_mtx);
-	return (m != NULL);
-}
-
-/*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
@@ -1505,13 +1584,10 @@
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
- *	VM_ALLOC_IFCACHED	return page only if it is cached
- *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
- *				is cached
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
- *				should not be exclusive busy 
+ *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
@@ -1521,21 +1597,41 @@
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
-	struct vnode *vp = NULL;
-	vm_object_t m_object;
-	vm_page_t m, mpred;
+
+	return (vm_page_alloc_after(object, pindex, req, object != NULL ?
+	    vm_radix_lookup_le(&object->rtree, pindex) : NULL));
+}
+
+/*
+ * Allocate a page in the specified object with the given page index.  To
+ * optimize insertion of the page into the object, the caller must also specifiy
+ * the resident page in the object with largest index smaller than the given
+ * page index, or NULL if no such page exists.
+ */
+vm_page_t
+vm_page_alloc_after(vm_object_t object, vm_pindex_t pindex, int req,
+    vm_page_t mpred)
+{
+	vm_page_t m;
 	int flags, req_class;
+	u_int free_count;
 
-	mpred = 0;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
-	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
-	    req));
+	    ("inconsistent object(%p)/req(%x)", object, req));
+	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+	    ("Can't sleep and retry object insertion."));
+	KASSERT(mpred == NULL || mpred->pindex < pindex,
+	    ("mpred %p doesn't precede pindex 0x%jx", mpred,
+	    (uintmax_t)pindex));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
+	if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
+		return (NULL);
+
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
@@ -1544,52 +1640,29 @@
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
-	if (object != NULL) {
-		mpred = vm_radix_lookup_le(&object->rtree, pindex);
-		KASSERT(mpred == NULL || mpred->pindex != pindex,
-		   ("vm_page_alloc: pindex already allocated"));
-	}
-
 	/*
-	 * The page allocation request can came from consumers which already
-	 * hold the free page queue mutex, like vm_page_insert() in
-	 * vm_page_cache().
+	 * Allocate a page if the number of free pages exceeds the minimum
+	 * for the request class.
 	 */
-	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
-	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count > 0)) {
+	    vm_cnt.v_free_count > 0)) {
 		/*
-		 * Allocate from the free queue if the number of free pages
-		 * exceeds the minimum for the request class.
+		 * Can we allocate the page from a reservation?
 		 */
-		if (object != NULL &&
-		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
-			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
-				mtx_unlock(&vm_page_queue_free_mtx);
-				return (NULL);
-			}
-			if (vm_phys_unfree_page(m))
-				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
 #if VM_NRESERVLEVEL > 0
-			else if (!vm_reserv_reactivate_page(m))
-#else
-			else
-#endif
-				panic("vm_page_alloc: cache page %p is missing"
-				    " from the free queue", m);
-		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
-			mtx_unlock(&vm_page_queue_free_mtx);
-			return (NULL);
-#if VM_NRESERVLEVEL > 0
-		} else if (object == NULL || (object->flags & (OBJ_COLORED |
+		if (object == NULL || (object->flags & (OBJ_COLORED |
 		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
-		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
-#else
-		} else {
+		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
 #endif
+		{
+			/*
+			 * If not, allocate it from the free page queues.
+			 */
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 #if VM_NRESERVLEVEL > 0
@@ -1604,10 +1677,8 @@
 		/*
 		 * Not allocatable, give up.
 		 */
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit,
-		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
-		pagedaemon_wakeup();
+		if (vm_page_alloc_fail(object, req))
+			goto again;
 		return (NULL);
 	}
 
@@ -1614,52 +1685,23 @@
 	/*
 	 *  At this point we had better have found a good page.
 	 */
-	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
-	KASSERT(m->queue == PQ_NONE,
-	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
-	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
-	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
-	KASSERT(!vm_page_busied(m), ("vm_page_alloc: page %p is busy", m));
-	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
-	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
-	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
-	    pmap_page_get_memattr(m)));
-	if ((m->flags & PG_CACHED) != 0) {
-		KASSERT((m->flags & PG_ZERO) == 0,
-		    ("vm_page_alloc: cached page %p is PG_ZERO", m));
-		KASSERT(m->valid != 0,
-		    ("vm_page_alloc: cached page %p is invalid", m));
-		if (m->object == object && m->pindex == pindex)
-	  		cnt.v_reactivated++;
-		else
-			m->valid = 0;
-		m_object = m->object;
-		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE &&
-		    vm_object_cache_is_empty(m_object))
-			vp = m_object->handle;
-	} else {
-		KASSERT(VM_PAGE_IS_FREE(m),
-		    ("vm_page_alloc: page %p is not free", m));
-		KASSERT(m->valid == 0,
-		    ("vm_page_alloc: free page %p is valid", m));
-		vm_phys_freecnt_adj(m, -1);
-	}
+	KASSERT(m != NULL, ("missing page"));
+	free_count = vm_phys_freecnt_adj(m, -1);
+	if ((m->flags & PG_ZERO) != 0)
+		vm_page_zero_count--;
+	mtx_unlock(&vm_page_queue_free_mtx);
+	vm_page_alloc_check(m);
 
 	/*
-	 * Only the PG_ZERO flag is inherited.  The PG_CACHED or PG_FREE flag
-	 * must be cleared before the free page queues lock is released.
+	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
-	if (m->flags & PG_ZERO) {
-		vm_page_zero_count--;
-		if (req & VM_ALLOC_ZERO)
-			flags = PG_ZERO;
-	}
-	if (req & VM_ALLOC_NODUMP)
+	if ((req & VM_ALLOC_ZERO) != 0)
+		flags = PG_ZERO;
+	flags &= m->flags;
+	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
-	mtx_unlock(&vm_page_queue_free_mtx);
 	m->aflags = 0;
 	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
@@ -1673,7 +1715,7 @@
 		 * The page lock is not required for wiring a page until that
 		 * page is inserted into the object.
 		 */
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	m->act_count = 0;
@@ -1680,18 +1722,21 @@
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
-			/* See the comment below about hold count. */
-			if (vp != NULL)
-				vdrop(vp);
 			pagedaemon_wakeup();
 			if (req & VM_ALLOC_WIRED) {
-				atomic_subtract_int(&cnt.v_wire_count, 1);
+				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				m->wire_count = 0;
 			}
-			m->object = NULL;
+			KASSERT(m->object == NULL, ("page %p has object", m));
 			m->oflags = VPO_UNMANAGED;
 			m->busy_lock = VPB_UNBUSIED;
-			vm_page_free(m);
+			/* Don't change PG_ZERO. */
+			vm_page_free_toq(m);
+			if (req & VM_ALLOC_WAITFAIL) {
+				VM_OBJECT_WUNLOCK(object);
+				vm_radix_wait();
+				VM_OBJECT_WLOCK(object);
+			}
 			return (NULL);
 		}
 
@@ -1703,34 +1748,15 @@
 		m->pindex = pindex;
 
 	/*
-	 * The following call to vdrop() must come after the above call
-	 * to vm_page_insert() in case both affect the same object and
-	 * vnode.  Otherwise, the affected vnode's hold count could
-	 * temporarily become zero.
-	 */
-	if (vp != NULL)
-		vdrop(vp);
-
-	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (vm_paging_needed())
+	if (vm_paging_needed(free_count))
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
-static void
-vm_page_alloc_contig_vdrop(struct spglist *lst)
-{
-
-	while (!SLIST_EMPTY(lst)) {
-		vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
-		SLIST_REMOVE_HEAD(lst, plinks.s.ss);
-	}
-}
-
 /*
  *	vm_page_alloc_contig:
  *
@@ -1752,6 +1778,8 @@
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
+ *	The specified object may not contain fictitious pages.
+ *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
@@ -1763,7 +1791,7 @@
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
- *				should not be exclusive busy 
+ *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
@@ -1775,22 +1803,23 @@
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
-	struct vnode *drop;
-	struct spglist deferred_vdrop_list;
-	vm_page_t m, m_tmp, m_ret;
-	u_int flags, oflags;
+	vm_page_t m, m_ret, mpred;
+	u_int busy_lock, flags, oflags;
 	int req_class;
 
+	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
-	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
+	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 	    req));
+	KASSERT(object == NULL || (req & VM_ALLOC_WAITOK) == 0,
+	    ("Can't sleep and retry object insertion."));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
-		KASSERT(object->type == OBJT_PHYS,
-		    ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
+		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
+		    ("vm_page_alloc_contig: object %p has fictitious pages",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
@@ -1802,40 +1831,48 @@
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
-	SLIST_INIT(&deferred_vdrop_list);
+	if (object != NULL) {
+		mpred = vm_radix_lookup_le(&object->rtree, pindex);
+		KASSERT(mpred == NULL || mpred->pindex != pindex,
+		    ("vm_page_alloc_contig: pindex already allocated"));
+	}
+
+	/*
+	 * Can we allocate the pages without the number of free pages falling
+	 * below the lower bound for the allocation class?
+	 */
+again:
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (cnt.v_free_count + cnt.v_cache_count >= npages +
-	    cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count >= npages +
-	    cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count >= npages)) {
+	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
+	    (req_class == VM_ALLOC_SYSTEM &&
+	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
+	    (req_class == VM_ALLOC_INTERRUPT &&
+	    vm_cnt.v_free_count >= npages)) {
+		/*
+		 * Can we allocate the pages from a reservation?
+		 */
 #if VM_NRESERVLEVEL > 0
 retry:
 		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
 		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
-		    low, high, alignment, boundary)) == NULL)
+		    low, high, alignment, boundary, mpred)) == NULL)
 #endif
+			/*
+			 * If not, allocate them from the free page queues.
+			 */
 			m_ret = vm_phys_alloc_contig(npages, low, high,
 			    alignment, boundary);
 	} else {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit, npages);
-		pagedaemon_wakeup();
+		if (vm_page_alloc_fail(object, req))
+			goto again;
 		return (NULL);
 	}
-	if (m_ret != NULL)
-		for (m = m_ret; m < &m_ret[npages]; m++) {
-			drop = vm_page_alloc_init(m);
-			if (drop != NULL) {
-				/*
-				 * Enqueue the vnode for deferred vdrop().
-				 */
-				m->plinks.s.pv = drop;
-				SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
-				    plinks.s.ss);
-			}
-		}
-	else {
+	if (m_ret != NULL) {
+		vm_phys_freecnt_adj(m_ret, -npages);
+		for (m = m_ret; m < &m_ret[npages]; m++)
+			if ((m->flags & PG_ZERO) != 0)
+				vm_page_zero_count--;
+	} else {
 #if VM_NRESERVLEVEL > 0
 		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
 		    boundary))
@@ -1845,6 +1882,8 @@
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (m_ret == NULL)
 		return (NULL);
+	for (m = m_ret; m < &m_ret[npages]; m++)
+		vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
@@ -1854,9 +1893,15 @@
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
+	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
+	    VPO_UNMANAGED : 0;
+	busy_lock = VPB_UNBUSIED;
+	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
+		busy_lock = VPB_SINGLE_EXCLUSIVER;
+	if ((req & VM_ALLOC_SBUSY) != 0)
+		busy_lock = VPB_SHARERS_WORD(1);
 	if ((req & VM_ALLOC_WIRED) != 0)
-		atomic_add_int(&cnt.v_wire_count, npages);
-	oflags = VPO_UNMANAGED;
+		atomic_add_int(&vm_cnt.v_wire_count, npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
@@ -1865,39 +1910,37 @@
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->aflags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
-		m->busy_lock = VPB_UNBUSIED;
-		if (object != NULL) {
-			if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
-				m->busy_lock = VPB_SINGLE_EXCLUSIVER;
-			if ((req & VM_ALLOC_SBUSY) != 0)
-				m->busy_lock = VPB_SHARERS_WORD(1);
-		}
+		m->busy_lock = busy_lock;
 		if ((req & VM_ALLOC_WIRED) != 0)
 			m->wire_count = 1;
-		/* Unmanaged pages don't use "act_count". */
+		m->act_count = 0;
 		m->oflags = oflags;
 		if (object != NULL) {
-			if (vm_page_insert(m, object, pindex)) {
-				vm_page_alloc_contig_vdrop(
-				    &deferred_vdrop_list);
-				if (vm_paging_needed())
-					pagedaemon_wakeup();
+			if (vm_page_insert_after(m, object, pindex, mpred)) {
+				pagedaemon_wakeup();
 				if ((req & VM_ALLOC_WIRED) != 0)
-					atomic_subtract_int(&cnt.v_wire_count,
-					    npages);
-				for (m_tmp = m, m = m_ret;
-				    m < &m_ret[npages]; m++) {
-					if ((req & VM_ALLOC_WIRED) != 0)
+					atomic_subtract_int(
+					    &vm_cnt.v_wire_count, npages);
+				KASSERT(m->object == NULL,
+				    ("page %p has object", m));
+				mpred = m;
+				for (m = m_ret; m < &m_ret[npages]; m++) {
+					if (m <= mpred &&
+					    (req & VM_ALLOC_WIRED) != 0)
 						m->wire_count = 0;
-					if (m >= m_tmp) {
-						m->object = NULL;
-						m->oflags |= VPO_UNMANAGED;
-					}
+					m->oflags = VPO_UNMANAGED;
 					m->busy_lock = VPB_UNBUSIED;
-					vm_page_free(m);
+					/* Don't change PG_ZERO. */
+					vm_page_free_toq(m);
 				}
+				if (req & VM_ALLOC_WAITFAIL) {
+					VM_OBJECT_WUNLOCK(object);
+					vm_radix_wait();
+					VM_OBJECT_WLOCK(object);
+				}
 				return (NULL);
 			}
+			mpred = m;
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
@@ -1904,63 +1947,29 @@
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
-	vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
-	if (vm_paging_needed())
+	if (vm_paging_needed(vm_cnt.v_free_count))
 		pagedaemon_wakeup();
 	return (m_ret);
 }
 
 /*
- * Initialize a page that has been freshly dequeued from a freelist.
- * The caller has to drop the vnode returned, if it is not NULL.
- *
- * This function may only be used to initialize unmanaged pages.
- *
- * To be called with vm_page_queue_free_mtx held.
+ * Check a page that has been freshly dequeued from a freelist.
  */
-static struct vnode *
-vm_page_alloc_init(vm_page_t m)
+static void
+vm_page_alloc_check(vm_page_t m)
 {
-	struct vnode *drop;
-	vm_object_t m_object;
 
+	KASSERT(m->object == NULL, ("page %p has object", m));
 	KASSERT(m->queue == PQ_NONE,
-	    ("vm_page_alloc_init: page %p has unexpected queue %d",
-	    m, m->queue));
-	KASSERT(m->wire_count == 0,
-	    ("vm_page_alloc_init: page %p is wired", m));
-	KASSERT(m->hold_count == 0,
-	    ("vm_page_alloc_init: page %p is held", m));
-	KASSERT(!vm_page_busied(m),
-	    ("vm_page_alloc_init: page %p is busy", m));
-	KASSERT(m->dirty == 0,
-	    ("vm_page_alloc_init: page %p is dirty", m));
+	    ("page %p has unexpected queue %d", m, m->queue));
+	KASSERT(m->wire_count == 0, ("page %p is wired", m));
+	KASSERT(m->hold_count == 0, ("page %p is held", m));
+	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
+	KASSERT(m->dirty == 0, ("page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
-	    ("vm_page_alloc_init: page %p has unexpected memattr %d",
+	    ("page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	drop = NULL;
-	if ((m->flags & PG_CACHED) != 0) {
-		KASSERT((m->flags & PG_ZERO) == 0,
-		    ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
-		m->valid = 0;
-		m_object = m->object;
-		vm_page_cache_remove(m);
-		if (m_object->type == OBJT_VNODE &&
-		    vm_object_cache_is_empty(m_object))
-			drop = m_object->handle;
-	} else {
-		KASSERT(VM_PAGE_IS_FREE(m),
-		    ("vm_page_alloc_init: page %p is not free", m));
-		KASSERT(m->valid == 0,
-		    ("vm_page_alloc_init: free page %p is valid", m));
-		vm_phys_freecnt_adj(m, -1);
-		if ((m->flags & PG_ZERO) != 0)
-			vm_page_zero_count--;
-	}
-	/* Don't clear the PG_ZERO flag; we'll need it later. */
-	m->flags &= PG_ZERO;
-	return (drop);
+	KASSERT(m->valid == 0, ("free page %p is valid", m));
 }
 
 /*
@@ -1986,9 +1995,8 @@
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
 {
-	struct vnode *drop;
 	vm_page_t m;
-	u_int flags;
+	u_int flags, free_count;
 	int req_class;
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
@@ -2002,18 +2010,17 @@
 	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
-	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
-	if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
+again:
+	mtx_lock(&vm_page_queue_free_mtx);
+	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
-	    cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
+	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
-	    cnt.v_free_count + cnt.v_cache_count > 0))
+	    vm_cnt.v_free_count > 0)) {
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
-	else {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		atomic_add_int(&vm_pageout_deficit,
-		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
-		pagedaemon_wakeup();
+	} else {
+		if (vm_page_alloc_fail(NULL, req))
+			goto again;
 		return (NULL);
 	}
 	if (m == NULL) {
@@ -2020,8 +2027,11 @@
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return (NULL);
 	}
-	drop = vm_page_alloc_init(m);
+	free_count = vm_phys_freecnt_adj(m, -1);
+	if ((m->flags & PG_ZERO) != 0)
+		vm_page_zero_count--;
 	mtx_unlock(&vm_page_queue_free_mtx);
+	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
@@ -2036,44 +2046,602 @@
 		 * The page lock is not required for wiring a page that does
 		 * not belong to an object.
 		 */
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	/* Unmanaged pages don't use "act_count". */
 	m->oflags = VPO_UNMANAGED;
-	if (drop != NULL)
-		vdrop(drop);
-	if (vm_paging_needed())
+	if (vm_paging_needed(free_count))
 		pagedaemon_wakeup();
 	return (m);
 }
 
+#define	VPSC_ANY	0	/* No restrictions. */
+#define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
+#define	VPSC_NOSUPER	2	/* Skip superpages. */
+
 /*
+ *	vm_page_scan_contig:
+ *
+ *	Scan vm_page_array[] between the specified entries "m_start" and
+ *	"m_end" for a run of contiguous physical pages that satisfy the
+ *	specified conditions, and return the lowest page in the run.  The
+ *	specified "alignment" determines the alignment of the lowest physical
+ *	page in the run.  If the specified "boundary" is non-zero, then the
+ *	run of physical pages cannot span a physical address that is a
+ *	multiple of "boundary".
+ *
+ *	"m_end" is never dereferenced, so it need not point to a vm_page
+ *	structure within vm_page_array[].
+ *
+ *	"npages" must be greater than zero.  "m_start" and "m_end" must not
+ *	span a hole (or discontiguity) in the physical address space.  Both
+ *	"alignment" and "boundary" must be a power of two.
+ */
+vm_page_t
+vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	struct mtx *m_mtx;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_run;
+#if VM_NRESERVLEVEL > 0
+	int level;
+#endif
+	int m_inc, order, run_ext, run_len;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	m_run = NULL;
+	run_len = 0;
+	m_mtx = NULL;
+	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
+		KASSERT((m->flags & PG_MARKER) == 0,
+		    ("page %p is PG_MARKER", m));
+		KASSERT((m->flags & PG_FICTITIOUS) == 0 || m->wire_count == 1,
+		    ("fictitious page %p has invalid wire count", m));
+
+		/*
+		 * If the current page would be the start of a run, check its
+		 * physical address against the end, alignment, and boundary
+		 * conditions.  If it doesn't satisfy these conditions, either
+		 * terminate the scan or advance to the next page that
+		 * satisfies the failed condition.
+		 */
+		if (run_len == 0) {
+			KASSERT(m_run == NULL, ("m_run != NULL"));
+			if (m + npages > m_end)
+				break;
+			pa = VM_PAGE_TO_PHYS(m);
+			if ((pa & (alignment - 1)) != 0) {
+				m_inc = atop(roundup2(pa, alignment) - pa);
+				continue;
+			}
+			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
+			    boundary) != 0) {
+				m_inc = atop(roundup2(pa, boundary) - pa);
+				continue;
+			}
+		} else
+			KASSERT(m_run != NULL, ("m_run == NULL"));
+
+		vm_page_change_lock(m, &m_mtx);
+		m_inc = 1;
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+		else if ((level = vm_reserv_level(m)) >= 0 &&
+		    (options & VPSC_NORESERV) != 0) {
+			run_ext = 0;
+			/* Advance to the end of the reservation. */
+			pa = VM_PAGE_TO_PHYS(m);
+			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
+			    pa);
+		}
+#endif
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is considered eligible for relocation if
+			 * and only if it could be laundered or reclaimed by
+			 * the page daemon.
+			 */
+			if (!VM_OBJECT_TRYRLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_RLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_RUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					run_ext = 0;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE) {
+				run_ext = 0;
+#if VM_NRESERVLEVEL > 0
+			} else if ((options & VPSC_NOSUPER) != 0 &&
+			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
+				run_ext = 0;
+				/* Advance to the end of the superpage. */
+				pa = VM_PAGE_TO_PHYS(m);
+				m_inc = atop(roundup2(pa + 1,
+				    vm_reserv_size(level)) - pa);
+#endif
+			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
+			    m->queue != PQ_NONE && !vm_page_busied(m)) {
+				/*
+				 * The page is allocated but eligible for
+				 * relocation.  Extend the current run by one
+				 * page.
+				 */
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				run_ext = 1;
+			} else
+				run_ext = 0;
+unlock:
+			VM_OBJECT_RUNLOCK(object);
+#if VM_NRESERVLEVEL > 0
+		} else if (level >= 0) {
+			/*
+			 * The page is reserved but not yet allocated.  In
+			 * other words, it is still free.  Extend the current
+			 * run by one page.
+			 */
+			run_ext = 1;
+#endif
+		} else if ((order = m->order) < VM_NFREEORDER) {
+			/*
+			 * The page is enqueued in the physical memory
+			 * allocator's free page queues.  Moreover, it is the
+			 * first page in a power-of-two-sized run of
+			 * contiguous free pages.  Add these pages to the end
+			 * of the current run, and jump ahead.
+			 */
+			run_ext = 1 << order;
+			m_inc = 1 << order;
+		} else {
+			/*
+			 * Skip the page for one of the following reasons: (1)
+			 * It is enqueued in the physical memory allocator's
+			 * free page queues.  However, it is not the first
+			 * page in a run of contiguous free pages.  (This case
+			 * rarely occurs because the scan is performed in
+			 * ascending order.) (2) It is not reserved, and it is
+			 * transitioning from free to allocated.  (Conversely,
+			 * the transition from allocated to free for managed
+			 * pages is blocked by the page lock.) (3) It is
+			 * allocated but not contained by an object and not
+			 * wired, e.g., allocated by Xen's balloon driver.
+			 */
+			run_ext = 0;
+		}
+
+		/*
+		 * Extend or reset the current run of pages.
+		 */
+		if (run_ext > 0) {
+			if (run_len == 0)
+				m_run = m;
+			run_len += run_ext;
+		} else {
+			if (run_len > 0) {
+				m_run = NULL;
+				run_len = 0;
+			}
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if (run_len >= npages)
+		return (m_run);
+	return (NULL);
+}
+
+/*
+ *	vm_page_reclaim_run:
+ *
+ *	Try to relocate each of the allocated virtual pages within the
+ *	specified run of physical pages to a new physical address.  Free the
+ *	physical pages underlying the relocated virtual pages.  A virtual page
+ *	is relocatable if and only if it could be laundered or reclaimed by
+ *	the page daemon.  Whenever possible, a virtual page is relocated to a
+ *	physical address above "high".
+ *
+ *	Returns 0 if every physical page within the run was already free or
+ *	just freed by a successful relocation.  Otherwise, returns a non-zero
+ *	value indicating why the last attempt to relocate a virtual page was
+ *	unsuccessful.
+ *
+ *	"req_class" must be an allocation class.
+ */
+static int
+vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
+    vm_paddr_t high)
+{
+	struct mtx *m_mtx;
+	struct spglist free;
+	vm_object_t object;
+	vm_paddr_t pa;
+	vm_page_t m, m_end, m_new;
+	int error, order, req;
+
+	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
+	    ("req_class is not an allocation class"));
+	SLIST_INIT(&free);
+	error = 0;
+	m = m_run;
+	m_end = m_run + npages;
+	m_mtx = NULL;
+	for (; error == 0 && m < m_end; m++) {
+		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
+		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
+
+		/*
+		 * Avoid releasing and reacquiring the same page lock.
+		 */
+		vm_page_change_lock(m, &m_mtx);
+retry:
+		if (m->wire_count != 0 || m->hold_count != 0)
+			error = EBUSY;
+		else if ((object = m->object) != NULL) {
+			/*
+			 * The page is relocated if and only if it could be
+			 * laundered or reclaimed by the page daemon.
+			 */
+			if (!VM_OBJECT_TRYWLOCK(object)) {
+				mtx_unlock(m_mtx);
+				VM_OBJECT_WLOCK(object);
+				mtx_lock(m_mtx);
+				if (m->object != object) {
+					/*
+					 * The page may have been freed.
+					 */
+					VM_OBJECT_WUNLOCK(object);
+					goto retry;
+				} else if (m->wire_count != 0 ||
+				    m->hold_count != 0) {
+					error = EBUSY;
+					goto unlock;
+				}
+			}
+			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
+			    ("page %p is PG_UNHOLDFREE", m));
+			/* Don't care: PG_NODUMP, PG_ZERO. */
+			if (object->type != OBJT_DEFAULT &&
+			    object->type != OBJT_SWAP &&
+			    object->type != OBJT_VNODE)
+				error = EINVAL;
+			else if (object->memattr != VM_MEMATTR_DEFAULT)
+				error = EINVAL;
+			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
+				KASSERT(pmap_page_get_memattr(m) ==
+				    VM_MEMATTR_DEFAULT,
+				    ("page %p has an unexpected memattr", m));
+				KASSERT((m->oflags & (VPO_SWAPINPROG |
+				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
+				    ("page %p has unexpected oflags", m));
+				/* Don't care: VPO_NOSYNC. */
+				if (m->valid != 0) {
+					/*
+					 * First, try to allocate a new page
+					 * that is above "high".  Failing
+					 * that, try to allocate a new page
+					 * that is below "m_run".  Allocate
+					 * the new page between the end of
+					 * "m_run" and "high" only as a last
+					 * resort.
+					 */
+					req = req_class | VM_ALLOC_NOOBJ;
+					if ((m->flags & PG_NODUMP) != 0)
+						req |= VM_ALLOC_NODUMP;
+					if (trunc_page(high) !=
+					    ~(vm_paddr_t)PAGE_MASK) {
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    round_page(high),
+						    ~(vm_paddr_t)0,
+						    PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					} else
+						m_new = NULL;
+					if (m_new == NULL) {
+						pa = VM_PAGE_TO_PHYS(m_run);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    0, pa - 1, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						pa += ptoa(npages);
+						m_new = vm_page_alloc_contig(
+						    NULL, 0, req, 1,
+						    pa, high, PAGE_SIZE, 0,
+						    VM_MEMATTR_DEFAULT);
+					}
+					if (m_new == NULL) {
+						error = ENOMEM;
+						goto unlock;
+					}
+					KASSERT(m_new->wire_count == 0,
+					    ("page %p is wired", m_new));
+
+					/*
+					 * Replace "m" with the new page.  For
+					 * vm_page_replace(), "m" must be busy
+					 * and dequeued.  Finally, change "m"
+					 * as if vm_page_free() was called.
+					 */
+					if (object->ref_count != 0)
+						pmap_remove_all(m);
+					m_new->aflags = m->aflags;
+					KASSERT(m_new->oflags == VPO_UNMANAGED,
+					    ("page %p is managed", m_new));
+					m_new->oflags = m->oflags & VPO_NOSYNC;
+					pmap_copy_page(m, m_new);
+					m_new->valid = m->valid;
+					m_new->dirty = m->dirty;
+					m->flags &= ~PG_ZERO;
+					vm_page_xbusy(m);
+					vm_page_remque(m);
+					vm_page_replace_checked(m_new, object,
+					    m->pindex, m);
+					m->valid = 0;
+					vm_page_undirty(m);
+
+					/*
+					 * The new page must be deactivated
+					 * before the object is unlocked.
+					 */
+					vm_page_change_lock(m_new, &m_mtx);
+					vm_page_deactivate(m_new);
+				} else {
+					m->flags &= ~PG_ZERO;
+					vm_page_remque(m);
+					vm_page_remove(m);
+					KASSERT(m->dirty == 0,
+					    ("page %p is dirty", m));
+				}
+				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
+			} else
+				error = EBUSY;
+unlock:
+			VM_OBJECT_WUNLOCK(object);
+		} else {
+			mtx_lock(&vm_page_queue_free_mtx);
+			order = m->order;
+			if (order < VM_NFREEORDER) {
+				/*
+				 * The page is enqueued in the physical memory
+				 * allocator's free page queues.  Moreover, it
+				 * is the first page in a power-of-two-sized
+				 * run of contiguous free pages.  Jump ahead
+				 * to the last page within that run, and
+				 * continue from there.
+				 */
+				m += (1 << order) - 1;
+			}
+#if VM_NRESERVLEVEL > 0
+			else if (vm_reserv_is_page_free(m))
+				order = 0;
+#endif
+			mtx_unlock(&vm_page_queue_free_mtx);
+			if (order == VM_NFREEORDER)
+				error = EINVAL;
+		}
+	}
+	if (m_mtx != NULL)
+		mtx_unlock(m_mtx);
+	if ((m = SLIST_FIRST(&free)) != NULL) {
+		mtx_lock(&vm_page_queue_free_mtx);
+		do {
+			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
+			vm_page_free_phys(m);
+		} while ((m = SLIST_FIRST(&free)) != NULL);
+		vm_page_zero_idle_wakeup();
+		vm_page_free_wakeup();
+		mtx_unlock(&vm_page_queue_free_mtx);
+	}
+	return (error);
+}
+
+#define	NRUNS	16
+
+CTASSERT(powerof2(NRUNS));
+
+#define	RUN_INDEX(count)	((count) & (NRUNS - 1))
+
+#define	MIN_RECLAIM	8
+
+/*
+ *	vm_page_reclaim_contig:
+ *
+ *	Reclaim allocated, contiguous physical memory satisfying the specified
+ *	conditions by relocating the virtual pages using that physical memory.
+ *	Returns true if reclamation is successful and false otherwise.  Since
+ *	relocation requires the allocation of physical pages, reclamation may
+ *	fail due to a shortage of free pages.  When reclamation fails, callers
+ *	are expected to perform VM_WAIT before retrying a failed allocation
+ *	operation, e.g., vm_page_alloc_contig().
+ *
+ *	The caller must always specify an allocation class through "req".
+ *
+ *	allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs a page
+ *	VM_ALLOC_INTERRUPT	interrupt time request
+ *
+ *	The optional allocation flags are ignored.
+ *
+ *	"npages" must be greater than zero.  Both "alignment" and "boundary"
+ *	must be a power of two.
+ */
+bool
+vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary)
+{
+	vm_paddr_t curr_low;
+	vm_page_t m_run, m_runs[NRUNS];
+	u_long count, reclaimed;
+	int error, i, options, req_class;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	req_class = req & VM_ALLOC_CLASS_MASK;
+
+	/*
+	 * The page daemon is allowed to dig deeper into the free page list.
+	 */
+	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
+		req_class = VM_ALLOC_SYSTEM;
+
+	/*
+	 * Return if the number of free pages cannot satisfy the requested
+	 * allocation.
+	 */
+	count = vm_cnt.v_free_count;
+	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
+	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
+	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
+		return (false);
+
+	/*
+	 * Scan up to three times, relaxing the restrictions ("options") on
+	 * the reclamation of reservations and superpages each time.
+	 */
+	for (options = VPSC_NORESERV;;) {
+		/*
+		 * Find the highest runs that satisfy the given constraints
+		 * and restrictions, and record them in "m_runs".
+		 */
+		curr_low = low;
+		count = 0;
+		for (;;) {
+			m_run = vm_phys_scan_contig(npages, curr_low, high,
+			    alignment, boundary, options);
+			if (m_run == NULL)
+				break;
+			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
+			m_runs[RUN_INDEX(count)] = m_run;
+			count++;
+		}
+
+		/*
+		 * Reclaim the highest runs in LIFO (descending) order until
+		 * the number of reclaimed pages, "reclaimed", is at least
+		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
+		 * reclamation is idempotent, and runs will (likely) recur
+		 * from one scan to the next as restrictions are relaxed.
+		 */
+		reclaimed = 0;
+		for (i = 0; count > 0 && i < NRUNS; i++) {
+			count--;
+			m_run = m_runs[RUN_INDEX(count)];
+			error = vm_page_reclaim_run(req_class, npages, m_run,
+			    high);
+			if (error == 0) {
+				reclaimed += npages;
+				if (reclaimed >= MIN_RECLAIM)
+					return (true);
+			}
+		}
+
+		/*
+		 * Either relax the restrictions on the next scan or return if
+		 * the last scan had no restrictions.
+		 */
+		if (options == VPSC_NORESERV)
+			options = VPSC_NOSUPER;
+		else if (options == VPSC_NOSUPER)
+			options = VPSC_ANY;
+		else if (options == VPSC_ANY)
+			return (reclaimed != 0);
+	}
+}
+
+/*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places before memory allocations.
  */
-void
-vm_wait(void)
+static void
+_vm_wait(void)
 {
 
-	mtx_lock(&vm_page_queue_free_mtx);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
-		if (!vm_pages_needed) {
-			vm_pages_needed = 1;
-			wakeup(&vm_pages_needed);
-		}
-		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
-		    "vmwait", 0);
+		if (pageproc == NULL)
+			panic("vm_wait in early boot");
+		pagedaemon_wait(PVM, "vmwait");
 	}
 }
 
+void
+vm_wait(void)
+{
+
+	mtx_lock(&vm_page_queue_free_mtx);
+	_vm_wait();
+}
+
 /*
+ *	vm_page_alloc_fail:
+ *
+ *	Called when a page allocation function fails.  Informs the
+ *	pagedaemon and performs the requested wait.  Requires the
+ *	page_queue_free and object lock on entry.  Returns with the
+ *	object lock held and free lock released.  Returns an error when
+ *	retry is necessary.
+ *
+ */
+static int
+vm_page_alloc_fail(vm_object_t object, int req)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	atomic_add_int(&vm_pageout_deficit,
+	    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
+	if (req & (VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) {
+		if (object != NULL) 
+			VM_OBJECT_WUNLOCK(object);
+		_vm_wait();
+		if (object != NULL) 
+			VM_OBJECT_WLOCK(object);
+		if (req & VM_ALLOC_WAITOK)
+			return (EAGAIN);
+	} else {
+		mtx_unlock(&vm_page_queue_free_mtx);
+		pagedaemon_wakeup();
+	}
+	return (0);
+}
+
+/*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Sleep until free pages are available for allocation.
@@ -2088,12 +2656,7 @@
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (!vm_pages_needed) {
-		vm_pages_needed = 1;
-		wakeup(&vm_pages_needed);
-	}
-	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
-	    "pfault", 0);
+	pagedaemon_wait(PUSER, "pfault");
 }
 
 struct vm_pagequeue *
@@ -2100,7 +2663,10 @@
 vm_page_pagequeue(vm_page_t m)
 {
 
-	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
+	if (vm_page_in_laundry(m))
+		return (&vm_dom[0].vmd_pagequeues[m->queue]);
+	else
+		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
 }
 
 /*
@@ -2115,9 +2681,9 @@
 {
 	struct vm_pagequeue *pq;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	KASSERT(m->queue != PQ_NONE,
-	    ("vm_page_dequeue: page %p is not queued", m));
+	vm_page_assert_locked(m);
+	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
+	    m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
@@ -2154,12 +2720,18 @@
  *	The page must be locked.
  */
 static void
-vm_page_enqueue(int queue, vm_page_t m)
+vm_page_enqueue(uint8_t queue, vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
-	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
+	KASSERT(queue < PQ_COUNT,
+	    ("vm_page_enqueue: invalid queue %u request for page %p",
+	    queue, m));
+	if (queue == PQ_LAUNDRY)
+		pq = &vm_dom[0].vmd_pagequeues[queue];
+	else
+		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
@@ -2243,13 +2815,12 @@
 /*
  *	vm_page_free_wakeup:
  *
- *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
- *	routine is called when a page has been added to the cache or free
- *	queues.
+ *	Helper routine for vm_page_free_toq().  This routine is called
+ *	when a page is added to the free queues.
  *
  *	The page queues must be locked.
  */
-static inline void
+static void
 vm_page_free_wakeup(void)
 {
 
@@ -2259,7 +2830,7 @@
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
-	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
+	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
@@ -2269,45 +2840,36 @@
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
-		vm_pages_needed = 0;
-		wakeup(&cnt.v_free_count);
+		vm_pages_needed = false;
+		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
 /*
- *	Turn a cached page into a free page, by changing its attributes.
- *	Keep the statistics up-to-date.
+ *	vm_page_free_prep:
  *
- *	The free page queue must be locked.
- */
-static void
-vm_page_cache_turn_free(vm_page_t m)
-{
-
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-
-	m->object = NULL;
-	m->valid = 0;
-	/* Clear PG_CACHED and set PG_FREE. */
-	m->flags ^= PG_CACHED | PG_FREE;
-	KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
-	    ("vm_page_cache_free: page %p has inconsistent flags", m));
-	cnt.v_cache_count--;
-	vm_phys_freecnt_adj(m, 1);
-}
-
-/*
- *	vm_page_free_toq:
+ *	Prepares the given page to be put on the free list,
+ *	disassociating it from any VM object. The caller may return
+ *	the page to the free list only if this function returns true.
  *
- *	Returns the given page to the free list,
- *	disassociating it with any VM object.
- *
- *	The object must be locked.  The page must be locked if it is managed.
+ *	The object must be locked.  The page must be locked if it is
+ *	managed.  For a queued managed page, the pagequeue_locked
+ *	argument specifies whether the page queue is already locked.
  */
-void
-vm_page_free_toq(vm_page_t m)
+bool
+vm_page_free_prep(vm_page_t m, bool pagequeue_locked)
 {
 
+#if defined(DIAGNOSTIC) && defined(PHYS_TO_DMAP)
+	if ((m->flags & PG_ZERO) != 0) {
+		uint64_t *p;
+		int i;
+		p = (uint64_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
+		for (i = 0; i < PAGE_SIZE / sizeof(uint64_t); i++, p++)
+			KASSERT(*p == 0, ("vm_page_free_prep %p PG_ZERO %d %jx",
+			    m, i, (uintmax_t)*p));
+	}
+#endif
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
@@ -2317,9 +2879,7 @@
 		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 	PCPU_INC(cnt.v_tfree);
 
-	if (VM_PAGE_IS_FREE(m))
-		panic("vm_page_free: freeing free page %p", m);
-	else if (vm_page_sbusied(m))
+	if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
@@ -2328,7 +2888,12 @@
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
-	vm_page_remque(m);
+	if (m->queue != PQ_NONE) {
+		if (pagequeue_locked)
+			vm_page_dequeue_locked(m);
+		else
+			vm_page_dequeue(m);
+	}
 	vm_page_remove(m);
 
 	/*
@@ -2335,9 +2900,8 @@
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
-	if ((m->flags & PG_FICTITIOUS) != 0) {
-		return;
-	}
+	if ((m->flags & PG_FICTITIOUS) != 0)
+		return (false);
 
 	m->valid = 0;
 	vm_page_undirty(m);
@@ -2349,36 +2913,75 @@
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
-	} else {
-		/*
-		 * Restore the default memory attribute to the page.
-		 */
-		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
-			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+		return (false);
+	}
 
-		/*
-		 * Insert the page into the physical memory allocator's
-		 * cache/free page queues.
-		 */
-		mtx_lock(&vm_page_queue_free_mtx);
-		m->flags |= PG_FREE;
-		vm_phys_freecnt_adj(m, 1);
+	/*
+	 * Restore the default memory attribute to the page.
+	 */
+	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
+		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
+
+	return (true);
+}
+
+/*
+ * Insert the page into the physical memory allocator's free page
+ * queues.  This is the last step to free a page.
+ */
+static void
+vm_page_free_phys(vm_page_t m)
+{
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+	vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
-		if (!vm_reserv_free_page(m))
-#else
-		if (TRUE)
+	if (!vm_reserv_free_page(m))
 #endif
 			vm_phys_free_pages(m, 0);
-		if ((m->flags & PG_ZERO) != 0)
-			++vm_page_zero_count;
-		else
-			vm_page_zero_idle_wakeup();
-		vm_page_free_wakeup();
-		mtx_unlock(&vm_page_queue_free_mtx);
-	}
+	if ((m->flags & PG_ZERO) != 0)
+		++vm_page_zero_count;
+	else
+		vm_page_zero_idle_wakeup();
 }
 
+void
+vm_page_free_phys_pglist(struct pglist *tq)
+{
+	vm_page_t m;
+
+	if (TAILQ_EMPTY(tq))
+		return;
+	mtx_lock(&vm_page_queue_free_mtx);
+	TAILQ_FOREACH(m, tq, listq)
+		vm_page_free_phys(m);
+	vm_page_free_wakeup();
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
 /*
+ *	vm_page_free_toq:
+ *
+ *	Returns the given page to the free list, disassociating it
+ *	from any VM object.
+ *
+ *	The object must be locked.  The page must be locked if it is
+ *	managed.
+ */
+void
+vm_page_free_toq(vm_page_t m)
+{
+
+	if (!vm_page_free_prep(m, false))
+		return;
+	mtx_lock(&vm_page_queue_free_mtx);
+	vm_page_free_phys(m);
+	vm_page_free_wakeup();
+	mtx_unlock(&vm_page_queue_free_mtx);
+}
+
+/*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
@@ -2410,7 +3013,7 @@
 		    m->queue == PQ_NONE,
 		    ("vm_page_wire: unmanaged page %p is queued", m));
 		vm_page_remque(m);
-		atomic_add_int(&cnt.v_wire_count, 1);
+		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
@@ -2419,41 +3022,43 @@
 /*
  * vm_page_unwire:
  *
- * Release one wiring of the specified page, potentially enabling it to be
- * paged again.  If paging is enabled, then the value of the parameter
- * "activate" determines to which queue the page is added.  If "activate" is
- * non-zero, then the page is added to the active queue.  Otherwise, it is
- * added to the inactive queue.
+ * Release one wiring of the specified page, potentially allowing it to be
+ * paged out.  Returns TRUE if the number of wirings transitions to zero and
+ * FALSE otherwise.
  *
- * However, unless the page belongs to an object, it is not enqueued because
- * it cannot be paged out.
+ * Only managed pages belonging to an object can be paged out.  If the number
+ * of wirings transitions to zero and the page is eligible for page out, then
+ * the page is added to the specified paging queue (unless PQ_NONE is
+ * specified).
  *
  * If a page is fictitious, then its wire count must always be one.
  *
  * A managed page must be locked.
  */
-void
-vm_page_unwire(vm_page_t m, int activate)
+boolean_t
+vm_page_unwire(vm_page_t m, uint8_t queue)
 {
 
+	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
+	    ("vm_page_unwire: invalid queue %u request for page %p",
+	    queue, m));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
-		vm_page_lock_assert(m, MA_OWNED);
+		vm_page_assert_locked(m);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
-		return;
+		return (FALSE);
 	}
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
-			atomic_subtract_int(&cnt.v_wire_count, 1);
-			if ((m->oflags & VPO_UNMANAGED) != 0 ||
-			    m->object == NULL)
-				return;
-			if (!activate)
-				m->flags &= ~PG_WINATCFLS;
-			vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
-		}
+			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+			if ((m->oflags & VPO_UNMANAGED) == 0 &&
+			    m->object != NULL && queue != PQ_NONE)
+				vm_page_enqueue(queue, m);
+			return (TRUE);
+		} else
+			return (FALSE);
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
 }
@@ -2461,25 +3066,16 @@
 /*
  * Move the specified page to the inactive queue.
  *
- * Many pages placed on the inactive queue should actually go
- * into the cache, but it is difficult to figure out which.  What
- * we do instead, if the inactive target is well met, is to put
- * clean pages at the head of the inactive queue instead of the tail.
- * This will cause them to be moved to the cache more quickly and
- * if not actively re-referenced, reclaimed more quickly.  If we just
- * stick these pages at the end of the inactive queue, heavy filesystem
- * meta-data accesses can cause an unnecessary paging load on memory bound 
- * processes.  This optimization causes one-time-use metadata to be
- * reused more quickly.
+ * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
+ * queue.  However, setting "noreuse" to TRUE will accelerate the specified
+ * page's reclamation, but it will not unmap the page from any address space.
+ * This is implemented by inserting the page near the head of the inactive
+ * queue, using a marker page to guide FIFO insertion ordering.
  *
- * Normally athead is 0 resulting in LRU operation.  athead is set
- * to 1 if we want this page to be 'as if it were placed in the cache',
- * except without unmapping it from the process address space.
- *
  * The page must be locked.
  */
 static inline void
-_vm_page_deactivate(vm_page_t m, int athead)
+_vm_page_deactivate(vm_page_t m, boolean_t noreuse)
 {
 	struct vm_pagequeue *pq;
 	int queue;
@@ -2490,7 +3086,7 @@
 	 * Ignore if the page is already inactive, unless it is unlikely to be
 	 * reactivated.
 	 */
-	if ((queue = m->queue) == PQ_INACTIVE && !athead)
+	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
@@ -2501,12 +3097,12 @@
 		} else {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
-			m->flags &= ~PG_WINATCFLS;
 			vm_pagequeue_lock(pq);
 		}
 		m->queue = PQ_INACTIVE;
-		if (athead)
-			TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
+		if (noreuse)
+			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
+			    m, plinks.q);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_inc(pq);
@@ -2523,165 +3119,73 @@
 vm_page_deactivate(vm_page_t m)
 {
 
-	_vm_page_deactivate(m, 0);
+	_vm_page_deactivate(m, FALSE);
 }
 
 /*
- * vm_page_try_to_cache:
+ * Move the specified page to the inactive queue with the expectation
+ * that it is unlikely to be reused.
  *
- * Returns 0 on failure, 1 on success
+ * The page must be locked.
  */
-int
-vm_page_try_to_cache(vm_page_t m)
+void
+vm_page_deactivate_noreuse(vm_page_t m)
 {
 
-	vm_page_lock_assert(m, MA_OWNED);
-	VM_OBJECT_ASSERT_WLOCKED(m->object);
-	if (m->dirty || m->hold_count || m->wire_count ||
-	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
-		return (0);
-	pmap_remove_all(m);
-	if (m->dirty)
-		return (0);
-	vm_page_cache(m);
-	return (1);
+	_vm_page_deactivate(m, TRUE);
 }
 
 /*
- * vm_page_try_to_free()
+ * vm_page_launder
  *
- *	Attempt to free the page.  If we cannot free it, we do nothing.
- *	1 is returned on success, 0 on failure.
+ * 	Put a page in the laundry.
  */
-int
-vm_page_try_to_free(vm_page_t m)
+void
+vm_page_launder(vm_page_t m)
 {
+	int queue;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	if (m->object != NULL)
-		VM_OBJECT_ASSERT_WLOCKED(m->object);
-	if (m->dirty || m->hold_count || m->wire_count ||
-	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
-		return (0);
-	pmap_remove_all(m);
-	if (m->dirty)
-		return (0);
-	vm_page_free(m);
-	return (1);
+	vm_page_assert_locked(m);
+	if ((queue = m->queue) != PQ_LAUNDRY) {
+		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
+			if (queue != PQ_NONE)
+				vm_page_dequeue(m);
+			vm_page_enqueue(PQ_LAUNDRY, m);
+		} else
+			KASSERT(queue == PQ_NONE,
+			    ("wired page %p is queued", m));
+	}
 }
 
 /*
- * vm_page_cache
+ * vm_page_try_to_free()
  *
- * Put the specified page onto the page cache queue (if appropriate).
- *
- * The object and page must be locked.
+ *	Attempt to free the page.  If we cannot free it, we do nothing.
+ *	true is returned on success, false on failure.
  */
-void
-vm_page_cache(vm_page_t m)
+bool
+vm_page_try_to_free(vm_page_t m)
 {
-	vm_object_t object;
-	boolean_t cache_was_empty;
 
-	vm_page_lock_assert(m, MA_OWNED);
-	object = m->object;
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
-	    m->hold_count || m->wire_count)
-		panic("vm_page_cache: attempting to cache busy page");
-	KASSERT(!pmap_page_is_mapped(m),
-	    ("vm_page_cache: page %p is mapped", m));
-	KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
-	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
-	    (object->type == OBJT_SWAP &&
-	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
-		/*
-		 * Hypothesis: A cache-elgible page belonging to a
-		 * default object or swap object but without a backing
-		 * store must be zero filled.
-		 */
-		vm_page_free(m);
-		return;
+	vm_page_assert_locked(m);
+	if (m->object != NULL)
+		VM_OBJECT_ASSERT_WLOCKED(m->object);
+	if (m->dirty != 0 || m->hold_count != 0 || m->wire_count != 0 ||
+	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
+		return (false);
+	if (m->object != NULL && m->object->ref_count != 0) {
+		pmap_remove_all(m);
+		if (m->dirty != 0)
+			return (false);
 	}
-	KASSERT((m->flags & PG_CACHED) == 0,
-	    ("vm_page_cache: page %p is already cached", m));
-
-	/*
-	 * Remove the page from the paging queues.
-	 */
-	vm_page_remque(m);
-
-	/*
-	 * Remove the page from the object's collection of resident
-	 * pages. 
-	 */
-	vm_radix_remove(&object->rtree, m->pindex);
-	TAILQ_REMOVE(&object->memq, m, listq);
-	object->resident_page_count--;
-
-	/*
-	 * Restore the default memory attribute to the page.
-	 */
-	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
-		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
-
-	/*
-	 * Insert the page into the object's collection of cached pages
-	 * and the physical memory allocator's cache/free page queues.
-	 */
-	m->flags &= ~PG_ZERO;
-	mtx_lock(&vm_page_queue_free_mtx);
-	cache_was_empty = vm_radix_is_empty(&object->cache);
-	if (vm_radix_insert(&object->cache, m)) {
-		mtx_unlock(&vm_page_queue_free_mtx);
-		if (object->type == OBJT_VNODE &&
-		    object->resident_page_count == 0)
-			vdrop(object->handle);
-		m->object = NULL;
-		vm_page_free(m);
-		return;
-	}
-
-	/*
-	 * The above call to vm_radix_insert() could reclaim the one pre-
-	 * existing cached page from this object, resulting in a call to
-	 * vdrop().
-	 */
-	if (!cache_was_empty)
-		cache_was_empty = vm_radix_is_singleton(&object->cache);
-
-	m->flags |= PG_CACHED;
-	cnt.v_cache_count++;
-	PCPU_INC(cnt.v_tcached);
-#if VM_NRESERVLEVEL > 0
-	if (!vm_reserv_free_page(m)) {
-#else
-	if (TRUE) {
-#endif
-		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
-		vm_phys_free_pages(m, 0);
-	}
-	vm_page_free_wakeup();
-	mtx_unlock(&vm_page_queue_free_mtx);
-
-	/*
-	 * Increment the vnode's hold count if this is the object's only
-	 * cached page.  Decrement the vnode's hold count if this was
-	 * the object's only resident page.
-	 */
-	if (object->type == OBJT_VNODE) {
-		if (cache_was_empty && object->resident_page_count != 0)
-			vhold(object->handle);
-		else if (!cache_was_empty && object->resident_page_count == 0)
-			vdrop(object->handle);
-	}
+	vm_page_free(m);
+	return (true);
 }
 
 /*
  * vm_page_advise
  *
- * 	Deactivate or do nothing, as appropriate.  This routine is used
- * 	by madvise() and vop_stdadvise().
+ * 	Apply the specified advice to the given page.
  *
  *	The object and page must be locked.
  */
@@ -2694,20 +3198,16 @@
 	if (advice == MADV_FREE)
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
-		 * up by the system.  However, such pages are often reused
-		 * quickly by malloc() so we do not do anything that would
-		 * cause a page fault if we can help it.
-		 *
-		 * Specifically, we do not try to actually free the page now
-		 * nor do we try to put it in the cache (which would cause a
-		 * page fault on reuse).
-		 *
-		 * But we do make the page as freeable as we can without
-		 * actually taking the step of unmapping it.
+		 * without first paging it out.  MADV_FREE pages are often
+		 * quickly reused by malloc(3), so we do not do anything that
+		 * would result in a page fault on a later access.
 		 */
 		vm_page_undirty(m);
-	else if (advice != MADV_DONTNEED)
+	else if (advice != MADV_DONTNEED) {
+		if (advice == MADV_WILLNEED)
+			vm_page_activate(m);
 		return;
+	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
@@ -2719,11 +3219,15 @@
 		vm_page_dirty(m);
 
 	/*
-	 * Place clean pages at the head of the inactive queue rather than the
-	 * tail, thus defeating the queue's LRU operation and ensuring that the
-	 * page will be reused quickly.
+	 * Place clean pages near the head of the inactive queue rather than
+	 * the tail, thus defeating the queue's LRU operation and ensuring that
+	 * the page will be reused quickly.  Dirty pages not already in the
+	 * laundry are moved there.
 	 */
-	_vm_page_deactivate(m, m->dirty == 0);
+	if (m->dirty == 0)
+		vm_page_deactivate_noreuse(m);
+	else
+		vm_page_launder(m);
 }
 
 /*
@@ -2742,16 +3246,23 @@
 {
 	vm_page_t m;
 	int sleep;
+	int pflags;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
+	pflags = allocflags &
+	    ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL);
+	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+		pflags |= VM_ALLOC_WAITFAIL;
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
+			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+				return (NULL);
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
@@ -2778,14 +3289,12 @@
 			return (m);
 		}
 	}
-	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
+	m = vm_page_alloc(object, pindex, pflags);
 	if (m == NULL) {
-		VM_OBJECT_WUNLOCK(object);
-		VM_WAIT;
-		VM_OBJECT_WLOCK(object);
+		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+			return (NULL);
 		goto retrylookup;
-	} else if (m->valid != 0)
-		return (m);
+	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
@@ -2792,6 +3301,114 @@
 }
 
 /*
+ * Return the specified range of pages from the given object.  For each
+ * page offset within the range, if a page already exists within the object
+ * at that offset and it is busy, then wait for it to change state.  If,
+ * instead, the page doesn't exist, then allocate it.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs the pages
+ *
+ * The caller must always specify that the pages are to be busied and/or
+ * wired.
+ *
+ * optional allocation flags:
+ *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
+ *	VM_ALLOC_NOBUSY		do not exclusive busy the page
+ *	VM_ALLOC_NOWAIT		do not sleep
+ *	VM_ALLOC_SBUSY		set page to sbusy state
+ *	VM_ALLOC_WIRED		wire the pages
+ *	VM_ALLOC_ZERO		zero and validate any invalid pages
+ *
+ * If VM_ALLOC_NOWAIT is not specified, this routine may sleep.  Otherwise, it
+ * may return a partial prefix of the requested range.
+ */
+int
+vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count)
+{
+	vm_page_t m, mpred;
+	int pflags;
+	int i;
+	bool sleep;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
+	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
+	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_WIRED) != 0,
+	    ("vm_page_grab_pages: the pages must be busied or wired"));
+	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+	    ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
+	if (count == 0)
+		return (0);
+	pflags = allocflags & ~(VM_ALLOC_NOWAIT | VM_ALLOC_WAITOK |
+	    VM_ALLOC_WAITFAIL | VM_ALLOC_IGN_SBUSY);
+	if ((allocflags & VM_ALLOC_NOWAIT) == 0)
+		pflags |= VM_ALLOC_WAITFAIL;
+	i = 0;
+retrylookup:
+	m = vm_radix_lookup_le(&object->rtree, pindex + i);
+	if (m == NULL || m->pindex != pindex + i) {
+		mpred = m;
+		m = NULL;
+	} else
+		mpred = TAILQ_PREV(m, pglist, listq);
+	for (; i < count; i++) {
+		if (m != NULL) {
+			sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+			    vm_page_xbusied(m) : vm_page_busied(m);
+			if (sleep) {
+				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+					break;
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_aflag_set(m, PGA_REFERENCED);
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(object);
+				vm_page_busy_sleep(m, "grbmaw", (allocflags &
+				    VM_ALLOC_IGN_SBUSY) != 0);
+				VM_OBJECT_WLOCK(object);
+				goto retrylookup;
+			}
+			if ((allocflags & VM_ALLOC_WIRED) != 0) {
+				vm_page_lock(m);
+				vm_page_wire(m);
+				vm_page_unlock(m);
+			}
+			if ((allocflags & (VM_ALLOC_NOBUSY |
+			    VM_ALLOC_SBUSY)) == 0)
+				vm_page_xbusy(m);
+			if ((allocflags & VM_ALLOC_SBUSY) != 0)
+				vm_page_sbusy(m);
+		} else {
+			m = vm_page_alloc_after(object, pindex + i,
+			    pflags | VM_ALLOC_COUNT(count - i), mpred);
+			if (m == NULL) {
+				if ((allocflags & VM_ALLOC_NOWAIT) != 0)
+					break;
+				goto retrylookup;
+			}
+		}
+		if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
+			if ((m->flags & PG_ZERO) == 0)
+				pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+		}
+		ma[i] = mpred = m;
+		m = vm_page_next(m);
+	}
+	return (i);
+}
+
+/*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
@@ -2841,17 +3458,17 @@
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
-	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
-	 * If the ending offset is not DEV_BSIZE aligned and the 
+	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
-	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -2858,7 +3475,7 @@
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
-	 * is already dirty. 
+	 * is already dirty.
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
@@ -2948,17 +3565,17 @@
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
-	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
+	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
-	 * If the ending offset is not DEV_BSIZE aligned and the 
+	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
-	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
+	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
@@ -3050,12 +3667,12 @@
 /*
  * vm_page_zero_invalid()
  *
- *	The kernel assumes that the invalid portions of a page contain 
+ *	The kernel assumes that the invalid portions of a page contain
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
- *	Pages are most often semi-valid when the end of a file is mapped 
+ *	Pages are most often semi-valid when the end of a file is mapped
  *	into memory and the file's size is not page aligned.
  */
 void
@@ -3072,10 +3689,10 @@
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
-		if (i == (PAGE_SIZE / DEV_BSIZE) || 
+		if (i == (PAGE_SIZE / DEV_BSIZE) ||
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
-				pmap_zero_page_area(m, 
+				pmap_zero_page_area(m,
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
@@ -3109,16 +3726,19 @@
 }
 
 /*
- *	vm_page_ps_is_valid:
- *
- *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
+ * Returns true if all of the specified predicates are true for the entire
+ * (super)page and false otherwise.
  */
-boolean_t
-vm_page_ps_is_valid(vm_page_t m)
+bool
+vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m)
 {
+	vm_object_t object;
 	int i, npages;
 
-	VM_OBJECT_ASSERT_LOCKED(m->object);
+	object = m->object;
+	if (skip_m != NULL && skip_m->object != object)
+		return (false);
+	VM_OBJECT_ASSERT_LOCKED(object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
@@ -3127,10 +3747,28 @@
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
-		if (m[i].valid != VM_PAGE_BITS_ALL)
-			return (FALSE);
+		/* Always test object consistency, including "skip_m". */
+		if (m[i].object != object)
+			return (false);
+		if (&m[i] == skip_m)
+			continue;
+		if ((flags & PS_NONE_BUSY) != 0 && vm_page_busied(&m[i]))
+			return (false);
+		if ((flags & PS_ALL_DIRTY) != 0) {
+			/*
+			 * Calling vm_page_test_dirty() or pmap_is_modified()
+			 * might stop this case from spuriously returning
+			 * "false".  However, that would require a write lock
+			 * on the object containing "m[i]".
+			 */
+			if (m[i].dirty != VM_PAGE_BITS_ALL)
+				return (false);
+		}
+		if ((flags & PS_ALL_VALID) != 0 &&
+		    m[i].valid != VM_PAGE_BITS_ALL)
+			return (false);
 	}
-	return (TRUE);
+	return (true);
 }
 
 /*
@@ -3224,16 +3862,16 @@
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
-	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
-	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
-	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
-	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
-	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
-	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
-	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
-	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
-	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
-	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
+
+	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
+	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
+	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
+	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
+	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
+	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
+	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
+	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
+	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
@@ -3240,17 +3878,16 @@
 {
 	int dom;
 
-	db_printf("pq_free %d pq_cache %d\n",
-	    cnt.v_free_count, cnt.v_cache_count);
+	db_printf("pq_free %d\n", vm_cnt.v_free_count);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
-	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
+	    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
-		    vm_dom[dom].vmd_pass);
+		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
 	}
 }
 
@@ -3257,7 +3894,7 @@
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
-	boolean_t phys;
+	boolean_t phys, virt;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
@@ -3265,7 +3902,10 @@
 	}
 
 	phys = strchr(modif, 'p') != NULL;
-	if (phys)
+	virt = strchr(modif, 'v') != NULL;
+	if (virt)
+		m = PHYS_TO_VM_PAGE(pmap_kextract(addr));
+	else if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;

Modified: trunk/sys/vm/vm_page.h
===================================================================
--- trunk/sys/vm/vm_page.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_page.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_page.h 307672 2016-10-20 13:12:19Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_page.h 332505 2018-04-14 17:41:54Z kib $
  */
 
 /*
@@ -142,7 +142,7 @@
 	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_paddr_t phys_addr;		/* physical address of page */
-	struct md_page md;		/* machine dependant stuff */
+	struct md_page md;		/* machine dependent stuff */
 	u_int wire_count;		/* wired down maps refs (P) */
 	volatile u_int busy_lock;	/* busy owners lock */
 	uint16_t hold_count;		/* page hold count (P) */
@@ -150,6 +150,7 @@
 	uint8_t aflags;			/* access is atomic */
 	uint8_t oflags;			/* page VPO_* flags (O) */
 	uint8_t	queue;			/* page queue index (P,Q) */
+	int8_t psind;			/* pagesizes[] index (O) */
 	int8_t segind;
 	uint8_t	order;			/* index of the buddy queue */
 	uint8_t pool;
@@ -158,7 +159,6 @@
 	/* so, on normal X86 kernels, they must be at least 8 bits wide */
 	vm_page_bits_t valid;		/* map of valid DEV_BSIZE chunks (O) */
 	vm_page_bits_t dirty;		/* map of dirty DEV_BSIZE chunks (M) */
-	int8_t psind;			/* pagesizes[] index (O) */
 };
 
 /*
@@ -207,9 +207,13 @@
 #define	PQ_NONE		255
 #define	PQ_INACTIVE	0
 #define	PQ_ACTIVE	1
-#define	PQ_COUNT	2
+#define	PQ_LAUNDRY	2
+#define	PQ_COUNT	3
 
+#ifndef VM_PAGE_HAVE_PGLIST
 TAILQ_HEAD(pglist, vm_page);
+#define VM_PAGE_HAVE_PGLIST
+#endif
 SLIST_HEAD(spglist, vm_page);
 
 struct vm_pagequeue {
@@ -227,10 +231,11 @@
 	u_int vmd_free_count;
 	long vmd_segs;	/* bitmask of the segments */
 	boolean_t vmd_oom;
-	int vmd_pass;	/* local pagedaemon pass */
 	int vmd_oom_seq;
 	int vmd_last_active_scan;
+	struct vm_page vmd_laundry_marker;
 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
+	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
 };
 
 extern struct vm_domain vm_dom[MAXMEMDOM];
@@ -237,6 +242,7 @@
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
+#define	vm_pagequeue_lockptr(pq)	(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
 #ifdef _KERNEL
@@ -324,12 +330,9 @@
  * Page flags.  If changed at any other time than page allocation or
  * freeing, the modification must be protected by the vm_page lock.
  */
-#define	PG_CACHED	0x0001		/* page is cached */
-#define	PG_FREE		0x0002		/* page is free */
 #define	PG_FICTITIOUS	0x0004		/* physical page doesn't exist */
 #define	PG_ZERO		0x0008		/* page is zeroed */
 #define	PG_MARKER	0x0010		/* special queue marker page */
-#define	PG_WINATCFLS	0x0040		/* flush dirty page on inactive q */
 #define	PG_NODUMP	0x0080		/* don't include this page in a dump */
 #define	PG_UNHOLDFREE	0x0100		/* delayed free of a held page */
 
@@ -353,19 +356,16 @@
  *	free
  *		Available for allocation now.
  *
- *	cache
- *		Almost available for allocation. Still associated with
- *		an object, but clean and immediately freeable.
- *
- * The following lists are LRU sorted:
- *
  *	inactive
  *		Low activity, candidates for reclamation.
+ *		This list is approximately LRU ordered.
+ *
+ *	laundry
  *		This is the list of pages that should be
  *		paged out next.
  *
  *	active
- *		Pages that are "active" i.e. they have been
+ *		Pages that are "active", i.e., they have been
  *		recently referenced.
  *
  */
@@ -376,28 +376,51 @@
 extern long vm_page_array_size;		/* number of vm_page_t's */
 extern long first_page;			/* first physical page number */
 
-#define	VM_PAGE_IS_FREE(m)	(((m)->flags & PG_FREE) != 0)
-
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
+/*
+ * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory
+ * page to which the given physical address belongs. The correct vm_page_t
+ * object is returned for addresses that are not page-aligned.
+ */
 vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
 
-/* page allocation classes: */
+/*
+ * Page allocation parameters for vm_page for the functions
+ * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and
+ * vm_page_alloc_freelist().  Some functions support only a subset
+ * of the flags, and ignore others, see the flags legend.
+ *
+ * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*()
+ * and the vm_page_grab*() functions.  See these functions for details.
+ *
+ * Bits 0 - 1 define class.
+ * Bits 2 - 15 dedicated for flags.
+ * Legend:
+ * (a) - vm_page_alloc() supports the flag.
+ * (c) - vm_page_alloc_contig() supports the flag.
+ * (f) - vm_page_alloc_freelist() supports the flag.
+ * (g) - vm_page_grab() supports the flag.
+ * (p) - vm_page_grab_pages() supports the flag.
+ * Bits above 15 define the count of additional pages that the caller
+ * intends to allocate.
+ */
 #define VM_ALLOC_NORMAL		0
 #define VM_ALLOC_INTERRUPT	1
 #define VM_ALLOC_SYSTEM		2
 #define	VM_ALLOC_CLASS_MASK	3
-/* page allocation flags: */
-#define	VM_ALLOC_WIRED		0x0020	/* non pageable */
-#define	VM_ALLOC_ZERO		0x0040	/* Try to obtain a zeroed page */
-#define	VM_ALLOC_NOOBJ		0x0100	/* No associated object */
-#define	VM_ALLOC_NOBUSY		0x0200	/* Do not busy the page */
-#define	VM_ALLOC_IFCACHED	0x0400	/* Fail if the page is not cached */
-#define	VM_ALLOC_IFNOTCACHED	0x0800	/* Fail if the page is cached */
-#define	VM_ALLOC_IGN_SBUSY	0x1000	/* vm_page_grab() only */
-#define	VM_ALLOC_NODUMP		0x2000	/* don't include in dump */
-#define	VM_ALLOC_SBUSY		0x4000	/* Shared busy the page */
-
+#define	VM_ALLOC_WAITOK		0x0008	/* (acf) Sleep and retry */
+#define	VM_ALLOC_WAITFAIL	0x0010	/* (acf) Sleep and return error */
+#define	VM_ALLOC_WIRED		0x0020	/* (acfgp) Allocate a wired page */
+#define	VM_ALLOC_ZERO		0x0040	/* (acfgp) Allocate a prezeroed page */
+#define	VM_ALLOC_NOOBJ		0x0100	/* (acg) No associated object */
+#define	VM_ALLOC_NOBUSY		0x0200	/* (acgp) Do not excl busy the page */
+#define	VM_ALLOC_IFCACHED	0x0400
+#define	VM_ALLOC_IFNOTCACHED	0x0800
+#define	VM_ALLOC_IGN_SBUSY	0x1000	/* (gp) Ignore shared busy flag */
+#define	VM_ALLOC_NODUMP		0x2000	/* (ag) don't include in dump */
+#define	VM_ALLOC_SBUSY		0x4000	/* (acgp) Shared busy the page */
+#define	VM_ALLOC_NOWAIT		0x8000	/* (acfgp) Do not sleep */
 #define	VM_ALLOC_COUNT_SHIFT	16
 #define	VM_ALLOC_COUNT(count)	((count) << VM_ALLOC_COUNT_SHIFT)
 
@@ -416,10 +439,26 @@
 		pflags |= VM_ALLOC_ZERO;
 	if ((malloc_flags & M_NODUMP) != 0)
 		pflags |= VM_ALLOC_NODUMP;
+	if ((malloc_flags & M_NOWAIT))
+		pflags |= VM_ALLOC_NOWAIT;
+	if ((malloc_flags & M_WAITOK))
+		pflags |= VM_ALLOC_WAITOK;
 	return (pflags);
 }
 #endif
 
+/*
+ * Predicates supported by vm_page_ps_test():
+ *
+ *	PS_ALL_DIRTY is true only if the entire (super)page is dirty.
+ *	However, it can be spuriously false when the (super)page has become
+ *	dirty in the pmap but that information has not been propagated to the
+ *	machine-independent layer.
+ */
+#define	PS_ALL_DIRTY	0x1
+#define	PS_ALL_VALID	0x2
+#define	PS_NONE_BUSY	0x4
+
 void vm_page_busy_downgrade(vm_page_t m);
 void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
 void vm_page_flash(vm_page_t m);
@@ -430,33 +469,38 @@
 
 void vm_page_activate (vm_page_t);
 void vm_page_advise(vm_page_t m, int advice);
-vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc(vm_object_t, vm_pindex_t, int);
+vm_page_t vm_page_alloc_after(vm_object_t, vm_pindex_t, int, vm_page_t);
 vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr);
 vm_page_t vm_page_alloc_freelist(int, int);
+bool vm_page_blacklist_add(vm_paddr_t pa, bool verbose);
+void vm_page_change_lock(vm_page_t m, struct mtx **mtx);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
-void vm_page_cache(vm_page_t);
-void vm_page_cache_free(vm_object_t, vm_pindex_t, vm_pindex_t);
-void vm_page_cache_transfer(vm_object_t, vm_pindex_t, vm_object_t);
-int vm_page_try_to_cache (vm_page_t);
-int vm_page_try_to_free (vm_page_t);
+int vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count);
 void vm_page_deactivate (vm_page_t);
+void vm_page_deactivate_noreuse(vm_page_t);
 void vm_page_dequeue(vm_page_t m);
 void vm_page_dequeue_locked(vm_page_t m);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
+void vm_page_free_phys_pglist(struct pglist *tq);
+bool vm_page_free_prep(vm_page_t m, bool pagequeue_locked);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
-boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
+void vm_page_launder(vm_page_t m);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
 struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
-boolean_t vm_page_ps_is_valid(vm_page_t m);
+bool vm_page_ps_test(vm_page_t m, int flags, vm_page_t skip_m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
+bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
+    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
 int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
@@ -465,16 +509,20 @@
 void vm_page_requeue(vm_page_t m);
 void vm_page_requeue_locked(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
+vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
+    vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_sunbusy(vm_page_t m);
+bool vm_page_try_to_free(vm_page_t m);
 int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
-void vm_page_unwire (vm_page_t, int);
+boolean_t vm_page_unwire(vm_page_t m, uint8_t queue);
 void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_wire (vm_page_t);
 void vm_page_xunbusy_hard(vm_page_t m);
+void vm_page_xunbusy_maybelocked(vm_page_t m);
 void vm_page_set_validclean (vm_page_t, int, int);
 void vm_page_clear_dirty (vm_page_t, int, int);
 void vm_page_set_invalid (vm_page_t, int, int);
@@ -497,17 +545,17 @@
 #define	vm_page_assert_sbusied(m)					\
 	KASSERT(vm_page_sbusied(m),					\
 	    ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_unbusied(m)					\
 	KASSERT(!vm_page_busied(m),					\
 	    ("vm_page_assert_unbusied: page %p busy @ %s:%d",		\
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_xbusied(m)					\
 	KASSERT(vm_page_xbusied(m),					\
 	    ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
-	    (void *)m, __FILE__, __LINE__));
+	    (m), __FILE__, __LINE__))
 
 #define	vm_page_busied(m)						\
 	((m)->busy_lock != VPB_UNBUSIED)
@@ -514,22 +562,24 @@
 
 #define	vm_page_sbusy(m) do {						\
 	if (!vm_page_trysbusy(m))					\
-		panic("%s: page %p failed shared busing", __func__, m);	\
+		panic("%s: page %p failed shared busying", __func__,	\
+		    (m));						\
 } while (0)
 
 #define	vm_page_tryxbusy(m)						\
-	(atomic_cmpset_acq_int(&m->busy_lock, VPB_UNBUSIED,		\
+	(atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED,		\
 	    VPB_SINGLE_EXCLUSIVER))
 
 #define	vm_page_xbusied(m)						\
-	((m->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
+	(((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
 
 #define	vm_page_xbusy(m) do {						\
 	if (!vm_page_tryxbusy(m))					\
-		panic("%s: page %p failed exclusive busing", __func__,	\
-		    m);							\
+		panic("%s: page %p failed exclusive busying", __func__,	\
+		    (m));						\
 } while (0)
 
+/* Note: page m's lock must not be owned by the caller. */
 #define	vm_page_xunbusy(m) do {						\
 	if (!atomic_cmpset_rel_int(&(m)->busy_lock,			\
 	    VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED))			\
@@ -660,5 +710,41 @@
 	m->dirty = 0;
 }
 
+static inline void
+vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
+    vm_page_t mold)
+{
+	vm_page_t mret;
+
+	mret = vm_page_replace(mnew, object, pindex);
+	KASSERT(mret == mold,
+	    ("invalid page replacement, mold=%p, mret=%p", mold, mret));
+
+	/* Unused if !INVARIANTS. */
+	(void)mold;
+	(void)mret;
+}
+
+static inline bool
+vm_page_active(vm_page_t m)
+{
+
+	return (m->queue == PQ_ACTIVE);
+}
+
+static inline bool
+vm_page_inactive(vm_page_t m)
+{
+
+	return (m->queue == PQ_INACTIVE);
+}
+
+static inline bool
+vm_page_in_laundry(vm_page_t m)
+{
+
+	return (m->queue == PQ_LAUNDRY);
+}
+
 #endif				/* _KERNEL */
 #endif				/* !_VM_PAGE_ */

Modified: trunk/sys/vm/vm_pageout.c
===================================================================
--- trunk/sys/vm/vm_pageout.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -74,10 +74,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 320550 2017-07-01 19:24:53Z alc $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pageout.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include "opt_vm.h"
-#include "opt_kdtrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -120,8 +120,9 @@
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
-static int vm_pageout_clean(vm_page_t);
-static void vm_pageout_scan(struct vm_domain *vmd, int pass);
+static int vm_pageout_clean(vm_page_t m, int *numpagedout);
+static int vm_pageout_cluster(vm_page_t m);
+static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
@@ -139,82 +140,49 @@
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
-SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
-#if !defined(NO_SWAPPING)
-/* the kernel process "vm_daemon"*/
-static void vm_daemon(void);
-static struct	proc *vmproc;
+/* Pagedaemon activity rates, in subdivisions of one second. */
+#define	VM_LAUNDER_RATE		10
+#define	VM_INACT_SCAN_RATE	2
 
-static struct kproc_desc vm_kp = {
-	"vmdaemon",
-	vm_daemon,
-	&vmproc
-};
-SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
-#endif
-
-
-int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
-int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
-int vm_pageout_wakeup_thresh;
+u_int vm_pageout_wakeup_thresh;
 static int vm_pageout_oom_seq = 12;
+bool vm_pageout_wanted;		/* Event on which pageout daemon sleeps */
+bool vm_pages_needed;		/* Are threads waiting for free pages? */
 
-#if !defined(NO_SWAPPING)
-static int vm_pageout_req_swapout;	/* XXX */
-static int vm_daemon_needed;
-static struct mtx vm_daemon_mtx;
-/* Allow for use by vm_pageout before vm_daemon is initialized. */
-MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
-#endif
-static int vm_max_launder = 32;
+/* Pending request for dirty page laundering. */
+static enum {
+	VM_LAUNDRY_IDLE,
+	VM_LAUNDRY_BACKGROUND,
+	VM_LAUNDRY_SHORTFALL
+} vm_laundry_request = VM_LAUNDRY_IDLE;
+
 static int vm_pageout_update_period;
-static int defer_swap_pageouts;
 static int disable_swap_pageouts;
 static int lowmem_period = 10;
 static time_t lowmem_uptime;
 
-#if defined(NO_SWAPPING)
-static int vm_swap_enabled = 0;
-static int vm_swap_idle_enabled = 0;
-#else
-static int vm_swap_enabled = 1;
-static int vm_swap_idle_enabled = 0;
-#endif
+static int vm_panic_on_oom = 0;
 
+SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
+	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
+	"panic on out of memory instead of killing the largest process");
+
 SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
-	CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0,
+	CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
 	"free page threshold for waking up the pageout daemon");
 
-SYSCTL_INT(_vm, OID_AUTO, max_launder,
-	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
-
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
-	CTLFLAG_RW, &vm_pageout_update_period, 0,
+	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
 	"Maximum active LRU update period");
   
-SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0,
+SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
 	"Low memory callback period");
 
-#if defined(NO_SWAPPING)
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
-	CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
-	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#else
-SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
-	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
-SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
-	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
-#endif
-
-SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
-	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
-
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
-	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
+	CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
@@ -221,24 +189,39 @@
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
-	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+	CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
 	"back-to-back calls to oom detector to start OOM");
 
-#define VM_PAGEOUT_PAGE_COUNT 16
-int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
+static int act_scan_laundry_weight = 3;
+SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
+    &act_scan_laundry_weight, 0,
+    "weight given to clean vs. dirty pages in active queue scans");
 
+static u_int vm_background_launder_target;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
+    &vm_background_launder_target, 0,
+    "background laundering target, in pages");
+
+static u_int vm_background_launder_rate = 4096;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
+    &vm_background_launder_rate, 0,
+    "background laundering rate, in kilobytes per second");
+
+static u_int vm_background_launder_max = 20 * 1024;
+SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
+    &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
+
+int vm_pageout_page_count = 32;
+
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 SYSCTL_INT(_vm, OID_AUTO, max_wired,
 	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
 
+static u_int isqrt(u_int num);
 static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
-static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t,
-    vm_paddr_t);
-#if !defined(NO_SWAPPING)
-static void vm_pageout_map_deactivate_pages(vm_map_t, long);
-static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
-static void vm_req_vmdaemon(int req);
-#endif
+static int vm_pageout_launder(struct vm_domain *vmd, int launder,
+    bool in_shortfall);
+static void vm_pageout_laundry_worker(void *arg);
 static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
 
 /*
@@ -352,41 +335,30 @@
 }
 
 /*
- * vm_pageout_clean:
- *
- * Clean the page and remove it from the laundry.
- * 
- * We set the busy bit to cause potential page faults on this page to
- * block.  Note the careful timing, however, the busy bit isn't set till
- * late and we cannot do anything that will mess with the page.
+ * Scan for pages at adjacent offsets within the given page's object that are
+ * eligible for laundering, form a cluster of these pages and the given page,
+ * and launder that cluster.
  */
 static int
-vm_pageout_clean(vm_page_t m)
+vm_pageout_cluster(vm_page_t m)
 {
 	vm_object_t object;
-	vm_page_t mc[2*vm_pageout_page_count], pb, ps;
-	int pageout_count;
-	int ib, is, page_base;
-	vm_pindex_t pindex = m->pindex;
+	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
+	vm_pindex_t pindex;
+	int ib, is, page_base, pageout_count;
 
-	vm_page_lock_assert(m, MA_OWNED);
+	vm_page_assert_locked(m);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
+	pindex = m->pindex;
 
 	/*
-	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
-	 * with the new swapper, but we could have serious problems paging
-	 * out other object types if there is insufficient memory.  
-	 *
-	 * Unfortunately, checking free memory here is far too late, so the
-	 * check has been moved up a procedural level.
+	 * We can't clean the page if it is busy or held.
 	 */
+	vm_page_assert_unbusied(m);
+	KASSERT(m->hold_count == 0, ("page %p is held", m));
 
-	/*
-	 * Can't clean the page if it's busy or held.
-	 */
-	vm_page_assert_unbusied(m);
-	KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m));
+	pmap_remove_write(m);
 	vm_page_unlock(m);
 
 	mc[vm_pageout_page_count] = pb = ps = m;
@@ -396,33 +368,23 @@
 	is = 1;
 
 	/*
-	 * Scan object for clusterable pages.
+	 * We can cluster only if the page is not clean, busy, or held, and
+	 * the page is in the laundry queue.
 	 *
-	 * We can cluster ONLY if: ->> the page is NOT
-	 * clean, wired, busy, held, or mapped into a
-	 * buffer, and one of the following:
-	 * 1) The page is inactive, or a seldom used
-	 *    active page.
-	 * -or-
-	 * 2) we force the issue.
-	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
-	 * due to flushing pages out of order and not trying
-	 * align the clusters (which leave sporatic out-of-order
+	 * due to flushing pages out of order and not trying to
+	 * align the clusters (which leaves sporadic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 more:
-	while (ib && pageout_count < vm_pageout_page_count) {
-		vm_page_t p;
-
+	while (ib != 0 && pageout_count < vm_pageout_page_count) {
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
-
 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
 			ib = 0;
 			break;
@@ -433,28 +395,27 @@
 			break;
 		}
 		vm_page_lock(p);
-		if (p->queue != PQ_INACTIVE ||
+		if (!vm_page_in_laundry(p) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			ib = 0;
 			break;
 		}
+		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[--page_base] = pb = p;
 		++pageout_count;
 		++ib;
+
 		/*
-		 * alignment boundry, stop here and switch directions.  Do
-		 * not clear ib.
+		 * We are at an alignment boundary.  Stop here, and switch
+		 * directions.  Do not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
-
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
-		vm_page_t p;
-
 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
 		vm_page_test_dirty(p);
@@ -461,11 +422,12 @@
 		if (p->dirty == 0)
 			break;
 		vm_page_lock(p);
-		if (p->queue != PQ_INACTIVE ||
+		if (!vm_page_in_laundry(p) ||
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			vm_page_unlock(p);
 			break;
 		}
+		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[page_base + pageout_count] = ps = p;
 		++pageout_count;
@@ -474,17 +436,14 @@
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
-	 * when possible, even past a page boundry.  This catches boundry
-	 * conditions.
+	 * when possible, even past an alignment boundary.  This catches
+	 * boundary conditions.
 	 */
-	if (ib && pageout_count < vm_pageout_page_count)
+	if (ib != 0 && pageout_count < vm_pageout_page_count)
 		goto more;
 
-	/*
-	 * we allow reads during pageouts...
-	 */
-	return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL,
-	    NULL));
+	return (vm_pageout_flush(&mc[page_base], pageout_count,
+	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
 }
 
 /*
@@ -513,8 +472,8 @@
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
-	 * Initiate I/O.  Bump the vm_page_t->busy counter and
-	 * mark the pages read-only.
+	 * Initiate I/O.  Mark the pages busy and verify that they're valid
+	 * and read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
@@ -526,8 +485,9 @@
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
+		KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
+		    ("vm_pageout_flush: writeable page %p", mc[i]));
 		vm_page_sbusy(mc[i]);
-		pmap_remove_write(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
@@ -544,23 +504,33 @@
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
+			vm_page_lock(mt);
+			if (vm_page_in_laundry(mt))
+				vm_page_deactivate_noreuse(mt);
+			vm_page_unlock(mt);
+			/* FALLTHROUGH */
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
-			 * Page outside of range of object. Right now we
-			 * essentially lose the changes by pretending it
-			 * worked.
+			 * The page is outside the object's range.  We pretend
+			 * that the page out worked and clean the page, so the
+			 * changes will be lost if the page is reclaimed by
+			 * the page daemon.
 			 */
 			vm_page_undirty(mt);
+			vm_page_lock(mt);
+			if (vm_page_in_laundry(mt))
+				vm_page_deactivate_noreuse(mt);
+			vm_page_unlock(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
-			 * If page couldn't be paged out, then reactivate the
-			 * page so it doesn't clog the inactive list.  (We
-			 * will try paging out it again later).
+			 * If the page couldn't be paged out, then reactivate
+			 * it so that it doesn't clog the laundry and inactive
+			 * queues.  (We will try paging it out again later).
 			 */
 			vm_page_lock(mt);
 			vm_page_activate(mt);
@@ -583,11 +553,6 @@
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
-			if (vm_page_count_severe()) {
-				vm_page_lock(mt);
-				vm_page_try_to_cache(mt);
-				vm_page_unlock(mt);
-			}
 		}
 	}
 	if (prunlen != NULL)
@@ -595,24 +560,172 @@
 	return (numpagedout);
 }
 
-static boolean_t
-vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low,
-    vm_paddr_t high)
+/*
+ * Attempt to acquire all of the necessary locks to launder a page and
+ * then call through the clustering layer to PUTPAGES.  Wait a short
+ * time for a vnode lock.
+ *
+ * Requires the page and object lock on entry, releases both before return.
+ * Returns 0 on success and an errno otherwise.
+ */
+static int
+vm_pageout_clean(vm_page_t m, int *numpagedout)
 {
+	struct vnode *vp;
 	struct mount *mp;
-	struct vnode *vp;
 	vm_object_t object;
-	vm_paddr_t pa;
-	vm_page_t m, m_tmp, next;
-	int lockmode;
+	vm_pindex_t pindex;
+	int error, lockmode;
 
+	vm_page_assert_locked(m);
+	object = m->object;
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	error = 0;
+	vp = NULL;
+	mp = NULL;
+
+	/*
+	 * The object is already known NOT to be dead.   It
+	 * is possible for the vget() to block the whole
+	 * pageout daemon, but the new low-memory handling
+	 * code should prevent it.
+	 *
+	 * We can't wait forever for the vnode lock, we might
+	 * deadlock due to a vn_read() getting stuck in
+	 * vm_wait while holding this vnode.  We skip the 
+	 * vnode if we can't get it in a reasonable amount
+	 * of time.
+	 */
+	if (object->type == OBJT_VNODE) {
+		vm_page_unlock(m);
+		vp = object->handle;
+		if (vp->v_type == VREG &&
+		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
+			mp = NULL;
+			error = EDEADLK;
+			goto unlock_all;
+		}
+		KASSERT(mp != NULL,
+		    ("vp %p with NULL v_mount", vp));
+		vm_object_reference_locked(object);
+		pindex = m->pindex;
+		VM_OBJECT_WUNLOCK(object);
+		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
+		    LK_SHARED : LK_EXCLUSIVE;
+		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
+			vp = NULL;
+			error = EDEADLK;
+			goto unlock_mp;
+		}
+		VM_OBJECT_WLOCK(object);
+
+		/*
+		 * Ensure that the object and vnode were not disassociated
+		 * while locks were dropped.
+		 */
+		if (vp->v_object != object) {
+			error = ENOENT;
+			goto unlock_all;
+		}
+		vm_page_lock(m);
+
+		/*
+		 * While the object and page were unlocked, the page
+		 * may have been:
+		 * (1) moved to a different queue,
+		 * (2) reallocated to a different object,
+		 * (3) reallocated to a different offset, or
+		 * (4) cleaned.
+		 */
+		if (!vm_page_in_laundry(m) || m->object != object ||
+		    m->pindex != pindex || m->dirty == 0) {
+			vm_page_unlock(m);
+			error = ENXIO;
+			goto unlock_all;
+		}
+
+		/*
+		 * The page may have been busied or held while the object
+		 * and page locks were released.
+		 */
+		if (vm_page_busied(m) || m->hold_count != 0) {
+			vm_page_unlock(m);
+			error = EBUSY;
+			goto unlock_all;
+		}
+	}
+
+	/*
+	 * If a page is dirty, then it is either being washed
+	 * (but not yet cleaned) or it is still in the
+	 * laundry.  If it is still in the laundry, then we
+	 * start the cleaning operation. 
+	 */
+	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
+		error = EIO;
+
+unlock_all:
+	VM_OBJECT_WUNLOCK(object);
+
+unlock_mp:
+	vm_page_lock_assert(m, MA_NOTOWNED);
+	if (mp != NULL) {
+		if (vp != NULL)
+			vput(vp);
+		vm_object_deallocate(object);
+		vn_finished_write(mp);
+	}
+
+	return (error);
+}
+
+/*
+ * Attempt to launder the specified number of pages.
+ *
+ * Returns the number of pages successfully laundered.
+ */
+static int
+vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
+{
+	struct vm_pagequeue *pq;
+	vm_object_t object;
+	vm_page_t m, next;
+	int act_delta, error, maxscan, numpagedout, starting_target;
+	int vnodes_skipped;
+	bool pageout_ok, queue_locked;
+
+	starting_target = launder;
+	vnodes_skipped = 0;
+
+	/*
+	 * Scan the laundry queue for pages eligible to be laundered.  We stop
+	 * once the target number of dirty pages have been laundered, or once
+	 * we've reached the end of the queue.  A single iteration of this loop
+	 * may cause more than one page to be laundered because of clustering.
+	 *
+	 * maxscan ensures that we don't re-examine requeued pages.  Any
+	 * additional pages written as part of a cluster are subtracted from
+	 * maxscan since they must be taken from the laundry queue.
+	 */
+	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
+	maxscan = pq->pq_cnt;
+
 	vm_pagequeue_lock(pq);
-	TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) {
+	queue_locked = true;
+	for (m = TAILQ_FIRST(&pq->pq_pl);
+	    m != NULL && maxscan-- > 0 && launder > 0;
+	    m = next) {
+		vm_pagequeue_assert_locked(pq);
+		KASSERT(queue_locked, ("unlocked laundry queue"));
+		KASSERT(vm_page_in_laundry(m),
+		    ("page %p has an inconsistent queue", m));
+		next = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
-		pa = VM_PAGE_TO_PHYS(m);
-		if (pa < low || pa + PAGE_SIZE > high)
-			continue;
+		KASSERT((m->flags & PG_FICTITIOUS) == 0,
+		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
+		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
+		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
 		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
 			vm_page_unlock(m);
 			continue;
@@ -621,326 +734,341 @@
 		if ((!VM_OBJECT_TRYWLOCK(object) &&
 		    (!vm_pageout_fallback_object_lock(m, &next) ||
 		    m->hold_count != 0)) || vm_page_busied(m)) {
+			VM_OBJECT_WUNLOCK(object);
 			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
 			continue;
 		}
-		vm_page_test_dirty(m);
-		if (m->dirty == 0 && object->ref_count != 0)
-			pmap_remove_all(m);
-		if (m->dirty != 0) {
-			vm_page_unlock(m);
-			if (tries == 0 || (object->flags & OBJ_DEAD) != 0) {
-				VM_OBJECT_WUNLOCK(object);
-				continue;
-			}
-			if (object->type == OBJT_VNODE) {
-				vm_pagequeue_unlock(pq);
-				vp = object->handle;
-				vm_object_reference_locked(object);
-				VM_OBJECT_WUNLOCK(object);
-				(void)vn_start_write(vp, &mp, V_WAIT);
-				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
-				    LK_SHARED : LK_EXCLUSIVE;
-				vn_lock(vp, lockmode | LK_RETRY);
-				VM_OBJECT_WLOCK(object);
-				vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
-				VM_OBJECT_WUNLOCK(object);
-				VOP_UNLOCK(vp, 0);
-				vm_object_deallocate(object);
-				vn_finished_write(mp);
-				return (TRUE);
-			} else if (object->type == OBJT_SWAP ||
-			    object->type == OBJT_DEFAULT) {
-				vm_pagequeue_unlock(pq);
-				m_tmp = m;
-				vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC,
-				    0, NULL, NULL);
-				VM_OBJECT_WUNLOCK(object);
-				return (TRUE);
-			}
-		} else {
-			/*
-			 * Dequeue here to prevent lock recursion in
-			 * vm_page_cache().
-			 */
-			vm_page_dequeue_locked(m);
-			vm_page_cache(m);
-			vm_page_unlock(m);
+
+		/*
+		 * Unlock the laundry queue, invalidating the 'next' pointer.
+		 * Use a marker to remember our place in the laundry queue.
+		 */
+		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
+		    plinks.q);
+		vm_pagequeue_unlock(pq);
+		queue_locked = false;
+
+		/*
+		 * Invalid pages can be easily freed.  They cannot be
+		 * mapped; vm_page_free() asserts this.
+		 */
+		if (m->valid == 0)
+			goto free_page;
+
+		/*
+		 * If the page has been referenced and the object is not dead,
+		 * reactivate or requeue the page depending on whether the
+		 * object is mapped.
+		 */
+		if ((m->aflags & PGA_REFERENCED) != 0) {
+			vm_page_aflag_clear(m, PGA_REFERENCED);
+			act_delta = 1;
+		} else
+			act_delta = 0;
+		if (object->ref_count != 0)
+			act_delta += pmap_ts_referenced(m);
+		else {
+			KASSERT(!pmap_page_is_mapped(m),
+			    ("page %p is mapped", m));
 		}
-		VM_OBJECT_WUNLOCK(object);
-	}
-	vm_pagequeue_unlock(pq);
-	return (FALSE);
-}
+		if (act_delta != 0) {
+			if (object->ref_count != 0) {
+				PCPU_INC(cnt.v_reactivated);
+				vm_page_activate(m);
 
-/*
- * Increase the number of cached pages.  The specified value, "tries",
- * determines which categories of pages are cached:
- *
- *  0: All clean, inactive pages within the specified physical address range
- *     are cached.  Will not sleep.
- *  1: The vm_lowmem handlers are called.  All inactive pages within
- *     the specified physical address range are cached.  May sleep.
- *  2: The vm_lowmem handlers are called.  All inactive and active pages
- *     within the specified physical address range are cached.  May sleep.
- */
-void
-vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high)
-{
-	int actl, actmax, inactl, inactmax, dom, initial_dom;
-	static int start_dom = 0;
+				/*
+				 * Increase the activation count if the page
+				 * was referenced while in the laundry queue.
+				 * This makes it less likely that the page will
+				 * be returned prematurely to the inactive
+				 * queue.
+ 				 */
+				m->act_count += act_delta + ACT_ADVANCE;
 
-	if (tries > 0) {
+				/*
+				 * If this was a background laundering, count
+				 * activated pages towards our target.  The
+				 * purpose of background laundering is to ensure
+				 * that pages are eventually cycled through the
+				 * laundry queue, and an activation is a valid
+				 * way out.
+				 */
+				if (!in_shortfall)
+					launder--;
+				goto drop_page;
+			} else if ((object->flags & OBJ_DEAD) == 0)
+				goto requeue_page;
+		}
+
 		/*
-		 * Decrease registered cache sizes.  The vm_lowmem handlers
-		 * may acquire locks and/or sleep, so they can only be invoked
-		 * when "tries" is greater than zero.
+		 * If the page appears to be clean at the machine-independent
+		 * layer, then remove all of its mappings from the pmap in
+		 * anticipation of freeing it.  If, however, any of the page's
+		 * mappings allow write access, then the page may still be
+		 * modified until the last of those mappings are removed.
 		 */
-		SDT_PROBE0(vm, , , vm__lowmem_cache);
-		EVENTHANDLER_INVOKE(vm_lowmem, 0);
+		if (object->ref_count != 0) {
+			vm_page_test_dirty(m);
+			if (m->dirty == 0)
+				pmap_remove_all(m);
+		}
 
 		/*
-		 * We do this explicitly after the caches have been drained
-		 * above.
+		 * Clean pages are freed, and dirty pages are paged out unless
+		 * they belong to a dead object.  Requeueing dirty pages from
+		 * dead objects is pointless, as they are being paged out and
+		 * freed by the thread that destroyed the object.
 		 */
-		uma_reclaim();
+		if (m->dirty == 0) {
+free_page:
+			vm_page_free(m);
+			PCPU_INC(cnt.v_dfree);
+		} else if ((object->flags & OBJ_DEAD) == 0) {
+			if (object->type != OBJT_SWAP &&
+			    object->type != OBJT_DEFAULT)
+				pageout_ok = true;
+			else if (disable_swap_pageouts)
+				pageout_ok = false;
+			else
+				pageout_ok = true;
+			if (!pageout_ok) {
+requeue_page:
+				vm_pagequeue_lock(pq);
+				queue_locked = true;
+				vm_page_requeue_locked(m);
+				goto drop_page;
+			}
+
+			/*
+			 * Form a cluster with adjacent, dirty pages from the
+			 * same object, and page out that entire cluster.
+			 *
+			 * The adjacent, dirty pages must also be in the
+			 * laundry.  However, their mappings are not checked
+			 * for new references.  Consequently, a recently
+			 * referenced page may be paged out.  However, that
+			 * page will not be prematurely reclaimed.  After page
+			 * out, the page will be placed in the inactive queue,
+			 * where any new references will be detected and the
+			 * page reactivated.
+			 */
+			error = vm_pageout_clean(m, &numpagedout);
+			if (error == 0) {
+				launder -= numpagedout;
+				maxscan -= numpagedout - 1;
+			} else if (error == EDEADLK) {
+				pageout_lock_miss++;
+				vnodes_skipped++;
+			}
+			goto relock_queue;
+		}
+drop_page:
+		vm_page_unlock(m);
+		VM_OBJECT_WUNLOCK(object);
+relock_queue:
+		if (!queue_locked) {
+			vm_pagequeue_lock(pq);
+			queue_locked = true;
+		}
+		next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
+		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
 	}
+	vm_pagequeue_unlock(pq);
 
 	/*
-	 * Make the next scan start on the next domain.
+	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
+	 * and we didn't launder enough pages.
 	 */
-	initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains;
+	if (vnodes_skipped > 0 && launder > 0)
+		(void)speedup_syncer();
 
-	inactl = 0;
-	inactmax = cnt.v_inactive_count;
-	actl = 0;
-	actmax = tries < 2 ? 0 : cnt.v_active_count;
-	dom = initial_dom;
-
-	/*
-	 * Scan domains in round-robin order, first inactive queues,
-	 * then active.  Since domain usually owns large physically
-	 * contiguous chunk of memory, it makes sense to completely
-	 * exhaust one domain before switching to next, while growing
-	 * the pool of contiguous physical pages.
-	 *
-	 * Do not even start launder a domain which cannot contain
-	 * the specified address range, as indicated by segments
-	 * constituting the domain.
-	 */
-again_inact:
-	if (inactl < inactmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE],
-		    tries, low, high)) {
-			inactl++;
-			goto again_inact;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_inact;
-	}
-again_act:
-	if (actl < actmax) {
-		if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs,
-		    low, high) &&
-		    vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE],
-		      tries, low, high)) {
-			actl++;
-			goto again_act;
-		}
-		if (++dom == vm_ndomains)
-			dom = 0;
-		if (dom != initial_dom)
-			goto again_act;
-	}
+	return (starting_target - launder);
 }
 
-#if !defined(NO_SWAPPING)
 /*
- *	vm_pageout_object_deactivate_pages
- *
- *	Deactivate enough pages to satisfy the inactive target
- *	requirements.
- *
- *	The object and map must be locked.
+ * Compute the integer square root.
  */
-static void
-vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
-    long desired)
+static u_int
+isqrt(u_int num)
 {
-	vm_object_t backing_object, object;
-	vm_page_t p;
-	int act_delta, remove_mode;
+	u_int bit, root, tmp;
 
-	VM_OBJECT_ASSERT_LOCKED(first_object);
-	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
-		return;
-	for (object = first_object;; object = backing_object) {
-		if (pmap_resident_count(pmap) <= desired)
-			goto unlock_return;
-		VM_OBJECT_ASSERT_LOCKED(object);
-		if ((object->flags & OBJ_UNMANAGED) != 0 ||
-		    object->paging_in_progress != 0)
-			goto unlock_return;
-
-		remove_mode = 0;
-		if (object->shadow_count > 1)
-			remove_mode = 1;
-		/*
-		 * Scan the object's entire memory queue.
-		 */
-		TAILQ_FOREACH(p, &object->memq, listq) {
-			if (pmap_resident_count(pmap) <= desired)
-				goto unlock_return;
-			if (vm_page_busied(p))
-				continue;
-			PCPU_INC(cnt.v_pdpages);
-			vm_page_lock(p);
-			if (p->wire_count != 0 || p->hold_count != 0 ||
-			    !pmap_page_exists_quick(pmap, p)) {
-				vm_page_unlock(p);
-				continue;
-			}
-			act_delta = pmap_ts_referenced(p);
-			if ((p->aflags & PGA_REFERENCED) != 0) {
-				if (act_delta == 0)
-					act_delta = 1;
-				vm_page_aflag_clear(p, PGA_REFERENCED);
-			}
-			if (p->queue != PQ_ACTIVE && act_delta != 0) {
-				vm_page_activate(p);
-				p->act_count += act_delta;
-			} else if (p->queue == PQ_ACTIVE) {
-				if (act_delta == 0) {
-					p->act_count -= min(p->act_count,
-					    ACT_DECLINE);
-					if (!remove_mode && p->act_count == 0) {
-						pmap_remove_all(p);
-						vm_page_deactivate(p);
-					} else
-						vm_page_requeue(p);
-				} else {
-					vm_page_activate(p);
-					if (p->act_count < ACT_MAX -
-					    ACT_ADVANCE)
-						p->act_count += ACT_ADVANCE;
-					vm_page_requeue(p);
-				}
-			} else if (p->queue == PQ_INACTIVE)
-				pmap_remove_all(p);
-			vm_page_unlock(p);
+	bit = 1u << ((NBBY * sizeof(u_int)) - 2);
+	while (bit > num)
+		bit >>= 2;
+	root = 0;
+	while (bit != 0) {
+		tmp = root + bit;
+		root >>= 1;
+		if (num >= tmp) {
+			num -= tmp;
+			root += bit;
 		}
-		if ((backing_object = object->backing_object) == NULL)
-			goto unlock_return;
-		VM_OBJECT_RLOCK(backing_object);
-		if (object != first_object)
-			VM_OBJECT_RUNLOCK(object);
+		bit >>= 2;
 	}
-unlock_return:
-	if (object != first_object)
-		VM_OBJECT_RUNLOCK(object);
+	return (root);
 }
 
 /*
- * deactivate some number of pages in a map, try to do it fairly, but
- * that is really hard to do.
+ * Perform the work of the laundry thread: periodically wake up and determine
+ * whether any pages need to be laundered.  If so, determine the number of pages
+ * that need to be laundered, and launder them.
  */
 static void
-vm_pageout_map_deactivate_pages(map, desired)
-	vm_map_t map;
-	long desired;
+vm_pageout_laundry_worker(void *arg)
 {
-	vm_map_entry_t tmpe;
-	vm_object_t obj, bigobj;
-	int nothingwired;
+	struct vm_domain *domain;
+	struct vm_pagequeue *pq;
+	uint64_t nclean, ndirty;
+	u_int last_launder, wakeups;
+	int domidx, last_target, launder, shortfall, shortfall_cycle, target;
+	bool in_shortfall;
 
-	if (!vm_map_trylock(map))
-		return;
+	domidx = (uintptr_t)arg;
+	domain = &vm_dom[domidx];
+	pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
+	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
+	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
 
-	bigobj = NULL;
-	nothingwired = TRUE;
+	shortfall = 0;
+	in_shortfall = false;
+	shortfall_cycle = 0;
+	target = 0;
+	last_launder = 0;
 
 	/*
-	 * first, search out the biggest object, and try to free pages from
-	 * that.
+	 * The pageout laundry worker is never done, so loop forever.
 	 */
-	tmpe = map->header.next;
-	while (tmpe != &map->header) {
-		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
-			obj = tmpe->object.vm_object;
-			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
-				if (obj->shadow_count <= 1 &&
-				    (bigobj == NULL ||
-				     bigobj->resident_page_count < obj->resident_page_count)) {
-					if (bigobj != NULL)
-						VM_OBJECT_RUNLOCK(bigobj);
-					bigobj = obj;
-				} else
-					VM_OBJECT_RUNLOCK(obj);
-			}
+	for (;;) {
+		KASSERT(target >= 0, ("negative target %d", target));
+		KASSERT(shortfall_cycle >= 0,
+		    ("negative cycle %d", shortfall_cycle));
+		launder = 0;
+		wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
+
+		/*
+		 * First determine whether we need to launder pages to meet a
+		 * shortage of free pages.
+		 */
+		if (shortfall > 0) {
+			in_shortfall = true;
+			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
+			target = shortfall;
+		} else if (!in_shortfall)
+			goto trybackground;
+		else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
+			/*
+			 * We recently entered shortfall and began laundering
+			 * pages.  If we have completed that laundering run
+			 * (and we are no longer in shortfall) or we have met
+			 * our laundry target through other activity, then we
+			 * can stop laundering pages.
+			 */
+			in_shortfall = false;
+			target = 0;
+			goto trybackground;
 		}
-		if (tmpe->wired_count > 0)
-			nothingwired = FALSE;
-		tmpe = tmpe->next;
-	}
+		last_launder = wakeups;
+		launder = target / shortfall_cycle--;
+		goto dolaundry;
 
-	if (bigobj != NULL) {
-		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
-		VM_OBJECT_RUNLOCK(bigobj);
-	}
-	/*
-	 * Next, hunt around for other pages to deactivate.  We actually
-	 * do this search sort of wrong -- .text first is not the best idea.
-	 */
-	tmpe = map->header.next;
-	while (tmpe != &map->header) {
-		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
-			break;
-		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
-			obj = tmpe->object.vm_object;
-			if (obj != NULL) {
-				VM_OBJECT_RLOCK(obj);
-				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
-				VM_OBJECT_RUNLOCK(obj);
+		/*
+		 * There's no immediate need to launder any pages; see if we
+		 * meet the conditions to perform background laundering:
+		 *
+		 * 1. The ratio of dirty to clean inactive pages exceeds the
+		 *    background laundering threshold and the pagedaemon has
+		 *    been woken up to reclaim pages since our last
+		 *    laundering, or
+		 * 2. we haven't yet reached the target of the current
+		 *    background laundering run.
+		 *
+		 * The background laundering threshold is not a constant.
+		 * Instead, it is a slowly growing function of the number of
+		 * page daemon wakeups since the last laundering.  Thus, as the
+		 * ratio of dirty to clean inactive pages grows, the amount of
+		 * memory pressure required to trigger laundering decreases.
+		 */
+trybackground:
+		nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
+		ndirty = vm_cnt.v_laundry_count;
+		if (target == 0 && wakeups != last_launder &&
+		    ndirty * isqrt(wakeups - last_launder) >= nclean) {
+			target = vm_background_launder_target;
+		}
+
+		/*
+		 * We have a non-zero background laundering target.  If we've
+		 * laundered up to our maximum without observing a page daemon
+		 * wakeup, just stop.  This is a safety belt that ensures we
+		 * don't launder an excessive amount if memory pressure is low
+		 * and the ratio of dirty to clean pages is large.  Otherwise,
+		 * proceed at the background laundering rate.
+		 */
+		if (target > 0) {
+			if (wakeups != last_launder) {
+				last_launder = wakeups;
+				last_target = target;
+			} else if (last_target - target >=
+			    vm_background_launder_max * PAGE_SIZE / 1024) {
+				target = 0;
 			}
+			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
+			launder /= VM_LAUNDER_RATE;
+			if (launder > target)
+				launder = target;
 		}
-		tmpe = tmpe->next;
-	}
 
-#ifdef __ia64__
-	/*
-	 * Remove all non-wired, managed mappings if a process is swapped out.
-	 * This will free page table pages.
-	 */
-	if (desired == 0)
-		pmap_remove_pages(map->pmap);
-#else
-	/*
-	 * Remove all mappings if a process is swapped out, this will free page
-	 * table pages.
-	 */
-	if (desired == 0 && nothingwired) {
-		pmap_remove(vm_map_pmap(map), vm_map_min(map),
-		    vm_map_max(map));
+dolaundry:
+		if (launder > 0) {
+			/*
+			 * Because of I/O clustering, the number of laundered
+			 * pages could exceed "target" by the maximum size of
+			 * a cluster minus one. 
+			 */
+			target -= min(vm_pageout_launder(domain, launder,
+			    in_shortfall), target);
+			pause("laundp", hz / VM_LAUNDER_RATE);
+		}
+
+		/*
+		 * If we're not currently laundering pages and the page daemon
+		 * hasn't posted a new request, sleep until the page daemon
+		 * kicks us.
+		 */
+		vm_pagequeue_lock(pq);
+		if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
+			(void)mtx_sleep(&vm_laundry_request,
+			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
+
+		/*
+		 * If the pagedaemon has indicated that it's in shortfall, start
+		 * a shortfall laundering unless we're already in the middle of
+		 * one.  This may preempt a background laundering.
+		 */
+		if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
+		    (!in_shortfall || shortfall_cycle == 0)) {
+			shortfall = vm_laundry_target() + vm_pageout_deficit;
+			target = 0;
+		} else
+			shortfall = 0;
+
+		if (target == 0)
+			vm_laundry_request = VM_LAUNDRY_IDLE;
+		vm_pagequeue_unlock(pq);
 	}
-#endif
-
-	vm_map_unlock(map);
 }
-#endif		/* !defined(NO_SWAPPING) */
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  *
- *	pass 0 - Update active LRU/deactivate pages
- *	pass 1 - Move inactive to cache or free
- *	pass 2 - Launder dirty pages
+ *	pass == 0: Update active LRU/deactivate pages
+ *	pass >= 1: Free inactive pages
+ *
+ * Returns true if pass was zero or enough pages were freed by the inactive
+ * queue scan to meet the target.
  */
-static void
+static bool
 vm_pageout_scan(struct vm_domain *vmd, int pass)
 {
 	vm_page_t m, next;
@@ -947,10 +1075,8 @@
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	long min_scan;
-	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
-	int vnodes_skipped = 0;
-	int maxlaunder, scan_tick, scanned, starting_page_shortage;
-	int lockmode;
+	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
+	int page_shortage, scan_tick, scanned, starting_page_shortage;
 	boolean_t queue_locked;
 
 	/*
@@ -981,8 +1107,9 @@
 	addl_page_shortage = 0;
 
 	/*
-	 * Calculate the number of pages we want to either free or move
-	 * to the cache.
+	 * Calculate the number of pages that we want to free.  This number
+	 * can be negative if many pages are freed between the wakeup call to
+	 * the page daemon and this calculation.
 	 */
 	if (pass > 0) {
 		deficit = atomic_readandclear_int(&vm_pageout_deficit);
@@ -992,27 +1119,11 @@
 	starting_page_shortage = page_shortage;
 
 	/*
-	 * maxlaunder limits the number of dirty pages we flush per scan.
-	 * For most systems a smaller value (16 or 32) is more robust under
-	 * extreme memory and disk pressure because any unnecessary writes
-	 * to disk can result in extreme performance degredation.  However,
-	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
-	 * used) will die horribly with limited laundering.  If the pageout
-	 * daemon cannot clean enough pages in the first pass, we let it go
-	 * all out in succeeding passes.
+	 * Start scanning the inactive queue for pages that we can free.  The
+	 * scan will stop when we reach the target or we have scanned the
+	 * entire queue.  (Note that m->act_count is not used to make
+	 * decisions for the inactive queue, only for the active queue.)
 	 */
-	if ((maxlaunder = vm_max_launder) <= 1)
-		maxlaunder = 1;
-	if (pass > 1)
-		maxlaunder = 10000;
-
-	/*
-	 * Start scanning the inactive queue for pages we can move to the
-	 * cache or free.  The scan will stop when the target is reached or
-	 * we have scanned the entire inactive queue.  Note that m->act_count
-	 * is not used to form decisions for the inactive queue, only for the
-	 * active queue.
-	 */
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	maxscan = pq->pq_cnt;
 	vm_pagequeue_lock(pq);
@@ -1022,7 +1133,7 @@
 	     m = next) {
 		vm_pagequeue_assert_locked(pq);
 		KASSERT(queue_locked, ("unlocked inactive queue"));
-		KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m));
+		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
 
 		PCPU_INC(cnt.v_pdpages);
 		next = TAILQ_NEXT(m, plinks.q);
@@ -1044,55 +1155,76 @@
 		 * different position within the queue.  In either
 		 * case, addl_page_shortage should not be incremented.
 		 */
-		if (!vm_pageout_page_lock(m, &next)) {
-			vm_page_unlock(m);
-			continue;
+		if (!vm_pageout_page_lock(m, &next))
+			goto unlock_page;
+		else if (m->hold_count != 0) {
+			/*
+			 * Held pages are essentially stuck in the
+			 * queue.  So, they ought to be discounted
+			 * from the inactive count.  See the
+			 * calculation of inactq_shortage before the
+			 * loop over the active queue below.
+			 */
+			addl_page_shortage++;
+			goto unlock_page;
 		}
 		object = m->object;
-		if (!VM_OBJECT_TRYWLOCK(object) &&
-		    !vm_pageout_fallback_object_lock(m, &next)) {
-			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-			continue;
+		if (!VM_OBJECT_TRYWLOCK(object)) {
+			if (!vm_pageout_fallback_object_lock(m, &next))
+				goto unlock_object;
+			else if (m->hold_count != 0) {
+				addl_page_shortage++;
+				goto unlock_object;
+			}
 		}
-
-		/*
-		 * Don't mess with busy pages, keep them at at the
-		 * front of the queue, most likely they are being
-		 * paged out.  Increment addl_page_shortage for busy
-		 * pages, because they may leave the inactive queue
-		 * shortly after page scan is finished.
-		 */
 		if (vm_page_busied(m)) {
+			/*
+			 * Don't mess with busy pages.  Leave them at
+			 * the front of the queue.  Most likely, they
+			 * are being paged out and will leave the
+			 * queue shortly after the scan finishes.  So,
+			 * they ought to be discounted from the
+			 * inactive count.
+			 */
+			addl_page_shortage++;
+unlock_object:
+			VM_OBJECT_WUNLOCK(object);
+unlock_page:
 			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-			addl_page_shortage++;
 			continue;
 		}
+		KASSERT(m->hold_count == 0, ("Held page %p", m));
 
 		/*
-		 * We unlock the inactive page queue, invalidating the
-		 * 'next' pointer.  Use our marker to remember our
-		 * place.
+		 * Dequeue the inactive page and unlock the inactive page
+		 * queue, invalidating the 'next' pointer.  Dequeueing the
+		 * page here avoids a later reacquisition (and release) of
+		 * the inactive page queue lock when vm_page_activate(),
+		 * vm_page_free(), or vm_page_launder() is called.  Use a
+		 * marker to remember our place in the inactive queue.
 		 */
 		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
+		vm_page_dequeue_locked(m);
 		vm_pagequeue_unlock(pq);
 		queue_locked = FALSE;
 
 		/*
-		 * We bump the activation count if the page has been
-		 * referenced while in the inactive queue.  This makes
-		 * it less likely that the page will be added back to the
-		 * inactive queue prematurely again.  Here we check the 
-		 * page tables (or emulated bits, if any), given the upper 
-		 * level VM system not knowing anything about existing 
-		 * references.
+		 * Invalid pages can be easily freed. They cannot be
+		 * mapped, vm_page_free() asserts this.
 		 */
-		act_delta = 0;
+		if (m->valid == 0)
+			goto free_page;
+
+		/*
+		 * If the page has been referenced and the object is not dead,
+		 * reactivate or requeue the page depending on whether the
+		 * object is mapped.
+		 */
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta = 1;
-		}
+		} else
+			act_delta = 0;
 		if (object->ref_count != 0) {
 			act_delta += pmap_ts_referenced(m);
 		} else {
@@ -1099,47 +1231,36 @@
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("vm_pageout_scan: page %p is mapped", m));
 		}
-
-		/*
-		 * If the upper level VM system knows about any page 
-		 * references, we reactivate the page or requeue it.
-		 */
 		if (act_delta != 0) {
-			if (object->ref_count) {
+			if (object->ref_count != 0) {
+				PCPU_INC(cnt.v_reactivated);
 				vm_page_activate(m);
+
+				/*
+				 * Increase the activation count if the page
+				 * was referenced while in the inactive queue.
+				 * This makes it less likely that the page will
+				 * be returned prematurely to the inactive
+				 * queue.
+ 				 */
 				m->act_count += act_delta + ACT_ADVANCE;
-			} else {
+				goto drop_page;
+			} else if ((object->flags & OBJ_DEAD) == 0) {
 				vm_pagequeue_lock(pq);
 				queue_locked = TRUE;
-				vm_page_requeue_locked(m);
+				m->queue = PQ_INACTIVE;
+				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
+				vm_pagequeue_cnt_inc(pq);
+				goto drop_page;
 			}
-			VM_OBJECT_WUNLOCK(object);
-			vm_page_unlock(m);
-			goto relock_queue;
 		}
 
-		if (m->hold_count != 0) {
-			vm_page_unlock(m);
-			VM_OBJECT_WUNLOCK(object);
-
-			/*
-			 * Held pages are essentially stuck in the
-			 * queue.  So, they ought to be discounted
-			 * from the inactive count.  See the
-			 * calculation of the page_shortage for the
-			 * loop over the active queue below.
-			 */
-			addl_page_shortage++;
-			goto relock_queue;
-		}
-
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
-		 * anticipation of placing it onto the cache queue.  If,
-		 * however, any of the page's mappings allow write access,
-		 * then the page may still be modified until the last of those
-		 * mappings are removed.
+		 * anticipation of freeing it.  If, however, any of the page's
+		 * mappings allow write access, then the page may still be
+		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
@@ -1147,199 +1268,23 @@
 				pmap_remove_all(m);
 		}
 
-		if (m->valid == 0) {
-			/*
-			 * Invalid pages can be easily freed
-			 */
+		/*
+		 * Clean pages can be freed, but dirty pages must be sent back
+		 * to the laundry, unless they belong to a dead object.
+		 * Requeueing dirty pages from dead objects is pointless, as
+		 * they are being paged out and freed by the thread that
+		 * destroyed the object.
+		 */
+		if (m->dirty == 0) {
+free_page:
 			vm_page_free(m);
 			PCPU_INC(cnt.v_dfree);
 			--page_shortage;
-		} else if (m->dirty == 0) {
-			/*
-			 * Clean pages can be placed onto the cache queue.
-			 * This effectively frees them.
-			 */
-			vm_page_cache(m);
-			--page_shortage;
-		} else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) {
-			/*
-			 * Dirty pages need to be paged out, but flushing
-			 * a page is extremely expensive verses freeing
-			 * a clean page.  Rather then artificially limiting
-			 * the number of pages we can flush, we instead give
-			 * dirty pages extra priority on the inactive queue
-			 * by forcing them to be cycled through the queue
-			 * twice before being flushed, after which the
-			 * (now clean) page will cycle through once more
-			 * before being freed.  This significantly extends
-			 * the thrash point for a heavily loaded machine.
-			 */
-			m->flags |= PG_WINATCFLS;
-			vm_pagequeue_lock(pq);
-			queue_locked = TRUE;
-			vm_page_requeue_locked(m);
-		} else if (maxlaunder > 0) {
-			/*
-			 * We always want to try to flush some dirty pages if
-			 * we encounter them, to keep the system stable.
-			 * Normally this number is small, but under extreme
-			 * pressure where there are insufficient clean pages
-			 * on the inactive queue, we may have to go all out.
-			 */
-			int swap_pageouts_ok;
-			struct vnode *vp = NULL;
-			struct mount *mp = NULL;
-
-			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
-				swap_pageouts_ok = 1;
-			} else {
-				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
-				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
-				vm_page_count_min());
-										
-			}
-
-			/*
-			 * We don't bother paging objects that are "dead".  
-			 * Those objects are in a "rundown" state.
-			 */
-			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
-				vm_pagequeue_lock(pq);
-				vm_page_unlock(m);
-				VM_OBJECT_WUNLOCK(object);
-				queue_locked = TRUE;
-				vm_page_requeue_locked(m);
-				goto relock_queue;
-			}
-
-			/*
-			 * The object is already known NOT to be dead.   It
-			 * is possible for the vget() to block the whole
-			 * pageout daemon, but the new low-memory handling
-			 * code should prevent it.
-			 *
-			 * The previous code skipped locked vnodes and, worse,
-			 * reordered pages in the queue.  This results in
-			 * completely non-deterministic operation and, on a
-			 * busy system, can lead to extremely non-optimal
-			 * pageouts.  For example, it can cause clean pages
-			 * to be freed and dirty pages to be moved to the end
-			 * of the queue.  Since dirty pages are also moved to
-			 * the end of the queue once-cleaned, this gives
-			 * way too large a weighting to defering the freeing
-			 * of dirty pages.
-			 *
-			 * We can't wait forever for the vnode lock, we might
-			 * deadlock due to a vn_read() getting stuck in
-			 * vm_wait while holding this vnode.  We skip the 
-			 * vnode if we can't get it in a reasonable amount
-			 * of time.
-			 */
-			if (object->type == OBJT_VNODE) {
-				vm_page_unlock(m);
-				vp = object->handle;
-				if (vp->v_type == VREG &&
-				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
-					mp = NULL;
-					++pageout_lock_miss;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-				KASSERT(mp != NULL,
-				    ("vp %p with NULL v_mount", vp));
-				vm_object_reference_locked(object);
-				VM_OBJECT_WUNLOCK(object);
-				lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
-				    LK_SHARED : LK_EXCLUSIVE;
-				if (vget(vp, lockmode | LK_TIMELOCK,
-				    curthread)) {
-					VM_OBJECT_WLOCK(object);
-					++pageout_lock_miss;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					vp = NULL;
-					goto unlock_and_continue;
-				}
-				VM_OBJECT_WLOCK(object);
-				vm_page_lock(m);
-				vm_pagequeue_lock(pq);
-				queue_locked = TRUE;
-				/*
-				 * The page might have been moved to another
-				 * queue during potential blocking in vget()
-				 * above.  The page might have been freed and
-				 * reused for another vnode.
-				 */
-				if (m->queue != PQ_INACTIVE ||
-				    m->object != object ||
-				    TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) {
-					vm_page_unlock(m);
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-	
-				/*
-				 * The page may have been busied during the
-				 * blocking in vget().  We don't move the
-				 * page back onto the end of the queue so that
-				 * statistics are more correct if we don't.
-				 */
-				if (vm_page_busied(m)) {
-					vm_page_unlock(m);
-					addl_page_shortage++;
-					goto unlock_and_continue;
-				}
-
-				/*
-				 * If the page has become held it might
-				 * be undergoing I/O, so skip it
-				 */
-				if (m->hold_count != 0) {
-					vm_page_unlock(m);
-					addl_page_shortage++;
-					if (object->flags & OBJ_MIGHTBEDIRTY)
-						vnodes_skipped++;
-					goto unlock_and_continue;
-				}
-				vm_pagequeue_unlock(pq);
-				queue_locked = FALSE;
-			}
-
-			/*
-			 * If a page is dirty, then it is either being washed
-			 * (but not yet cleaned) or it is still in the
-			 * laundry.  If it is still in the laundry, then we
-			 * start the cleaning operation. 
-			 *
-			 * decrement page_shortage on success to account for
-			 * the (future) cleaned page.  Otherwise we could wind
-			 * up laundering or cleaning too many pages.
-			 */
-			if (vm_pageout_clean(m) != 0) {
-				--page_shortage;
-				--maxlaunder;
-			}
-unlock_and_continue:
-			vm_page_lock_assert(m, MA_NOTOWNED);
-			VM_OBJECT_WUNLOCK(object);
-			if (mp != NULL) {
-				if (queue_locked) {
-					vm_pagequeue_unlock(pq);
-					queue_locked = FALSE;
-				}
-				if (vp != NULL)
-					vput(vp);
-				vm_object_deallocate(object);
-				vn_finished_write(mp);
-			}
-			vm_page_lock_assert(m, MA_NOTOWNED);
-			goto relock_queue;
-		}
+		} else if ((object->flags & OBJ_DEAD) == 0)
+			vm_page_launder(m);
+drop_page:
 		vm_page_unlock(m);
 		VM_OBJECT_WUNLOCK(object);
-relock_queue:
 		if (!queue_locked) {
 			vm_pagequeue_lock(pq);
 			queue_locked = TRUE;
@@ -1349,22 +1294,30 @@
 	}
 	vm_pagequeue_unlock(pq);
 
-#if !defined(NO_SWAPPING)
 	/*
-	 * Wakeup the swapout daemon if we didn't cache or free the targeted
-	 * number of pages. 
+	 * Wake up the laundry thread so that it can perform any needed
+	 * laundering.  If we didn't meet our target, we're in shortfall and
+	 * need to launder more aggressively.
 	 */
-	if (vm_swap_enabled && page_shortage > 0)
-		vm_req_vmdaemon(VM_SWAP_NORMAL);
-#endif
+	if (vm_laundry_request == VM_LAUNDRY_IDLE &&
+	    starting_page_shortage > 0) {
+		pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
+		vm_pagequeue_lock(pq);
+		if (page_shortage > 0) {
+			vm_laundry_request = VM_LAUNDRY_SHORTFALL;
+			PCPU_INC(cnt.v_pdshortfalls);
+		} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
+			vm_laundry_request = VM_LAUNDRY_BACKGROUND;
+		wakeup(&vm_laundry_request);
+		vm_pagequeue_unlock(pq);
+	}
 
 	/*
-	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
-	 * and we didn't cache or free enough pages.
+	 * Wakeup the swapout daemon if we didn't free the targeted number of
+	 * pages.
 	 */
-	if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target -
-	    cnt.v_free_min)
-		(void)speedup_syncer();
+	if (page_shortage > 0)
+		vm_swapout_run();
 
 	/*
 	 * If the inactive queue scan fails repeatedly to meet its
@@ -1374,10 +1327,20 @@
 
 	/*
 	 * Compute the number of pages we want to try to move from the
-	 * active queue to the inactive queue.
+	 * active queue to either the inactive or laundry queue.
+	 *
+	 * When scanning active pages, we make clean pages count more heavily
+	 * towards the page shortage than dirty pages.  This is because dirty
+	 * pages must be laundered before they can be reused and thus have less
+	 * utility when attempting to quickly alleviate a shortage.  However,
+	 * this weighting also causes the scan to deactivate dirty pages more
+	 * more aggressively, improving the effectiveness of clustering and
+	 * ensuring that they can eventually be reused.
 	 */
-	page_shortage = cnt.v_inactive_target - cnt.v_inactive_count +
+	inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
+	    vm_cnt.v_laundry_count / act_scan_laundry_weight) +
 	    vm_paging_target() + deficit + addl_page_shortage;
+	inactq_shortage *= act_scan_laundry_weight;
 
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
@@ -1394,7 +1357,7 @@
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
-	if (min_scan > 0 || (page_shortage > 0 && maxscan > 0))
+	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
 		vmd->vmd_last_active_scan = scan_tick;
 
 	/*
@@ -1403,7 +1366,7 @@
 	 * candidates.  Held pages may be deactivated.
 	 */
 	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
-	    min_scan || (page_shortage > 0 && scanned < maxscan)); m = next,
+	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
 	    scanned++) {
 		KASSERT(m->queue == PQ_ACTIVE,
 		    ("vm_pageout_scan: page %p isn't active", m));
@@ -1428,11 +1391,12 @@
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
-		act_delta = 0;
-		if (m->aflags & PGA_REFERENCED) {
+		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
-			act_delta += 1;
-		}
+			act_delta = 1;
+		} else
+			act_delta = 0;
+
 		/*
 		 * Perform an unsynchronized object ref count check.  While
 		 * the page lock ensures that the page is not reallocated to
@@ -1452,41 +1416,60 @@
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
-		if (act_delta) {
+		if (act_delta != 0) {
 			m->act_count += ACT_ADVANCE + act_delta;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
-		} else {
+		} else
 			m->act_count -= min(m->act_count, ACT_DECLINE);
-			act_delta = m->act_count;
-		}
 
 		/*
-		 * Move this page to the tail of the active or inactive
+		 * Move this page to the tail of the active, inactive or laundry
 		 * queue depending on usage.
 		 */
-		if (act_delta == 0) {
+		if (m->act_count == 0) {
 			/* Dequeue to avoid later lock recursion. */
 			vm_page_dequeue_locked(m);
-			vm_page_deactivate(m);
-			page_shortage--;
+
+			/*
+			 * When not short for inactive pages, let dirty pages go
+			 * through the inactive queue before moving to the
+			 * laundry queues.  This gives them some extra time to
+			 * be reactivated, potentially avoiding an expensive
+			 * pageout.  During a page shortage, the inactive queue
+			 * is necessarily small, so we may move dirty pages
+			 * directly to the laundry queue.
+			 */
+			if (inactq_shortage <= 0)
+				vm_page_deactivate(m);
+			else {
+				/*
+				 * Calling vm_page_test_dirty() here would
+				 * require acquisition of the object's write
+				 * lock.  However, during a page shortage,
+				 * directing dirty pages into the laundry
+				 * queue is only an optimization and not a
+				 * requirement.  Therefore, we simply rely on
+				 * the opportunistic updates to the page's
+				 * dirty field by the pmap.
+				 */
+				if (m->dirty == 0) {
+					vm_page_deactivate(m);
+					inactq_shortage -=
+					    act_scan_laundry_weight;
+				} else {
+					vm_page_launder(m);
+					inactq_shortage--;
+				}
+			}
 		} else
 			vm_page_requeue_locked(m);
 		vm_page_unlock(m);
 	}
 	vm_pagequeue_unlock(pq);
-#if !defined(NO_SWAPPING)
-	/*
-	 * Idle process swapout -- run once per second.
-	 */
-	if (vm_swap_idle_enabled) {
-		static long lsec;
-		if (time_second != lsec) {
-			vm_req_vmdaemon(VM_SWAP_IDLE);
-			lsec = time_second;
-		}
-	}
-#endif
+	if (pass > 0)
+		vm_swapout_run_idle();
+	return (page_shortage <= 0);
 }
 
 static int vm_pageout_oom_vote;
@@ -1668,19 +1651,21 @@
 			PROC_UNLOCK(p);
 			continue;
 		}
-		_PHOLD(p);
+		_PHOLD_LITE(p);
+		PROC_UNLOCK(p);
+		sx_sunlock(&allproc_lock);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
-			_PRELE(p);
-			PROC_UNLOCK(p);
 			vmspace_free(vm);
+			sx_slock(&allproc_lock);
+			PRELE(p);
 			continue;
 		}
-		PROC_UNLOCK(p);
 		size = vmspace_swap_count(vm);
 		if (shortage == VM_OOM_MEM)
 			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		vmspace_free(vm);
+		sx_slock(&allproc_lock);
 
 		/*
 		 * If this process is bigger than the biggest one,
@@ -1697,12 +1682,14 @@
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
+		if (vm_panic_on_oom != 0)
+			panic("out of swap space");
 		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
-		wakeup(&cnt.v_free_count);
+		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
@@ -1710,10 +1697,13 @@
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *domain;
-	int domidx;
+	int domidx, pass;
+	bool target_met;
 
 	domidx = (uintptr_t)arg;
 	domain = &vm_dom[domidx];
+	pass = 0;
+	target_met = true;
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
@@ -1724,54 +1714,80 @@
 	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
 	domain->vmd_last_active_scan = ticks;
 	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
+	vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
+	TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
+	    &domain->vmd_inacthead, plinks.q);
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
+		mtx_lock(&vm_page_queue_free_mtx);
+
 		/*
-		 * If we have enough free memory, wakeup waiters.  Do
-		 * not clear vm_pages_needed until we reach our target,
-		 * otherwise we may be woken up over and over again and
-		 * waste a lot of cpu.
+		 * Generally, after a level >= 1 scan, if there are enough
+		 * free pages to wakeup the waiters, then they are already
+		 * awake.  A call to vm_page_free() during the scan awakened
+		 * them.  However, in the following case, this wakeup serves
+		 * to bound the amount of time that a thread might wait.
+		 * Suppose a thread's call to vm_page_alloc() fails, but
+		 * before that thread calls VM_WAIT, enough pages are freed by
+		 * other threads to alleviate the free page shortage.  The
+		 * thread will, nonetheless, wait until another page is freed
+		 * or this wakeup is performed.
 		 */
-		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
-			if (!vm_paging_needed())
-				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+			vm_pages_needed = false;
+			wakeup(&vm_cnt.v_free_count);
 		}
-		if (vm_pages_needed) {
+
+		/*
+		 * Do not clear vm_pageout_wanted until we reach our free page
+		 * target.  Otherwise, we may be awakened over and over again,
+		 * wasting CPU time.
+		 */
+		if (vm_pageout_wanted && target_met)
+			vm_pageout_wanted = false;
+
+		/*
+		 * Might the page daemon receive a wakeup call?
+		 */
+		if (vm_pageout_wanted) {
 			/*
-			 * We're still not done.  Either vm_pages_needed was
-			 * set by another thread during the previous scan
-			 * (typically, this happens during a level 0 scan) or
-			 * vm_pages_needed was already set and the scan failed
-			 * to free enough pages.  If we haven't yet performed
-			 * a level >= 2 scan (unlimited dirty cleaning), then
-			 * upgrade the level and scan again now.  Otherwise,
-			 * sleep a bit and try again later.  While sleeping,
-			 * vm_pages_needed can be cleared.
+			 * No.  Either vm_pageout_wanted was set by another
+			 * thread during the previous scan, which must have
+			 * been a level 0 scan, or vm_pageout_wanted was
+			 * already set and the scan failed to free enough
+			 * pages.  If we haven't yet performed a level >= 1
+			 * (page reclamation) scan, then increase the level
+			 * and scan again now.  Otherwise, sleep a bit and
+			 * try again later.
 			 */
-			if (domain->vmd_pass > 1)
-				msleep(&vm_pages_needed,
-				    &vm_page_queue_free_mtx, PVM, "psleep",
-				    hz / 2);
+			mtx_unlock(&vm_page_queue_free_mtx);
+			if (pass >= 1)
+				pause("pwait", hz / VM_INACT_SCAN_RATE);
+			pass++;
 		} else {
 			/*
-			 * Good enough, sleep until required to refresh
-			 * stats.
+			 * Yes.  If threads are still sleeping in VM_WAIT
+			 * then we immediately start a new scan.  Otherwise,
+			 * sleep until the next wakeup or until pages need to
+			 * have their reference stats updated.
 			 */
-			msleep(&vm_pages_needed, &vm_page_queue_free_mtx,
-			    PVM, "psleep", hz);
+			if (vm_pages_needed) {
+				mtx_unlock(&vm_page_queue_free_mtx);
+				if (pass == 0)
+					pass++;
+			} else if (mtx_sleep(&vm_pageout_wanted,
+			    &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
+			    hz) == 0) {
+				PCPU_INC(cnt.v_pdwakeups);
+				pass = 1;
+			} else
+				pass = 0;
 		}
-		if (vm_pages_needed) {
-			cnt.v_pdwakeups++;
-			domain->vmd_pass++;
-		} else
-			domain->vmd_pass = 0;
-		mtx_unlock(&vm_page_queue_free_mtx);
-		vm_pageout_scan(domain, domain->vmd_pass);
+
+		target_met = vm_pageout_scan(domain, pass);
 	}
 }
 
@@ -1784,8 +1800,8 @@
 	/*
 	 * Initialize some paging parameters.
 	 */
-	cnt.v_interrupt_free_min = 2;
-	if (cnt.v_page_count < 2000)
+	vm_cnt.v_interrupt_free_min = 2;
+	if (vm_cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
@@ -1793,27 +1809,27 @@
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
-	if (cnt.v_page_count > 1024)
-		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
+	if (vm_cnt.v_page_count > 1024)
+		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
 	else
-		cnt.v_free_min = 4;
-	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
-	    cnt.v_interrupt_free_min;
-	cnt.v_free_reserved = vm_pageout_page_count +
-	    cnt.v_pageout_free_min + (cnt.v_page_count / 768);
-	cnt.v_free_severe = cnt.v_free_min / 2;
-	cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
-	cnt.v_free_min += cnt.v_free_reserved;
-	cnt.v_free_severe += cnt.v_free_reserved;
-	cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
-	if (cnt.v_inactive_target > cnt.v_free_count / 3)
-		cnt.v_inactive_target = cnt.v_free_count / 3;
+		vm_cnt.v_free_min = 4;
+	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
+	    vm_cnt.v_interrupt_free_min;
+	vm_cnt.v_free_reserved = vm_pageout_page_count +
+	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
+	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
+	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
+	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
+	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
+	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
+	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
+		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% above the minimum
 	 * page limit.  This keeps the steady state out of shortfall.
 	 */
-	vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11;
+	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
@@ -1825,7 +1841,15 @@
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
-		vm_page_max_wired = cnt.v_free_count / 3;
+		vm_page_max_wired = vm_cnt.v_free_count / 3;
+
+	/*
+	 * Target amount of memory to move out of the laundry queue during a
+	 * background laundering.  This is proportional to the amount of system
+	 * memory.
+	 */
+	vm_background_launder_target = (vm_cnt.v_free_target -
+	    vm_cnt.v_free_min) / 10;
 }
 
 /*
@@ -1835,12 +1859,17 @@
 vm_pageout(void)
 {
 	int error;
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	int i;
 #endif
 
 	swap_pager_swap_init();
-#if MAXMEMDOM > 1
+	snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0");
+	error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
+	    0, 0, "laundry: dom0");
+	if (error != 0)
+		panic("starting laundry for domain 0, error %d", error);
+#ifdef VM_NUMA_ALLOC
 	for (i = 1; i < vm_ndomains; i++) {
 		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
 		    curproc, NULL, 0, 0, "dom%d", i);
@@ -1858,175 +1887,42 @@
 }
 
 /*
- * Unless the free page queue lock is held by the caller, this function
- * should be regarded as advisory.  Specifically, the caller should
- * not msleep() on &cnt.v_free_count following this function unless
- * the free page queue lock is held until the msleep() is performed.
+ * Perform an advisory wakeup of the page daemon.
  */
 void
 pagedaemon_wakeup(void)
 {
 
-	if (!vm_pages_needed && curthread->td_proc != pageproc) {
-		vm_pages_needed = 1;
-		wakeup(&vm_pages_needed);
-	}
-}
+	mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED);
 
-#if !defined(NO_SWAPPING)
-static void
-vm_req_vmdaemon(int req)
-{
-	static int lastrun = 0;
-
-	mtx_lock(&vm_daemon_mtx);
-	vm_pageout_req_swapout |= req;
-	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
-		wakeup(&vm_daemon_needed);
-		lastrun = ticks;
+	if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
+		vm_pageout_wanted = true;
+		wakeup(&vm_pageout_wanted);
 	}
-	mtx_unlock(&vm_daemon_mtx);
 }
 
-static void
-vm_daemon(void)
+/*
+ * Wake up the page daemon and wait for it to reclaim free pages.
+ *
+ * This function returns with the free queues mutex unlocked.
+ */
+void
+pagedaemon_wait(int pri, const char *wmesg)
 {
-	struct rlimit rsslim;
-	struct proc *p;
-	struct thread *td;
-	struct vmspace *vm;
-	int breakout, swapout_flags, tryagain, attempts;
-#ifdef RACCT
-	uint64_t rsize, ravailable;
-#endif
 
-	while (TRUE) {
-		mtx_lock(&vm_daemon_mtx);
-		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
-#ifdef RACCT
-		    racct_enable ? hz : 0
-#else
-		    0
-#endif
-		);
-		swapout_flags = vm_pageout_req_swapout;
-		vm_pageout_req_swapout = 0;
-		mtx_unlock(&vm_daemon_mtx);
-		if (swapout_flags)
-			swapout_procs(swapout_flags);
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 
-		/*
-		 * scan the processes for exceeding their rlimits or if
-		 * process is swapped out -- deactivate pages
-		 */
-		tryagain = 0;
-		attempts = 0;
-again:
-		attempts++;
-		sx_slock(&allproc_lock);
-		FOREACH_PROC_IN_SYSTEM(p) {
-			vm_pindex_t limit, size;
-
-			/*
-			 * if this is a system process or if we have already
-			 * looked at this process, skip it.
-			 */
-			PROC_LOCK(p);
-			if (p->p_state != PRS_NORMAL ||
-			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			/*
-			 * if the process is in a non-running type state,
-			 * don't touch it.
-			 */
-			breakout = 0;
-			FOREACH_THREAD_IN_PROC(p, td) {
-				thread_lock(td);
-				if (!TD_ON_RUNQ(td) &&
-				    !TD_IS_RUNNING(td) &&
-				    !TD_IS_SLEEPING(td) &&
-				    !TD_IS_SUSPENDED(td)) {
-					thread_unlock(td);
-					breakout = 1;
-					break;
-				}
-				thread_unlock(td);
-			}
-			if (breakout) {
-				PROC_UNLOCK(p);
-				continue;
-			}
-			/*
-			 * get a limit
-			 */
-			lim_rlimit(p, RLIMIT_RSS, &rsslim);
-			limit = OFF_TO_IDX(
-			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
-
-			/*
-			 * let processes that are swapped out really be
-			 * swapped out set the limit to nothing (will force a
-			 * swap-out.)
-			 */
-			if ((p->p_flag & P_INMEM) == 0)
-				limit = 0;	/* XXX */
-			vm = vmspace_acquire_ref(p);
-			PROC_UNLOCK(p);
-			if (vm == NULL)
-				continue;
-
-			size = vmspace_resident_count(vm);
-			if (size >= limit) {
-				vm_pageout_map_deactivate_pages(
-				    &vm->vm_map, limit);
-				size = vmspace_resident_count(vm);
-			}
-#ifdef RACCT
-			if (racct_enable) {
-				rsize = IDX_TO_OFF(size);
-				PROC_LOCK(p);
-				if (p->p_state == PRS_NORMAL)
-					racct_set(p, RACCT_RSS, rsize);
-				ravailable = racct_get_available(p, RACCT_RSS);
-				PROC_UNLOCK(p);
-				if (rsize > ravailable) {
-					/*
-					 * Don't be overly aggressive; this
-					 * might be an innocent process,
-					 * and the limit could've been exceeded
-					 * by some memory hog.  Don't try
-					 * to deactivate more than 1/4th
-					 * of process' resident set size.
-					 */
-					if (attempts <= 8) {
-						if (ravailable < rsize -
-						    (rsize / 4)) {
-							ravailable = rsize -
-							    (rsize / 4);
-						}
-					}
-					vm_pageout_map_deactivate_pages(
-					    &vm->vm_map,
-					    OFF_TO_IDX(ravailable));
-					/* Update RSS usage after paging out. */
-					size = vmspace_resident_count(vm);
-					rsize = IDX_TO_OFF(size);
-					PROC_LOCK(p);
-					if (p->p_state == PRS_NORMAL)
-						racct_set(p, RACCT_RSS, rsize);
-					PROC_UNLOCK(p);
-					if (rsize > ravailable)
-						tryagain = 1;
-				}
-			}
-#endif
-			vmspace_free(vm);
-		}
-		sx_sunlock(&allproc_lock);
-		if (tryagain != 0 && attempts <= 10)
-			goto again;
+	/*
+	 * vm_pageout_wanted may have been set by an advisory wakeup, but if the
+	 * page daemon is running on a CPU, the wakeup will have been lost.
+	 * Thus, deliver a potentially spurious wakeup to ensure that the page
+	 * daemon has been notified of the shortage.
+	 */
+	if (!vm_pageout_wanted || !vm_pages_needed) {
+		vm_pageout_wanted = true;
+		wakeup(&vm_pageout_wanted);
 	}
+	vm_pages_needed = true;
+	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri,
+	    wmesg, 0);
 }
-#endif			/* !defined(NO_SWAPPING) */

Modified: trunk/sys/vm/vm_pageout.h
===================================================================
--- trunk/sys/vm/vm_pageout.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pageout.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,12 +58,14 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_pageout.h 314664 2017-03-04 12:05:50Z avg $
+ * $FreeBSD: stable/11/sys/vm/vm_pageout.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef _VM_VM_PAGEOUT_H_
 #define _VM_VM_PAGEOUT_H_
 
+#ifdef _KERNEL
+
 /*
  *	Header file for pageout daemon.
  */
@@ -73,17 +75,11 @@
  */
 
 extern int vm_page_max_wired;
-extern int vm_pages_needed;	/* should be some "event" structure */
-extern int vm_pageout_pages_needed;
 extern int vm_pageout_deficit;
 extern int vm_pageout_page_count;
+extern bool vm_pageout_wanted;
+extern bool vm_pages_needed;
 
-/*
- * Swap out requests
- */
-#define VM_SWAP_NORMAL 1
-#define VM_SWAP_IDLE 2
-
 #define	VM_OOM_MEM	1
 #define	VM_OOM_SWAPZ	2
 
@@ -101,15 +97,17 @@
  *	Signal pageout-daemon and wait for it.
  */
 
-extern void pagedaemon_wakeup(void);
+void pagedaemon_wait(int pri, const char *wmesg);
+void pagedaemon_wakeup(void);
 #define VM_WAIT vm_wait()
 #define VM_WAITPFAULT vm_waitpfault()
-extern void vm_wait(void);
-extern void vm_waitpfault(void);
+void vm_wait(void);
+void vm_waitpfault(void);
 
-#ifdef _KERNEL
 int vm_pageout_flush(vm_page_t *, int, int, int, int *, boolean_t *);
-void vm_pageout_grow_cache(int, vm_paddr_t, vm_paddr_t);
 void vm_pageout_oom(int shortage);
-#endif
+
+void vm_swapout_run(void);
+void vm_swapout_run_idle(void);
+#endif /* _KERNEL */
 #endif	/* _VM_VM_PAGEOUT_H_ */

Modified: trunk/sys/vm/vm_pager.c
===================================================================
--- trunk/sys/vm/vm_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -65,7 +65,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pager.c 311645 2017-01-07 12:04:30Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -87,7 +87,9 @@
 
 int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
 
-static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int);
+struct buf *swbuf;
+
+static int dead_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static vm_object_t dead_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *);
 static void dead_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
@@ -95,13 +97,11 @@
 static void dead_pager_dealloc(vm_object_t);
 
 static int
-dead_pager_getpages(obj, ma, count, req)
-	vm_object_t obj;
-	vm_page_t *ma;
-	int count;
-	int req;
+dead_pager_getpages(vm_object_t obj, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
 {
-	return VM_PAGER_FAIL;
+
+	return (VM_PAGER_FAIL);
 }
 
 static vm_object_t
@@ -158,8 +158,6 @@
 	&mgtdevicepagerops,	/* OBJT_MGTDEVICE */
 };
 
-static const int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
-
 /*
  * Kernel address space for mapping pages.
  * Used by pagers where KVAs are needed for IO.
@@ -168,7 +166,7 @@
  * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
  * (MAXPHYS == 64k) if you want to get the most efficiency.
  */
-struct mtx_padalign pbuf_mtx;
+struct mtx_padalign __exclusive_cache_line pbuf_mtx;
 static TAILQ_HEAD(swqueue, buf) bswlist;
 static int bswneeded;
 vm_offset_t swapbkva;		/* swap buffers kva */
@@ -182,7 +180,7 @@
 	/*
 	 * Initialize known pagers
 	 */
-	for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
+	for (pgops = pagertab; pgops < &pagertab[nitems(pagertab)]; pgops++)
 		if ((*pgops)->pgo_init != NULL)
 			(*(*pgops)->pgo_init)();
 }
@@ -208,6 +206,7 @@
 
 	cluster_pbuf_freecnt = nswbuf / 2;
 	vnode_pbuf_freecnt = nswbuf / 2 + 1;
+	vnode_async_pbuf_freecnt = nswbuf / 2;
 }
 
 /*
@@ -241,8 +240,80 @@
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
+static void
+vm_pager_assert_in(vm_object_t object, vm_page_t *m, int count)
+{
+#ifdef INVARIANTS
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(count > 0, ("%s: 0 count", __func__));
+	/*
+	 * All pages must be busied, not mapped, not fully valid,
+	 * not dirty and belong to the proper object.
+	 */
+	for (int i = 0 ; i < count; i++) {
+		vm_page_assert_xbusied(m[i]);
+		KASSERT(!pmap_page_is_mapped(m[i]),
+		    ("%s: page %p is mapped", __func__, m[i]));
+		KASSERT(m[i]->valid != VM_PAGE_BITS_ALL,
+		    ("%s: request for a valid page %p", __func__, m[i]));
+		KASSERT(m[i]->dirty == 0,
+		    ("%s: page %p is dirty", __func__, m[i]));
+		KASSERT(m[i]->object == object,
+		    ("%s: wrong object %p/%p", __func__, object, m[i]->object));
+	}
+#endif
+}
+
 /*
- * vm_pager_get_pages() - inline, see vm/vm_pager.h
+ * Page in the pages for the object using its associated pager.
+ * The requested page must be fully valid on successful return.
+ */
+int
+vm_pager_get_pages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
+{
+#ifdef INVARIANTS
+	vm_pindex_t pindex = m[0]->pindex;
+#endif
+	int r;
+
+	vm_pager_assert_in(object, m, count);
+
+	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, rbehind,
+	    rahead);
+	if (r != VM_PAGER_OK)
+		return (r);
+
+	for (int i = 0; i < count; i++) {
+		/*
+		 * If pager has replaced a page, assert that it had
+		 * updated the array.
+		 */
+		KASSERT(m[i] == vm_page_lookup(object, pindex++),
+		    ("%s: mismatch page %p pindex %ju", __func__,
+		    m[i], (uintmax_t )pindex - 1));
+		/*
+		 * Zero out partially filled data.
+		 */
+		if (m[i]->valid != VM_PAGE_BITS_ALL)
+			vm_page_zero_invalid(m[i], TRUE);
+	}
+	return (VM_PAGER_OK);
+}
+
+int
+vm_pager_get_pages_async(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
+{
+
+	vm_pager_assert_in(object, m, count);
+
+	return ((*pagertab[object->type]->pgo_getpages_async)(object, m,
+	    count, rbehind, rahead, iodone, arg));
+}
+
+/*
  * vm_pager_put_pages() - inline, see vm/vm_pager.h
  * vm_pager_has_page() - inline, see vm/vm_pager.h
  */
@@ -289,12 +360,11 @@
 	bp->b_rcred = NOCRED;
 	bp->b_wcred = NOCRED;
 	bp->b_qindex = 0;	/* On no queue (QUEUE_NONE) */
-	bp->b_saveaddr = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
-	bp->b_data = bp->b_saveaddr;
-	bp->b_kvabase = bp->b_saveaddr;
+	bp->b_kvabase = (caddr_t)(MAXPHYS * (bp - swbuf)) + swapbkva;
+	bp->b_data = bp->b_kvabase;
 	bp->b_kvasize = MAXPHYS;
+	bp->b_flags = 0;
 	bp->b_xflags = 0;
-	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_iodone = NULL;
 	bp->b_error = 0;

Modified: trunk/sys/vm/vm_pager.h
===================================================================
--- trunk/sys/vm/vm_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vm_pager.h	8.4 (Berkeley) 1/12/94
- * $FreeBSD: stable/10/sys/vm/vm_pager.h 308365 2016-11-06 13:37:33Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -51,19 +51,26 @@
 typedef vm_object_t pgo_alloc_t(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t,
     struct ucred *);
 typedef void pgo_dealloc_t(vm_object_t);
-typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int);
+typedef int pgo_getpages_t(vm_object_t, vm_page_t *, int, int *, int *);
+typedef void pgo_getpages_iodone_t(void *, vm_page_t *, int, int);
+typedef int pgo_getpages_async_t(vm_object_t, vm_page_t *, int, int *, int *,
+    pgo_getpages_iodone_t, void *);
 typedef void pgo_putpages_t(vm_object_t, vm_page_t *, int, int, int *);
 typedef boolean_t pgo_haspage_t(vm_object_t, vm_pindex_t, int *, int *);
+typedef int pgo_populate_t(vm_object_t, vm_pindex_t, int, vm_prot_t,
+    vm_pindex_t *, vm_pindex_t *);
 typedef void pgo_pageunswapped_t(vm_page_t);
 
 struct pagerops {
-	pgo_init_t	*pgo_init;		/* Initialize pager. */
-	pgo_alloc_t	*pgo_alloc;		/* Allocate pager. */
-	pgo_dealloc_t	*pgo_dealloc;		/* Disassociate. */
-	pgo_getpages_t	*pgo_getpages;		/* Get (read) page. */
-	pgo_putpages_t	*pgo_putpages;		/* Put (write) page. */
-	pgo_haspage_t	*pgo_haspage;		/* Does pager have page? */
-	pgo_pageunswapped_t *pgo_pageunswapped;
+	pgo_init_t		*pgo_init;		/* Initialize pager. */
+	pgo_alloc_t		*pgo_alloc;		/* Allocate pager. */
+	pgo_dealloc_t		*pgo_dealloc;		/* Disassociate. */
+	pgo_getpages_t		*pgo_getpages;		/* Get (read) page. */
+	pgo_getpages_async_t	*pgo_getpages_async;	/* Get page asyncly. */
+	pgo_putpages_t		*pgo_putpages;		/* Put (write) page. */
+	pgo_haspage_t		*pgo_haspage;		/* Query page. */
+	pgo_populate_t		*pgo_populate;		/* Bulk spec pagein. */
+	pgo_pageunswapped_t	*pgo_pageunswapped;
 };
 
 extern struct pagerops defaultpagerops;
@@ -92,6 +99,7 @@
 
 #define	VM_PAGER_PUT_SYNC		0x0001
 #define	VM_PAGER_PUT_INVAL		0x0002
+#define	VM_PAGER_PUT_NOREUSE		0x0004
 #define VM_PAGER_CLUSTER_OK		0x0008
 
 #ifdef _KERNEL
@@ -103,34 +111,12 @@
     vm_ooffset_t, struct ucred *);
 void vm_pager_bufferinit(void);
 void vm_pager_deallocate(vm_object_t);
-static __inline int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int);
+int vm_pager_get_pages(vm_object_t, vm_page_t *, int, int *, int *);
+int vm_pager_get_pages_async(vm_object_t, vm_page_t *, int, int *, int *,
+    pgo_getpages_iodone_t, void *);
 void vm_pager_init(void);
 vm_object_t vm_pager_object_lookup(struct pagerlst *, void *);
 
-/*
- *	vm_page_get_pages:
- *
- *	Retrieve pages from the VM system in order to map them into an object
- *	( or into VM space somewhere ).  If the pagein was successful, we
- *	must fully validate it.
- */
-static __inline int
-vm_pager_get_pages(
-	vm_object_t object,
-	vm_page_t *m,
-	int count,
-	int reqpage
-) {
-	int r;
-
-	VM_OBJECT_ASSERT_WLOCKED(object);
-	r = (*pagertab[object->type]->pgo_getpages)(object, m, count, reqpage);
-	if (r == VM_PAGER_OK && m[reqpage]->valid != VM_PAGE_BITS_ALL) {
-		vm_page_zero_invalid(m[reqpage], TRUE);
-	}
-	return (r);
-}
-
 static __inline void
 vm_pager_put_pages(
 	vm_object_t object,
@@ -170,6 +156,19 @@
 	return (ret);
 } 
 
+static __inline int
+vm_pager_populate(vm_object_t object, vm_pindex_t pidx, int fault_type,
+    vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
+{
+
+	MPASS((object->flags & OBJ_POPULATE) != 0);
+	MPASS(pidx < object->size);
+	MPASS(object->paging_in_progress > 0);
+	return ((*pagertab[object->type]->pgo_populate)(object, pidx,
+	    fault_type, max_prot, first, last));
+}
+
+
 /* 
  *      vm_pager_page_unswapped
  * 
@@ -195,6 +194,9 @@
 struct cdev_pager_ops {
 	int (*cdev_pg_fault)(vm_object_t vm_obj, vm_ooffset_t offset,
 	    int prot, vm_page_t *mres);
+	int (*cdev_pg_populate)(vm_object_t vm_obj, vm_pindex_t pidx,
+	    int fault_type, vm_prot_t max_prot, vm_pindex_t *first,
+	    vm_pindex_t *last);
 	int (*cdev_pg_ctor)(void *handle, vm_ooffset_t size, vm_prot_t prot,
 	    vm_ooffset_t foff, struct ucred *cred, u_short *color);
 	void (*cdev_pg_dtor)(void *handle);

Modified: trunk/sys/vm/vm_param.h
===================================================================
--- trunk/sys/vm/vm_param.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_param.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -58,7 +58,7 @@
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $FreeBSD: stable/10/sys/vm/vm_param.h 254168 2013-08-09 23:47:43Z zont $
+ * $FreeBSD: stable/11/sys/vm/vm_param.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 /*
@@ -76,16 +76,17 @@
 #define	VM_TOTAL		1	/* struct vmtotal */
 #define	VM_METER                VM_TOTAL/* deprecated, use VM_TOTAL */
 #define	VM_LOADAVG	 	2	/* struct loadavg */
-#define VM_V_FREE_MIN		3	/* cnt.v_free_min */
-#define VM_V_FREE_TARGET	4	/* cnt.v_free_target */
-#define VM_V_FREE_RESERVED	5	/* cnt.v_free_reserved */
-#define VM_V_INACTIVE_TARGET	6	/* cnt.v_inactive_target */
-#define	VM_V_CACHE_MIN		7	/* cnt.v_cache_min */
-#define	VM_V_CACHE_MAX		8	/* cnt.v_cache_max */
-#define VM_V_PAGEOUT_FREE_MIN	9	/* cnt.v_pageout_free_min */
+#define VM_V_FREE_MIN		3	/* vm_cnt.v_free_min */
+#define VM_V_FREE_TARGET	4	/* vm_cnt.v_free_target */
+#define VM_V_FREE_RESERVED	5	/* vm_cnt.v_free_reserved */
+#define VM_V_INACTIVE_TARGET	6	/* vm_cnt.v_inactive_target */
+#define	VM_OBSOLETE_7		7	/* unused, formerly v_cache_min */
+#define	VM_OBSOLETE_8		8	/* unused, formerly v_cache_max */
+#define VM_V_PAGEOUT_FREE_MIN	9	/* vm_cnt.v_pageout_free_min */
 #define	VM_OBSOLETE_10		10	/* pageout algorithm */
 #define VM_SWAPPING_ENABLED	11	/* swapping enabled */
-#define	VM_MAXID		12	/* number of valid vm ids */
+#define VM_OVERCOMMIT		12	/* vm.overcommit */
+#define	VM_MAXID		13	/* number of valid vm ids */
 
 /*
  * Structure for swap device statistics

Modified: trunk/sys/vm/vm_phys.c
===================================================================
--- trunk/sys/vm/vm_phys.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_phys.c 308349 2016-11-05 20:14:23Z markj $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_phys.c 331614 2018-03-27 13:09:35Z kib $");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
@@ -49,13 +49,14 @@
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
-#if MAXMEMDOM > 1
 #include <sys/proc.h>
-#endif
 #include <sys/queue.h>
+#include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
+#include <sys/tree.h>
 #include <sys/vmmeter.h>
+#include <sys/seq.h>
 
 #include <ddb/ddb.h>
 
@@ -66,10 +67,15 @@
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
+#include <vm/vm_domain.h>
+
 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
     "Too many physsegs.");
 
+#ifdef VM_NUMA_ALLOC
 struct mem_affinity *mem_affinity;
+int *mem_locality;
+#endif
 
 int vm_ndomains = 1;
 
@@ -76,13 +82,25 @@
 struct vm_phys_seg vm_phys_segs[VM_PHYSSEG_MAX];
 int vm_phys_nsegs;
 
-#define VM_PHYS_FICTITIOUS_NSEGS	8
-static struct vm_phys_fictitious_seg {
+struct vm_phys_fictitious_seg;
+static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
+    struct vm_phys_fictitious_seg *);
+
+RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
+    RB_INITIALIZER(_vm_phys_fictitious_tree);
+
+struct vm_phys_fictitious_seg {
+	RB_ENTRY(vm_phys_fictitious_seg) node;
+	/* Memory region data */
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
-} vm_phys_fictitious_segs[VM_PHYS_FICTITIOUS_NSEGS];
-static struct mtx vm_phys_fictitious_reg_mtx;
+};
+
+RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
+    vm_phys_fictitious_cmp);
+
+static struct rwlock vm_phys_fictitious_reg_lock;
 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
 
 static struct vm_freelist
@@ -127,21 +145,139 @@
 SYSCTL_OID(_vm, OID_AUTO, phys_segs, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_vm_phys_segs, "A", "Phys Seg Info");
 
+#ifdef VM_NUMA_ALLOC
+static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
+SYSCTL_OID(_vm, OID_AUTO, phys_locality, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, sysctl_vm_phys_locality, "A", "Phys Locality Info");
+#endif
+
 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
     &vm_ndomains, 0, "Number of physical memory domains available.");
 
+/*
+ * Default to first-touch + round-robin.
+ */
+static struct mtx vm_default_policy_mtx;
+MTX_SYSINIT(vm_default_policy, &vm_default_policy_mtx, "default policy mutex",
+    MTX_DEF);
+#ifdef VM_NUMA_ALLOC
+static struct vm_domain_policy vm_default_policy =
+    VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+#else
+/* Use round-robin so the domain policy code will only try once per allocation */
+static struct vm_domain_policy vm_default_policy =
+    VM_DOMAIN_POLICY_STATIC_INITIALISER(VM_POLICY_ROUND_ROBIN, 0);
+#endif
+
 static vm_page_t vm_phys_alloc_domain_pages(int domain, int flind, int pool,
     int order);
+static vm_page_t vm_phys_alloc_seg_contig(struct vm_phys_seg *seg,
+    u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
+    vm_paddr_t boundary);
 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
-static int vm_phys_paddr_to_segind(vm_paddr_t pa);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
     int order);
 
+static int
+sysctl_vm_default_policy(SYSCTL_HANDLER_ARGS)
+{
+	char policy_name[32];
+	int error;
+
+	mtx_lock(&vm_default_policy_mtx);
+
+	/* Map policy to output string */
+	switch (vm_default_policy.p.policy) {
+	case VM_POLICY_FIRST_TOUCH:
+		strcpy(policy_name, "first-touch");
+		break;
+	case VM_POLICY_FIRST_TOUCH_ROUND_ROBIN:
+		strcpy(policy_name, "first-touch-rr");
+		break;
+	case VM_POLICY_ROUND_ROBIN:
+	default:
+		strcpy(policy_name, "rr");
+		break;
+	}
+	mtx_unlock(&vm_default_policy_mtx);
+
+	error = sysctl_handle_string(oidp, &policy_name[0],
+	    sizeof(policy_name), req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	mtx_lock(&vm_default_policy_mtx);
+	/* Set: match on the subset of policies that make sense as a default */
+	if (strcmp("first-touch-rr", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_FIRST_TOUCH_ROUND_ROBIN, 0);
+	} else if (strcmp("first-touch", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_FIRST_TOUCH, 0);
+	} else if (strcmp("rr", policy_name) == 0) {
+		vm_domain_policy_set(&vm_default_policy,
+		    VM_POLICY_ROUND_ROBIN, 0);
+	} else {
+		error = EINVAL;
+		goto finish;
+	}
+
+	error = 0;
+finish:
+	mtx_unlock(&vm_default_policy_mtx);
+	return (error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, default_policy, CTLTYPE_STRING | CTLFLAG_RW,
+    0, 0, sysctl_vm_default_policy, "A",
+    "Default policy (rr, first-touch, first-touch-rr");
+
+/*
+ * Red-black tree helpers for vm fictitious range management.
+ */
+static inline int
+vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
+    struct vm_phys_fictitious_seg *range)
+{
+
+	KASSERT(range->start != 0 && range->end != 0,
+	    ("Invalid range passed on search for vm_fictitious page"));
+	if (p->start >= range->end)
+		return (1);
+	if (p->start < range->start)
+		return (-1);
+
+	return (0);
+}
+
+static int
+vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
+    struct vm_phys_fictitious_seg *p2)
+{
+
+	/* Check if this is a search for a page */
+	if (p1->end == 0)
+		return (vm_phys_fictitious_in_range(p1, p2));
+
+	KASSERT(p2->end != 0,
+    ("Invalid range passed as second parameter to vm fictitious comparison"));
+
+	/* Searching to add a new range */
+	if (p1->end <= p2->start)
+		return (-1);
+	if (p1->start >= p2->end)
+		return (1);
+
+	panic("Trying to add overlapping vm fictitious ranges:\n"
+	    "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
+	    (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
+}
+
 static __inline int
 vm_rr_selectdomain(void)
 {
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	struct thread *td;
 
 	td = curthread;
@@ -154,6 +290,53 @@
 #endif
 }
 
+/*
+ * Initialise a VM domain iterator.
+ *
+ * Check the thread policy, then the proc policy,
+ * then default to the system policy.
+ *
+ * Later on the various layers will have this logic
+ * plumbed into them and the phys code will be explicitly
+ * handed a VM domain policy to use.
+ */
+static void
+vm_policy_iterator_init(struct vm_domain_iterator *vi)
+{
+#ifdef VM_NUMA_ALLOC
+	struct vm_domain_policy lcl;
+#endif
+
+	vm_domain_iterator_init(vi);
+
+#ifdef VM_NUMA_ALLOC
+	/* Copy out the thread policy */
+	vm_domain_policy_localcopy(&lcl, &curthread->td_vm_dom_policy);
+	if (lcl.p.policy != VM_POLICY_NONE) {
+		/* Thread policy is present; use it */
+		vm_domain_iterator_set_policy(vi, &lcl);
+		return;
+	}
+
+	vm_domain_policy_localcopy(&lcl,
+	    &curthread->td_proc->p_vm_dom_policy);
+	if (lcl.p.policy != VM_POLICY_NONE) {
+		/* Process policy is present; use it */
+		vm_domain_iterator_set_policy(vi, &lcl);
+		return;
+	}
+#endif
+	/* Use system default policy */
+	vm_domain_iterator_set_policy(vi, &vm_default_policy);
+}
+
+static void
+vm_policy_iterator_finish(struct vm_domain_iterator *vi)
+{
+
+	vm_domain_iterator_cleanup(vi);
+}
+
 boolean_t
 vm_phys_domain_intersects(long mask, vm_paddr_t low, vm_paddr_t high)
 {
@@ -243,6 +426,54 @@
 	return (error);
 }
 
+/*
+ * Return affinity, or -1 if there's no affinity information.
+ */
+int
+vm_phys_mem_affinity(int f, int t)
+{
+
+#ifdef VM_NUMA_ALLOC
+	if (mem_locality == NULL)
+		return (-1);
+	if (f >= vm_ndomains || t >= vm_ndomains)
+		return (-1);
+	return (mem_locality[f * vm_ndomains + t]);
+#else
+	return (-1);
+#endif
+}
+
+#ifdef VM_NUMA_ALLOC
+/*
+ * Outputs the VM locality table.
+ */
+static int
+sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf sbuf;
+	int error, i, j;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
+
+	sbuf_printf(&sbuf, "\n");
+
+	for (i = 0; i < vm_ndomains; i++) {
+		sbuf_printf(&sbuf, "%d: ", i);
+		for (j = 0; j < vm_ndomains; j++) {
+			sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
+		}
+		sbuf_printf(&sbuf, "\n");
+	}
+	error = sbuf_finish(&sbuf);
+	sbuf_delete(&sbuf);
+	return (error);
+}
+#endif
+
 static void
 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 {
@@ -289,6 +520,7 @@
 static void
 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
 {
+#ifdef VM_NUMA_ALLOC
 	int i;
 
 	if (mem_affinity == NULL) {
@@ -313,6 +545,9 @@
 		    mem_affinity[i].domain);
 		start = mem_affinity[i].end;
 	}
+#else
+	_vm_phys_create_seg(start, end, 0);
+#endif
 }
 
 /*
@@ -473,7 +708,8 @@
 			}
 		}
 	}
-	mtx_init(&vm_phys_fictitious_reg_mtx, "vmfctr", NULL, MTX_DEF);
+
+	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 }
 
 /*
@@ -495,36 +731,6 @@
 }
 
 /*
- * Initialize a physical page and add it to the free lists.
- */
-void
-vm_phys_add_page(vm_paddr_t pa)
-{
-	vm_page_t m;
-	struct vm_domain *vmd;
-
-	cnt.v_page_count++;
-	m = vm_phys_paddr_to_vm_page(pa);
-	m->busy_lock = VPB_UNBUSIED;
-	m->phys_addr = pa;
-	m->queue = PQ_NONE;
-	m->segind = vm_phys_paddr_to_segind(pa);
-	vmd = vm_phys_domain(m);
-	vmd->vmd_page_count++;
-	vmd->vmd_segs |= 1UL << m->segind;
-	m->flags = PG_FREE;
-	KASSERT(m->order == VM_NFREEORDER,
-	    ("vm_phys_add_page: page %p has unexpected order %d",
-	    m, m->order));
-	m->pool = VM_FREEPOOL_DEFAULT;
-	pmap_page_init(m);
-	mtx_lock(&vm_page_queue_free_mtx);
-	vm_phys_freecnt_adj(m, 1);
-	vm_phys_free_pages(m, 0);
-	mtx_unlock(&vm_page_queue_free_mtx);
-}
-
-/*
  * Allocate a contiguous, power of two-sized set of physical pages
  * from the free lists.
  *
@@ -534,7 +740,8 @@
 vm_phys_alloc_pages(int pool, int order)
 {
 	vm_page_t m;
-	int dom, domain, flind;
+	int domain, flind;
+	struct vm_domain_iterator vi;
 
 	KASSERT(pool < VM_NFREEPOOL,
 	    ("vm_phys_alloc_pages: pool %d is out of range", pool));
@@ -541,8 +748,9 @@
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_pages: order %d is out of range", order));
 
-	for (dom = 0; dom < vm_ndomains; dom++) {
-		domain = vm_rr_selectdomain();
+	vm_policy_iterator_init(&vi);
+
+	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 		for (flind = 0; flind < vm_nfreelists; flind++) {
 			m = vm_phys_alloc_domain_pages(domain, flind, pool,
 			    order);
@@ -550,6 +758,8 @@
 				return (m);
 		}
 	}
+
+	vm_policy_iterator_finish(&vi);
 	return (NULL);
 }
 
@@ -564,7 +774,8 @@
 vm_phys_alloc_freelist_pages(int freelist, int pool, int order)
 {
 	vm_page_t m;
-	int dom, domain;
+	struct vm_domain_iterator vi;
+	int domain;
 
 	KASSERT(freelist < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
@@ -573,13 +784,17 @@
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
-	for (dom = 0; dom < vm_ndomains; dom++) {
-		domain = vm_rr_selectdomain();
+
+	vm_policy_iterator_init(&vi);
+
+	while ((vm_domain_iterator_run(&vi, &domain)) == 0) {
 		m = vm_phys_alloc_domain_pages(domain,
 		    vm_freelist_to_flind[freelist], pool, order);
 		if (m != NULL)
 			return (m);
 	}
+
+	vm_policy_iterator_finish(&vi);
 	return (NULL);
 }
 
@@ -643,23 +858,39 @@
 vm_page_t
 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
 {
-	struct vm_phys_fictitious_seg *seg;
+	struct vm_phys_fictitious_seg tmp, *seg;
 	vm_page_t m;
-	int segind;
 
 	m = NULL;
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (pa >= seg->start && pa < seg->end) {
-			m = &seg->first_page[atop(pa - seg->start)];
-			KASSERT((m->flags & PG_FICTITIOUS) != 0,
-			    ("%p not fictitious", m));
-			break;
-		}
-	}
+	tmp.start = pa;
+	tmp.end = 0;
+
+	rw_rlock(&vm_phys_fictitious_reg_lock);
+	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+	rw_runlock(&vm_phys_fictitious_reg_lock);
+	if (seg == NULL)
+		return (NULL);
+
+	m = &seg->first_page[atop(pa - seg->start)];
+	KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
+
 	return (m);
 }
 
+static inline void
+vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
+    long page_count, vm_memattr_t memattr)
+{
+	long i;
+
+	bzero(range, page_count * sizeof(*range));
+	for (i = 0; i < page_count; i++) {
+		vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
+		range[i].oflags &= ~VPO_UNMANAGED;
+		range[i].busy_lock = VPB_UNBUSIED;
+	}
+}
+
 int
 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr)
@@ -666,104 +897,145 @@
 {
 	struct vm_phys_fictitious_seg *seg;
 	vm_page_t fp;
-	long i, page_count;
-	int segind;
+	long page_count;
 #ifdef VM_PHYSSEG_DENSE
-	long pi;
-	boolean_t malloced;
+	long pi, pe;
+	long dpage_count;
 #endif
 
+	KASSERT(start < end,
+	    ("Start of segment isn't less than end (start: %jx end: %jx)",
+	    (uintmax_t)start, (uintmax_t)end));
+
 	page_count = (end - start) / PAGE_SIZE;
 
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
-	if (pi >= first_page && pi < vm_page_array_size + first_page) {
-		if (atop(end) >= vm_page_array_size + first_page)
-			return (EINVAL);
+	pe = atop(end);
+	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		fp = &vm_page_array[pi - first_page];
-		malloced = FALSE;
-	} else
+		if ((pe - first_page) > vm_page_array_size) {
+			/*
+			 * We have a segment that starts inside
+			 * of vm_page_array, but ends outside of it.
+			 *
+			 * Use vm_page_array pages for those that are
+			 * inside of the vm_page_array range, and
+			 * allocate the remaining ones.
+			 */
+			dpage_count = vm_page_array_size - (pi - first_page);
+			vm_phys_fictitious_init_range(fp, start, dpage_count,
+			    memattr);
+			page_count -= dpage_count;
+			start += ptoa(dpage_count);
+			goto alloc;
+		}
+		/*
+		 * We can allocate the full range from vm_page_array,
+		 * so there's no need to register the range in the tree.
+		 */
+		vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+		return (0);
+	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+		/*
+		 * We have a segment that ends inside of vm_page_array,
+		 * but starts outside of it.
+		 */
+		fp = &vm_page_array[0];
+		dpage_count = pe - first_page;
+		vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
+		    memattr);
+		end -= ptoa(dpage_count);
+		page_count -= dpage_count;
+		goto alloc;
+	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+		/*
+		 * Trying to register a fictitious range that expands before
+		 * and after vm_page_array.
+		 */
+		return (EINVAL);
+	} else {
+alloc:
 #endif
-	{
 		fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
-		    M_WAITOK | M_ZERO);
+		    M_WAITOK);
 #ifdef VM_PHYSSEG_DENSE
-		malloced = TRUE;
-#endif
 	}
-	for (i = 0; i < page_count; i++) {
-		vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
-		fp[i].oflags &= ~VPO_UNMANAGED;
-		fp[i].busy_lock = VPB_UNBUSIED;
-	}
-	mtx_lock(&vm_phys_fictitious_reg_mtx);
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (seg->start == 0 && seg->end == 0) {
-			seg->start = start;
-			seg->end = end;
-			seg->first_page = fp;
-			mtx_unlock(&vm_phys_fictitious_reg_mtx);
-			return (0);
-		}
-	}
-	mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
-	if (malloced)
 #endif
-		free(fp, M_FICT_PAGES);
-	return (EBUSY);
+	vm_phys_fictitious_init_range(fp, start, page_count, memattr);
+
+	seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
+	seg->start = start;
+	seg->end = end;
+	seg->first_page = fp;
+
+	rw_wlock(&vm_phys_fictitious_reg_lock);
+	RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
+	rw_wunlock(&vm_phys_fictitious_reg_lock);
+
+	return (0);
 }
 
 void
 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
 {
-	struct vm_phys_fictitious_seg *seg;
-	vm_page_t fp;
-	int segind;
+	struct vm_phys_fictitious_seg *seg, tmp;
 #ifdef VM_PHYSSEG_DENSE
-	long pi;
+	long pi, pe;
 #endif
 
+	KASSERT(start < end,
+	    ("Start of segment isn't less than end (start: %jx end: %jx)",
+	    (uintmax_t)start, (uintmax_t)end));
+
 #ifdef VM_PHYSSEG_DENSE
 	pi = atop(start);
-#endif
-
-	mtx_lock(&vm_phys_fictitious_reg_mtx);
-	for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
-		seg = &vm_phys_fictitious_segs[segind];
-		if (seg->start == start && seg->end == end) {
-			seg->start = seg->end = 0;
-			fp = seg->first_page;
-			seg->first_page = NULL;
-			mtx_unlock(&vm_phys_fictitious_reg_mtx);
-#ifdef VM_PHYSSEG_DENSE
-			if (pi < first_page || atop(end) >= vm_page_array_size)
-#endif
-				free(fp, M_FICT_PAGES);
+	pe = atop(end);
+	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
+		if ((pe - first_page) <= vm_page_array_size) {
+			/*
+			 * This segment was allocated using vm_page_array
+			 * only, there's nothing to do since those pages
+			 * were never added to the tree.
+			 */
 			return;
 		}
+		/*
+		 * We have a segment that starts inside
+		 * of vm_page_array, but ends outside of it.
+		 *
+		 * Calculate how many pages were added to the
+		 * tree and free them.
+		 */
+		start = ptoa(first_page + vm_page_array_size);
+	} else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
+		/*
+		 * We have a segment that ends inside of vm_page_array,
+		 * but starts outside of it.
+		 */
+		end = ptoa(first_page);
+	} else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
+		/* Since it's not possible to register such a range, panic. */
+		panic(
+		    "Unregistering not registered fictitious range [%#jx:%#jx]",
+		    (uintmax_t)start, (uintmax_t)end);
 	}
-	mtx_unlock(&vm_phys_fictitious_reg_mtx);
-	KASSERT(0, ("Unregistering not registered fictitious range"));
-}
+#endif
+	tmp.start = start;
+	tmp.end = 0;
 
-/*
- * Find the segment containing the given physical address.
- */
-static int
-vm_phys_paddr_to_segind(vm_paddr_t pa)
-{
-	struct vm_phys_seg *seg;
-	int segind;
-
-	for (segind = 0; segind < vm_phys_nsegs; segind++) {
-		seg = &vm_phys_segs[segind];
-		if (pa >= seg->start && pa < seg->end)
-			return (segind);
+	rw_wlock(&vm_phys_fictitious_reg_lock);
+	seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
+	if (seg->start != start || seg->end != end) {
+		rw_wunlock(&vm_phys_fictitious_reg_lock);
+		panic(
+		    "Unregistering not registered fictitious range [%#jx:%#jx]",
+		    (uintmax_t)start, (uintmax_t)end);
 	}
-	panic("vm_phys_paddr_to_segind: paddr %#jx is not in any segment" ,
-	    (uintmax_t)pa);
+	RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
+	rw_wunlock(&vm_phys_fictitious_reg_lock);
+	free(seg->first_page, M_FICT_PAGES);
+	free(seg, M_FICT_PAGES);
 }
 
 /*
@@ -853,6 +1125,56 @@
 }
 
 /*
+ * Scan physical memory between the specified addresses "low" and "high" for a
+ * run of contiguous physical pages that satisfy the specified conditions, and
+ * return the lowest page in the run.  The specified "alignment" determines
+ * the alignment of the lowest physical page in the run.  If the specified
+ * "boundary" is non-zero, then the run of physical pages cannot span a
+ * physical address that is a multiple of "boundary".
+ *
+ * "npages" must be greater than zero.  Both "alignment" and "boundary" must
+ * be a power of two.
+ */
+vm_page_t
+vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options)
+{
+	vm_paddr_t pa_end;
+	vm_page_t m_end, m_run, m_start;
+	struct vm_phys_seg *seg;
+	int segind;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	if (low >= high)
+		return (NULL);
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high)
+			break;
+		if (low >= seg->end)
+			continue;
+		if (low <= seg->start)
+			m_start = seg->first_page;
+		else
+			m_start = &seg->first_page[atop(low - seg->start)];
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - VM_PAGE_TO_PHYS(m_start) < ptoa(npages))
+			continue;
+		m_end = &seg->first_page[atop(pa_end - seg->start)];
+		m_run = vm_page_scan_contig(npages, m_start, m_end,
+		    alignment, boundary, options);
+		if (m_run != NULL)
+			return (m_run);
+	}
+	return (NULL);
+}
+
+/*
  * Set the pool for a contiguous, power of two-sized set of physical pages. 
  */
 void
@@ -946,7 +1268,7 @@
 	for (;;) {
 		TAILQ_FOREACH_REVERSE(m, &fl[oind].pl, pglist, plinks.q) {
 			for (m_tmp = m; m_tmp < &m[1 << oind]; m_tmp++) {
-				if ((m_tmp->flags & (PG_CACHED | PG_ZERO)) == 0) {
+				if ((m_tmp->flags & PG_ZERO) == 0) {
 					vm_phys_unfree_page(m_tmp);
 					vm_phys_freecnt_adj(m, -1);
 					mtx_unlock(&vm_page_queue_free_mtx);
@@ -990,85 +1312,125 @@
 vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
+	vm_paddr_t pa_end, pa_start;
+	vm_page_t m_run;
+	struct vm_domain_iterator vi;
+	struct vm_phys_seg *seg;
+	int domain, segind;
+
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	if (low >= high)
+		return (NULL);
+	vm_policy_iterator_init(&vi);
+restartdom:
+	if (vm_domain_iterator_run(&vi, &domain) != 0) {
+		vm_policy_iterator_finish(&vi);
+		return (NULL);
+	}
+	m_run = NULL;
+	for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
+		seg = &vm_phys_segs[segind];
+		if (seg->start >= high || seg->domain != domain)
+			continue;
+		if (low >= seg->end)
+			break;
+		if (low <= seg->start)
+			pa_start = seg->start;
+		else
+			pa_start = low;
+		if (high < seg->end)
+			pa_end = high;
+		else
+			pa_end = seg->end;
+		if (pa_end - pa_start < ptoa(npages))
+			continue;
+		m_run = vm_phys_alloc_seg_contig(seg, npages, low, high,
+		    alignment, boundary);
+		if (m_run != NULL)
+			break;
+	}
+	if (m_run == NULL && !vm_domain_iterator_isdone(&vi))
+		goto restartdom;
+	vm_policy_iterator_finish(&vi);
+	return (m_run);
+}
+
+/*
+ * Allocate a run of contiguous physical pages from the free list for the
+ * specified segment.
+ */
+static vm_page_t
+vm_phys_alloc_seg_contig(struct vm_phys_seg *seg, u_long npages,
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+{
 	struct vm_freelist *fl;
-	struct vm_phys_seg *seg;
-	vm_paddr_t pa, pa_last, size;
+	vm_paddr_t pa, pa_end, size;
 	vm_page_t m, m_ret;
 	u_long npages_end;
-	int dom, domain, flind, oind, order, pind;
+	int oind, order, pind;
 
+	KASSERT(npages > 0, ("npages is 0"));
+	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
+	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	size = npages << PAGE_SHIFT;
-	KASSERT(size != 0,
-	    ("vm_phys_alloc_contig: size must not be 0"));
-	KASSERT((alignment & (alignment - 1)) == 0,
-	    ("vm_phys_alloc_contig: alignment must be a power of 2"));
-	KASSERT((boundary & (boundary - 1)) == 0,
-	    ("vm_phys_alloc_contig: boundary must be a power of 2"));
 	/* Compute the queue that is the best fit for npages. */
 	for (order = 0; (1 << order) < npages; order++);
-	dom = 0;
-restartdom:
-	domain = vm_rr_selectdomain();
-	for (flind = 0; flind < vm_nfreelists; flind++) {
-		for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER; oind++) {
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
-				fl = &vm_phys_free_queues[domain][flind][pind][0];
-				TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+	/* Search for a run satisfying the specified conditions. */
+	size = npages << PAGE_SHIFT;
+	for (oind = min(order, VM_NFREEORDER - 1); oind < VM_NFREEORDER;
+	    oind++) {
+		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+			fl = (*seg->free_queues)[pind];
+			TAILQ_FOREACH(m_ret, &fl[oind].pl, plinks.q) {
+				/*
+				 * Is the size of this allocation request
+				 * larger than the largest block size?
+				 */
+				if (order >= VM_NFREEORDER) {
 					/*
-					 * A free list may contain physical pages
-					 * from one or more segments.
+					 * Determine if a sufficient number of
+					 * subsequent blocks to satisfy the
+					 * allocation request are free.
 					 */
-					seg = &vm_phys_segs[m_ret->segind];
-					if (seg->start > high ||
-					    low >= seg->end)
+					pa = VM_PAGE_TO_PHYS(m_ret);
+					pa_end = pa + size;
+					if (pa_end < pa)
 						continue;
-
-					/*
-					 * Is the size of this allocation request
-					 * larger than the largest block size?
-					 */
-					if (order >= VM_NFREEORDER) {
-						/*
-						 * Determine if a sufficient number
-						 * of subsequent blocks to satisfy
-						 * the allocation request are free.
-						 */
-						pa = VM_PAGE_TO_PHYS(m_ret);
-						pa_last = pa + size;
-						for (;;) {
-							pa += 1 << (PAGE_SHIFT + VM_NFREEORDER - 1);
-							if (pa >= pa_last)
-								break;
-							if (pa < seg->start ||
-							    pa >= seg->end)
-								break;
-							m = &seg->first_page[atop(pa - seg->start)];
-							if (m->order != VM_NFREEORDER - 1)
-								break;
-						}
-						/* If not, continue to the next block. */
-						if (pa < pa_last)
-							continue;
+					for (;;) {
+						pa += 1 << (PAGE_SHIFT +
+						    VM_NFREEORDER - 1);
+						if (pa >= pa_end ||
+						    pa < seg->start ||
+						    pa >= seg->end)
+							break;
+						m = &seg->first_page[atop(pa -
+						    seg->start)];
+						if (m->order != VM_NFREEORDER -
+						    1)
+							break;
 					}
+					/* If not, go to the next block. */
+					if (pa < pa_end)
+						continue;
+				}
 
-					/*
-					 * Determine if the blocks are within the given range,
-					 * satisfy the given alignment, and do not cross the
-					 * given boundary.
-					 */
-					pa = VM_PAGE_TO_PHYS(m_ret);
-					if (pa >= low &&
-					    pa + size <= high &&
-					    (pa & (alignment - 1)) == 0 &&
-					    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) == 0)
-						goto done;
-				}
+				/*
+				 * Determine if the blocks are within the
+				 * given range, satisfy the given alignment,
+				 * and do not cross the given boundary.
+				 */
+				pa = VM_PAGE_TO_PHYS(m_ret);
+				pa_end = pa + size;
+				if (pa >= low && pa_end <= high &&
+				    (pa & (alignment - 1)) == 0 &&
+				    rounddown2(pa ^ (pa_end - 1), boundary) == 0)
+					goto done;
 			}
 		}
 	}
-	if (++dom < vm_ndomains)
-		goto restartdom;
 	return (NULL);
 done:
 	for (m = m_ret; m < &m_ret[npages]; m = &m[1 << oind]) {

Modified: trunk/sys/vm/vm_phys.h
===================================================================
--- trunk/sys/vm/vm_phys.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_phys.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_phys.h 285634 2015-07-16 14:41:58Z kib $
+ * $FreeBSD: stable/11/sys/vm/vm_phys.h 329381 2018-02-16 16:16:33Z mjg $
  */
 
 /*
@@ -62,6 +62,7 @@
 };
 
 extern struct mem_affinity *mem_affinity;
+extern int *mem_locality;
 extern int vm_ndomains;
 extern struct vm_phys_seg vm_phys_segs[];
 extern int vm_phys_nsegs;
@@ -69,7 +70,6 @@
 /*
  * The following functions are only to be used by the virtual memory system.
  */
-void vm_phys_add_page(vm_paddr_t pa);
 void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_alloc_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary);
@@ -84,9 +84,12 @@
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
 vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
+vm_page_t vm_phys_scan_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
+    u_long alignment, vm_paddr_t boundary, int options);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 boolean_t vm_phys_zero_pages_idle(void);
+int vm_phys_mem_affinity(int f, int t);
 
 /*
  *	vm_phys_domain:
@@ -96,7 +99,7 @@
 static inline struct vm_domain *
 vm_phys_domain(vm_page_t m)
 {
-#if MAXMEMDOM > 1
+#ifdef VM_NUMA_ALLOC
 	int domn, segind;
 
 	/* XXXKIB try to assert that the page is managed */
@@ -110,13 +113,13 @@
 #endif
 }
 
-static inline void
+static inline u_int
 vm_phys_freecnt_adj(vm_page_t m, int adj)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
-	cnt.v_free_count += adj;
 	vm_phys_domain(m)->vmd_free_count += adj;
+	return (vm_cnt.v_free_count += adj);
 }
 
 #endif	/* _KERNEL */

Modified: trunk/sys/vm/vm_radix.c
===================================================================
--- trunk/sys/vm/vm_radix.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -50,7 +50,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_radix.c 298653 2016-04-26 17:39:54Z pfg $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_radix.c 327785 2018-01-10 20:39:26Z markj $");
 
 #include "opt_ddb.h"
 
@@ -299,21 +299,19 @@
 	 * are needed to store them.
 	 */
 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
-	    ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
 	    sizeof(struct vm_radix_node))))
 		panic("%s: unable to reserve KVA", __func__);
 }
-SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_THIRD,
     vm_radix_reserve_kva, NULL);
 #endif
 
 /*
  * Initialize the UMA slab zone.
- * Until vm_radix_prealloc() is called, the zone will be served by the
- * UMA boot-time pre-allocated pool of pages.
  */
 void
-vm_radix_init(void)
+vm_radix_zinit(void)
 {
 
 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
@@ -342,8 +340,6 @@
 
 	index = page->pindex;
 
-restart:
-
 	/*
 	 * The owner of record for root is not really important because it
 	 * will never be used.
@@ -361,32 +357,10 @@
 				panic("%s: key %jx is already present",
 				    __func__, (uintmax_t)index);
 			clev = vm_radix_keydiff(m->pindex, index);
-
-			/*
-			 * During node allocation the trie that is being
-			 * walked can be modified because of recursing radix
-			 * trie operations.
-			 * If this is the case, the recursing functions signal
-			 * such situation and the insert operation must
-			 * start from scratch again.
-			 * The freed radix node will then be in the UMA
-			 * caches very likely to avoid the same situation
-			 * to happen.
-			 */
-			rtree->rt_flags |= RT_INSERT_INPROG;
 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
 			    clev + 1), 2, clev);
-			rtree->rt_flags &= ~RT_INSERT_INPROG;
-			if (tmp == NULL) {
-				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+			if (tmp == NULL)
 				return (ENOMEM);
-			}
-			if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
-				rtree->rt_flags &= ~RT_TRIE_MODIFIED;
-				tmp->rn_count = 0;
-				vm_radix_node_put(tmp);
-				goto restart;
-			}
 			*parentp = tmp;
 			vm_radix_addpage(tmp, index, clev, page);
 			vm_radix_addpage(tmp, m->pindex, clev, m);
@@ -410,21 +384,9 @@
 	 */
 	newind = rnode->rn_owner;
 	clev = vm_radix_keydiff(newind, index);
-
-	/* See the comments above. */
-	rtree->rt_flags |= RT_INSERT_INPROG;
 	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
-	rtree->rt_flags &= ~RT_INSERT_INPROG;
-	if (tmp == NULL) {
-		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+	if (tmp == NULL)
 		return (ENOMEM);
-	}
-	if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
-		rtree->rt_flags &= ~RT_TRIE_MODIFIED;
-		tmp->rn_count = 0;
-		vm_radix_node_put(tmp);
-		goto restart;
-	}
 	*parentp = tmp;
 	vm_radix_addpage(tmp, index, clev, page);
 	slot = vm_radix_slot(newind, clev);
@@ -699,10 +661,10 @@
 }
 
 /*
- * Remove the specified index from the tree.
- * Panics if the key is not present.
+ * Remove the specified index from the trie, and return the value stored at
+ * that index.  If the index is not present, return NULL.
  */
-void
+vm_page_t
 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
 {
 	struct vm_radix_node *rnode, *parent;
@@ -709,41 +671,27 @@
 	vm_page_t m;
 	int i, slot;
 
-	/*
-	 * Detect if a page is going to be removed from a trie which is
-	 * already undergoing another trie operation.
-	 * Right now this is only possible for vm_radix_remove() recursing
-	 * into vm_radix_insert().
-	 * If this is the case, the caller must be notified about this
-	 * situation.  It will also takecare to update the RT_TRIE_MODIFIED
-	 * accordingly.
-	 * The RT_TRIE_MODIFIED bit is set here because the remove operation
-	 * will always succeed.
-	 */
-	if ((rtree->rt_flags & RT_INSERT_INPROG) != 0)
-		rtree->rt_flags |= RT_TRIE_MODIFIED;
-
 	rnode = vm_radix_getroot(rtree);
 	if (vm_radix_isleaf(rnode)) {
 		m = vm_radix_topage(rnode);
 		if (m->pindex != index)
-			panic("%s: invalid key found", __func__);
+			return (NULL);
 		vm_radix_setroot(rtree, NULL);
-		return;
+		return (m);
 	}
 	parent = NULL;
 	for (;;) {
 		if (rnode == NULL)
-			panic("vm_radix_remove: impossible to locate the key");
+			return (NULL);
 		slot = vm_radix_slot(index, rnode->rn_clev);
 		if (vm_radix_isleaf(rnode->rn_child[slot])) {
 			m = vm_radix_topage(rnode->rn_child[slot]);
 			if (m->pindex != index)
-				panic("%s: invalid key found", __func__);
+				return (NULL);
 			rnode->rn_child[slot] = NULL;
 			rnode->rn_count--;
 			if (rnode->rn_count > 1)
-				break;
+				return (m);
 			for (i = 0; i < VM_RADIX_COUNT; i++)
 				if (rnode->rn_child[i] != NULL)
 					break;
@@ -760,7 +708,7 @@
 			rnode->rn_count--;
 			rnode->rn_child[i] = NULL;
 			vm_radix_node_put(rnode);
-			break;
+			return (m);
 		}
 		parent = rnode;
 		rnode = rnode->rn_child[slot];
@@ -777,9 +725,6 @@
 {
 	struct vm_radix_node *root;
 
-	KASSERT((rtree->rt_flags & RT_INSERT_INPROG) == 0,
-	    ("vm_radix_reclaim_allnodes: unexpected trie recursion"));
-
 	root = vm_radix_getroot(rtree);
 	if (root == NULL)
 		return;
@@ -831,6 +776,12 @@
 	panic("%s: original replacing page not found", __func__);
 }
 
+void
+vm_radix_wait(void)
+{
+	uma_zwait(vm_radix_node_zone);
+}
+
 #ifdef DDB
 /*
  * Show details about the given radix node.

Modified: trunk/sys/vm/vm_radix.h
===================================================================
--- trunk/sys/vm/vm_radix.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_radix.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -26,7 +26,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_radix.h 266591 2014-05-23 17:47:49Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_radix.h 327785 2018-01-10 20:39:26Z markj $
  */
 
 #ifndef _VM_RADIX_H_
@@ -36,15 +36,30 @@
 
 #ifdef _KERNEL
 
-void		vm_radix_init(void);
 int		vm_radix_insert(struct vm_radix *rtree, vm_page_t page);
+void		vm_radix_wait(void);
 boolean_t	vm_radix_is_singleton(struct vm_radix *rtree);
 vm_page_t	vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index);
 void		vm_radix_reclaim_allnodes(struct vm_radix *rtree);
-void		vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
+vm_page_t	vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index);
 vm_page_t	vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage);
+void		vm_radix_zinit(void);
 
+static __inline void
+vm_radix_init(struct vm_radix *rtree)
+{
+
+	rtree->rt_root = 0;
+}
+
+static __inline boolean_t
+vm_radix_is_empty(struct vm_radix *rtree)
+{
+
+	return (rtree->rt_root == 0);
+}
+
 #endif /* _KERNEL */
 #endif /* !_VM_RADIX_H_ */

Modified: trunk/sys/vm/vm_reserv.c
===================================================================
--- trunk/sys/vm/vm_reserv.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -1,7 +1,7 @@
 /* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2006 Rice University
- * Copyright (c) 2007-2008 Alan L. Cox <alc at cs.rice.edu>
+ * Copyright (c) 2007-2011 Alan L. Cox <alc at cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_reserv.c 280045 2015-03-15 18:40:06Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_reserv.c 351826 2019-09-04 19:31:37Z ray $");
 
 #include "opt_vm.h"
 
@@ -52,6 +52,7 @@
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -63,7 +64,7 @@
 
 /*
  * The reservation system supports the speculative allocation of large physical
- * pages ("superpages").  Speculative allocation enables the fully-automatic
+ * pages ("superpages").  Speculative allocation enables the fully automatic
  * utilization of superpages by the virtual memory system.  In other words, no
  * programmatic directives are required to use superpages.
  */
@@ -94,6 +95,61 @@
     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
 
 /*
+ * The size of a population map entry
+ */
+typedef	u_long		popmap_t;
+
+/*
+ * The number of bits in a population map entry
+ */
+#define	NBPOPMAP	(NBBY * sizeof(popmap_t))
+
+/*
+ * The number of population map entries in a reservation
+ */
+#define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
+
+/*
+ * Clear a bit in the population map.
+ */
+static __inline void
+popmap_clear(popmap_t popmap[], int i)
+{
+
+	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
+}
+
+/*
+ * Set a bit in the population map.
+ */
+static __inline void
+popmap_set(popmap_t popmap[], int i)
+{
+
+	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
+}
+
+/*
+ * Is a bit in the population map clear?
+ */
+static __inline boolean_t
+popmap_is_clear(popmap_t popmap[], int i)
+{
+
+	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
+}
+
+/*
+ * Is a bit in the population map set?
+ */
+static __inline boolean_t
+popmap_is_set(popmap_t popmap[], int i)
+{
+
+	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
+}
+
+/*
  * The reservation structure
  *
  * A reservation structure is constructed whenever a large physical page is
@@ -101,11 +157,11 @@
  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  * within that object.  The reservation's "popcnt" tracks the number of these
  * small physical pages that are in use at any given time.  When and if the
- * reservation is not fully utilized, it appears in the queue of partially-
+ * reservation is not fully utilized, it appears in the queue of partially
  * populated reservations.  The reservation always appears on the containing
  * object's list of reservations.
  *
- * A partially-populated reservation can be broken and reclaimed at any time.
+ * A partially populated reservation can be broken and reclaimed at any time.
  */
 struct vm_reserv {
 	TAILQ_ENTRY(vm_reserv) partpopq;
@@ -115,6 +171,7 @@
 	vm_page_t	pages;			/* first page of a superpage */
 	int		popcnt;			/* # of pages in use */
 	char		inpartpopq;
+	popmap_t	popmap[NPOPMAP];	/* bit vector of used pages */
 };
 
 /*
@@ -141,11 +198,11 @@
 static vm_reserv_t vm_reserv_array;
 
 /*
- * The partially-populated reservation queue
+ * The partially populated reservation queue
  *
- * This queue enables the fast recovery of an unused cached or free small page
- * from a partially-populated reservation.  The reservation at the head of
- * this queue is the least-recently-changed, partially-populated reservation.
+ * This queue enables the fast recovery of an unused free small page from a
+ * partially populated reservation.  The reservation at the head of this queue
+ * is the least recently changed, partially populated reservation.
  *
  * Access to this queue is synchronized by the free page queue lock.
  */
@@ -162,26 +219,60 @@
 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 
+static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
+
+SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
+    sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
+
 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
-    sysctl_vm_reserv_partpopq, "A", "Partially-populated reservation queues");
+    sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 
 static long vm_reserv_reclaimed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 
-static void		vm_reserv_depopulate(vm_reserv_t rv);
+static void		vm_reserv_break(vm_reserv_t rv);
+static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
 			    vm_pindex_t pindex);
-static void		vm_reserv_populate(vm_reserv_t rv);
+static void		vm_reserv_populate(vm_reserv_t rv, int index);
 static void		vm_reserv_reclaim(vm_reserv_t rv);
 
 /*
- * Describes the current state of the partially-populated reservation queue.
+ * Returns the current number of full reservations.
+ *
+ * Since the number of full reservations is computed without acquiring the
+ * free page queue lock, the returned value may be inexact.
  */
 static int
+sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
+{
+	vm_paddr_t paddr;
+	struct vm_phys_seg *seg;
+	vm_reserv_t rv;
+	int fullpop, segind;
+
+	fullpop = 0;
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+		    VM_LEVEL_0_SIZE <= seg->end) {
+			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
+			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
+			paddr += VM_LEVEL_0_SIZE;
+		}
+	}
+	return (sysctl_handle_int(oidp, &fullpop, 0, req));
+}
+
+/*
+ * Describes the current state of the partially populated reservation queue.
+ */
+static int
 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
@@ -213,18 +304,21 @@
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
- * reservation to the tail of the partially-populated reservations queue if the
+ * reservation to the tail of the partially populated reservation queue if the
  * population count is non-zero.
  *
  * The free page queue lock must be held.
  */
 static void
-vm_reserv_depopulate(vm_reserv_t rv)
+vm_reserv_depopulate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
+	KASSERT(popmap_is_set(rv->popmap, index),
+	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
+	    index));
 	KASSERT(rv->popcnt > 0,
 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 	if (rv->inpartpopq) {
@@ -236,6 +330,7 @@
 		    rv));
 		rv->pages->psind = 0;
 	}
+	popmap_clear(rv->popmap, index);
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
 		LIST_REMOVE(rv, objq);
@@ -271,17 +366,20 @@
 
 /*
  * Increases the given reservation's population count.  Moves the reservation
- * to the tail of the partially-populated reservation queue.
+ * to the tail of the partially populated reservation queue.
  *
  * The free page queue must be locked.
  */
 static void
-vm_reserv_populate(vm_reserv_t rv)
+vm_reserv_populate(vm_reserv_t rv, int index)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
+	KASSERT(popmap_is_clear(rv->popmap, index),
+	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
+	    index));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
 	KASSERT(rv->pages->psind == 0,
@@ -290,6 +388,7 @@
 		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
+	popmap_set(rv->popmap, index);
 	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
@@ -308,14 +407,18 @@
  * physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  *
+ * The page "mpred" must immediately precede the offset "pindex" within the
+ * specified object.
+ *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, u_long npages,
-    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
+    vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary,
+    vm_page_t mpred)
 {
 	vm_paddr_t pa, size;
-	vm_page_t m, m_ret, mpred, msucc;
+	vm_page_t m, m_ret, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	u_long allocpages, maxpages, minpages;
@@ -352,10 +455,11 @@
 	/*
 	 * Look for an existing reservation.
 	 */
-	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	if (mpred != NULL) {
+		KASSERT(mpred->object == object,
+		    ("vm_reserv_alloc_contig: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
-		    ("vm_reserv_alloc_contig: pindex already allocated"));
+		    ("vm_reserv_alloc_contig: mpred doesn't precede pindex"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
@@ -364,7 +468,7 @@
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex > pindex,
-		    ("vm_reserv_alloc_contig: pindex already allocated"));
+		    ("vm_reserv_alloc_contig: msucc doesn't succeed pindex"));
 		rv = vm_reserv_from_page(msucc);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
@@ -460,9 +564,13 @@
 		KASSERT(!rv->inpartpopq,
 		    ("vm_reserv_alloc_contig: reserv %p's inpartpopq is TRUE",
 		    rv));
+		for (i = 0; i < NPOPMAP; i++)
+			KASSERT(rv->popmap[i] == 0,
+		    ("vm_reserv_alloc_contig: reserv %p's popmap is corrupted",
+			    rv));
 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 		for (i = 0; i < n; i++)
-			vm_reserv_populate(rv);
+			vm_reserv_populate(rv, index + i);
 		npages -= n;
 		if (m_ret == NULL) {
 			m_ret = &rv->pages[index];
@@ -489,15 +597,15 @@
 		return (NULL);
 	/* Handle vm_page_rename(m, new_object, ...). */
 	for (i = 0; i < npages; i++)
-		if ((rv->pages[index + i].flags & (PG_CACHED | PG_FREE)) == 0)
+		if (popmap_is_set(rv->popmap, index + i))
 			return (NULL);
 	for (i = 0; i < npages; i++)
-		vm_reserv_populate(rv);
+		vm_reserv_populate(rv, index + i);
 	return (m);
 }
 
 /*
- * Allocates a page from an existing or newly-created reservation.
+ * Allocates a page from an existing or newly created reservation.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
@@ -510,6 +618,7 @@
 	vm_page_t m, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
+	int i, index;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
@@ -598,22 +707,93 @@
 	    ("vm_reserv_alloc_page: reserv %p's popcnt is corrupted", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_alloc_page: reserv %p's inpartpopq is TRUE", rv));
-	vm_reserv_populate(rv);
-	return (&rv->pages[VM_RESERV_INDEX(object, pindex)]);
+	for (i = 0; i < NPOPMAP; i++)
+		KASSERT(rv->popmap[i] == 0,
+		    ("vm_reserv_alloc_page: reserv %p's popmap is corrupted",
+		    rv));
+	index = VM_RESERV_INDEX(object, pindex);
+	vm_reserv_populate(rv, index);
+	return (&rv->pages[index]);
 
 	/*
 	 * Found a matching reservation.
 	 */
 found:
-	m = &rv->pages[VM_RESERV_INDEX(object, pindex)];
+	index = VM_RESERV_INDEX(object, pindex);
+	m = &rv->pages[index];
 	/* Handle vm_page_rename(m, new_object, ...). */
-	if ((m->flags & (PG_CACHED | PG_FREE)) == 0)
+	if (popmap_is_set(rv->popmap, index))
 		return (NULL);
-	vm_reserv_populate(rv);
+	vm_reserv_populate(rv, index);
 	return (m);
 }
 
 /*
+ * Breaks the given reservation.  All free pages in the reservation
+ * are returned to the physical memory allocator.  The reservation's
+ * population count and map are reset to their initial state.
+ *
+ * The given reservation must not be in the partially populated reservation
+ * queue.  The free page queue lock must be held.
+ */
+static void
+vm_reserv_break(vm_reserv_t rv)
+{
+	int begin_zeroes, hi, i, lo;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	KASSERT(rv->object != NULL,
+	    ("vm_reserv_break: reserv %p is free", rv));
+	KASSERT(!rv->inpartpopq,
+	    ("vm_reserv_break: reserv %p's inpartpopq is TRUE", rv));
+	LIST_REMOVE(rv, objq);
+	rv->object = NULL;
+	rv->pages->psind = 0;
+	i = hi = 0;
+	do {
+		/* Find the next 0 bit.  Any previous 0 bits are < "hi". */
+		lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+		if (lo == 0) {
+			/* Redundantly clears bits < "hi". */
+			rv->popmap[i] = 0;
+			rv->popcnt -= NBPOPMAP - hi;
+			while (++i < NPOPMAP) {
+				lo = ffsl(~rv->popmap[i]);
+				if (lo == 0) {
+					rv->popmap[i] = 0;
+					rv->popcnt -= NBPOPMAP;
+				} else
+					break;
+			}
+			if (i == NPOPMAP)
+				break;
+			hi = 0;
+		}
+		KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
+		/* Convert from ffsl() to ordinary bit numbering. */
+		lo--;
+		if (lo > 0) {
+			/* Redundantly clears bits < "hi". */
+			rv->popmap[i] &= ~((1UL << lo) - 1);
+			rv->popcnt -= lo - hi;
+		}
+		begin_zeroes = NBPOPMAP * i + lo;
+		/* Find the next 1 bit. */
+		do
+			hi = ffsl(rv->popmap[i]);
+		while (hi == 0 && ++i < NPOPMAP);
+		if (i != NPOPMAP)
+			/* Convert from ffsl() to ordinary bit numbering. */
+			hi--;
+		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
+		    hi - begin_zeroes);
+	} while (i < NPOPMAP);
+	KASSERT(rv->popcnt == 0,
+	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
+	vm_reserv_broken++;
+}
+
+/*
  * Breaks all reservations belonging to the given object.
  */
 void
@@ -620,7 +800,6 @@
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
-	int i;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
@@ -630,18 +809,7 @@
 			TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
-		LIST_REMOVE(rv, objq);
-		rv->object = NULL;
-		for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		KASSERT(rv->popcnt == 0,
-		    ("vm_reserv_break_all: reserv %p's popcnt is corrupted",
-		    rv));
-		vm_reserv_broken++;
+		vm_reserv_break(rv);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -661,10 +829,7 @@
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
-	if ((m->flags & PG_CACHED) != 0 && m->pool != VM_FREEPOOL_CACHE)
-		vm_phys_set_pool(VM_FREEPOOL_CACHE, rv->pages,
-		    VM_LEVEL_0_ORDER);
-	vm_reserv_depopulate(rv);
+	vm_reserv_depopulate(rv, m - rv->pages);
 	return (TRUE);
 }
 
@@ -678,15 +843,18 @@
 vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
-	int i;
+	struct vm_phys_seg *seg;
+	int segind;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
 	 * "pages" field for every element that has an underlying superpage.
 	 */
-	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
-		paddr = roundup2(phys_avail[i], VM_LEVEL_0_SIZE);
-		while (paddr + VM_LEVEL_0_SIZE <= phys_avail[i + 1]) {
+	for (segind = 0; segind < vm_phys_nsegs; segind++) {
+		seg = &vm_phys_segs[segind];
+		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
+		while (paddr + VM_LEVEL_0_SIZE > paddr && paddr +
+		    VM_LEVEL_0_SIZE <= seg->end) {
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 			    PHYS_TO_VM_PAGE(paddr);
 			paddr += VM_LEVEL_0_SIZE;
@@ -695,77 +863,50 @@
 }
 
 /*
- * Returns a reservation level if the given page belongs to a fully-populated
- * reservation and -1 otherwise.
+ * Returns true if the given page belongs to a reservation and that page is
+ * free.  Otherwise, returns false.
  */
+bool
+vm_reserv_is_page_free(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+	rv = vm_reserv_from_page(m);
+	if (rv->object == NULL)
+		return (false);
+	return (popmap_is_clear(rv->popmap, m - rv->pages));
+}
+
+/*
+ * If the given page belongs to a reservation, returns the level of that
+ * reservation.  Otherwise, returns -1.
+ */
 int
-vm_reserv_level_iffullpop(vm_page_t m)
+vm_reserv_level(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
-	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
+	return (rv->object != NULL ? 0 : -1);
 }
 
 /*
- * Prepare for the reactivation of a cached page.
- *
- * First, suppose that the given page "m" was allocated individually, i.e., not
- * as part of a reservation, and cached.  Then, suppose a reservation
- * containing "m" is allocated by the same object.  Although "m" and the
- * reservation belong to the same object, "m"'s pindex may not match the
- * reservation's.
- *
- * The free page queue must be locked.
+ * Returns a reservation level if the given page belongs to a fully populated
+ * reservation and -1 otherwise.
  */
-boolean_t
-vm_reserv_reactivate_page(vm_page_t m)
+int
+vm_reserv_level_iffullpop(vm_page_t m)
 {
 	vm_reserv_t rv;
-	int i, m_index;
 
-	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	rv = vm_reserv_from_page(m);
-	if (rv->object == NULL)
-		return (FALSE);
-	KASSERT((m->flags & PG_CACHED) != 0,
-	    ("vm_reserv_uncache_page: page %p is not cached", m));
-	if (m->object == rv->object &&
-	    m->pindex - rv->pindex == VM_RESERV_INDEX(m->object, m->pindex))
-		vm_reserv_populate(rv);
-	else {
-		KASSERT(rv->inpartpopq,
-		    ("vm_reserv_uncache_page: reserv %p's inpartpopq is FALSE",
-		    rv));
-		TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
-		rv->inpartpopq = FALSE;
-		LIST_REMOVE(rv, objq);
-		rv->object = NULL;
-		/* Don't vm_phys_free_pages(m, 0). */
-		m_index = m - rv->pages;
-		for (i = 0; i < m_index; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		for (i++; i < VM_LEVEL_0_NPAGES; i++) {
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-				vm_phys_free_pages(&rv->pages[i], 0);
-			else
-				rv->popcnt--;
-		}
-		KASSERT(rv->popcnt == 0,
-		    ("vm_reserv_uncache_page: reserv %p's popcnt is corrupted",
-		    rv));
-		vm_reserv_broken++;
-	}
-	return (TRUE);
+	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 }
 
 /*
- * Breaks the given partially-populated reservation, releasing its cached and
- * free pages to the physical memory allocator.
+ * Breaks the given partially populated reservation, releasing its free pages
+ * to the physical memory allocator.
  *
  * The free page queue lock must be held.
  */
@@ -772,32 +913,20 @@
 static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
-	int i;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT(rv->inpartpopq,
-	    ("vm_reserv_reclaim: reserv %p's inpartpopq is corrupted", rv));
+	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 	TAILQ_REMOVE(&vm_rvq_partpop, rv, partpopq);
 	rv->inpartpopq = FALSE;
-	KASSERT(rv->object != NULL,
-	    ("vm_reserv_reclaim: reserv %p is free", rv));
-	LIST_REMOVE(rv, objq);
-	rv->object = NULL;
-	for (i = 0; i < VM_LEVEL_0_NPAGES; i++) {
-		if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0)
-			vm_phys_free_pages(&rv->pages[i], 0);
-		else
-			rv->popcnt--;
-	}
-	KASSERT(rv->popcnt == 0,
-	    ("vm_reserv_reclaim: reserv %p's popcnt is corrupted", rv));
+	vm_reserv_break(rv);
 	vm_reserv_reclaimed++;
 }
 
 /*
- * Breaks the reservation at the head of the partially-populated reservation
- * queue, releasing its cached and free pages to the physical memory
- * allocator.  Returns TRUE if a reservation is broken and FALSE otherwise.
+ * Breaks the reservation at the head of the partially populated reservation
+ * queue, releasing its free pages to the physical memory allocator.  Returns
+ * TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
@@ -815,11 +944,10 @@
 }
 
 /*
- * Searches the partially-populated reservation queue for the least recently
- * active reservation with unused pages, i.e., cached or free, that satisfy the
- * given request for contiguous physical memory.  If a satisfactory reservation
- * is found, it is broken.  Returns TRUE if a reservation is broken and FALSE
- * otherwise.
+ * Searches the partially populated reservation queue for the least recently
+ * changed reservation with free pages that satisfy the given request for
+ * contiguous physical memory.  If a satisfactory reservation is found, it is
+ * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
@@ -827,9 +955,9 @@
 vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
-	vm_paddr_t pa, pa_length, size;
+	vm_paddr_t pa, size;
 	vm_reserv_t rv;
-	int i;
+	int hi, i, lo, low_index, next_free;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (npages > VM_LEVEL_0_NPAGES - 1)
@@ -838,30 +966,72 @@
 	TAILQ_FOREACH(rv, &vm_rvq_partpop, partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
-			/* this entire reservation is too low; go to next */
+			/* This entire reservation is too low; go to next. */
 			continue;
 		}
-		pa_length = 0;
-		for (i = 0; i < VM_LEVEL_0_NPAGES; i++)
-			if ((rv->pages[i].flags & (PG_CACHED | PG_FREE)) != 0) {
-				pa_length += PAGE_SIZE;
-				if (pa_length == PAGE_SIZE) {
-					pa = VM_PAGE_TO_PHYS(&rv->pages[i]);
-					if (pa + size > high) {
-						/* skip to next reservation */
-						break;
-					} else if (pa < low ||
-					    (pa & (alignment - 1)) != 0 ||
-					    ((pa ^ (pa + size - 1)) &
-					    ~(boundary - 1)) != 0)
-						pa_length = 0;
+		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
+		if (pa + size > high) {
+			/* This entire reservation is too high; go to next. */
+			continue;
+		}
+		if (pa < low) {
+			/* Start the search for free pages at "low". */
+			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
+			i = low_index / NBPOPMAP;
+			hi = low_index % NBPOPMAP;
+		} else
+			i = hi = 0;
+		do {
+			/* Find the next free page. */
+			lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
+			while (lo == 0 && ++i < NPOPMAP)
+				lo = ffsl(~rv->popmap[i]);
+			if (i == NPOPMAP)
+				break;
+			/* Convert from ffsl() to ordinary bit numbering. */
+			lo--;
+			next_free = NBPOPMAP * i + lo;
+			pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
+			KASSERT(pa >= low,
+			    ("vm_reserv_reclaim_contig: pa is too low"));
+			if (pa + size > high) {
+				/* The rest of this reservation is too high. */
+				break;
+			} else if ((pa & (alignment - 1)) != 0 ||
+			    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
+				/*
+				 * The current page doesn't meet the alignment
+				 * and/or boundary requirements.  Continue
+				 * searching this reservation until the rest
+				 * of its free pages are either excluded or
+				 * exhausted.
+				 */
+				hi = lo + 1;
+				if (hi >= NBPOPMAP) {
+					hi = 0;
+					i++;
 				}
-				if (pa_length >= size) {
+				continue;
+			}
+			/* Find the next used page. */
+			hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
+			while (hi == 0 && ++i < NPOPMAP) {
+				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
+				    size) {
 					vm_reserv_reclaim(rv);
 					return (TRUE);
 				}
-			} else
-				pa_length = 0;
+				hi = ffsl(rv->popmap[i]);
+			}
+			/* Convert from ffsl() to ordinary bit numbering. */
+			if (i != NPOPMAP)
+				hi--;
+			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
+			    size) {
+				vm_reserv_reclaim(rv);
+				return (TRUE);
+			}
+		} while (i < NPOPMAP);
 	}
 	return (FALSE);
 }
@@ -892,6 +1062,23 @@
 }
 
 /*
+ * Returns the size (in bytes) of a reservation of the specified level.
+ */
+int
+vm_reserv_size(int level)
+{
+
+	switch (level) {
+	case 0:
+		return (VM_LEVEL_0_SIZE);
+	case -1:
+		return (PAGE_SIZE);
+	default:
+		return (0);
+	}
+}
+
+/*
  * Allocates the virtual and physical memory required by the reservation
  * management system's data structures, in particular, the reservation array.
  */
@@ -925,4 +1112,18 @@
 	return (new_end);
 }
 
+/*
+ * Returns the superpage containing the given page.
+ */
+vm_page_t
+vm_reserv_to_superpage(vm_page_t m)
+{
+	vm_reserv_t rv;
+
+	VM_OBJECT_ASSERT_LOCKED(m->object);
+	rv = vm_reserv_from_page(m);
+	return (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES ?
+	    rv->pages : NULL);
+}
+
 #endif	/* VM_NRESERVLEVEL > 0 */

Modified: trunk/sys/vm/vm_reserv.h
===================================================================
--- trunk/sys/vm/vm_reserv.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_reserv.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -29,7 +29,7 @@
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: stable/10/sys/vm/vm_reserv.h 250577 2013-05-12 16:50:18Z alc $
+ * $FreeBSD: stable/11/sys/vm/vm_reserv.h 324399 2017-10-07 20:22:04Z alc $
  */
 
 /*
@@ -48,21 +48,24 @@
  */
 vm_page_t	vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex,
 		    u_long npages, vm_paddr_t low, vm_paddr_t high,
-		    u_long alignment, vm_paddr_t boundary);
+		    u_long alignment, vm_paddr_t boundary, vm_page_t mpred);
 vm_page_t	vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex,
 		    vm_page_t mpred);
 void		vm_reserv_break_all(vm_object_t object);
 boolean_t	vm_reserv_free_page(vm_page_t m);
 void		vm_reserv_init(void);
+bool		vm_reserv_is_page_free(vm_page_t m);
+int		vm_reserv_level(vm_page_t m);
 int		vm_reserv_level_iffullpop(vm_page_t m);
-boolean_t	vm_reserv_reactivate_page(vm_page_t m);
 boolean_t	vm_reserv_reclaim_contig(u_long npages, vm_paddr_t low,
 		    vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 boolean_t	vm_reserv_reclaim_inactive(void);
 void		vm_reserv_rename(vm_page_t m, vm_object_t new_object,
 		    vm_object_t old_object, vm_pindex_t old_object_offset);
+int		vm_reserv_size(int level);
 vm_paddr_t	vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end,
 		    vm_paddr_t high_water);
+vm_page_t	vm_reserv_to_superpage(vm_page_t m);
 
 #endif	/* VM_NRESERVLEVEL > 0 */
 #endif	/* _KERNEL */

Added: trunk/sys/vm/vm_swapout.c
===================================================================
--- trunk/sys/vm/vm_swapout.c	                        (rev 0)
+++ trunk/sys/vm/vm_swapout.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,955 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution at CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout.c 338335 2018-08-27 09:39:34Z kib $");
+
+#include "opt_kstack_pages.h"
+#include "opt_kstack_max_pages.h"
+#include "opt_vm.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/kernel.h>
+#include <sys/eventhandler.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/_kstack_cache.h>
+#include <sys/kthread.h>
+#include <sys/ktr.h>
+#include <sys/mount.h>
+#include <sys/racct.h>
+#include <sys/resourcevar.h>
+#include <sys/sched.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/smp.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <sys/rwlock.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_phys.h>
+#include <vm/swap_pager.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
+
+/* the kernel process "vm_daemon" */
+static void vm_daemon(void);
+static struct proc *vmproc;
+
+static struct kproc_desc vm_kp = {
+	"vmdaemon",
+	vm_daemon,
+	&vmproc
+};
+SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp);
+
+static int vm_swap_enabled = 1;
+static int vm_swap_idle_enabled = 0;
+
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW,
+    &vm_swap_enabled, 0,
+    "Enable entire process swapout");
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW,
+    &vm_swap_idle_enabled, 0,
+    "Allow swapout on idle criteria");
+
+/*
+ * Swap_idle_threshold1 is the guaranteed swapped in time for a process
+ */
+static int swap_idle_threshold1 = 2;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
+    &swap_idle_threshold1, 0,
+    "Guaranteed swapped in time for a process");
+
+/*
+ * Swap_idle_threshold2 is the time that a process can be idle before
+ * it will be swapped out, if idle swapping is enabled.
+ */
+static int swap_idle_threshold2 = 10;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
+    &swap_idle_threshold2, 0,
+    "Time before a process will be swapped out");
+
+static int vm_pageout_req_swapout;	/* XXX */
+static int vm_daemon_needed;
+static struct mtx vm_daemon_mtx;
+/* Allow for use by vm_pageout before vm_daemon is initialized. */
+MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF);
+
+static int swapped_cnt;
+static int swap_inprogress;	/* Pending swap-ins done outside swapper. */
+static int last_swapin;
+
+static void swapclear(struct proc *);
+static int swapout(struct proc *);
+static void vm_swapout_map_deactivate_pages(vm_map_t, long);
+static void vm_swapout_object_deactivate_pages(pmap_t, vm_object_t, long);
+static void swapout_procs(int action);
+static void vm_req_vmdaemon(int req);
+static void vm_thread_swapout(struct thread *td);
+
+/*
+ *	vm_swapout_object_deactivate_pages
+ *
+ *	Deactivate enough pages to satisfy the inactive target
+ *	requirements.
+ *
+ *	The object and map must be locked.
+ */
+static void
+vm_swapout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object,
+    long desired)
+{
+	vm_object_t backing_object, object;
+	vm_page_t p;
+	int act_delta, remove_mode;
+
+	VM_OBJECT_ASSERT_LOCKED(first_object);
+	if ((first_object->flags & OBJ_FICTITIOUS) != 0)
+		return;
+	for (object = first_object;; object = backing_object) {
+		if (pmap_resident_count(pmap) <= desired)
+			goto unlock_return;
+		VM_OBJECT_ASSERT_LOCKED(object);
+		if ((object->flags & OBJ_UNMANAGED) != 0 ||
+		    object->paging_in_progress != 0)
+			goto unlock_return;
+
+		remove_mode = 0;
+		if (object->shadow_count > 1)
+			remove_mode = 1;
+		/*
+		 * Scan the object's entire memory queue.
+		 */
+		TAILQ_FOREACH(p, &object->memq, listq) {
+			if (pmap_resident_count(pmap) <= desired)
+				goto unlock_return;
+			if (should_yield())
+				goto unlock_return;
+			if (vm_page_busied(p))
+				continue;
+			PCPU_INC(cnt.v_pdpages);
+			vm_page_lock(p);
+			if (p->wire_count != 0 || p->hold_count != 0 ||
+			    !pmap_page_exists_quick(pmap, p)) {
+				vm_page_unlock(p);
+				continue;
+			}
+			act_delta = pmap_ts_referenced(p);
+			if ((p->aflags & PGA_REFERENCED) != 0) {
+				if (act_delta == 0)
+					act_delta = 1;
+				vm_page_aflag_clear(p, PGA_REFERENCED);
+			}
+			if (!vm_page_active(p) && act_delta != 0) {
+				vm_page_activate(p);
+				p->act_count += act_delta;
+			} else if (vm_page_active(p)) {
+				if (act_delta == 0) {
+					p->act_count -= min(p->act_count,
+					    ACT_DECLINE);
+					if (!remove_mode && p->act_count == 0) {
+						pmap_remove_all(p);
+						vm_page_deactivate(p);
+					} else
+						vm_page_requeue(p);
+				} else {
+					vm_page_activate(p);
+					if (p->act_count < ACT_MAX -
+					    ACT_ADVANCE)
+						p->act_count += ACT_ADVANCE;
+					vm_page_requeue(p);
+				}
+			} else if (vm_page_inactive(p))
+				pmap_remove_all(p);
+			vm_page_unlock(p);
+		}
+		if ((backing_object = object->backing_object) == NULL)
+			goto unlock_return;
+		VM_OBJECT_RLOCK(backing_object);
+		if (object != first_object)
+			VM_OBJECT_RUNLOCK(object);
+	}
+unlock_return:
+	if (object != first_object)
+		VM_OBJECT_RUNLOCK(object);
+}
+
+/*
+ * deactivate some number of pages in a map, try to do it fairly, but
+ * that is really hard to do.
+ */
+static void
+vm_swapout_map_deactivate_pages(vm_map_t map, long desired)
+{
+	vm_map_entry_t tmpe;
+	vm_object_t obj, bigobj;
+	int nothingwired;
+
+	if (!vm_map_trylock_read(map))
+		return;
+
+	bigobj = NULL;
+	nothingwired = TRUE;
+
+	/*
+	 * first, search out the biggest object, and try to free pages from
+	 * that.
+	 */
+	tmpe = map->header.next;
+	while (tmpe != &map->header) {
+		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+			obj = tmpe->object.vm_object;
+			if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) {
+				if (obj->shadow_count <= 1 &&
+				    (bigobj == NULL ||
+				     bigobj->resident_page_count <
+				     obj->resident_page_count)) {
+					if (bigobj != NULL)
+						VM_OBJECT_RUNLOCK(bigobj);
+					bigobj = obj;
+				} else
+					VM_OBJECT_RUNLOCK(obj);
+			}
+		}
+		if (tmpe->wired_count > 0)
+			nothingwired = FALSE;
+		tmpe = tmpe->next;
+	}
+
+	if (bigobj != NULL) {
+		vm_swapout_object_deactivate_pages(map->pmap, bigobj, desired);
+		VM_OBJECT_RUNLOCK(bigobj);
+	}
+	/*
+	 * Next, hunt around for other pages to deactivate.  We actually
+	 * do this search sort of wrong -- .text first is not the best idea.
+	 */
+	tmpe = map->header.next;
+	while (tmpe != &map->header) {
+		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
+			break;
+		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+			obj = tmpe->object.vm_object;
+			if (obj != NULL) {
+				VM_OBJECT_RLOCK(obj);
+				vm_swapout_object_deactivate_pages(map->pmap,
+				    obj, desired);
+				VM_OBJECT_RUNLOCK(obj);
+			}
+		}
+		tmpe = tmpe->next;
+	}
+
+	/*
+	 * Remove all mappings if a process is swapped out, this will free page
+	 * table pages.
+	 */
+	if (desired == 0 && nothingwired) {
+		pmap_remove(vm_map_pmap(map), vm_map_min(map),
+		    vm_map_max(map));
+	}
+
+	vm_map_unlock_read(map);
+}
+
+/*
+ * Swap out requests
+ */
+#define VM_SWAP_NORMAL 1
+#define VM_SWAP_IDLE 2
+
+void
+vm_swapout_run(void)
+{
+
+	if (vm_swap_enabled)
+		vm_req_vmdaemon(VM_SWAP_NORMAL);
+}
+
+/*
+ * Idle process swapout -- run once per second when pagedaemons are
+ * reclaiming pages.
+ */
+void
+vm_swapout_run_idle(void)
+{
+	static long lsec;
+
+	if (!vm_swap_idle_enabled || time_second == lsec)
+		return;
+	vm_req_vmdaemon(VM_SWAP_IDLE);
+	lsec = time_second;
+}
+
+static void
+vm_req_vmdaemon(int req)
+{
+	static int lastrun = 0;
+
+	mtx_lock(&vm_daemon_mtx);
+	vm_pageout_req_swapout |= req;
+	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
+		wakeup(&vm_daemon_needed);
+		lastrun = ticks;
+	}
+	mtx_unlock(&vm_daemon_mtx);
+}
+
+static void
+vm_daemon(void)
+{
+	struct rlimit rsslim;
+	struct proc *p;
+	struct thread *td;
+	struct vmspace *vm;
+	int breakout, swapout_flags, tryagain, attempts;
+#ifdef RACCT
+	uint64_t rsize, ravailable;
+#endif
+
+	while (TRUE) {
+		mtx_lock(&vm_daemon_mtx);
+		msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep",
+#ifdef RACCT
+		    racct_enable ? hz : 0
+#else
+		    0
+#endif
+		);
+		swapout_flags = vm_pageout_req_swapout;
+		vm_pageout_req_swapout = 0;
+		mtx_unlock(&vm_daemon_mtx);
+		if (swapout_flags)
+			swapout_procs(swapout_flags);
+
+		/*
+		 * scan the processes for exceeding their rlimits or if
+		 * process is swapped out -- deactivate pages
+		 */
+		tryagain = 0;
+		attempts = 0;
+again:
+		attempts++;
+		sx_slock(&allproc_lock);
+		FOREACH_PROC_IN_SYSTEM(p) {
+			vm_pindex_t limit, size;
+
+			/*
+			 * if this is a system process or if we have already
+			 * looked at this process, skip it.
+			 */
+			PROC_LOCK(p);
+			if (p->p_state != PRS_NORMAL ||
+			    p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * if the process is in a non-running type state,
+			 * don't touch it.
+			 */
+			breakout = 0;
+			FOREACH_THREAD_IN_PROC(p, td) {
+				thread_lock(td);
+				if (!TD_ON_RUNQ(td) &&
+				    !TD_IS_RUNNING(td) &&
+				    !TD_IS_SLEEPING(td) &&
+				    !TD_IS_SUSPENDED(td)) {
+					thread_unlock(td);
+					breakout = 1;
+					break;
+				}
+				thread_unlock(td);
+			}
+			if (breakout) {
+				PROC_UNLOCK(p);
+				continue;
+			}
+			/*
+			 * get a limit
+			 */
+			lim_rlimit_proc(p, RLIMIT_RSS, &rsslim);
+			limit = OFF_TO_IDX(
+			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
+
+			/*
+			 * let processes that are swapped out really be
+			 * swapped out set the limit to nothing (will force a
+			 * swap-out.)
+			 */
+			if ((p->p_flag & P_INMEM) == 0)
+				limit = 0;	/* XXX */
+			vm = vmspace_acquire_ref(p);
+			_PHOLD_LITE(p);
+			PROC_UNLOCK(p);
+			if (vm == NULL) {
+				PRELE(p);
+				continue;
+			}
+			sx_sunlock(&allproc_lock);
+
+			size = vmspace_resident_count(vm);
+			if (size >= limit) {
+				vm_swapout_map_deactivate_pages(
+				    &vm->vm_map, limit);
+				size = vmspace_resident_count(vm);
+			}
+#ifdef RACCT
+			if (racct_enable) {
+				rsize = IDX_TO_OFF(size);
+				PROC_LOCK(p);
+				if (p->p_state == PRS_NORMAL)
+					racct_set(p, RACCT_RSS, rsize);
+				ravailable = racct_get_available(p, RACCT_RSS);
+				PROC_UNLOCK(p);
+				if (rsize > ravailable) {
+					/*
+					 * Don't be overly aggressive; this
+					 * might be an innocent process,
+					 * and the limit could've been exceeded
+					 * by some memory hog.  Don't try
+					 * to deactivate more than 1/4th
+					 * of process' resident set size.
+					 */
+					if (attempts <= 8) {
+						if (ravailable < rsize -
+						    (rsize / 4)) {
+							ravailable = rsize -
+							    (rsize / 4);
+						}
+					}
+					vm_swapout_map_deactivate_pages(
+					    &vm->vm_map,
+					    OFF_TO_IDX(ravailable));
+					/* Update RSS usage after paging out. */
+					size = vmspace_resident_count(vm);
+					rsize = IDX_TO_OFF(size);
+					PROC_LOCK(p);
+					if (p->p_state == PRS_NORMAL)
+						racct_set(p, RACCT_RSS, rsize);
+					PROC_UNLOCK(p);
+					if (rsize > ravailable)
+						tryagain = 1;
+				}
+			}
+#endif
+			vmspace_free(vm);
+			sx_slock(&allproc_lock);
+			PRELE(p);
+		}
+		sx_sunlock(&allproc_lock);
+		if (tryagain != 0 && attempts <= 10) {
+			maybe_yield();
+			goto again;
+		}
+	}
+}
+
+/*
+ * Allow a thread's kernel stack to be paged out.
+ */
+static void
+vm_thread_swapout(struct thread *td)
+{
+	vm_object_t ksobj;
+	vm_page_t m;
+	int i, pages;
+
+	cpu_thread_swapout(td);
+	pages = td->td_kstack_pages;
+	ksobj = td->td_kstack_obj;
+	pmap_qremove(td->td_kstack, pages);
+	VM_OBJECT_WLOCK(ksobj);
+	for (i = 0; i < pages; i++) {
+		m = vm_page_lookup(ksobj, i);
+		if (m == NULL)
+			panic("vm_thread_swapout: kstack already missing?");
+		vm_page_dirty(m);
+		vm_page_lock(m);
+		vm_page_unwire(m, PQ_INACTIVE);
+		vm_page_unlock(m);
+	}
+	VM_OBJECT_WUNLOCK(ksobj);
+}
+
+/*
+ * Bring the kernel stack for a specified thread back in.
+ */
+static void
+vm_thread_swapin(struct thread *td, int oom_alloc)
+{
+	vm_object_t ksobj;
+	vm_page_t ma[KSTACK_MAX_PAGES];
+	int a, count, i, j, pages, rv;
+
+	pages = td->td_kstack_pages;
+	ksobj = td->td_kstack_obj;
+	VM_OBJECT_WLOCK(ksobj);
+	(void)vm_page_grab_pages(ksobj, 0, oom_alloc | VM_ALLOC_WIRED, ma,
+	    pages);
+	for (i = 0; i < pages;) {
+		vm_page_assert_xbusied(ma[i]);
+		if (ma[i]->valid == VM_PAGE_BITS_ALL) {
+			vm_page_xunbusy(ma[i]);
+			i++;
+			continue;
+		}
+		vm_object_pip_add(ksobj, 1);
+		for (j = i + 1; j < pages; j++)
+			if (ma[j]->valid == VM_PAGE_BITS_ALL)
+				break;
+		rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
+		KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
+		count = min(a + 1, j - i);
+		rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
+		KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
+		    __func__, td->td_proc->p_pid));
+		vm_object_pip_wakeup(ksobj);
+		for (j = i; j < i + count; j++)
+			vm_page_xunbusy(ma[j]);
+		i += count;
+	}
+	VM_OBJECT_WUNLOCK(ksobj);
+	pmap_qenter(td->td_kstack, ma, pages);
+	cpu_thread_swapin(td);
+}
+
+void
+faultin(struct proc *p)
+{
+	struct thread *td;
+	int oom_alloc;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * If another process is swapping in this process,
+	 * just wait until it finishes.
+	 */
+	if (p->p_flag & P_SWAPPINGIN) {
+		while (p->p_flag & P_SWAPPINGIN)
+			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
+		return;
+	}
+
+	if ((p->p_flag & P_INMEM) == 0) {
+		oom_alloc = (p->p_flag & P_WKILLED) != 0 ? VM_ALLOC_SYSTEM :
+		    VM_ALLOC_NORMAL;
+
+		/*
+		 * Don't let another thread swap process p out while we are
+		 * busy swapping it in.
+		 */
+		++p->p_lock;
+		p->p_flag |= P_SWAPPINGIN;
+		PROC_UNLOCK(p);
+		sx_xlock(&allproc_lock);
+		MPASS(swapped_cnt > 0);
+		swapped_cnt--;
+		if (curthread != &thread0)
+			swap_inprogress++;
+		sx_xunlock(&allproc_lock);
+
+		/*
+		 * We hold no lock here because the list of threads
+		 * can not change while all threads in the process are
+		 * swapped out.
+		 */
+		FOREACH_THREAD_IN_PROC(p, td)
+			vm_thread_swapin(td, oom_alloc);
+
+		if (curthread != &thread0) {
+			sx_xlock(&allproc_lock);
+			MPASS(swap_inprogress > 0);
+			swap_inprogress--;
+			last_swapin = ticks;
+			sx_xunlock(&allproc_lock);
+		}
+		PROC_LOCK(p);
+		swapclear(p);
+		p->p_swtick = ticks;
+
+		/* Allow other threads to swap p out now. */
+		wakeup(&p->p_flag);
+		--p->p_lock;
+	}
+}
+
+/*
+ * This swapin algorithm attempts to swap-in processes only if there
+ * is enough space for them.  Of course, if a process waits for a long
+ * time, it will be swapped in anyway.
+ */
+
+static struct proc *
+swapper_selector(bool wkilled_only)
+{
+	struct proc *p, *res;
+	struct thread *td;
+	int ppri, pri, slptime, swtime;
+
+	sx_assert(&allproc_lock, SA_SLOCKED);
+	if (swapped_cnt == 0)
+		return (NULL);
+	res = NULL;
+	ppri = INT_MIN;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		PROC_LOCK(p);
+		if (p->p_state == PRS_NEW || (p->p_flag & (P_SWAPPINGOUT |
+		    P_SWAPPINGIN | P_INMEM)) != 0) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		if (p->p_state == PRS_NORMAL && (p->p_flag & P_WKILLED) != 0) {
+			/*
+			 * A swapped-out process might have mapped a
+			 * large portion of the system's pages as
+			 * anonymous memory.  There is no other way to
+			 * release the memory other than to kill the
+			 * process, for which we need to swap it in.
+			 */
+			return (p);
+		}
+		if (wkilled_only) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+		swtime = (ticks - p->p_swtick) / hz;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			/*
+			 * An otherwise runnable thread of a process
+			 * swapped out has only the TDI_SWAPPED bit set.
+			 */
+			thread_lock(td);
+			if (td->td_inhibitors == TDI_SWAPPED) {
+				slptime = (ticks - td->td_slptick) / hz;
+				pri = swtime + slptime;
+				if ((td->td_flags & TDF_SWAPINREQ) == 0)
+					pri -= p->p_nice * 8;
+				/*
+				 * if this thread is higher priority
+				 * and there is enough space, then select
+				 * this process instead of the previous
+				 * selection.
+				 */
+				if (pri > ppri) {
+					res = p;
+					ppri = pri;
+				}
+			}
+			thread_unlock(td);
+		}
+		PROC_UNLOCK(p);
+	}
+
+	if (res != NULL)
+		PROC_LOCK(res);
+	return (res);
+}
+
+#define	SWAPIN_INTERVAL	(MAXSLP * hz / 2)
+
+/*
+ * Limit swapper to swap in one non-WKILLED process in MAXSLP/2
+ * interval, assuming that there is:
+ * - no memory shortage;
+ * - no parallel swap-ins;
+ * - no other swap-ins in the current SWAPIN_INTERVAL.
+ */
+static bool
+swapper_wkilled_only(void)
+{
+
+	return (vm_page_count_min() || swap_inprogress > 0 ||
+	    (u_int)(ticks - last_swapin) < SWAPIN_INTERVAL);
+}
+
+void
+swapper(void)
+{
+	struct proc *p;
+
+	for (;;) {
+		sx_slock(&allproc_lock);
+		p = swapper_selector(swapper_wkilled_only());
+		sx_sunlock(&allproc_lock);
+
+		if (p == NULL) {
+			tsleep(&proc0, PVM, "swapin", SWAPIN_INTERVAL);
+		} else {
+			PROC_LOCK_ASSERT(p, MA_OWNED);
+
+			/*
+			 * Another process may be bringing or may have
+			 * already brought this process in while we
+			 * traverse all threads.  Or, this process may
+			 * have exited or even being swapped out
+			 * again.
+			 */
+			if (p->p_state == PRS_NORMAL && (p->p_flag & (P_INMEM |
+			    P_SWAPPINGOUT | P_SWAPPINGIN)) == 0) {
+				faultin(p);
+			}
+			PROC_UNLOCK(p);
+		}
+	}
+}
+
+/*
+ * First, if any processes have been sleeping or stopped for at least
+ * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
+ * no such processes exist, then the longest-sleeping or stopped
+ * process is swapped out.  Finally, and only as a last resort, if
+ * there are no sleeping or stopped processes, the longest-resident
+ * process is swapped out.
+ */
+static void
+swapout_procs(int action)
+{
+	struct proc *p;
+	struct thread *td;
+	int slptime;
+	bool didswap, doswap;
+
+	MPASS((action & (VM_SWAP_NORMAL | VM_SWAP_IDLE)) != 0);
+
+	didswap = false;
+	sx_slock(&allproc_lock);
+	FOREACH_PROC_IN_SYSTEM(p) {
+		/*
+		 * Filter out not yet fully constructed processes.  Do
+		 * not swap out held processes.  Avoid processes which
+		 * are system, exiting, execing, traced, already swapped
+		 * out or are in the process of being swapped in or out.
+		 */
+		PROC_LOCK(p);
+		if (p->p_state != PRS_NORMAL || p->p_lock != 0 || (p->p_flag &
+		    (P_SYSTEM | P_WEXIT | P_INEXEC | P_STOPPED_SINGLE |
+		    P_TRACED | P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) !=
+		    P_INMEM) {
+			PROC_UNLOCK(p);
+			continue;
+		}
+
+		/*
+		 * Further consideration of this process for swap out
+		 * requires iterating over its threads.  We release
+		 * allproc_lock here so that process creation and
+		 * destruction are not blocked while we iterate.
+		 *
+		 * To later reacquire allproc_lock and resume
+		 * iteration over the allproc list, we will first have
+		 * to release the lock on the process.  We place a
+		 * hold on the process so that it remains in the
+		 * allproc list while it is unlocked.
+		 */
+		_PHOLD_LITE(p);
+		sx_sunlock(&allproc_lock);
+
+		/*
+		 * Do not swapout a realtime process.
+		 * Guarantee swap_idle_threshold1 time in memory.
+		 * If the system is under memory stress, or if we are
+		 * swapping idle processes >= swap_idle_threshold2,
+		 * then swap the process out.
+		 */
+		doswap = true;
+		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			slptime = (ticks - td->td_slptick) / hz;
+			if (PRI_IS_REALTIME(td->td_pri_class) ||
+			    slptime < swap_idle_threshold1 ||
+			    !thread_safetoswapout(td) ||
+			    ((action & VM_SWAP_NORMAL) == 0 &&
+			    slptime < swap_idle_threshold2))
+				doswap = false;
+			thread_unlock(td);
+			if (!doswap)
+				break;
+		}
+		if (doswap && swapout(p) == 0)
+			didswap = true;
+
+		PROC_UNLOCK(p);
+		if (didswap) {
+			sx_xlock(&allproc_lock);
+			swapped_cnt++;
+			sx_downgrade(&allproc_lock);
+		} else
+			sx_slock(&allproc_lock);
+		PRELE(p);
+	}
+	sx_sunlock(&allproc_lock);
+
+	/*
+	 * If we swapped something out, and another process needed memory,
+	 * then wakeup the sched process.
+	 */
+	if (didswap)
+		wakeup(&proc0);
+}
+
+static void
+swapclear(struct proc *p)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		td->td_flags |= TDF_INMEM;
+		td->td_flags &= ~TDF_SWAPINREQ;
+		TD_CLR_SWAPPED(td);
+		if (TD_CAN_RUN(td))
+			if (setrunnable(td)) {
+#ifdef INVARIANTS
+				/*
+				 * XXX: We just cleared TDI_SWAPPED
+				 * above and set TDF_INMEM, so this
+				 * should never happen.
+				 */
+				panic("not waking up swapper");
+#endif
+			}
+		thread_unlock(td);
+	}
+	p->p_flag &= ~(P_SWAPPINGIN | P_SWAPPINGOUT);
+	p->p_flag |= P_INMEM;
+}
+
+static int
+swapout(struct proc *p)
+{
+	struct thread *td;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	/*
+	 * The states of this process and its threads may have changed
+	 * by now.  Assuming that there is only one pageout daemon thread,
+	 * this process should still be in memory.
+	 */
+	KASSERT((p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) ==
+	    P_INMEM, ("swapout: lost a swapout race?"));
+
+	/*
+	 * Remember the resident count.
+	 */
+	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
+
+	/*
+	 * Check and mark all threads before we proceed.
+	 */
+	p->p_flag &= ~P_INMEM;
+	p->p_flag |= P_SWAPPINGOUT;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		if (!thread_safetoswapout(td)) {
+			thread_unlock(td);
+			swapclear(p);
+			return (EBUSY);
+		}
+		td->td_flags &= ~TDF_INMEM;
+		TD_SET_SWAPPED(td);
+		thread_unlock(td);
+	}
+	td = FIRST_THREAD_IN_PROC(p);
+	++td->td_ru.ru_nswap;
+	PROC_UNLOCK(p);
+
+	/*
+	 * This list is stable because all threads are now prevented from
+	 * running.  The list is only modified in the context of a running
+	 * thread in this process.
+	 */
+	FOREACH_THREAD_IN_PROC(p, td)
+		vm_thread_swapout(td);
+
+	PROC_LOCK(p);
+	p->p_flag &= ~P_SWAPPINGOUT;
+	p->p_swtick = ticks;
+	return (0);
+}


Property changes on: trunk/sys/vm/vm_swapout.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/vm/vm_swapout_dummy.c
===================================================================
--- trunk/sys/vm/vm_swapout_dummy.c	                        (rev 0)
+++ trunk/sys/vm/vm_swapout_dummy.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -0,0 +1,123 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 1991 Regents of the University of California.
+ * All rights reserved.
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
+ * Copyright (c) 1994 David Greenman
+ * All rights reserved.
+ * Copyright (c) 2005 Yahoo! Technologies Norway AS
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * The Mach Operating System project at Carnegie-Mellon University.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
+ *
+ *
+ * Copyright (c) 1987, 1990 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Avadis Tevanian, Jr., Michael Wayne Young
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ *  Software Distribution Coordinator  or  Software.Distribution at CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_swapout_dummy.c 325647 2017-11-10 13:17:40Z kib $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_pageout.h>
+
+static int vm_swap_enabled = 0;
+SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD,
+    &vm_swap_enabled, 0,
+    "Enable entire process swapout");
+
+static int vm_swap_idle_enabled = 0;
+SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD,
+    &vm_swap_idle_enabled, 0,
+    "Allow swapout on idle criteria");
+
+void
+vm_swapout_run(void)
+{
+}
+
+void
+vm_swapout_run_idle(void)
+{
+}
+
+void
+faultin(struct proc *p)
+{
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((p->p_flag & P_INMEM) == 0)
+		panic("faultin: proc %p swapped out with NO_SWAPPING", p);
+}
+
+void
+swapper(void)
+{
+
+	for (;;)
+		tsleep(&proc0, PVM, "swapin", MAXSLP * hz);
+}


Property changes on: trunk/sys/vm/vm_swapout_dummy.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/vm/vm_unix.c
===================================================================
--- trunk/sys/vm/vm_unix.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_unix.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -44,7 +44,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_unix.c 284665 2015-06-21 06:28:26Z trasz $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_unix.c 341467 2018-12-04 15:04:48Z emaste $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -72,9 +72,7 @@
  */
 /* ARGSUSED */
 int
-sys_obreak(td, uap)
-	struct thread *td;
-	struct obreak_args *uap;
+sys_obreak(struct thread *td, struct obreak_args *uap)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_map_t map = &vm->vm_map;
@@ -84,11 +82,9 @@
 	int error = 0;
 	boolean_t do_map_wirefuture;
 
-	PROC_LOCK(td->td_proc);
-	datalim = lim_cur(td->td_proc, RLIMIT_DATA);
-	lmemlim = lim_cur(td->td_proc, RLIMIT_MEMLOCK);
-	vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
-	PROC_UNLOCK(td->td_proc);
+	datalim = lim_cur(td, RLIMIT_DATA);
+	lmemlim = lim_cur(td, RLIMIT_MEMLOCK);
+	vmemlim = lim_cur(td, RLIMIT_VMEM);
 
 	do_map_wirefuture = FALSE;
 	new = round_page((vm_offset_t)uap->nsize);
@@ -167,7 +163,7 @@
 #endif
 		prot = VM_PROT_RW;
 #ifdef COMPAT_FREEBSD32
-#if defined(__amd64__) || defined(__ia64__)
+#if defined(__amd64__)
 		if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32))
 			prot |= VM_PROT_EXECUTE;
 #endif
@@ -248,9 +244,7 @@
  */
 /* ARGSUSED */
 int
-sys_ovadvise(td, uap)
-	struct thread *td;
-	struct ovadvise_args *uap;
+sys_ovadvise(struct thread *td, struct ovadvise_args *uap)
 {
 	/* START_GIANT_OPTIONAL */
 	/* END_GIANT_OPTIONAL */

Modified: trunk/sys/vm/vm_zeroidle.c
===================================================================
--- trunk/sys/vm/vm_zeroidle.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vm_zeroidle.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vm_zeroidle.c 254065 2013-08-07 16:36:38Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vm_zeroidle.c 267992 2014-06-28 03:56:17Z hselasky $");
 
 #include <opt_sched.h>
 
@@ -56,10 +56,9 @@
 #include <vm/vm_phys.h>
 
 static int idlezero_enable_default = 0;
-TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
 /* Defer setting the enable flag until the kthread is running. */
 static int idlezero_enable = 0;
-SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0,
+SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RWTUN, &idlezero_enable, 0,
     "Allow the kernel to use idle cpu cycles to zero-out pages");
 /*
  * Implement the pre-zeroed page mechanism.
@@ -85,9 +84,9 @@
 	 * fast sleeps.  We also do not want to be continuously zeroing
 	 * pages because doing so may flush our L1 and L2 caches too much.
 	 */
-	if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count))
+	if (zero_state && vm_page_zero_count >= ZIDLE_LO(vm_cnt.v_free_count))
 		return (0);
-	if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+	if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
 		return (0);
 	return (1);
 }
@@ -99,7 +98,7 @@
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	zero_state = 0;
 	if (vm_phys_zero_pages_idle()) {
-		if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+		if (vm_page_zero_count >= ZIDLE_HI(vm_cnt.v_free_count))
 			zero_state = 1;
 	}
 }

Modified: trunk/sys/vm/vnode_pager.c
===================================================================
--- trunk/sys/vm/vnode_pager.c	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.c	2020-02-08 19:35:48 UTC (rev 12314)
@@ -52,8 +52,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: stable/10/sys/vm/vnode_pager.c 291454 2015-11-29 14:44:40Z kib $");
+__FBSDID("$FreeBSD: stable/11/sys/vm/vnode_pager.c 331722 2018-03-29 02:50:57Z eadler $");
 
+#include "opt_vm.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -83,21 +85,27 @@
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
-static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
+static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
+static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
+    int *, vop_getpages_iodone_t, void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
+static int vnode_pager_generic_getpages_done(struct buf *);
+static void vnode_pager_generic_getpages_done_async(struct buf *);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
+	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 int vnode_pbuf_freecnt;
+int vnode_async_pbuf_freecnt;
 
 /* Create the VM system backing object for this vnode */
 int
@@ -157,14 +165,26 @@
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_WLOCK(obj);
+	umtx_shm_object_terminated(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * don't double-terminate the object
 		 */
-		if ((obj->flags & OBJ_DEAD) == 0)
+		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_terminate(obj);
-		else
+		} else {
+			/*
+			 * Waiters were already handled during object
+			 * termination.  The exclusive vnode lock hopefully
+			 * prevented new waiters from referencing the dying
+			 * object.
+			 */
+			KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0,
+			    ("OBJ_DISCONNECTWNT set obj %p flags %x",
+			    obj, obj->flags));
+			vp->v_object = NULL;
 			VM_OBJECT_WUNLOCK(obj);
+		}
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
@@ -172,7 +192,7 @@
 		vm_pager_deallocate(obj);
 		VM_OBJECT_WUNLOCK(obj);
 	}
-	vp->v_object = NULL;
+	KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
 }
 
 
@@ -241,9 +261,12 @@
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
+#if VM_NRESERVLEVEL > 0
+		vm_object_color(object, 0);
+#endif
 		VM_OBJECT_WUNLOCK(object);
 	}
-	vref(vp);
+	vrefact(vp);
 	return (object);
 }
 
@@ -251,8 +274,7 @@
  *	The object must be locked.
  */
 static void
-vnode_pager_dealloc(object)
-	vm_object_t object;
+vnode_pager_dealloc(vm_object_t object)
 {
 	struct vnode *vp;
 	int refs;
@@ -287,11 +309,8 @@
 }
 
 static boolean_t
-vnode_pager_haspage(object, pindex, before, after)
-	vm_object_t object;
-	vm_pindex_t pindex;
-	int *before;
-	int *after;
+vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
+    int *after)
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
@@ -338,16 +357,21 @@
 			*before += poff;
 		}
 		if (after) {
-			int numafter;
+			/*
+			 * The BMAP vop can report a partial block in the
+			 * 'after', but must not report blocks after EOF.
+			 * Assert the latter, and truncate 'after' in case
+			 * of the former.
+			 */
+			KASSERT((reqblock + *after) * pagesperblock <
+			    roundup2(object->size, pagesperblock),
+			    ("%s: reqblock %jd after %d size %ju", __func__,
+			    (intmax_t )reqblock, *after,
+			    (uintmax_t )object->size));
 			*after *= pagesperblock;
-			numafter = pagesperblock - (poff + 1);
-			if (IDX_TO_OFF(pindex + numafter) >
-			    object->un_pager.vnp.vnp_size) {
-				numafter =
-		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
-				    pindex;
-			}
-			*after += numafter;
+			*after += pagesperblock - (poff + 1);
+			if (pindex + *after >= object->size)
+				*after = object->size - 1 - pindex;
 		}
 	} else {
 		if (before) {
@@ -370,9 +394,7 @@
  * operation (possibly at object termination time), so we must be careful.
  */
 void
-vnode_pager_setsize(vp, nsize)
-	struct vnode *vp;
-	vm_ooffset_t nsize;
+vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 {
 	vm_object_t object;
 	vm_page_t m;
@@ -445,10 +467,6 @@
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
-		} else if ((nsize & PAGE_MASK) &&
-		    vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
-			vm_page_cache_free(object, OFF_TO_IDX(nsize),
-			    nobjsize);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
@@ -497,9 +515,7 @@
  * small block filesystem vnode pager input
  */
 static int
-vnode_pager_input_smlfs(object, m)
-	vm_object_t object;
-	vm_page_t m;
+vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
@@ -591,9 +607,7 @@
  * old style vnode pager input routine
  */
 static int
-vnode_pager_input_old(object, m)
-	vm_object_t object;
-	vm_page_t m;
+vnode_pager_input_old(vm_object_t object, vm_page_t m)
 {
 	struct uio auio;
 	struct iovec aiov;
@@ -666,19 +680,15 @@
  * backing vp's VOP_GETPAGES.
  */
 static int
-vnode_pager_getpages(object, m, count, reqpage)
-	vm_object_t object;
-	vm_page_t *m;
-	int count;
-	int reqpage;
+vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
+    int *rahead)
 {
+	struct vnode *vp;
 	int rtval;
-	struct vnode *vp;
-	int bytes = count * PAGE_SIZE;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
-	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
+	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VM_OBJECT_WLOCK(object);
@@ -685,261 +695,373 @@
 	return rtval;
 }
 
+static int
+vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
+    int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
+{
+	struct vnode *vp;
+	int rtval;
+
+	vp = object->handle;
+	VM_OBJECT_WUNLOCK(object);
+	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
+	KASSERT(rtval != EOPNOTSUPP,
+	    ("vnode_pager: FS getpages_async not implemented\n"));
+	VM_OBJECT_WLOCK(object);
+	return (rtval);
+}
+
 /*
+ * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
+ * local filesystems, where partially valid pages can only occur at
+ * the end of file.
+ */
+int
+vnode_pager_local_getpages(struct vop_getpages_args *ap)
+{
+
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
+}
+
+int
+vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
+{
+
+	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
+	    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
+}
+
+/*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
-vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
-	struct vnode *vp;
-	vm_page_t *m;
-	int bytecount;
-	int reqpage;
+vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
+    int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	vm_object_t object;
 	struct bufobj *bo;
 	struct buf *bp;
-	struct mount *mp;
-	vm_offset_t kva;
-	daddr_t firstaddr, reqblock;
-	off_t foff, nextoff, tfoff, pib;
-	int pbefore, pafter, i, size, bsize, first, last;
-	int count, error, before, after, secmask;
+	off_t foff;
+	int bsize, pagesperblock, *freecnt;
+	int error, before, after, rbehind, rahead, poff, i;
+	int bytecount, secmask;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
-	    ("vnode_pager_generic_getpages does not support devices"));
+	    ("%s does not support devices", __func__));
+
 	if (vp->v_iflag & VI_DOOMED)
 		return (VM_PAGER_BAD);
 
 	object = vp->v_object;
-	count = bytecount / PAGE_SIZE;
+	foff = IDX_TO_OFF(m[0]->pindex);
 	bsize = vp->v_mount->mnt_stat.f_iosize;
+	pagesperblock = bsize / PAGE_SIZE;
 
-	/* get the UNDERLYING device for the file with VOP_BMAP() */
+	KASSERT(foff < object->un_pager.vnp.vnp_size,
+	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
+	KASSERT(count <= sizeof(bp->b_pages),
+	    ("%s: requested %d pages", __func__, count));
 
 	/*
-	 * originally, we did not check for an error return value -- assuming
-	 * an fs always has a bmap entry point -- that assumption is wrong!!!
+	 * The last page has valid blocks.  Invalid part can only
+	 * exist at the end of file, and the page is made fully valid
+	 * by zeroing in vm_pager_get_pages().
 	 */
-	foff = IDX_TO_OFF(m[reqpage]->pindex);
+	if (m[count - 1]->valid != 0 && --count == 0) {
+		if (iodone != NULL)
+			iodone(arg, m, 1, 0);
+		return (VM_PAGER_OK);
+	}
 
 	/*
-	 * if we can't bmap, use old VOP code
+	 * Synchronous and asynchronous paging operations use different
+	 * free pbuf counters.  This is done to avoid asynchronous requests
+	 * to consume all pbufs.
+	 * Allocate the pbuf at the very beginning of the function, so that
+	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
+	 * but sleep.
 	 */
-	error = VOP_BMAP(vp, IDX_TO_OFF(m[reqpage]->pindex) / bsize, &bo,
-	    &reqblock, &after, &before);
+	freecnt = iodone != NULL ?
+	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
+	bp = getpbuf(freecnt);
+
+	/*
+	 * Get the underlying device blocks for the file with VOP_BMAP().
+	 * If the file system doesn't support VOP_BMAP, use old way of
+	 * getting pages via VOP_READ.
+	 */
+	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
 	if (error == EOPNOTSUPP) {
+		relpbuf(bp, freecnt);
 		VM_OBJECT_WLOCK(object);
-		
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		PCPU_INC(cnt.v_vnodein);
-		PCPU_INC(cnt.v_vnodepgsin);
-		error = vnode_pager_input_old(object, m[reqpage]);
+		for (i = 0; i < count; i++) {
+			PCPU_INC(cnt.v_vnodein);
+			PCPU_INC(cnt.v_vnodepgsin);
+			error = vnode_pager_input_old(object, m[i]);
+			if (error)
+				break;
+		}
 		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
-		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
+		relpbuf(bp, freecnt);
 		return (VM_PAGER_ERROR);
+	}
 
-		/*
-		 * if the blocksize is smaller than a page size, then use
-		 * special small filesystem code.  NFS sometimes has a small
-		 * blocksize, but it can handle large reads itself.
-		 */
-	} else if ((PAGE_SIZE / bsize) > 1 &&
-	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
-		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
-		PCPU_INC(cnt.v_vnodein);
-		PCPU_INC(cnt.v_vnodepgsin);
-		return (vnode_pager_input_smlfs(object, m[reqpage]));
+	/*
+	 * If the file system supports BMAP, but blocksize is smaller
+	 * than a page size, then use special small filesystem code.
+	 */
+	if (pagesperblock == 0) {
+		relpbuf(bp, freecnt);
+		for (i = 0; i < count; i++) {
+			PCPU_INC(cnt.v_vnodein);
+			PCPU_INC(cnt.v_vnodepgsin);
+			error = vnode_pager_input_smlfs(object, m[i]);
+			if (error)
+				break;
+		}
+		return (error);
 	}
 
 	/*
-	 * If we have a completely valid page available to us, we can
-	 * clean up and return.  Otherwise we have to re-read the
-	 * media.
+	 * A sparse file can be encountered only for a single page request,
+	 * which may not be preceded by call to vm_pager_haspage().
 	 */
-	VM_OBJECT_WLOCK(object);
-	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
+	if (bp->b_blkno == -1) {
+		KASSERT(count == 1,
+		    ("%s: array[%d] request to a sparse file %p", __func__,
+		    count, vp));
+		relpbuf(bp, freecnt);
+		pmap_zero_page(m[0]);
+		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
+		    __func__, m[0]));
+		VM_OBJECT_WLOCK(object);
+		m[0]->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_WUNLOCK(object);
-		return VM_PAGER_OK;
-	} else if (reqblock == -1) {
-		pmap_zero_page(m[reqpage]);
-		KASSERT(m[reqpage]->dirty == 0,
-		    ("vnode_pager_generic_getpages: page %p is dirty", m));
-		m[reqpage]->valid = VM_PAGE_BITS_ALL;
-		for (i = 0; i < count; i++)
-			if (i != reqpage) {
-				vm_page_lock(m[i]);
-				vm_page_free(m[i]);
-				vm_page_unlock(m[i]);
-			}
-		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
-	m[reqpage]->valid = 0;
-	VM_OBJECT_WUNLOCK(object);
 
-	pib = IDX_TO_OFF(m[reqpage]->pindex) % bsize;
-	pbefore = ((daddr_t)before * bsize + pib) / PAGE_SIZE;
-	pafter = ((daddr_t)(after + 1) * bsize - pib) / PAGE_SIZE - 1;
-	first = reqpage < pbefore ? 0 : reqpage - pbefore;
-	last = reqpage + pafter >= count ? count - 1 : reqpage + pafter;
-	if (first > 0 || last + 1 < count) {
+	bp->b_blkno += (foff % bsize) / DEV_BSIZE;
+
+	/* Recalculate blocks available after/before to pages. */
+	poff = (foff % bsize) / PAGE_SIZE;
+	before *= pagesperblock;
+	before += poff;
+	after *= pagesperblock;
+	after += pagesperblock - (poff + 1);
+	if (m[0]->pindex + after >= object->size)
+		after = object->size - 1 - m[0]->pindex;
+	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
+	    __func__, count, after + 1));
+	after -= count - 1;
+
+	/* Trim requested rbehind/rahead to possible values. */   
+	rbehind = a_rbehind ? *a_rbehind : 0;
+	rahead = a_rahead ? *a_rahead : 0;
+	rbehind = min(rbehind, before);
+	rbehind = min(rbehind, m[0]->pindex);
+	rahead = min(rahead, after);
+	rahead = min(rahead, object->size - m[count - 1]->pindex);
+	KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages),
+	    ("%s: behind %d ahead %d count %d", __func__,
+	    rbehind, rahead, count));
+
+	/*
+	 * Fill in the bp->b_pages[] array with requested and optional   
+	 * read behind or read ahead pages.  Read behind pages are looked
+	 * up in a backward direction, down to a first cached page.  Same
+	 * for read ahead pages, but there is no need to shift the array
+	 * in case of encountering a cached page.
+	 */
+	i = bp->b_npages = 0;
+	if (rbehind) {
+		vm_pindex_t startpindex, tpindex;
+		vm_page_t p;
+
 		VM_OBJECT_WLOCK(object);
-		for (i = 0; i < first; i++) {
-			vm_page_lock(m[i]);
-			vm_page_free(m[i]);
-			vm_page_unlock(m[i]);
+		startpindex = m[0]->pindex - rbehind;
+		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
+		    p->pindex >= startpindex)
+			startpindex = p->pindex + 1;
+
+		/* tpindex is unsigned; beware of numeric underflow. */
+		for (tpindex = m[0]->pindex - 1;
+		    tpindex >= startpindex && tpindex < m[0]->pindex;
+		    tpindex--, i++) {
+			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			if (p == NULL) {
+				/* Shift the array. */
+				for (int j = 0; j < i; j++)
+					bp->b_pages[j] = bp->b_pages[j + 
+					    tpindex + 1 - startpindex]; 
+				break;
+			}
+			bp->b_pages[tpindex - startpindex] = p;
 		}
-		for (i = last + 1; i < count; i++) {
-			vm_page_lock(m[i]);
-			vm_page_free(m[i]);
-			vm_page_unlock(m[i]);
+
+		bp->b_pgbefore = i;
+		bp->b_npages += i;
+		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
+	} else
+		bp->b_pgbefore = 0;
+
+	/* Requested pages. */
+	for (int j = 0; j < count; j++, i++)
+		bp->b_pages[i] = m[j];
+	bp->b_npages += count;
+
+	if (rahead) {
+		vm_pindex_t endpindex, tpindex;
+		vm_page_t p;
+
+		if (!VM_OBJECT_WOWNED(object))
+			VM_OBJECT_WLOCK(object);
+		endpindex = m[count - 1]->pindex + rahead + 1;
+		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
+		    p->pindex < endpindex)
+			endpindex = p->pindex;
+		if (endpindex > object->size)
+			endpindex = object->size;
+
+		for (tpindex = m[count - 1]->pindex + 1;
+		    tpindex < endpindex; i++, tpindex++) {
+			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
+			if (p == NULL)
+				break;
+			bp->b_pages[i] = p;
 		}
-		VM_OBJECT_WUNLOCK(object);
-	}
 
-	/*
-	 * here on direct device I/O
-	 */
-	firstaddr = reqblock;
-	firstaddr += pib / DEV_BSIZE;
-	firstaddr -= IDX_TO_OFF(reqpage - first) / DEV_BSIZE;
+		bp->b_pgafter = i - bp->b_npages;
+		bp->b_npages = i;
+	} else
+		bp->b_pgafter = 0;
 
-	/*
-	 * The first and last page have been calculated now, move
-	 * input pages to be zero based, and adjust the count.
-	 */
-	m += first;
-	reqpage -= first;
-	count = last - first + 1;
+	if (VM_OBJECT_WOWNED(object))
+		VM_OBJECT_WUNLOCK(object);
 
-	/*
-	 * calculate the file virtual address for the transfer
-	 */
-	foff = IDX_TO_OFF(m[0]->pindex);
+	/* Report back actual behind/ahead read. */
+	if (a_rbehind)
+		*a_rbehind = bp->b_pgbefore;
+	if (a_rahead)
+		*a_rahead = bp->b_pgafter;
 
-	/*
-	 * calculate the size of the transfer
-	 */
-	size = count * PAGE_SIZE;
-	KASSERT(count > 0, ("zero count"));
-	if ((foff + size) > object->un_pager.vnp.vnp_size)
-		size = object->un_pager.vnp.vnp_size - foff;
-	KASSERT(size > 0, ("zero size"));
+	KASSERT(bp->b_npages <= sizeof(bp->b_pages),
+	    ("%s: buf %p overflowed", __func__, bp));
 
 	/*
-	 * round up physical size for real devices.
+	 * Recalculate first offset and bytecount with regards to read behind.
+	 * Truncate bytecount to vnode real size and round up physical size
+	 * for real devices.
 	 */
+	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+	bytecount = bp->b_npages << PAGE_SHIFT;
+	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
+		bytecount = object->un_pager.vnp.vnp_size - foff;
 	secmask = bo->bo_bsize - 1;
 	KASSERT(secmask < PAGE_SIZE && secmask > 0,
-	    ("vnode_pager_generic_getpages: sector size %d too large",
-	    secmask + 1));
-	size = (size + secmask) & ~secmask;
+	    ("%s: sector size %d too large", __func__, secmask + 1));
+	bytecount = (bytecount + secmask) & ~secmask;
 
-	bp = getpbuf(&vnode_pbuf_freecnt);
-	kva = (vm_offset_t)bp->b_data;
-
 	/*
-	 * and map the pages to be read into the kva, if the filesystem
+	 * And map the pages to be read into the kva, if the filesystem
 	 * requires mapped buffers.
 	 */
-	mp = vp->v_mount;
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
+	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
 	    unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
-		bp->b_kvabase = unmapped_buf;
 		bp->b_offset = 0;
-		bp->b_flags |= B_UNMAPPED;
-		bp->b_npages = count;
-		for (i = 0; i < count; i++)
-			bp->b_pages[i] = m[i];
-	} else
-		pmap_qenter(kva, m, count);
+	} else {
+		bp->b_data = bp->b_kvabase;
+		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+	}
 
-	/* build a minimal buffer header */
+	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
-	bp->b_iodone = bdone;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
-	bp->b_blkno = firstaddr;
 	pbgetbo(bo, bp);
 	bp->b_vp = vp;
-	bp->b_bcount = size;
-	bp->b_bufsize = size;
-	bp->b_runningbufspace = bp->b_bufsize;
+	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
+	bp->b_iooffset = dbtob(bp->b_blkno);
+
 	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
-
 	PCPU_INC(cnt.v_vnodein);
-	PCPU_ADD(cnt.v_vnodepgsin, count);
+	PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages);
 
-	/* do the input */
-	bp->b_iooffset = dbtob(bp->b_blkno);
-	bstrategy(bp);
+	if (iodone != NULL) { /* async */
+		bp->b_pgiodone = iodone;
+		bp->b_caller1 = arg;
+		bp->b_iodone = vnode_pager_generic_getpages_done_async;
+		bp->b_flags |= B_ASYNC;
+		BUF_KERNPROC(bp);
+		bstrategy(bp);
+		return (VM_PAGER_OK);
+	} else {
+		bp->b_iodone = bdone;
+		bstrategy(bp);
+		bwait(bp, PVM, "vnread");
+		error = vnode_pager_generic_getpages_done(bp);
+		for (i = 0; i < bp->b_npages; i++)
+			bp->b_pages[i] = NULL;
+		bp->b_vp = NULL;
+		pbrelbo(bp);
+		relpbuf(bp, &vnode_pbuf_freecnt);
+		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+	}
+}
 
-	bwait(bp, PVM, "vnread");
+static void
+vnode_pager_generic_getpages_done_async(struct buf *bp)
+{
+	int error;
 
-	if ((bp->b_ioflags & BIO_ERROR) != 0)
-		error = EIO;
+	error = vnode_pager_generic_getpages_done(bp);
+	/* Run the iodone upon the requested range. */
+	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
+	    bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
+	for (int i = 0; i < bp->b_npages; i++)
+		bp->b_pages[i] = NULL;
+	bp->b_vp = NULL;
+	pbrelbo(bp);
+	relpbuf(bp, &vnode_async_pbuf_freecnt);
+}
 
-	if (error == 0 && size != count * PAGE_SIZE) {
-		if ((bp->b_flags & B_UNMAPPED) != 0) {
-			bp->b_flags &= ~B_UNMAPPED;
-			pmap_qenter(kva, m, count);
+static int
+vnode_pager_generic_getpages_done(struct buf *bp)
+{
+	vm_object_t object;
+	off_t tfoff, nextoff;
+	int i, error;
+
+	error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
+	object = bp->b_vp->v_object;
+
+	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
+		if (!buf_mapped(bp)) {
+			bp->b_data = bp->b_kvabase;
+			pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
+			    bp->b_npages);
 		}
-		bzero((caddr_t)kva + size, PAGE_SIZE * count - size);
+		bzero(bp->b_data + bp->b_bcount,
+		    PAGE_SIZE * bp->b_npages - bp->b_bcount);
 	}
-	if ((bp->b_flags & B_UNMAPPED) == 0)
-		pmap_qremove(kva, count);
-	if (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) {
-		bp->b_data = (caddr_t)kva;
-		bp->b_kvabase = (caddr_t)kva;
-		bp->b_flags &= ~B_UNMAPPED;
-		for (i = 0; i < count; i++)
-			bp->b_pages[i] = NULL;
+	if (buf_mapped(bp)) {
+		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
+		bp->b_data = unmapped_buf;
 	}
 
-	/*
-	 * free the buffer header back to the swap buffer pool
-	 */
-	bp->b_vp = NULL;
-	pbrelbo(bp);
-	relpbuf(bp, &vnode_pbuf_freecnt);
-
 	VM_OBJECT_WLOCK(object);
-	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
+	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
+	    i < bp->b_npages; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
-		mt = m[i];
+		mt = bp->b_pages[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
@@ -947,11 +1069,9 @@
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			KASSERT(mt->dirty == 0,
-			    ("vnode_pager_generic_getpages: page %p is dirty",
-			    mt));
+			    ("%s: page %p is dirty", __func__, mt));
 			KASSERT(!pmap_page_is_mapped(mt),
-			    ("vnode_pager_generic_getpages: page %p is mapped",
-			    mt));
+			    ("%s: page %p is mapped", __func__, mt));
 		} else {
 			/*
 			 * Read did not fill up entire page.
@@ -964,18 +1084,17 @@
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
-			    ("vnode_pager_generic_getpages: page %p is dirty",
-			    mt));
+			    ("%s: page %p is dirty", __func__, mt));
 		}
-		
-		if (i != reqpage)
+
+		if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
 			vm_page_readahead_finish(mt);
 	}
 	VM_OBJECT_WUNLOCK(object);
-	if (error) {
-		printf("vnode_pager_getpages: I/O read error\n");
-	}
-	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+	if (error != 0)
+		printf("%s: I/O read error %d\n", __func__, error);
+
+	return (error);
 }
 
 /*
@@ -1006,7 +1125,7 @@
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
-	if (cnt.v_free_count + cnt.v_cache_count < cnt.v_pageout_free_min)
+	if (vm_cnt.v_free_count < vm_cnt.v_pageout_free_min)
 		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
@@ -1014,19 +1133,36 @@
 	 */
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
-	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals, 0);
+	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_WLOCK(object);
 }
 
+static int
+vn_off2bidx(vm_ooffset_t offset)
+{
 
+	return ((offset & PAGE_MASK) / DEV_BSIZE);
+}
+
+static bool
+vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
+{
+
+	KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
+	    offset < IDX_TO_OFF(m->pindex + 1),
+	    ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
+	    (uintmax_t)offset));
+	return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
+}
+
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
- * clustering has already typically occured, so in general we ask the
+ * clustering has already typically occurred, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
@@ -1034,18 +1170,14 @@
 vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
     int flags, int *rtvals)
 {
-	int i;
 	vm_object_t object;
 	vm_page_t m;
-	int count;
-
-	int maxsize, ncount;
-	vm_ooffset_t poffset;
+	vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
 	struct uio auio;
 	struct iovec aiov;
-	int error;
-	int ioflags;
-	int ppscheck = 0;
+	off_t prev_resid, wrsz;
+	int count, error, i, maxsize, ncount, pgoff, ppscheck;
+	bool in_hole;
 	static struct timeval lastfail;
 	static int curfail;
 
@@ -1056,10 +1188,11 @@
 		rtvals[i] = VM_PAGER_ERROR;
 
 	if ((int64_t)ma[0]->pindex < 0) {
-		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
-		    (long)ma[0]->pindex, (u_long)ma[0]->dirty);
+		printf("vnode_pager_generic_putpages: "
+		    "attempt to write meta-data 0x%jx(%lx)\n",
+		    (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
-		return VM_PAGER_BAD;
+		return (VM_PAGER_BAD);
 	}
 
 	maxsize = count * PAGE_SIZE;
@@ -1069,7 +1202,7 @@
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
-	 * have to invalidate pages occuring beyond the file EOF.  However,
+	 * have to invalidate pages occurring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
@@ -1079,14 +1212,20 @@
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
-	VM_OBJECT_WLOCK(object);
+	VM_OBJECT_RLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
+		if (!VM_OBJECT_TRYUPGRADE(object)) {
+			VM_OBJECT_RUNLOCK(object);
+			VM_OBJECT_WLOCK(object);
+			if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
+				goto downgrade;
+		}
 		if (object->un_pager.vnp.vnp_size > poffset) {
-			int pgoff;
-
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
+				pgoff = roundup2(pgoff, DEV_BSIZE);
+
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
@@ -1097,6 +1236,7 @@
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
+				MPASS(m->dirty != 0);
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
@@ -1104,64 +1244,152 @@
 			maxsize = 0;
 			ncount = 0;
 		}
-		if (ncount < count) {
-			for (i = ncount; i < count; i++) {
-				rtvals[i] = VM_PAGER_BAD;
+		for (i = ncount; i < count; i++)
+			rtvals[i] = VM_PAGER_BAD;
+downgrade:
+		VM_OBJECT_LOCK_DOWNGRADE(object);
+	}
+
+	auio.uio_iov = &aiov;
+	auio.uio_segflg = UIO_NOCOPY;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = NULL;
+	maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
+
+	for (prev_offset = poffset; prev_offset < maxblksz;) {
+		/* Skip clean blocks. */
+		for (in_hole = true; in_hole && prev_offset < maxblksz;) {
+			m = ma[OFF_TO_IDX(prev_offset - poffset)];
+			for (i = vn_off2bidx(prev_offset);
+			    i < sizeof(vm_page_bits_t) * NBBY &&
+			    prev_offset < maxblksz; i++) {
+				if (vn_dirty_blk(m, prev_offset)) {
+					in_hole = false;
+					break;
+				}
+				prev_offset += DEV_BSIZE;
 			}
 		}
+		if (in_hole)
+			goto write_done;
+
+		/* Find longest run of dirty blocks. */
+		for (next_offset = prev_offset; next_offset < maxblksz;) {
+			m = ma[OFF_TO_IDX(next_offset - poffset)];
+			for (i = vn_off2bidx(next_offset);
+			    i < sizeof(vm_page_bits_t) * NBBY &&
+			    next_offset < maxblksz; i++) {
+				if (!vn_dirty_blk(m, next_offset))
+					goto start_write;
+				next_offset += DEV_BSIZE;
+			}
+		}
+start_write:
+		if (next_offset > poffset + maxsize)
+			next_offset = poffset + maxsize;
+
+		/*
+		 * Getting here requires finding a dirty block in the
+		 * 'skip clean blocks' loop.
+		 */
+		MPASS(prev_offset < next_offset);
+
+		VM_OBJECT_RUNLOCK(object);
+		aiov.iov_base = NULL;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = prev_offset;
+		prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
+		    prev_offset;
+		error = VOP_WRITE(vp, &auio,
+		    vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
+
+		wrsz = prev_resid - auio.uio_resid;
+		if (wrsz == 0) {
+			if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
+				vn_printf(vp, "vnode_pager_putpages: "
+				    "zero-length write at %ju resid %zd\n",
+				    auio.uio_offset, auio.uio_resid);
+			}
+			VM_OBJECT_RLOCK(object);
+			break;
+		}
+
+		/* Adjust the starting offset for next iteration. */
+		prev_offset += wrsz;
+		MPASS(auio.uio_offset == prev_offset);
+
+		ppscheck = 0;
+		if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
+		    &curfail, 1)) != 0)
+			vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
+			    error);
+		if (auio.uio_resid != 0 && (ppscheck != 0 ||
+		    ppsratecheck(&lastfail, &curfail, 1) != 0))
+			vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
+			    "at %ju\n", auio.uio_resid,
+			    (uintmax_t)ma[0]->pindex);
+		VM_OBJECT_RLOCK(object);
+		if (error != 0 || auio.uio_resid != 0)
+			break;
 	}
-	VM_OBJECT_WUNLOCK(object);
+write_done:
+	/* Mark completely processed pages. */
+	for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
+		rtvals[i] = VM_PAGER_OK;
+	/* Mark partial EOF page. */
+	if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
+		rtvals[i++] = VM_PAGER_OK;
+	/* Unwritten pages in range, free bonus if the page is clean. */
+	for (; i < ncount; i++)
+		rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
+	VM_OBJECT_RUNLOCK(object);
+	PCPU_ADD(cnt.v_vnodepgsout, i);
+	PCPU_INC(cnt.v_vnodeout);
+	return (rtvals[0]);
+}
 
+int
+vnode_pager_putpages_ioflags(int pager_flags)
+{
+	int ioflags;
+
 	/*
-	 * pageouts are already clustered, use IO_ASYNC to force a bawrite()
-	 * rather then a bdwrite() to prevent paging I/O from saturating 
-	 * the buffer cache.  Dummy-up the sequential heuristic to cause
-	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
-	 * the system decides how to cluster.
+	 * Pageouts are already clustered, use IO_ASYNC to force a
+	 * bawrite() rather then a bdwrite() to prevent paging I/O
+	 * from saturating the buffer cache.  Dummy-up the sequential
+	 * heuristic to cause large ranges to cluster.  If neither
+	 * IO_SYNC or IO_ASYNC is set, the system decides how to
+	 * cluster.
 	 */
 	ioflags = IO_VMIO;
-	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
+	if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0)
 		ioflags |= IO_SYNC;
-	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
+	else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
-	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
+	ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
+	ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
-
-	aiov.iov_base = (caddr_t) 0;
-	aiov.iov_len = maxsize;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_offset = poffset;
-	auio.uio_segflg = UIO_NOCOPY;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_resid = maxsize;
-	auio.uio_td = (struct thread *) 0;
-	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
-	PCPU_INC(cnt.v_vnodeout);
-	PCPU_ADD(cnt.v_vnodepgsout, ncount);
-
-	if (error) {
-		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
-			printf("vnode_pager_putpages: I/O error %d\n", error);
-	}
-	if (auio.uio_resid) {
-		if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
-			printf("vnode_pager_putpages: residual I/O %zd at %lu\n",
-			    auio.uio_resid, (u_long)ma[0]->pindex);
-	}
-	for (i = 0; i < ncount; i++) {
-		rtvals[i] = VM_PAGER_OK;
-	}
-	return rtvals[0];
+	return (ioflags);
 }
 
+/*
+ * vnode_pager_undirty_pages().
+ *
+ * A helper to mark pages as clean after pageout that was possibly
+ * done with a short write.  The lpos argument specifies the page run
+ * length in bytes, and the written argument specifies how many bytes
+ * were actually written.  eof is the offset past the last valid byte
+ * in the vnode using the absolute file position of the first byte in
+ * the run as the base from which it is computed.
+ */
 void
-vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written)
+vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof,
+    int lpos)
 {
 	vm_object_t obj;
-	int i, pos;
+	int i, pos, pos_devb;
 
-	if (written == 0)
+	if (written == 0 && eof >= lpos)
 		return;
 	obj = ma[0]->object;
 	VM_OBJECT_WLOCK(obj);
@@ -1175,6 +1403,37 @@
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
+	if (eof >= lpos) /* avoid truncation */
+		goto done;
+	for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
+		if (pos != trunc_page(pos)) {
+			/*
+			 * The page contains the last valid byte in
+			 * the vnode, mark the rest of the page as
+			 * clean, potentially making the whole page
+			 * clean.
+			 */
+			pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
+			vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
+			    pos_devb);
+
+			/*
+			 * If the page was cleaned, report the pageout
+			 * on it as successful.  msync() no longer
+			 * needs to write out the page, endlessly
+			 * creating write requests and dirty buffers.
+			 */
+			if (ma[i]->dirty == 0)
+				rtvals[i] = VM_PAGER_OK;
+
+			pos = round_page(pos);
+		} else {
+			/* vm_pageout_flush() clears dirty */
+			rtvals[i] = VM_PAGER_BAD;
+			pos += PAGE_SIZE;
+		}
+	}
+done:
 	VM_OBJECT_WUNLOCK(obj);
 }
 

Modified: trunk/sys/vm/vnode_pager.h
===================================================================
--- trunk/sys/vm/vnode_pager.h	2020-02-08 19:35:04 UTC (rev 12313)
+++ trunk/sys/vm/vnode_pager.h	2020-02-08 19:35:48 UTC (rev 12314)
@@ -33,7 +33,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)vnode_pager.h	8.1 (Berkeley) 6/11/93
- * $FreeBSD: stable/10/sys/vm/vnode_pager.h 232071 2012-02-23 21:07:16Z kib $
+ * $FreeBSD: stable/11/sys/vm/vnode_pager.h 331722 2018-03-29 02:50:57Z eadler $
  */
 
 #ifndef	_VNODE_PAGER_
@@ -42,14 +42,17 @@
 #ifdef _KERNEL
 
 int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m,
-					  int count, int reqpage);
+    int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone,
+    void *arg);
 int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *m,
-					  int count, boolean_t sync,
-					  int *rtvals);
-
+    int count, int flags, int *rtvals);
+int vnode_pager_local_getpages(struct vop_getpages_args *ap);
+int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap);
+int vnode_pager_putpages_ioflags(int pager_flags);
 void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end);
-void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written);
+void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written,
+    off_t eof, int lpos);
 void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end);
 



More information about the Midnightbsd-cvs mailing list